From 8bd7d308e3381fc961b11f0a8194778f76de1b16 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Thu, 26 Feb 2026 23:24:18 +0100
Subject: [PATCH 01/29] Add TopK layer and pnnx ONNX TopK lowering

---
 src/CMakeLists.txt                |   1 +
 src/layer/topk.cpp                | 194 ++++++++++++++++++++++++++++++
 src/layer/topk.h                  |  29 +++++
 tests/CMakeLists.txt              |   1 +
 tests/test_topk.cpp               |  88 ++++++++++++++
 tools/pnnx/src/CMakeLists.txt     |   1 +
 tools/pnnx/src/pass_ncnn/TopK.cpp |  97 +++++++++++++++
 7 files changed, 411 insertions(+)
 create mode 100644 src/layer/topk.cpp
 create mode 100644 src/layer/topk.h
 create mode 100644 tests/test_topk.cpp
 create mode 100644 tools/pnnx/src/pass_ncnn/TopK.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 614c3b8f31f1..c79d779cf220 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -101,6 +101,7 @@ ncnn_add_layer(SPP OFF)
 ncnn_add_layer(TanH)
 ncnn_add_layer(Threshold)
 ncnn_add_layer(Tile)
+ncnn_add_layer(TopK)
 ncnn_add_layer(RNN)
 ncnn_add_layer(LSTM)
 ncnn_add_layer(BinaryOp)
diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
new file mode 100644
index 000000000000..c65dbc9689ba
--- /dev/null
+++ b/src/layer/topk.cpp
@@ -0,0 +1,194 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "topk.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace ncnn {
+
+TopK::TopK()
+{
+    one_blob_only = false;
+    support_inplace = false;
+
+    axis = -1;
+    largest = 1;
+    sorted = 1;
+    k = 1;
+}
+
+int TopK::load_param(const ParamDict& pd)
+{
+    axis = pd.get(0, -1);
+    largest = pd.get(1, 1);
+    sorted = pd.get(2, 1);
+    k = pd.get(3, 1);
+
+    return 0;
+}
+
+int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    if (bottom_blobs.empty())
+        return -1;
+
+    const Mat& bottom_blob = bottom_blobs[0];
+
+    int _k = k;
+    if (bottom_blobs.size() >= 2)
+    {
+        const Mat& k_blob = bottom_blobs[1];
+        if (k_blob.total() < 1)
+            return -1;
+
+        _k = (int)((const float*)k_blob)[0];
+    }
+
+    if (bottom_blob.dims < 1 || bottom_blob.dims > 4)
+        return -100;
+
+    int dims = bottom_blob.dims;
+
+    int axis_p = axis < 0 ? axis + dims : axis;
+    if (axis_p < 0 || axis_p >= dims)
+        return -1;
+
+    int shape[4] = {1, 1, 1, 1};
+    shape[0] = bottom_blob.w;
+    if (dims >= 2) shape[1] = bottom_blob.h;
+    if (dims >= 3) shape[2] = bottom_blob.dims == 3 ? bottom_blob.c : bottom_blob.d;
+    if (dims >= 4) shape[3] = bottom_blob.c;
+
+    int axis_size = shape[axis_p];
+    if (axis_size <= 0)
+        return -1;
+
+    if (_k < 0)
+        return -1;
+    if (_k > axis_size)
+        _k = axis_size;
+
+    int out_shape[4] = {shape[0], shape[1], shape[2], shape[3]};
+    out_shape[axis_p] = _k;
+
+    Mat values;
+    if (dims == 1) values.create(out_shape[0], 4u, opt.blob_allocator);
+    if (dims == 2) values.create(out_shape[0], out_shape[1], 4u, opt.blob_allocator);
+    if (dims == 3) values.create(out_shape[0], out_shape[1], out_shape[2], 4u, opt.blob_allocator);
+    if (dims == 4) values.create(out_shape[0], out_shape[1], out_shape[2], out_shape[3], 4u, opt.blob_allocator);
+    if (values.empty())
+        return -100;
+
+    Mat indices;
+    if (top_blobs.size() >= 2)
+    {
+        if (dims == 1) indices.create(out_shape[0], 4u, opt.blob_allocator);
+        if (dims == 2) indices.create(out_shape[0], out_shape[1], 4u, opt.blob_allocator);
+        if (dims == 3) indices.create(out_shape[0], out_shape[1], out_shape[2], 4u, opt.blob_allocator);
+        if (dims == 4) indices.create(out_shape[0], out_shape[1], out_shape[2], out_shape[3], 4u, opt.blob_allocator);
+        if (indices.empty())
+            return -100;
+    }
+
+    const float* ptr = bottom_blob;
+    float* outptr = values;
+    float* outidxptr = indices;
+
+    int inner = 1;
+    for (int i = 0; i < axis_p; i++)
+    {
+        inner *= shape[i];
+    }
+
+    int outer = 1;
+    for (int i = axis_p + 1; i < dims; i++)
+    {
+        outer *= shape[i];
+    }
+
+    const bool largest_p = largest != 0;
+    const bool sorted_p = sorted != 0;
+
+    const int total_lines = outer * inner;
+
+#pragma omp parallel for num_threads(opt.num_threads)
+    for (int line = 0; line < total_lines; line++)
+    {
+        int outer_i = line / inner;
+        int inner_i = line - outer_i * inner;
+
+        int in_base = outer_i * axis_size * inner + inner_i;
+        int out_base = outer_i * _k * inner + inner_i;
+
+        std::vector<std::pair<float, int> > vec;
+        vec.resize(axis_size);
+
+        for (int j = 0; j < axis_size; j++)
+        {
+            vec[j].first = ptr[in_base + j * inner];
+            vec[j].second = j;
+        }
+
+        if (largest_p)
+        {
+            auto comp = [](const std::pair<float, int>& a, const std::pair<float, int>& b)
+            {
+                if (a.first != b.first)
+                    return a.first > b.first;
+                return a.second < b.second;
+            };
+
+            if (_k < axis_size)
+            {
+                if (sorted_p)
+                    std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp);
+                else
+                    std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
+            }
+            else
+            {
+                if (sorted_p)
+                    std::sort(vec.begin(), vec.end(), comp);
+            }
+        }
+        else
+        {
+            auto comp = [](const std::pair<float, int>& a, const std::pair<float, int>& b)
+            {
+                if (a.first != b.first)
+                    return a.first < b.first;
+                return a.second < b.second;
+            };
+
+            if (_k < axis_size)
+            {
+                if (sorted_p)
+                    std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp);
+                else
+                    std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
+            }
+            else
+            {
+                if (sorted_p)
+                    std::sort(vec.begin(), vec.end(), comp);
+            }
+        }
+
+        for (int j = 0; j < _k; j++)
+        {
+            outptr[out_base + j * inner] = vec[j].first;
+            if (outidxptr)
+                outidxptr[out_base + j * inner] = (float)vec[j].second;
+        }
+    }
+
+    top_blobs[0] = values;
+    if (top_blobs.size() >= 2)
+        top_blobs[1] = indices;
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/topk.h b/src/layer/topk.h
new file mode 100644
index 000000000000..ff8f410926d8
--- /dev/null
+++ b/src/layer/topk.h
@@ -0,0 +1,29 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_TOPK_H
+#define LAYER_TOPK_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class TopK : public Layer
+{
+public:
+    TopK();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+public:
+    int axis;
+    int largest;
+    int sorted;
+    int k;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_TOPK_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e72e6d02b86e..4f40f8279428 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -166,6 +166,7 @@ ncnn_add_layer_test(Spectrogram)
 ncnn_add_layer_test(Squeeze)
 ncnn_add_layer_test(Swish)
 ncnn_add_layer_test(TanH)
+ncnn_add_layer_test(TopK)
 ncnn_add_layer_test(Tile)
 ncnn_add_layer_test(UnaryOp)
 ncnn_add_layer_test(Unfold)
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
new file mode 100644
index 000000000000..7b7fe82690ba
--- /dev/null
+++ b/tests/test_topk.cpp
@@ -0,0 +1,88 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "testutil.h"
+
+static int test_topk(const ncnn::Mat& a, int axis, int k, int largest, int sorted)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, axis);
+    pd.set(1, largest);
+    pd.set(2, sorted);
+    pd.set(3, k);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    std::vector<ncnn::Mat> a0(1);
+    a0[0] = a;
+
+    int ret = test_layer("TopK", pd, weights, a0, 2, 0.01f, TEST_LAYER_DISABLE_AUTO_INPUT_CASTING);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_topk failed a.dims=%d a=(%d %d %d %d) axis=%d k=%d largest=%d sorted=%d\n", a.dims, a.w, a.h, a.d, a.c, axis, k, largest, sorted);
+    }
+
+    return ret;
+}
+
+static int test_topk_0()
+{
+    ncnn::Mat a = RandomMat(13);
+
+    return 0
+           || test_topk(a, 0, 1, 1, 1)
+           || test_topk(a, 0, 5, 1, 1)
+           || test_topk(a, -1, 7, 0, 1)
+           || test_topk(a, 0, 9, 1, 1);
+}
+
+static int test_topk_1()
+{
+    ncnn::Mat a = RandomMat(12, 17);
+
+    return 0
+           || test_topk(a, 0, 1, 1, 1)
+           || test_topk(a, 0, 5, 1, 1)
+           || test_topk(a, 1, 3, 1, 1)
+           || test_topk(a, -1, 8, 0, 1)
+           || test_topk(a, -2, 7, 1, 1);
+}
+
+static int test_topk_2()
+{
+    ncnn::Mat a = RandomMat(8, 9, 11);
+
+    return 0
+           || test_topk(a, 0, 3, 1, 1)
+           || test_topk(a, 1, 4, 1, 1)
+           || test_topk(a, 2, 2, 0, 1)
+           || test_topk(a, -1, 6, 1, 1)
+           || test_topk(a, -2, 5, 0, 1)
+           || test_topk(a, -3, 7, 1, 1);
+}
+
+static int test_topk_3()
+{
+    ncnn::Mat a = RandomMat(5, 7, 9, 10);
+
+    return 0
+           || test_topk(a, 0, 2, 1, 1)
+           || test_topk(a, 1, 3, 0, 1)
+           || test_topk(a, 2, 4, 1, 1)
+           || test_topk(a, 3, 5, 1, 1)
+           || test_topk(a, -1, 6, 0, 1)
+           || test_topk(a, -2, 3, 1, 1)
+           || test_topk(a, -3, 4, 0, 1)
+           || test_topk(a, -4, 2, 1, 1);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_topk_0()
+           || test_topk_1()
+           || test_topk_2()
+           || test_topk_3();
+}
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index 3e0c6f865a87..c554a6873e81 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -592,6 +592,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/Tensor_reshape_as.cpp
     pass_ncnn/Tensor_repeat.cpp
     pass_ncnn/Tensor_unflatten.cpp
+    pass_ncnn/TopK.cpp
     pass_ncnn/torch_addmm.cpp
     pass_ncnn/torch_amax.cpp
     pass_ncnn/torch_amin.cpp
diff --git a/tools/pnnx/src/pass_ncnn/TopK.cpp b/tools/pnnx/src/pass_ncnn/TopK.cpp
new file mode 100644
index 000000000000..515790e38518
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/TopK.cpp
@@ -0,0 +1,97 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+static int parameter_to_bool(const Parameter& p, int default_value)
+{
+    if (p.type == 1)
+        return p.b ? 1 : 0;
+    if (p.type == 2)
+        return p.i ? 1 : 0;
+
+    return default_value;
+}
+
+class TopK : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 k
+TopK                    op_0        2 2 input k values indices %*=%*
+pnnx.Output             output      2 0 values indices
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "TopK";
+    }
+
+    const char* name_str() const
+    {
+        return "topk";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        int axis = -1;
+        if (captured_params.find("op_0.axis") != captured_params.end())
+            axis = captured_params.at("op_0.axis").i;
+
+        int largest = 1;
+        if (captured_params.find("op_0.largest") != captured_params.end())
+            largest = parameter_to_bool(captured_params.at("op_0.largest"), 1);
+
+        int sorted = 1;
+        if (captured_params.find("op_0.sorted") != captured_params.end())
+            sorted = parameter_to_bool(captured_params.at("op_0.sorted"), 1);
+
+        const int batch_index = op->inputs[0]->params["__batch_index"].i;
+
+        if (axis == batch_index)
+        {
+            fprintf(stderr, "TopK along batch axis is not supported\n");
+            return;
+        }
+
+        int new_axis = axis;
+        if (axis >= 0)
+            new_axis = axis > batch_index ? axis - 1 : axis;
+
+        op->params["0"] = new_axis;
+        op->params["1"] = largest;
+        op->params["2"] = sorted;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(TopK, 20)
+
+class TopK_0 : public TopK
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 2
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 k
+TopK                    op_0        2 1 input k values %*=%*
+pnnx.Output             output      1 0 values
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(TopK_0, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx

From b2c445a61763ccf3e1e162803ccc23bdcb0b8d12 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Thu, 26 Feb 2026 23:34:51 +0100
Subject: [PATCH 02/29] Add ONNX torch_topk pnnx regression test

---
 tools/pnnx/tests/onnx/CMakeLists.txt     |  1 +
 tools/pnnx/tests/onnx/test_torch_topk.py | 61 ++++++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.py

diff --git a/tools/pnnx/tests/onnx/CMakeLists.txt b/tools/pnnx/tests/onnx/CMakeLists.txt
index f029a669584d..ba821233ad12 100644
--- a/tools/pnnx/tests/onnx/CMakeLists.txt
+++ b/tools/pnnx/tests/onnx/CMakeLists.txt
@@ -191,6 +191,7 @@ pnnx_onnx_add_test(torch_split)
 pnnx_onnx_add_test(torch_squeeze)
 pnnx_onnx_add_test(torch_stack)
 pnnx_onnx_add_test(torch_sum)
+pnnx_onnx_add_test(torch_topk)
 pnnx_onnx_add_test(torch_transpose)
 pnnx_onnx_add_test(torch_unbind)
 pnnx_onnx_add_test(torch_unsqueeze)
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.py b/tools/pnnx/tests/onnx/test_torch_topk.py
new file mode 100644
index 000000000000..fe3d15c99b84
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_topk.py
@@ -0,0 +1,61 @@
+# Copyright 2026 Tencent
+# SPDX-License-Identifier: BSD-3-Clause
+
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x_values, x_indices = torch.topk(
+            x, 2, dim=1, largest=True, sorted=True
+        )
+        y_values, y_indices = torch.topk(
+            y, 4, dim=3, largest=False, sorted=True
+        )
+        z_values, z_indices = torch.topk(
+            z, 3, dim=0, largest=True, sorted=True
+        )
+        return x_values, x_indices, y_values, y_indices, z_values, z_indices
+
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_topk.onnx")
+
+    # onnx to pnnx
+    import os
+
+    os.system(
+        "../../src/pnnx test_torch_topk.onnx "
+        "inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]"
+    )
+
+    # pnnx inference
+    import test_torch_topk_pnnx
+    b = test_torch_topk_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)

From 01d15cb58615e20d35c1fc3071fee5cbd378efc3 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 08:33:25 +0100
Subject: [PATCH 03/29] Add TopK Python class generation to pnnx module export
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Generate TopK class definition in pnnx.py output with forward() method
- Instantiate TopK modules in Model.__init__() with proper parameters
- Update forward() method to call self.topk_name() instead of direct TopK() calls
- Fixes pnnx inference to properly execute TopK operations using torch.topk()
- Test confirms TopK ONNX→pnnx conversion and inference working correctly
---
 tools/pnnx/src/CMakeLists.txt                 |  12 +-
 tools/pnnx/src/ir.cpp                         |  78 +++++++++++++
 tools/pnnx/src/load_onnx.cpp                  |   8 ++
 tools/pnnx/src/pass_onnx/fold_constants.cpp   |   8 ++
 tools/pnnx/src/pass_onnx/shape_inference.cpp  |   8 ++
 tools/pnnx/src/pnnx                           |   1 +
 tools/pnnx/tests/onnx/test_torch_topk.onnx    | Bin 0 -> 3317 bytes
 .../pnnx/tests/onnx/test_torch_topk.onnx.data |   0
 .../pnnx/tests/onnx/test_torch_topk.pnnx.bin  | Bin 0 -> 98 bytes
 .../pnnx/tests/onnx/test_torch_topk.pnnx.onnx | Bin 0 -> 882 bytes
 .../tests/onnx/test_torch_topk.pnnx.param     |  17 +++
 .../tests/onnx/test_torch_topk.pnnxsim.onnx   | Bin 0 -> 2861 bytes
 tools/pnnx/tests/onnx/test_torch_topk_pnnx.py | 109 ++++++++++++++++++
 13 files changed, 236 insertions(+), 5 deletions(-)
 create mode 120000 tools/pnnx/src/pnnx
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.onnx
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.onnx.data
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.param
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnxsim.onnx
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk_pnnx.py

diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index c554a6873e81..6231e36b16ac 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -630,23 +630,25 @@ if(PROTOBUF_FOUND)
         set(CMAKE_CXX_STANDARD 17)
     endif()
 
-    if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE)
+    if(COMMAND protobuf_generate_cpp)
         protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx-data.proto onnx-ml.proto onnx-operators-ml.proto)
         add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
         target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
         target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES})
-    else()
+    elseif(COMMAND protobuf_generate)
         add_library(onnxproto STATIC onnx-data.proto onnx-ml.proto onnx-operators-ml.proto)
         target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
         protobuf_generate(TARGET onnxproto)
         target_link_libraries(onnxproto PUBLIC protobuf::libprotobuf)
+    else()
+        message(FATAL_ERROR "Neither protobuf_generate_cpp nor protobuf_generate is available. Please install protobuf with CMake codegen support.")
     endif()
 
     # use onnxruntime onnx proto if found
     if(onnxruntime_FOUND)
         add_dependencies(onnxruntime::onnxruntime onnxproto)
 
-        if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE)
+        if(COMMAND protobuf_generate_cpp)
             set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
             set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PROTOBUF_LIBRARIES})
         else()
@@ -688,7 +690,7 @@ if(PROTOBUF_FOUND)
         save_onnx.cpp
     )
     if(onnxruntime_FOUND)
-        target_link_libraries(pnnx2onnx PRIVATE onnxruntime::onnxruntime)
+        target_link_libraries(pnnx2onnx PRIVATE onnxruntime::onnxruntime onnxproto)
     else()
         target_link_libraries(pnnx2onnx PRIVATE onnxproto)
     endif()
@@ -720,7 +722,7 @@ if(onnxruntime_FOUND)
     )
 
     add_library(onnx2pnnx OBJECT ${onnx2pnnx_SRCS})
-    target_link_libraries(onnx2pnnx PRIVATE onnxruntime::onnxruntime)
+    target_link_libraries(onnx2pnnx PRIVATE onnxruntime::onnxruntime onnxproto)
     target_compile_definitions(onnx2pnnx PRIVATE BUILD_ONNX2PNNX)
 
     message(STATUS "Building with onnx2pnnx")
diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 44e4b77fdf2f..63f9c70e21f4 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1479,6 +1479,33 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
 
     fprintf(pyfp, "\n");
 
+    // output custom layer classes for pnnx operators
+    {
+        bool has_topk = false;
+        for (const Operator* op : ops)
+        {
+            if (op->type == "TopK")
+            {
+                has_topk = true;
+                break;
+            }
+        }
+
+        if (has_topk)
+        {
+            fprintf(pyfp, "class TopK(nn.Module):\n");
+            fprintf(pyfp, "    def __init__(self, axis=1, largest=1, sorted=1):\n");
+            fprintf(pyfp, "        super(TopK, self).__init__()\n");
+            fprintf(pyfp, "        self.axis = axis\n");
+            fprintf(pyfp, "        self.largest = largest\n");
+            fprintf(pyfp, "        self.sorted = sorted\n");
+            fprintf(pyfp, "    def forward(self, x, k):\n");
+            fprintf(pyfp, "        # Torch topk returns (values, indices)\n");
+            fprintf(pyfp, "        return torch.topk(x, k.item() if hasattr(k, 'item') else k, dim=self.axis, largest=bool(self.largest), sorted=bool(self.sorted))\n");
+            fprintf(pyfp, "\n");
+        }
+    }
+
     fprintf(pyfp, "class Model(nn.Module):\n");
     fprintf(pyfp, "    def __init__(self):\n");
     fprintf(pyfp, "        super(Model, self).__init__()\n");
@@ -1605,6 +1632,39 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
         }
     }
 
+    // TopK modules
+    {
+        for (const Operator* op : ops)
+        {
+            if (op->type != "TopK")
+                continue;
+
+            fprintf(pyfp, "        self.%s = TopK(", sanitize_identifier(op->name).c_str());
+            
+            int i = 0;
+            for (const auto& it : op->params)
+            {
+                fprintf(pyfp, "%s=", it.first.c_str());
+                
+                const Parameter& param = it.second;
+                if (param.type == 2)
+                {
+                    fprintf(pyfp, "%d", param.i);
+                }
+                else if (param.type == 1)
+                {
+                    fprintf(pyfp, "%d", param.b ? 1 : 0);
+                }
+                
+                if (i + 1 != op->params.size())
+                    fprintf(pyfp, ", ");
+                i++;
+            }
+            
+            fprintf(pyfp, ")\n");
+        }
+    }
+
     fprintf(pyfp, "\n");
 
     // load weights
@@ -2186,6 +2246,24 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
                 }
                 fprintf(pyfp, ")\n");
             }
+            else if (op->type == "TopK")
+            {
+                // self.topk_name()
+                for (size_t i = 0; i < op->outputs.size(); i++)
+                {
+                    fprintf(pyfp, "v_%s", sanitize_identifier(op->outputs[i]->name).c_str());
+                    if (i + 1 != op->outputs.size())
+                        fprintf(pyfp, ", ");
+                }
+                fprintf(pyfp, " = self.%s(", sanitize_identifier(op->name).c_str());
+                for (size_t i = 0; i < op->inputs.size(); i++)
+                {
+                    fprintf(pyfp, "v_%s", sanitize_identifier(op->inputs[i]->name).c_str());
+                    if (i + 1 != op->inputs.size())
+                        fprintf(pyfp, ", ");
+                }
+                fprintf(pyfp, ")\n");
+            }
             else
             {
                 if (op->type.find("::") == std::string::npos && op->type.find(".") == std::string::npos)
diff --git a/tools/pnnx/src/load_onnx.cpp b/tools/pnnx/src/load_onnx.cpp
index 3c788a0c4849..6cc4a1de4284 100644
--- a/tools/pnnx/src/load_onnx.cpp
+++ b/tools/pnnx/src/load_onnx.cpp
@@ -13,7 +13,15 @@
 #include <chrono>
 #include <fstream>
 
+#if __has_include(<onnxruntime_c_api.h>)
 #include <onnxruntime_c_api.h>
+#elif __has_include(<onnxruntime/onnxruntime_c_api.h>)
+#include <onnxruntime/onnxruntime_c_api.h>
+#elif __has_include(<onnxruntime/core/session/onnxruntime_c_api.h>)
+#include <onnxruntime/core/session/onnxruntime_c_api.h>
+#else
+#error "onnxruntime_c_api.h not found"
+#endif
 
 #include "ir.h"
 
diff --git a/tools/pnnx/src/pass_onnx/fold_constants.cpp b/tools/pnnx/src/pass_onnx/fold_constants.cpp
index 1ef0092a72ec..c79cb29f34a1 100644
--- a/tools/pnnx/src/pass_onnx/fold_constants.cpp
+++ b/tools/pnnx/src/pass_onnx/fold_constants.cpp
@@ -9,7 +9,15 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#if __has_include(<onnxruntime_c_api.h>)
 #include <onnxruntime_c_api.h>
+#elif __has_include(<onnxruntime/onnxruntime_c_api.h>)
+#include <onnxruntime/onnxruntime_c_api.h>
+#elif __has_include(<onnxruntime/core/session/onnxruntime_c_api.h>)
+#include <onnxruntime/core/session/onnxruntime_c_api.h>
+#else
+#error "onnxruntime_c_api.h not found"
+#endif
 
 #include "dead_code_elimination.h"
 
diff --git a/tools/pnnx/src/pass_onnx/shape_inference.cpp b/tools/pnnx/src/pass_onnx/shape_inference.cpp
index 99dc652389d8..23986a7a7d2d 100644
--- a/tools/pnnx/src/pass_onnx/shape_inference.cpp
+++ b/tools/pnnx/src/pass_onnx/shape_inference.cpp
@@ -8,7 +8,15 @@
 #include <string>
 #include <vector>
 
+#if __has_include(<onnxruntime_c_api.h>)
 #include <onnxruntime_c_api.h>
+#elif __has_include(<onnxruntime/onnxruntime_c_api.h>)
+#include <onnxruntime/onnxruntime_c_api.h>
+#elif __has_include(<onnxruntime/core/session/onnxruntime_c_api.h>)
+#include <onnxruntime/core/session/onnxruntime_c_api.h>
+#else
+#error "onnxruntime_c_api.h not found"
+#endif
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pnnx b/tools/pnnx/src/pnnx
new file mode 120000
index 000000000000..909f9eae4b3f
--- /dev/null
+++ b/tools/pnnx/src/pnnx
@@ -0,0 +1 @@
+../build/src/pnnx
\ No newline at end of file
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.onnx b/tools/pnnx/tests/onnx/test_torch_topk.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..e57e7e63ec365e26943043ad0202d1152ca55191
GIT binary patch
literal 3317
zcmc(h&u`l{6vrdStraE<3@$5L9|poh;nWZ$ONsl#Kv1AS3&g|PVoflhLCUEz<7k#C
zL8M#DMK3J|?9giuI}F%u|HRJQX@9`}mmT&b|B(D6Sk}6T0m9^y$oGAGA3ah<xDZ6r
z51rG}WlJ|}dad#IHS~KPDSdS5h3(NAF0`3+CF|m~QHPqP4eqxXm448n)GX%@{NU^A
zXXvu<L&;iKm0y)ybrTglJ8Ux{hvRjWxBIS02F*HJ<93&cz;+nkZV*blw#zBizkzi<
zBVM>cY{|GDcl8!Kv3s6u?xQb*P8)2XSNA#Z>yBp&K~G&4+F*-t<)|{fRz)L~xrHjn
z&9@4=GPBl6pT=Qf_j>G<G{_O}b!ktqQ0!U9WREzu=h2fMcckm{BN<00_eNp@q0hNF
z-iB8MOx(WS*NCMN^M*z`FGt2#uWvM~s5B!Y-WDnxKVSlH993fZu^LePn=<#ZkhasI
zGK_ceC*^_bF;X@TW5LD?mphD0<2yg>7~J7J$rkLa?+GJ-turK~Mi}ufCS7<AJD9ws
z2T@rgp350AcQoR1a^i;|yM+k+`V_e6F`+@v<t^7?afrihh^20$zW|?Th>w|4W2N1l
z(q;;fc^-sTVx-0ht-_#rD~nKdAwtyr#1OqZ8=^3Qh?>jADALHTMj$5-Y)`ORnzT7U
zvNUa`G*R=yc)B^qQ#9czI)kS_fTzKbrwl~9DNpq*Pu6*ON=nq2rx>nt@nn5so_;)=
zr_PWkYq>lP5}0Zq4w7|YAl;b*X)pn4uoxtal2fJ~+QDh{3sor=?hVhlrkYzB{<3DX
zspc1$t){v<GCzH<z(I*IDyj<JKsi-`whFCS-a|%O*;zk;lTl9`Q{lF`Eqfu;_j%Bh
zPXgxX9Vnz29X`4LE!}_k`0xOx*Z;@#04lyK;JB1H|Ip{`>Cd1m!dK81=qiqMU4s8L
zXzLfO?HABCz$J7Qv~7W&O=v2#u|9OM`kp^SlN--;Vsjv}SiXyD|CyObJvo)x^(3>8
z4i6x;&%a0MMJhq3FhOTY9$Kjf>kJ6;A0J1?%TD5@Fo)U_yqu=7>}H^eKeB0PJ~}TM
zPxZu8A;EL`o__dze6>bMRSP%|u_$Asi5D2HtnrQVkFtQl`$>&qt;eAcb*4L8Z1A57
z`U<Xcz@<R_21#oTil#<`sS(Wa6PN`W-Tp_9hcJ(g@l7kk_{0nc<|6aXV)O1|bA8(U
jM!AiwxleF5{LtoYMjf9ExE{M)3Y>fO`d%Kb6u15b!!HUU

literal 0
HcmV?d00001

diff --git a/tools/pnnx/tests/onnx/test_torch_topk.onnx.data b/tools/pnnx/tests/onnx/test_torch_topk.onnx.data
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin
new file mode 100644
index 0000000000000000000000000000000000000000..aa99d4621ed08e4d5412634fb912b37433a365d8
GIT binary patch
literal 98
gcmWIWW@FP~ARPpFv#}%VVgzymyjj`)qX7m603eYP6951J

literal 0
HcmV?d00001

diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..83b5d3a0f7a0476395b71a8e3c1232fa127a2904
GIT binary patch
literal 882
zcmaiyO;5r=5QZ6AOC1oK4FOUQnjSe|w;z0X*JwO=@lHr=42>xbEeSt|f9PLuW~qd>
z5f78iyzlHj?{53>ZD3D;Yip~-budkm-S{O*w>hhlRasR%R=tvXFTl6kym=Ar(#^ti
zuGLA8)I?dfS|_6p>TEgS?LosQ1Q)><5C{lu<9(gJE?h_Fb<{?Exm#sJ*h6#F#n#ty
z^BHCNp}#^STW|8{L$yfY$thT#0S5(GB1{BK1<VSttbnx&xWRoLJU<1yXTUy#pAkHS
zxoAq4N)~9Zf>x-1?%YQRY<<1WT}M6n_-&i>2O~XHx~%C_rF%7frgXoo50Ek{Bd8lv
z8DYtY2lO?CK+8nY!KMjR8kH5_AV|^29vN0)T+n(+)sAoJMTCt*GJ7O+sZ6Th0gT)H
zhZv}v4bfmmP+>G_+F~PAM(YSa7_@zI+)hcRe<aa4=1iHbT1=f&%#X+1@!On)=4+Ni
N^Mq6rEp@Hh#1C`brwjl9

literal 0
HcmV?d00001

diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.param b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.param
new file mode 100644
index 000000000000..8335d975fe0d
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.param
@@ -0,0 +1,17 @@
+7767517
+15 12
+pnnx.Input               in0                      0 1 0 #0=(1,3,16)f32
+pnnx.Input               in1                      0 1 1 #1=(1,5,9,11)f32
+pnnx.Input               in2                      0 1 2 #2=(14,8,5,9,10)f32
+pnnx.Expression          pnnx_expr_2              0 1 3 expr=2
+TopK                     TopK_0                   2 2 0 3 4 5 axis=1 largest=1 sorted=1 #0=(1,3,16)f32 #4=(1,2,16)f32 #5=(1,2,16)i64
+pnnx.Expression          pnnx_expr_1              0 1 6 expr=4
+TopK                     TopK_1                   2 2 1 6 7 8 axis=3 largest=0 sorted=1 #1=(1,5,9,11)f32 #7=(1,5,9,4)f32 #8=(1,5,9,4)i64
+pnnx.Expression          pnnx_expr_0              0 1 9 expr=3
+TopK                     TopK_2                   2 2 2 9 10 11 axis=0 largest=1 sorted=1 #2=(14,8,5,9,10)f32 #10=(3,8,5,9,10)f32 #11=(3,8,5,9,10)i64
+pnnx.Output              out0                     1 0 4 #4=(1,2,16)f32
+pnnx.Output              out1                     1 0 5 #5=(1,2,16)i64
+pnnx.Output              out2                     1 0 7 #7=(1,5,9,4)f32
+pnnx.Output              out3                     1 0 8 #8=(1,5,9,4)i64
+pnnx.Output              out4                     1 0 10 #10=(3,8,5,9,10)f32
+pnnx.Output              out5                     1 0 11 #11=(3,8,5,9,10)i64
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnxsim.onnx b/tools/pnnx/tests/onnx/test_torch_topk.pnnxsim.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..9fdafe6b89976390f2aa42652657176353dd6ddc
GIT binary patch
literal 2861
zcmc(h&2QT_7{((<ja9Y_3@!@_9}2=n;nWZ$ONqDEKrRJZATHJwYk~m{0z#1)N3%=`
zBHh|4dTCLhhh28sVTbm%>rT7v+<$6^zM|yBwj5*|T0{UP@!{8_mp_p-5H81K872O4
zb;Y%urqyi!x{m(T(Q?3>=*n|1q)n_xLz&&3>dI~uKcdZ=o<192p%oD&l67xrAGA_q
z6_rCT=`kU*{57O|qd=%kr-iO^Z@@(C`3!Hg38e!s;FMb5M_m@$GK%|c;z`D>^wsLJ
zBX1bW&JMa6_j{m$M=RodWci^d1U(K|;(<2C_5J#6Uv-nz3mw%DI&U<zKG)aLh;kSn
z2e;EfNt^8R;eZYWOT@N&K(>kRg&{o}a$g1!-<Rpv$?e=sEUdyX2OCgDK*NpOBa^r$
zaqgI;{~{M(zPi(Cpz557G%Qs3QOpFszF$x6r*6RUA6lWZke=VCGD)NOPJ0-HjMVJC
z)Uf?L;6CHh{wYfOHupKFX*UqrC=_-Kp|d4Sjj+=ws#hq72&!6EJg%7}3^*gsrbz-$
zj-mu?cM*YWM1h1M6DGU`yc_r|{lqK`=@=X6Z-tDLA>$}yTy@{fk#R0WW+_Ezg3K~x
zI^Rtr(^!ZMb-pk(znl+EqM)J9=f!4hk}Z?K96a<w!RF~P3qVZMvwB7kbuNvcZwvg4
zPx%?2!_QxeA2;JCBWFwT)5`gAFXTrpK@)ybnEn+%?ic3g?fLxlGk)C9%g>2|r}_WE
zaW4&=dj)V#PQf`@3=T$W7wIKleBAiP&?@Eo*{;<wN=w-_Ejt}UUm&|3V>y?f{idnS
z4`Wm@G<+SE3=PsUq)KfY*=K7v8^y5qS!!r1f*$weFk#jXkB9PU%zUd4OC?5oPaph9
zcOF04+lA(>|Ij>!H9ZipAxf-2jyQYvyK?5>b+iV~D!H>O@VpA?+9kVtuiUw~if(|r
zHE^^F$$&I*h%VRRVHIU3nc0~wfx}8(=BQaOpU!$#c2l);&xyN!n3Zp;@^WEb^uwBJ
zzcm$^l%|=;<aJuAPFY2-Q0M2bCkHZw45N&7=tP@{4qhO-)8cFG9}P7N{5?<<v+k25
z0+I*wIOC5Rx&_ApU|OJ2o1D>%#xuE-nH;o%L!)X02c|%>L0<gB-Y)zJC>PV+TlF|6
u!=k)M-drqiEtXre@@wrbaxeV+#N$0i{fG-VVg+0ZTzdKHzK)hEYo7plmS9`}

literal 0
HcmV?d00001

diff --git a/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py b/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py
new file mode 100644
index 000000000000..2b4e7ed5abae
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py
@@ -0,0 +1,109 @@
+import os
+import numpy as np
+import tempfile, zipfile
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import torchvision
+    import torchaudio
+except:
+    pass
+
+class TopK(nn.Module):
+    def __init__(self, axis=1, largest=1, sorted=1):
+        super(TopK, self).__init__()
+        self.axis = axis
+        self.largest = largest
+        self.sorted = sorted
+    def forward(self, x, k):
+        # Torch topk returns (values, indices)
+        return torch.topk(x, k.item() if hasattr(k, 'item') else k, dim=self.axis, largest=bool(self.largest), sorted=bool(self.sorted))
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.TopK_0 = TopK(axis=1, largest=1, sorted=1)
+        self.TopK_1 = TopK(axis=3, largest=0, sorted=1)
+        self.TopK_2 = TopK(axis=0, largest=1, sorted=1)
+
+        archive = zipfile.ZipFile('test_torch_topk.pnnx.bin', 'r')
+        archive.close()
+
+    def load_pnnx_bin_as_parameter(self, archive, key, shape, dtype, requires_grad=True):
+        return nn.Parameter(self.load_pnnx_bin_as_tensor(archive, key, shape, dtype), requires_grad)
+
+    def load_pnnx_bin_as_tensor(self, archive, key, shape, dtype):
+        fd, tmppath = tempfile.mkstemp()
+        with os.fdopen(fd, 'wb') as tmpf, archive.open(key) as keyfile:
+            tmpf.write(keyfile.read())
+        m = np.memmap(tmppath, dtype=dtype, mode='r', shape=shape).copy()
+        os.remove(tmppath)
+        return torch.from_numpy(m)
+
+    def forward(self, v_0, v_1, v_2):
+        v_3 = 2
+        v_4, v_5 = self.TopK_0(v_0, v_3)
+        v_6 = 4
+        v_7, v_8 = self.TopK_1(v_1, v_6)
+        v_9 = 3
+        v_10, v_11 = self.TopK_2(v_2, v_9)
+        return v_4, v_5, v_7, v_8, v_10, v_11
+
+def export_torchscript():
+    net = Model()
+    net.float()
+    net.eval()
+
+    torch.manual_seed(0)
+    v_0 = torch.rand(1, 3, 16, dtype=torch.float)
+    v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float)
+    v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float)
+
+    mod = torch.jit.trace(net, (v_0, v_1, v_2))
+    mod.save("test_torch_topk_pnnx.py.pt")
+
+def export_onnx():
+    net = Model()
+    net.float()
+    net.eval()
+
+    torch.manual_seed(0)
+    v_0 = torch.rand(1, 3, 16, dtype=torch.float)
+    v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float)
+    v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float)
+
+    torch.onnx.export(net, (v_0, v_1, v_2), "test_torch_topk_pnnx.py.onnx", export_params=True, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, opset_version=13, input_names=['in0', 'in1', 'in2'], output_names=['out0', 'out1', 'out2', 'out3', 'out4', 'out5'])
+
+def export_pnnx():
+    net = Model()
+    net.float()
+    net.eval()
+
+    torch.manual_seed(0)
+    v_0 = torch.rand(1, 3, 16, dtype=torch.float)
+    v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float)
+    v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float)
+
+    import pnnx
+    pnnx.export(net, "test_torch_topk_pnnx.py.pt", (v_0, v_1, v_2))
+
+def export_ncnn():
+    export_pnnx()
+
+@torch.no_grad()
+def test_inference():
+    net = Model()
+    net.float()
+    net.eval()
+
+    torch.manual_seed(0)
+    v_0 = torch.rand(1, 3, 16, dtype=torch.float)
+    v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float)
+    v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float)
+
+    return net(v_0, v_1, v_2)
+
+if __name__ == "__main__":
+    print(test_inference())

From 13cf18c4f055dbae88e103a049c8e911aea98af4 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 08:38:06 +0100
Subject: [PATCH 04/29] Fix pnnx pass_ncnn TopK pattern matching and parameter
 capture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix IR pattern syntax to use explicit parameter names (axis=%, largest=%, sorted=%)
- Replace incorrect parameter lookup from 'op_0.axis' to 'axis' to match captured names
- TopK pass now properly fires during ONNX→pnnx→ncnn conversion
- All TopK parameters (axis, largest, sorted) correctly captured and set in ncnn layers
- End-to-end test confirms ONNX→pnnx→ncnn conversion with TopK working correctly
---
 tools/pnnx/src/pass_ncnn/TopK.cpp             | 16 ++++----
 .../pnnx/tests/onnx/test_torch_topk.ncnn.bin  |  0
 .../tests/onnx/test_torch_topk.ncnn.param     | 11 +++++
 tools/pnnx/tests/onnx/test_torch_topk_ncnn.py | 40 +++++++++++++++++++
 4 files changed, 59 insertions(+), 8 deletions(-)
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.ncnn.param
 create mode 100644 tools/pnnx/tests/onnx/test_torch_topk_ncnn.py

diff --git a/tools/pnnx/src/pass_ncnn/TopK.cpp b/tools/pnnx/src/pass_ncnn/TopK.cpp
index 515790e38518..ed226605ad8c 100644
--- a/tools/pnnx/src/pass_ncnn/TopK.cpp
+++ b/tools/pnnx/src/pass_ncnn/TopK.cpp
@@ -26,7 +26,7 @@ class TopK : public GraphRewriterPass
 4 3
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 k
-TopK                    op_0        2 2 input k values indices %*=%*
+TopK                    op_0        2 2 input k values indices axis=%axis largest=%largest sorted=%sorted
 pnnx.Output             output      2 0 values indices
 )PNNXIR";
     }
@@ -44,16 +44,16 @@ pnnx.Output             output      2 0 values indices
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
         int axis = -1;
-        if (captured_params.find("op_0.axis") != captured_params.end())
-            axis = captured_params.at("op_0.axis").i;
+        if (captured_params.find("axis") != captured_params.end())
+            axis = captured_params.at("axis").i;
 
         int largest = 1;
-        if (captured_params.find("op_0.largest") != captured_params.end())
-            largest = parameter_to_bool(captured_params.at("op_0.largest"), 1);
+        if (captured_params.find("largest") != captured_params.end())
+            largest = parameter_to_bool(captured_params.at("largest"), 1);
 
         int sorted = 1;
-        if (captured_params.find("op_0.sorted") != captured_params.end())
-            sorted = parameter_to_bool(captured_params.at("op_0.sorted"), 1);
+        if (captured_params.find("sorted") != captured_params.end())
+            sorted = parameter_to_bool(captured_params.at("sorted"), 1);
 
         const int batch_index = op->inputs[0]->params["__batch_index"].i;
 
@@ -84,7 +84,7 @@ class TopK_0 : public TopK
 4 2
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 k
-TopK                    op_0        2 1 input k values %*=%*
+TopK                    op_0        2 1 input k values axis=%axis largest=%largest sorted=%sorted
 pnnx.Output             output      1 0 values
 )PNNXIR";
     }
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin b/tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param b/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param
new file mode 100644
index 000000000000..f15762f83651
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param
@@ -0,0 +1,11 @@
+7767517
+9 12
+Input                    in0                      0 1 in0
+Input                    in1                      0 1 in1
+Input                    in2                      0 1 in2
+pnnx.Expression          pnnx_expr_2              0 1 3
+TopK                     topk_0                   2 2 in0 3 out0 out1 0=1 1=1 2=1
+pnnx.Expression          pnnx_expr_1              0 1 6
+TopK                     topk_1                   2 2 in1 6 out2 out3 0=3 1=0 2=1
+pnnx.Expression          pnnx_expr_0              0 1 9
+TopK                     topk_2                   2 2 in2 9 out4 out5 0=0 1=1 2=1
diff --git a/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py b/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py
new file mode 100644
index 000000000000..bcb84b7afc45
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py
@@ -0,0 +1,40 @@
+import numpy as np
+import ncnn
+import torch
+
+def test_inference():
+    torch.manual_seed(0)
+    in0 = torch.rand(1, 3, 16, dtype=torch.float)
+    in1 = torch.rand(1, 5, 9, 11, dtype=torch.float)
+    in2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float)
+    out = []
+
+    with ncnn.Net() as net:
+        net.load_param("test_torch_topk.ncnn.param")
+        net.load_model("test_torch_topk.ncnn.bin")
+
+        with net.create_extractor() as ex:
+            ex.input("in0", ncnn.Mat(in0.numpy()).clone())
+            ex.input("in1", ncnn.Mat(in1.numpy()).clone())
+            ex.input("in2", ncnn.Mat(in2.numpy()).clone())
+
+            _, out0 = ex.extract("out0")
+            out.append(torch.from_numpy(np.array(out0)))
+            _, out1 = ex.extract("out1")
+            out.append(torch.from_numpy(np.array(out1)))
+            _, out2 = ex.extract("out2")
+            out.append(torch.from_numpy(np.array(out2)))
+            _, out3 = ex.extract("out3")
+            out.append(torch.from_numpy(np.array(out3)))
+            _, out4 = ex.extract("out4")
+            out.append(torch.from_numpy(np.array(out4)))
+            _, out5 = ex.extract("out5")
+            out.append(torch.from_numpy(np.array(out5)))
+
+    if len(out) == 1:
+        return out[0]
+    else:
+        return tuple(out)
+
+if __name__ == "__main__":
+    print(test_inference())

From e95770e0bb0fcfef0ca74693d60af18054da3b75 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 14:43:11 +0100
Subject: [PATCH 05/29] topk: align with codebase style and expand ONNX
 coverage

use c++03-style topk comparator and keep deterministic nan/inf ordering

remove redundant constructor param initialization

fix tests cmakelists alphabetical order (Tile before TopK)

expand torch_topk onnx tests (k=0/k=1, negative dim, sorted=false cases)

drop generated topk onnx/pnnx/ncnn sidecar artifacts from repo
---
 src/layer/topk.cpp                            | 115 ++++++------
 tests/CMakeLists.txt                          |   2 +-
 tests/test_topk.cpp                           | 174 +++++++++++++++++-
 .../tests/onnx/test_torch_topk.ncnn.param     |  11 --
 tools/pnnx/tests/onnx/test_torch_topk.onnx    | Bin 3317 -> 0 bytes
 .../pnnx/tests/onnx/test_torch_topk.pnnx.onnx | Bin 882 -> 0 bytes
 .../tests/onnx/test_torch_topk.pnnx.param     |  17 --
 .../tests/onnx/test_torch_topk.pnnxsim.onnx   | Bin 2861 -> 0 bytes
 tools/pnnx/tests/onnx/test_torch_topk.py      |  50 ++++-
 tools/pnnx/tests/onnx/test_torch_topk_ncnn.py |  40 ----
 tools/pnnx/tests/onnx/test_torch_topk_pnnx.py | 109 -----------
 11 files changed, 281 insertions(+), 237 deletions(-)
 delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.ncnn.param
 delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.onnx
 delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx
 delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.param
 delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnxsim.onnx
 delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk_ncnn.py
 delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk_pnnx.py

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index c65dbc9689ba..72b4df40813d 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -4,19 +4,58 @@
 #include "topk.h"
 
 #include <algorithm>
+#include <stdint.h>
+#include <string.h>
 #include <vector>
 
 namespace ncnn {
 
+static inline bool topk_isnan(float v)
+{
+    uint32_t u;
+    memcpy(&u, &v, sizeof(uint32_t));
+    return (u & 0x7fffffff) > 0x7f800000;
+}
+
+static inline bool topk_pair_comp(const std::pair<float, int>& a, const std::pair<float, int>& b, bool largest)
+{
+    const bool a_nan = topk_isnan(a.first);
+    const bool b_nan = topk_isnan(b.first);
+
+    // Keep NaN at the end for both largest/smallest to ensure deterministic ordering.
+    if (a_nan || b_nan)
+    {
+        if (a_nan != b_nan)
+            return !a_nan && b_nan;
+
+        return a.second < b.second;
+    }
+
+    if (a.first != b.first)
+        return largest ? (a.first > b.first) : (a.first < b.first);
+
+    return a.second < b.second;
+}
+
+struct topk_pair_comparator
+{
+    topk_pair_comparator(bool _largest)
+        : largest(_largest)
+    {
+    }
+
+    bool operator()(const std::pair<float, int>& a, const std::pair<float, int>& b) const
+    {
+        return topk_pair_comp(a, b, largest);
+    }
+
+    bool largest;
+};
+
 TopK::TopK()
 {
     one_blob_only = false;
     support_inplace = false;
-
-    axis = -1;
-    largest = 1;
-    sorted = 1;
-    k = 1;
 }
 
 int TopK::load_param(const ParamDict& pd)
@@ -49,10 +88,10 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
     if (bottom_blob.dims < 1 || bottom_blob.dims > 4)
         return -100;
 
-    int dims = bottom_blob.dims;
+    const int dims = bottom_blob.dims;
 
-    int axis_p = axis < 0 ? axis + dims : axis;
-    if (axis_p < 0 || axis_p >= dims)
+    const int positive_axis = axis < 0 ? axis + dims : axis;
+    if (positive_axis < 0 || positive_axis >= dims)
         return -1;
 
     int shape[4] = {1, 1, 1, 1};
@@ -61,7 +100,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
     if (dims >= 3) shape[2] = bottom_blob.dims == 3 ? bottom_blob.c : bottom_blob.d;
     if (dims >= 4) shape[3] = bottom_blob.c;
 
-    int axis_size = shape[axis_p];
+    const int axis_size = shape[positive_axis];
     if (axis_size <= 0)
         return -1;
 
@@ -71,7 +110,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
         _k = axis_size;
 
     int out_shape[4] = {shape[0], shape[1], shape[2], shape[3]};
-    out_shape[axis_p] = _k;
+    out_shape[positive_axis] = _k;
 
     Mat values;
     if (dims == 1) values.create(out_shape[0], 4u, opt.blob_allocator);
@@ -97,23 +136,23 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
     float* outidxptr = indices;
 
     int inner = 1;
-    for (int i = 0; i < axis_p; i++)
+    for (int i = 0; i < positive_axis; i++)
     {
         inner *= shape[i];
     }
 
     int outer = 1;
-    for (int i = axis_p + 1; i < dims; i++)
+    for (int i = positive_axis + 1; i < dims; i++)
     {
         outer *= shape[i];
     }
 
-    const bool largest_p = largest != 0;
-    const bool sorted_p = sorted != 0;
+    const bool largest_flag = largest != 0;
+    const bool sorted_flag = sorted != 0;
 
     const int total_lines = outer * inner;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int line = 0; line < total_lines; line++)
     {
         int outer_i = line / inner;
@@ -131,49 +170,19 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             vec[j].second = j;
         }
 
-        if (largest_p)
+        topk_pair_comparator comp(largest_flag);
+
+        if (_k < axis_size)
         {
-            auto comp = [](const std::pair<float, int>& a, const std::pair<float, int>& b)
-            {
-                if (a.first != b.first)
-                    return a.first > b.first;
-                return a.second < b.second;
-            };
-
-            if (_k < axis_size)
-            {
-                if (sorted_p)
-                    std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp);
-                else
-                    std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
-            }
+            if (sorted_flag)
+                std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp);
             else
-            {
-                if (sorted_p)
-                    std::sort(vec.begin(), vec.end(), comp);
-            }
+                std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
         }
         else
         {
-            auto comp = [](const std::pair<float, int>& a, const std::pair<float, int>& b)
-            {
-                if (a.first != b.first)
-                    return a.first < b.first;
-                return a.second < b.second;
-            };
-
-            if (_k < axis_size)
-            {
-                if (sorted_p)
-                    std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp);
-                else
-                    std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
-            }
-            else
-            {
-                if (sorted_p)
-                    std::sort(vec.begin(), vec.end(), comp);
-            }
+            if (sorted_flag)
+                std::sort(vec.begin(), vec.end(), comp);
         }
 
         for (int j = 0; j < _k; j++)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4f40f8279428..35df0d37a967 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -166,8 +166,8 @@ ncnn_add_layer_test(Spectrogram)
 ncnn_add_layer_test(Squeeze)
 ncnn_add_layer_test(Swish)
 ncnn_add_layer_test(TanH)
-ncnn_add_layer_test(TopK)
 ncnn_add_layer_test(Tile)
+ncnn_add_layer_test(TopK)
 ncnn_add_layer_test(UnaryOp)
 ncnn_add_layer_test(Unfold)
 ncnn_add_layer_test(Yolov3DetectionOutput)
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 7b7fe82690ba..b35be1574b18 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -3,6 +3,52 @@
 
 #include "testutil.h"
 
+#include <limits>
+
+static int test_topk_cpu_forward(const ncnn::Mat& a, int axis, int k, int largest, int sorted, ncnn::Mat& values, ncnn::Mat& indices)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, axis);
+    pd.set(1, largest);
+    pd.set(2, sorted);
+    pd.set(3, k);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = false;
+    opt.use_packing_layout = false;
+
+    ncnn::Layer* op = ncnn::create_layer_cpu("TopK");
+    if (!op)
+        return -1;
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+    op->load_model(mb);
+
+    op->create_pipeline(opt);
+
+    std::vector<ncnn::Mat> bottom_blobs(1);
+    bottom_blobs[0] = a;
+
+    std::vector<ncnn::Mat> top_blobs(2);
+    int ret = op->forward(bottom_blobs, top_blobs, opt);
+
+    op->destroy_pipeline(opt);
+    delete op;
+
+    if (ret != 0)
+        return ret;
+
+    values = top_blobs[0];
+    indices = top_blobs[1];
+
+    return 0;
+}
+
 static int test_topk(const ncnn::Mat& a, int axis, int k, int largest, int sorted)
 {
     ncnn::ParamDict pd;
@@ -76,6 +122,130 @@ static int test_topk_3()
            || test_topk(a, -4, 2, 1, 1);
 }
 
+static int test_topk_inf_order()
+{
+    ncnn::Mat a(6);
+    float* ptr = a;
+    ptr[0] = 1.f;
+    ptr[1] = std::numeric_limits<float>::infinity();
+    ptr[2] = -2.f;
+    ptr[3] = -std::numeric_limits<float>::infinity();
+    ptr[4] = 0.5f;
+    ptr[5] = 3.f;
+
+    ncnn::Mat values;
+    ncnn::Mat indices;
+
+    int ret = test_topk_cpu_forward(a, 0, 2, 1, 1, values, indices);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_topk_inf_order largest failed ret=%d\n", ret);
+        return -1;
+    }
+
+    const float* vptr = values;
+    const float* iptr = indices;
+    if (values.w != 2 || indices.w != 2 || vptr[0] != std::numeric_limits<float>::infinity() || vptr[1] != 3.f || (int)iptr[0] != 1 || (int)iptr[1] != 5)
+    {
+        fprintf(stderr, "test_topk_inf_order largest result mismatch\n");
+        return -1;
+    }
+
+    ret = test_topk_cpu_forward(a, 0, 2, 0, 1, values, indices);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_topk_inf_order smallest failed ret=%d\n", ret);
+        return -1;
+    }
+
+    vptr = values;
+    iptr = indices;
+    if (values.w != 2 || indices.w != 2 || vptr[0] != -std::numeric_limits<float>::infinity() || vptr[1] != -2.f || (int)iptr[0] != 3 || (int)iptr[1] != 2)
+    {
+        fprintf(stderr, "test_topk_inf_order smallest result mismatch\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int test_topk_nan_robust()
+{
+    ncnn::Mat a(4);
+    float* ptr = a;
+    ptr[0] = 1.f;
+    ptr[1] = std::numeric_limits<float>::quiet_NaN();
+    ptr[2] = 2.f;
+    ptr[3] = -1.f;
+
+    ncnn::Mat values;
+    ncnn::Mat indices;
+
+    int ret = test_topk_cpu_forward(a, 0, 2, 1, 1, values, indices);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_topk_nan_robust sorted failed ret=%d\n", ret);
+        return -1;
+    }
+
+    if (values.w != 2 || indices.w != 2)
+    {
+        fprintf(stderr, "test_topk_nan_robust sorted shape mismatch\n");
+        return -1;
+    }
+
+    const float* vptr = values;
+    const float* iptr = indices;
+    if (vptr[0] != 2.f || vptr[1] != 1.f || (int)iptr[0] != 2 || (int)iptr[1] != 0)
+    {
+        fprintf(stderr, "test_topk_nan_robust sorted largest mismatch\n");
+        return -1;
+    }
+
+    ret = test_topk_cpu_forward(a, 0, 2, 0, 1, values, indices);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_topk_nan_robust sorted smallest failed ret=%d\n", ret);
+        return -1;
+    }
+
+    if (values.w != 2 || indices.w != 2)
+    {
+        fprintf(stderr, "test_topk_nan_robust sorted smallest shape mismatch\n");
+        return -1;
+    }
+
+    vptr = values;
+    iptr = indices;
+    if (vptr[0] != -1.f || vptr[1] != 1.f || (int)iptr[0] != 3 || (int)iptr[1] != 0)
+    {
+        fprintf(stderr, "test_topk_nan_robust sorted smallest mismatch\n");
+        return -1;
+    }
+
+    ret = test_topk_cpu_forward(a, 0, 2, 1, 0, values, indices);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_topk_nan_robust unsorted failed ret=%d\n", ret);
+        return -1;
+    }
+
+    if (values.w != 2 || indices.w != 2)
+    {
+        fprintf(stderr, "test_topk_nan_robust unsorted shape mismatch\n");
+        return -1;
+    }
+
+    iptr = indices;
+    if ((int)iptr[0] < 0 || (int)iptr[0] >= 4 || (int)iptr[1] < 0 || (int)iptr[1] >= 4)
+    {
+        fprintf(stderr, "test_topk_nan_robust unsorted invalid indices\n");
+        return -1;
+    }
+
+    return 0;
+}
+
 int main()
 {
     SRAND(7767517);
@@ -84,5 +254,7 @@ int main()
            || test_topk_0()
            || test_topk_1()
            || test_topk_2()
-           || test_topk_3();
+           || test_topk_3()
+           || test_topk_inf_order()
+           || test_topk_nan_robust();
 }
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param b/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param
deleted file mode 100644
index f15762f83651..000000000000
--- a/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param
+++ /dev/null
@@ -1,11 +0,0 @@
-7767517
-9 12
-Input                    in0                      0 1 in0
-Input                    in1                      0 1 in1
-Input                    in2                      0 1 in2
-pnnx.Expression          pnnx_expr_2              0 1 3
-TopK                     topk_0                   2 2 in0 3 out0 out1 0=1 1=1 2=1
-pnnx.Expression          pnnx_expr_1              0 1 6
-TopK                     topk_1                   2 2 in1 6 out2 out3 0=3 1=0 2=1
-pnnx.Expression          pnnx_expr_0              0 1 9
-TopK                     topk_2                   2 2 in2 9 out4 out5 0=0 1=1 2=1
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.onnx b/tools/pnnx/tests/onnx/test_torch_topk.onnx
deleted file mode 100644
index e57e7e63ec365e26943043ad0202d1152ca55191..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3317
zcmc(h&u`l{6vrdStraE<3@$5L9|poh;nWZ$ONsl#Kv1AS3&g|PVoflhLCUEz<7k#C
zL8M#DMK3J|?9giuI}F%u|HRJQX@9`}mmT&b|B(D6Sk}6T0m9^y$oGAGA3ah<xDZ6r
z51rG}WlJ|}dad#IHS~KPDSdS5h3(NAF0`3+CF|m~QHPqP4eqxXm448n)GX%@{NU^A
zXXvu<L&;iKm0y)ybrTglJ8Ux{hvRjWxBIS02F*HJ<93&cz;+nkZV*blw#zBizkzi<
zBVM>cY{|GDcl8!Kv3s6u?xQb*P8)2XSNA#Z>yBp&K~G&4+F*-t<)|{fRz)L~xrHjn
z&9@4=GPBl6pT=Qf_j>G<G{_O}b!ktqQ0!U9WREzu=h2fMcckm{BN<00_eNp@q0hNF
z-iB8MOx(WS*NCMN^M*z`FGt2#uWvM~s5B!Y-WDnxKVSlH993fZu^LePn=<#ZkhasI
zGK_ceC*^_bF;X@TW5LD?mphD0<2yg>7~J7J$rkLa?+GJ-turK~Mi}ufCS7<AJD9ws
z2T@rgp350AcQoR1a^i;|yM+k+`V_e6F`+@v<t^7?afrihh^20$zW|?Th>w|4W2N1l
z(q;;fc^-sTVx-0ht-_#rD~nKdAwtyr#1OqZ8=^3Qh?>jADALHTMj$5-Y)`ORnzT7U
zvNUa`G*R=yc)B^qQ#9czI)kS_fTzKbrwl~9DNpq*Pu6*ON=nq2rx>nt@nn5so_;)=
zr_PWkYq>lP5}0Zq4w7|YAl;b*X)pn4uoxtal2fJ~+QDh{3sor=?hVhlrkYzB{<3DX
zspc1$t){v<GCzH<z(I*IDyj<JKsi-`whFCS-a|%O*;zk;lTl9`Q{lF`Eqfu;_j%Bh
zPXgxX9Vnz29X`4LE!}_k`0xOx*Z;@#04lyK;JB1H|Ip{`>Cd1m!dK81=qiqMU4s8L
zXzLfO?HABCz$J7Qv~7W&O=v2#u|9OM`kp^SlN--;Vsjv}SiXyD|CyObJvo)x^(3>8
z4i6x;&%a0MMJhq3FhOTY9$Kjf>kJ6;A0J1?%TD5@Fo)U_yqu=7>}H^eKeB0PJ~}TM
zPxZu8A;EL`o__dze6>bMRSP%|u_$Asi5D2HtnrQVkFtQl`$>&qt;eAcb*4L8Z1A57
z`U<Xcz@<R_21#oTil#<`sS(Wa6PN`W-Tp_9hcJ(g@l7kk_{0nc<|6aXV)O1|bA8(U
jM!AiwxleF5{LtoYMjf9ExE{M)3Y>fO`d%Kb6u15b!!HUU

diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx
deleted file mode 100644
index 83b5d3a0f7a0476395b71a8e3c1232fa127a2904..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 882
zcmaiyO;5r=5QZ6AOC1oK4FOUQnjSe|w;z0X*JwO=@lHr=42>xbEeSt|f9PLuW~qd>
z5f78iyzlHj?{53>ZD3D;Yip~-budkm-S{O*w>hhlRasR%R=tvXFTl6kym=Ar(#^ti
zuGLA8)I?dfS|_6p>TEgS?LosQ1Q)><5C{lu<9(gJE?h_Fb<{?Exm#sJ*h6#F#n#ty
z^BHCNp}#^STW|8{L$yfY$thT#0S5(GB1{BK1<VSttbnx&xWRoLJU<1yXTUy#pAkHS
zxoAq4N)~9Zf>x-1?%YQRY<<1WT}M6n_-&i>2O~XHx~%C_rF%7frgXoo50Ek{Bd8lv
z8DYtY2lO?CK+8nY!KMjR8kH5_AV|^29vN0)T+n(+)sAoJMTCt*GJ7O+sZ6Th0gT)H
zhZv}v4bfmmP+>G_+F~PAM(YSa7_@zI+)hcRe<aa4=1iHbT1=f&%#X+1@!On)=4+Ni
N^Mq6rEp@Hh#1C`brwjl9

diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.param b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.param
deleted file mode 100644
index 8335d975fe0d..000000000000
--- a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.param
+++ /dev/null
@@ -1,17 +0,0 @@
-7767517
-15 12
-pnnx.Input               in0                      0 1 0 #0=(1,3,16)f32
-pnnx.Input               in1                      0 1 1 #1=(1,5,9,11)f32
-pnnx.Input               in2                      0 1 2 #2=(14,8,5,9,10)f32
-pnnx.Expression          pnnx_expr_2              0 1 3 expr=2
-TopK                     TopK_0                   2 2 0 3 4 5 axis=1 largest=1 sorted=1 #0=(1,3,16)f32 #4=(1,2,16)f32 #5=(1,2,16)i64
-pnnx.Expression          pnnx_expr_1              0 1 6 expr=4
-TopK                     TopK_1                   2 2 1 6 7 8 axis=3 largest=0 sorted=1 #1=(1,5,9,11)f32 #7=(1,5,9,4)f32 #8=(1,5,9,4)i64
-pnnx.Expression          pnnx_expr_0              0 1 9 expr=3
-TopK                     TopK_2                   2 2 2 9 10 11 axis=0 largest=1 sorted=1 #2=(14,8,5,9,10)f32 #10=(3,8,5,9,10)f32 #11=(3,8,5,9,10)i64
-pnnx.Output              out0                     1 0 4 #4=(1,2,16)f32
-pnnx.Output              out1                     1 0 5 #5=(1,2,16)i64
-pnnx.Output              out2                     1 0 7 #7=(1,5,9,4)f32
-pnnx.Output              out3                     1 0 8 #8=(1,5,9,4)i64
-pnnx.Output              out4                     1 0 10 #10=(3,8,5,9,10)f32
-pnnx.Output              out5                     1 0 11 #11=(3,8,5,9,10)i64
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnxsim.onnx b/tools/pnnx/tests/onnx/test_torch_topk.pnnxsim.onnx
deleted file mode 100644
index 9fdafe6b89976390f2aa42652657176353dd6ddc..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2861
zcmc(h&2QT_7{((<ja9Y_3@!@_9}2=n;nWZ$ONqDEKrRJZATHJwYk~m{0z#1)N3%=`
zBHh|4dTCLhhh28sVTbm%>rT7v+<$6^zM|yBwj5*|T0{UP@!{8_mp_p-5H81K872O4
zb;Y%urqyi!x{m(T(Q?3>=*n|1q)n_xLz&&3>dI~uKcdZ=o<192p%oD&l67xrAGA_q
z6_rCT=`kU*{57O|qd=%kr-iO^Z@@(C`3!Hg38e!s;FMb5M_m@$GK%|c;z`D>^wsLJ
zBX1bW&JMa6_j{m$M=RodWci^d1U(K|;(<2C_5J#6Uv-nz3mw%DI&U<zKG)aLh;kSn
z2e;EfNt^8R;eZYWOT@N&K(>kRg&{o}a$g1!-<Rpv$?e=sEUdyX2OCgDK*NpOBa^r$
zaqgI;{~{M(zPi(Cpz557G%Qs3QOpFszF$x6r*6RUA6lWZke=VCGD)NOPJ0-HjMVJC
z)Uf?L;6CHh{wYfOHupKFX*UqrC=_-Kp|d4Sjj+=ws#hq72&!6EJg%7}3^*gsrbz-$
zj-mu?cM*YWM1h1M6DGU`yc_r|{lqK`=@=X6Z-tDLA>$}yTy@{fk#R0WW+_Ezg3K~x
zI^Rtr(^!ZMb-pk(znl+EqM)J9=f!4hk}Z?K96a<w!RF~P3qVZMvwB7kbuNvcZwvg4
zPx%?2!_QxeA2;JCBWFwT)5`gAFXTrpK@)ybnEn+%?ic3g?fLxlGk)C9%g>2|r}_WE
zaW4&=dj)V#PQf`@3=T$W7wIKleBAiP&?@Eo*{;<wN=w-_Ejt}UUm&|3V>y?f{idnS
z4`Wm@G<+SE3=PsUq)KfY*=K7v8^y5qS!!r1f*$weFk#jXkB9PU%zUd4OC?5oPaph9
zcOF04+lA(>|Ij>!H9ZipAxf-2jyQYvyK?5>b+iV~D!H>O@VpA?+9kVtuiUw~if(|r
zHE^^F$$&I*h%VRRVHIU3nc0~wfx}8(=BQaOpU!$#c2l);&xyN!n3Zp;@^WEb^uwBJ
zzcm$^l%|=;<aJuAPFY2-Q0M2bCkHZw45N&7=tP@{4qhO-)8cFG9}P7N{5?<<v+k25
z0+I*wIOC5Rx&_ApU|OJ2o1D>%#xuE-nH;o%L!)X02c|%>L0<gB-Y)zJC>PV+TlF|6
u!=k)M-drqiEtXre@@wrbaxeV+#N$0i{fG-VVg+0ZTzdKHzK)hEYo7plmS9`}

diff --git a/tools/pnnx/tests/onnx/test_torch_topk.py b/tools/pnnx/tests/onnx/test_torch_topk.py
index fe3d15c99b84..d62db5990003 100644
--- a/tools/pnnx/tests/onnx/test_torch_topk.py
+++ b/tools/pnnx/tests/onnx/test_torch_topk.py
@@ -9,17 +9,55 @@ class Model(nn.Module):
     def __init__(self):
         super(Model, self).__init__()
 
-    def forward(self, x, y, z):
+    def forward(self, x, y, z, u, v):
         x_values, x_indices = torch.topk(
             x, 2, dim=1, largest=True, sorted=True
         )
+        x_k1_values, x_k1_indices = torch.topk(
+            x, 1, dim=1, largest=True, sorted=True
+        )
+        x_k0_values, x_k0_indices = torch.topk(
+            x, 0, dim=1, largest=True, sorted=True
+        )
+        x_unsorted_values, x_unsorted_indices = torch.topk(
+            x, 2, dim=1, largest=True, sorted=False
+        )
         y_values, y_indices = torch.topk(
             y, 4, dim=3, largest=False, sorted=True
         )
         z_values, z_indices = torch.topk(
             z, 3, dim=0, largest=True, sorted=True
         )
-        return x_values, x_indices, y_values, y_indices, z_values, z_indices
+        z_unsorted_values, z_unsorted_indices = torch.topk(
+            z, 3, dim=0, largest=True, sorted=False
+        )
+        u_values, u_indices = torch.topk(
+            u, 2, dim=-1, largest=True, sorted=True
+        )
+        v_values, v_indices = torch.topk(
+            v, 2, dim=1, largest=True, sorted=True
+        )
+
+        return (
+            x_values,
+            x_indices,
+            x_k1_values,
+            x_k1_indices,
+            x_k0_values,
+            x_k0_indices,
+            x_unsorted_values,
+            x_unsorted_indices,
+            y_values,
+            y_indices,
+            z_values,
+            z_indices,
+            z_unsorted_values,
+            z_unsorted_indices,
+            u_values,
+            u_indices,
+            v_values,
+            v_indices,
+        )
 
 
 def test():
@@ -30,18 +68,20 @@ def test():
     x = torch.rand(1, 3, 16)
     y = torch.rand(1, 5, 9, 11)
     z = torch.rand(14, 8, 5, 9, 10)
+    u = torch.rand(2, 8, 4)
+    v = torch.rand(2, 4, 3)
 
-    a = net(x, y, z)
+    a = net(x, y, z, u, v)
 
     # export onnx
-    torch.onnx.export(net, (x, y, z), "test_torch_topk.onnx")
+    torch.onnx.export(net, (x, y, z, u, v), "test_torch_topk.onnx")
 
     # onnx to pnnx
     import os
 
     os.system(
         "../../src/pnnx test_torch_topk.onnx "
-        "inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]"
+        "inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10],[2,8,4],[2,4,3]"
     )
 
     # pnnx inference
diff --git a/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py b/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py
deleted file mode 100644
index bcb84b7afc45..000000000000
--- a/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import numpy as np
-import ncnn
-import torch
-
-def test_inference():
-    torch.manual_seed(0)
-    in0 = torch.rand(1, 3, 16, dtype=torch.float)
-    in1 = torch.rand(1, 5, 9, 11, dtype=torch.float)
-    in2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float)
-    out = []
-
-    with ncnn.Net() as net:
-        net.load_param("test_torch_topk.ncnn.param")
-        net.load_model("test_torch_topk.ncnn.bin")
-
-        with net.create_extractor() as ex:
-            ex.input("in0", ncnn.Mat(in0.numpy()).clone())
-            ex.input("in1", ncnn.Mat(in1.numpy()).clone())
-            ex.input("in2", ncnn.Mat(in2.numpy()).clone())
-
-            _, out0 = ex.extract("out0")
-            out.append(torch.from_numpy(np.array(out0)))
-            _, out1 = ex.extract("out1")
-            out.append(torch.from_numpy(np.array(out1)))
-            _, out2 = ex.extract("out2")
-            out.append(torch.from_numpy(np.array(out2)))
-            _, out3 = ex.extract("out3")
-            out.append(torch.from_numpy(np.array(out3)))
-            _, out4 = ex.extract("out4")
-            out.append(torch.from_numpy(np.array(out4)))
-            _, out5 = ex.extract("out5")
-            out.append(torch.from_numpy(np.array(out5)))
-
-    if len(out) == 1:
-        return out[0]
-    else:
-        return tuple(out)
-
-if __name__ == "__main__":
-    print(test_inference())
diff --git a/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py b/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py
deleted file mode 100644
index 2b4e7ed5abae..000000000000
--- a/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import os
-import numpy as np
-import tempfile, zipfile
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-try:
-    import torchvision
-    import torchaudio
-except:
-    pass
-
-class TopK(nn.Module):
-    def __init__(self, axis=1, largest=1, sorted=1):
-        super(TopK, self).__init__()
-        self.axis = axis
-        self.largest = largest
-        self.sorted = sorted
-    def forward(self, x, k):
-        # Torch topk returns (values, indices)
-        return torch.topk(x, k.item() if hasattr(k, 'item') else k, dim=self.axis, largest=bool(self.largest), sorted=bool(self.sorted))
-
-class Model(nn.Module):
-    def __init__(self):
-        super(Model, self).__init__()
-
-        self.TopK_0 = TopK(axis=1, largest=1, sorted=1)
-        self.TopK_1 = TopK(axis=3, largest=0, sorted=1)
-        self.TopK_2 = TopK(axis=0, largest=1, sorted=1)
-
-        archive = zipfile.ZipFile('test_torch_topk.pnnx.bin', 'r')
-        archive.close()
-
-    def load_pnnx_bin_as_parameter(self, archive, key, shape, dtype, requires_grad=True):
-        return nn.Parameter(self.load_pnnx_bin_as_tensor(archive, key, shape, dtype), requires_grad)
-
-    def load_pnnx_bin_as_tensor(self, archive, key, shape, dtype):
-        fd, tmppath = tempfile.mkstemp()
-        with os.fdopen(fd, 'wb') as tmpf, archive.open(key) as keyfile:
-            tmpf.write(keyfile.read())
-        m = np.memmap(tmppath, dtype=dtype, mode='r', shape=shape).copy()
-        os.remove(tmppath)
-        return torch.from_numpy(m)
-
-    def forward(self, v_0, v_1, v_2):
-        v_3 = 2
-        v_4, v_5 = self.TopK_0(v_0, v_3)
-        v_6 = 4
-        v_7, v_8 = self.TopK_1(v_1, v_6)
-        v_9 = 3
-        v_10, v_11 = self.TopK_2(v_2, v_9)
-        return v_4, v_5, v_7, v_8, v_10, v_11
-
-def export_torchscript():
-    net = Model()
-    net.float()
-    net.eval()
-
-    torch.manual_seed(0)
-    v_0 = torch.rand(1, 3, 16, dtype=torch.float)
-    v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float)
-    v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float)
-
-    mod = torch.jit.trace(net, (v_0, v_1, v_2))
-    mod.save("test_torch_topk_pnnx.py.pt")
-
-def export_onnx():
-    net = Model()
-    net.float()
-    net.eval()
-
-    torch.manual_seed(0)
-    v_0 = torch.rand(1, 3, 16, dtype=torch.float)
-    v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float)
-    v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float)
-
-    torch.onnx.export(net, (v_0, v_1, v_2), "test_torch_topk_pnnx.py.onnx", export_params=True, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, opset_version=13, input_names=['in0', 'in1', 'in2'], output_names=['out0', 'out1', 'out2', 'out3', 'out4', 'out5'])
-
-def export_pnnx():
-    net = Model()
-    net.float()
-    net.eval()
-
-    torch.manual_seed(0)
-    v_0 = torch.rand(1, 3, 16, dtype=torch.float)
-    v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float)
-    v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float)
-
-    import pnnx
-    pnnx.export(net, "test_torch_topk_pnnx.py.pt", (v_0, v_1, v_2))
-
-def export_ncnn():
-    export_pnnx()
-
-@torch.no_grad()
-def test_inference():
-    net = Model()
-    net.float()
-    net.eval()
-
-    torch.manual_seed(0)
-    v_0 = torch.rand(1, 3, 16, dtype=torch.float)
-    v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float)
-    v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float)
-
-    return net(v_0, v_1, v_2)
-
-if __name__ == "__main__":
-    print(test_inference())

From 4b4b87a7c74086cae9b0d30a27ca26f12ac83738 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 15:11:34 +0100
Subject: [PATCH 06/29] tests: add sorted=0 coverage for topk

---
 tests/test_topk.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index b35be1574b18..55a95ef56bf0 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -79,6 +79,7 @@ static int test_topk_0()
            || test_topk(a, 0, 1, 1, 1)
            || test_topk(a, 0, 5, 1, 1)
            || test_topk(a, -1, 7, 0, 1)
+           || test_topk(a, 0, 4, 1, 0)
            || test_topk(a, 0, 9, 1, 1);
 }
 
@@ -91,6 +92,7 @@ static int test_topk_1()
            || test_topk(a, 0, 5, 1, 1)
            || test_topk(a, 1, 3, 1, 1)
            || test_topk(a, -1, 8, 0, 1)
+           || test_topk(a, 1, 6, 0, 0)
            || test_topk(a, -2, 7, 1, 1);
 }
 
@@ -102,6 +104,7 @@ static int test_topk_2()
            || test_topk(a, 0, 3, 1, 1)
            || test_topk(a, 1, 4, 1, 1)
            || test_topk(a, 2, 2, 0, 1)
+           || test_topk(a, 2, 5, 1, 0)
            || test_topk(a, -1, 6, 1, 1)
            || test_topk(a, -2, 5, 0, 1)
            || test_topk(a, -3, 7, 1, 1);
@@ -115,6 +118,7 @@ static int test_topk_3()
            || test_topk(a, 0, 2, 1, 1)
            || test_topk(a, 1, 3, 0, 1)
            || test_topk(a, 2, 4, 1, 1)
+           || test_topk(a, 3, 4, 0, 0)
            || test_topk(a, 3, 5, 1, 1)
            || test_topk(a, -1, 6, 0, 1)
            || test_topk(a, -2, 3, 1, 1)

From c9e856e8f59e3faad636a7523976401048a7d1da Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 15:29:13 +0100
Subject: [PATCH 07/29] tests: remove generated topk onnx artifacts

---
 tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin  |   0
 tools/pnnx/tests/onnx/test_torch_topk.onnx.data |   0
 tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin  | Bin 98 -> 0 bytes
 3 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin
 delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.onnx.data
 delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin

diff --git a/tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin b/tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.onnx.data b/tools/pnnx/tests/onnx/test_torch_topk.onnx.data
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin
deleted file mode 100644
index aa99d4621ed08e4d5412634fb912b37433a365d8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 98
gcmWIWW@FP~ARPpFv#}%VVgzymyjj`)qX7m603eYP6951J


From 4d5b35fed2d6b0c910e01aa4735fe3e6fb13b3c9 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:27:51 +0100
Subject: [PATCH 08/29] pnnx: drop unrelated cmake and symlink changes

---
 tools/pnnx/src/CMakeLists.txt | 12 +++++-------
 tools/pnnx/src/pnnx           |  1 -
 2 files changed, 5 insertions(+), 8 deletions(-)
 delete mode 120000 tools/pnnx/src/pnnx

diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index 6231e36b16ac..c554a6873e81 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -630,25 +630,23 @@ if(PROTOBUF_FOUND)
         set(CMAKE_CXX_STANDARD 17)
     endif()
 
-    if(COMMAND protobuf_generate_cpp)
+    if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE)
         protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx-data.proto onnx-ml.proto onnx-operators-ml.proto)
         add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
         target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
         target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES})
-    elseif(COMMAND protobuf_generate)
+    else()
         add_library(onnxproto STATIC onnx-data.proto onnx-ml.proto onnx-operators-ml.proto)
         target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
         protobuf_generate(TARGET onnxproto)
         target_link_libraries(onnxproto PUBLIC protobuf::libprotobuf)
-    else()
-        message(FATAL_ERROR "Neither protobuf_generate_cpp nor protobuf_generate is available. Please install protobuf with CMake codegen support.")
     endif()
 
     # use onnxruntime onnx proto if found
     if(onnxruntime_FOUND)
         add_dependencies(onnxruntime::onnxruntime onnxproto)
 
-        if(COMMAND protobuf_generate_cpp)
+        if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE)
             set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
             set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PROTOBUF_LIBRARIES})
         else()
@@ -690,7 +688,7 @@ if(PROTOBUF_FOUND)
         save_onnx.cpp
     )
     if(onnxruntime_FOUND)
-        target_link_libraries(pnnx2onnx PRIVATE onnxruntime::onnxruntime onnxproto)
+        target_link_libraries(pnnx2onnx PRIVATE onnxruntime::onnxruntime)
     else()
         target_link_libraries(pnnx2onnx PRIVATE onnxproto)
     endif()
@@ -722,7 +720,7 @@ if(onnxruntime_FOUND)
     )
 
     add_library(onnx2pnnx OBJECT ${onnx2pnnx_SRCS})
-    target_link_libraries(onnx2pnnx PRIVATE onnxruntime::onnxruntime onnxproto)
+    target_link_libraries(onnx2pnnx PRIVATE onnxruntime::onnxruntime)
     target_compile_definitions(onnx2pnnx PRIVATE BUILD_ONNX2PNNX)
 
     message(STATUS "Building with onnx2pnnx")
diff --git a/tools/pnnx/src/pnnx b/tools/pnnx/src/pnnx
deleted file mode 120000
index 909f9eae4b3f..000000000000
--- a/tools/pnnx/src/pnnx
+++ /dev/null
@@ -1 +0,0 @@
-../build/src/pnnx
\ No newline at end of file

From 5c11058f6c8e543d27bc5a5c4b1ad6dabed11eab Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:32:46 +0100
Subject: [PATCH 09/29] topk: reuse per-thread scratch buffer in forward

---
 src/layer/topk.cpp | 63 ++++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index 72b4df40813d..2c9554ae06a9 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -152,44 +152,47 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     const int total_lines = outer * inner;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int line = 0; line < total_lines; line++)
+    #pragma omp parallel num_threads(opt.num_threads)
     {
-        int outer_i = line / inner;
-        int inner_i = line - outer_i * inner;
-
-        int in_base = outer_i * axis_size * inner + inner_i;
-        int out_base = outer_i * _k * inner + inner_i;
-
         std::vector<std::pair<float, int> > vec;
         vec.resize(axis_size);
 
-        for (int j = 0; j < axis_size; j++)
-        {
-            vec[j].first = ptr[in_base + j * inner];
-            vec[j].second = j;
-        }
-
         topk_pair_comparator comp(largest_flag);
 
-        if (_k < axis_size)
+        #pragma omp for
+        for (int line = 0; line < total_lines; line++)
         {
-            if (sorted_flag)
-                std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp);
+            int outer_i = line / inner;
+            int inner_i = line - outer_i * inner;
+
+            int in_base = outer_i * axis_size * inner + inner_i;
+            int out_base = outer_i * _k * inner + inner_i;
+
+            for (int j = 0; j < axis_size; j++)
+            {
+                vec[j].first = ptr[in_base + j * inner];
+                vec[j].second = j;
+            }
+
+            if (_k < axis_size)
+            {
+                if (sorted_flag)
+                    std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp);
+                else
+                    std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
+            }
             else
-                std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
-        }
-        else
-        {
-            if (sorted_flag)
-                std::sort(vec.begin(), vec.end(), comp);
-        }
-
-        for (int j = 0; j < _k; j++)
-        {
-            outptr[out_base + j * inner] = vec[j].first;
-            if (outidxptr)
-                outidxptr[out_base + j * inner] = (float)vec[j].second;
+            {
+                if (sorted_flag)
+                    std::sort(vec.begin(), vec.end(), comp);
+            }
+
+            for (int j = 0; j < _k; j++)
+            {
+                outptr[out_base + j * inner] = vec[j].first;
+                if (outidxptr)
+                    outidxptr[out_base + j * inner] = (float)vec[j].second;
+            }
         }
     }
 

From 226bd88c4ead69883085b9dcf52e73d3be070057 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:34:48 +0100
Subject: [PATCH 10/29] topk: optimize sorted path and k=0 fast return

---
 src/layer/topk.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index 2c9554ae06a9..77814c9e0600 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -131,6 +131,15 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             return -100;
     }
 
+    if (_k == 0)
+    {
+        top_blobs[0] = values;
+        if (top_blobs.size() >= 2)
+            top_blobs[1] = indices;
+
+        return 0;
+    }
+
     const float* ptr = bottom_blob;
     float* outptr = values;
     float* outidxptr = indices;
@@ -177,7 +186,10 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             if (_k < axis_size)
             {
                 if (sorted_flag)
-                    std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp);
+                {
+                    std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
+                    std::sort(vec.begin(), vec.begin() + _k, comp);
+                }
                 else
                     std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
             }

From 6c5978b0ab8f0478f8412d96d87585f05c56d779 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:36:01 +0100
Subject: [PATCH 11/29] topk: add k=1 fast path for embedded runtime

---
 src/layer/topk.cpp  | 36 ++++++++++++++++++++++++++++++++++++
 tests/test_topk.cpp |  1 +
 2 files changed, 37 insertions(+)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index 77814c9e0600..d7a67fe87b33 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -161,6 +161,42 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     const int total_lines = outer * inner;
 
+    if (_k == 1)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int line = 0; line < total_lines; line++)
+        {
+            int outer_i = line / inner;
+            int inner_i = line - outer_i * inner;
+
+            int in_base = outer_i * axis_size * inner + inner_i;
+            int out_base = outer_i * inner + inner_i;
+
+            float best_value = ptr[in_base];
+            int best_index = 0;
+
+            for (int j = 1; j < axis_size; j++)
+            {
+                const float candidate_value = ptr[in_base + j * inner];
+                if (topk_pair_comp(std::make_pair(candidate_value, j), std::make_pair(best_value, best_index), largest_flag))
+                {
+                    best_value = candidate_value;
+                    best_index = j;
+                }
+            }
+
+            outptr[out_base] = best_value;
+            if (outidxptr)
+                outidxptr[out_base] = (float)best_index;
+        }
+
+        top_blobs[0] = values;
+        if (top_blobs.size() >= 2)
+            top_blobs[1] = indices;
+
+        return 0;
+    }
+
     #pragma omp parallel num_threads(opt.num_threads)
     {
         std::vector<std::pair<float, int> > vec;
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 55a95ef56bf0..0f9d8fee3a4e 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -78,6 +78,7 @@ static int test_topk_0()
     return 0
            || test_topk(a, 0, 1, 1, 1)
            || test_topk(a, 0, 5, 1, 1)
+            || test_topk(a, 0, 1, 0, 0)
            || test_topk(a, -1, 7, 0, 1)
            || test_topk(a, 0, 4, 1, 0)
            || test_topk(a, 0, 9, 1, 1);

From e16514bb00a95e73edf770922c2a399750cddad9 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:37:07 +0100
Subject: [PATCH 12/29] topk: avoid pair temporaries in k=1 hot loop

---
 src/layer/topk.cpp | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index d7a67fe87b33..d30af50c8d52 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -37,6 +37,25 @@ static inline bool topk_pair_comp(const std::pair<float, int>& a, const std::pai
     return a.second < b.second;
 }
 
+static inline bool topk_value_index_comp(float a_value, int a_index, float b_value, int b_index, bool largest)
+{
+    const bool a_nan = topk_isnan(a_value);
+    const bool b_nan = topk_isnan(b_value);
+
+    if (a_nan || b_nan)
+    {
+        if (a_nan != b_nan)
+            return !a_nan && b_nan;
+
+        return a_index < b_index;
+    }
+
+    if (a_value != b_value)
+        return largest ? (a_value > b_value) : (a_value < b_value);
+
+    return a_index < b_index;
+}
+
 struct topk_pair_comparator
 {
     topk_pair_comparator(bool _largest)
@@ -178,7 +197,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             for (int j = 1; j < axis_size; j++)
             {
                 const float candidate_value = ptr[in_base + j * inner];
-                if (topk_pair_comp(std::make_pair(candidate_value, j), std::make_pair(best_value, best_index), largest_flag))
+                if (topk_value_index_comp(candidate_value, j, best_value, best_index, largest_flag))
                 {
                     best_value = candidate_value;
                     best_index = j;

From 00be7f82e60dc139991cb969b013df5fcfb5917a Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:39:32 +0100
Subject: [PATCH 13/29] topk: reduce writeback branching in hot loop

---
 src/layer/topk.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index d30af50c8d52..3026b8088ffa 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -162,6 +162,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
     const float* ptr = bottom_blob;
     float* outptr = values;
     float* outidxptr = indices;
+    const bool output_indices = outidxptr != 0;
 
     int inner = 1;
     for (int i = 0; i < positive_axis; i++)
@@ -205,7 +206,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             }
 
             outptr[out_base] = best_value;
-            if (outidxptr)
+            if (output_indices)
                 outidxptr[out_base] = (float)best_index;
         }
 
@@ -254,11 +255,20 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
                     std::sort(vec.begin(), vec.end(), comp);
             }
 
-            for (int j = 0; j < _k; j++)
+            if (output_indices)
             {
-                outptr[out_base + j * inner] = vec[j].first;
-                if (outidxptr)
+                for (int j = 0; j < _k; j++)
+                {
+                    outptr[out_base + j * inner] = vec[j].first;
                     outidxptr[out_base + j * inner] = (float)vec[j].second;
+                }
+            }
+            else
+            {
+                for (int j = 0; j < _k; j++)
+                {
+                    outptr[out_base + j * inner] = vec[j].first;
+                }
             }
         }
     }

From 1fe44637e330453a3b9a95ff0d54e2244e58fe03 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:47:47 +0100
Subject: [PATCH 14/29] topk: fast path unsorted full-k copy

---
 src/layer/topk.cpp | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index 3026b8088ffa..c87c485fc8e3 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -217,6 +217,41 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
         return 0;
     }
 
+    if (_k == axis_size && !sorted_flag)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int line = 0; line < total_lines; line++)
+        {
+            int outer_i = line / inner;
+            int inner_i = line - outer_i * inner;
+
+            int in_base = outer_i * axis_size * inner + inner_i;
+            int out_base = outer_i * _k * inner + inner_i;
+
+            if (output_indices)
+            {
+                for (int j = 0; j < _k; j++)
+                {
+                    outptr[out_base + j * inner] = ptr[in_base + j * inner];
+                    outidxptr[out_base + j * inner] = (float)j;
+                }
+            }
+            else
+            {
+                for (int j = 0; j < _k; j++)
+                {
+                    outptr[out_base + j * inner] = ptr[in_base + j * inner];
+                }
+            }
+        }
+
+        top_blobs[0] = values;
+        if (top_blobs.size() >= 2)
+            top_blobs[1] = indices;
+
+        return 0;
+    }
+
     #pragma omp parallel num_threads(opt.num_threads)
     {
         std::vector<std::pair<float, int> > vec;

From 6ea29eb6e380562f613dc11511e237070c997422 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:49:20 +0100
Subject: [PATCH 15/29] topk: add small-k hot path for embedded runtime

---
 src/layer/topk.cpp | 72 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index c87c485fc8e3..00d632068dd6 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -252,6 +252,78 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
         return 0;
     }
 
+    if (_k <= 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int line = 0; line < total_lines; line++)
+        {
+            int outer_i = line / inner;
+            int inner_i = line - outer_i * inner;
+
+            int in_base = outer_i * axis_size * inner + inner_i;
+            int out_base = outer_i * _k * inner + inner_i;
+
+            float top_values[4];
+            int top_indices[4];
+            int top_count = 0;
+
+            for (int j = 0; j < axis_size; j++)
+            {
+                const float candidate_value = ptr[in_base + j * inner];
+
+                if (top_count < _k)
+                {
+                    int insert_pos = top_count;
+                    while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag))
+                    {
+                        top_values[insert_pos] = top_values[insert_pos - 1];
+                        top_indices[insert_pos] = top_indices[insert_pos - 1];
+                        insert_pos--;
+                    }
+
+                    top_values[insert_pos] = candidate_value;
+                    top_indices[insert_pos] = j;
+                    top_count++;
+                }
+                else if (topk_value_index_comp(candidate_value, j, top_values[_k - 1], top_indices[_k - 1], largest_flag))
+                {
+                    int insert_pos = _k - 1;
+                    while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag))
+                    {
+                        top_values[insert_pos] = top_values[insert_pos - 1];
+                        top_indices[insert_pos] = top_indices[insert_pos - 1];
+                        insert_pos--;
+                    }
+
+                    top_values[insert_pos] = candidate_value;
+                    top_indices[insert_pos] = j;
+                }
+            }
+
+            if (output_indices)
+            {
+                for (int j = 0; j < _k; j++)
+                {
+                    outptr[out_base + j * inner] = top_values[j];
+                    outidxptr[out_base + j * inner] = (float)top_indices[j];
+                }
+            }
+            else
+            {
+                for (int j = 0; j < _k; j++)
+                {
+                    outptr[out_base + j * inner] = top_values[j];
+                }
+            }
+        }
+
+        top_blobs[0] = values;
+        if (top_blobs.size() >= 2)
+            top_blobs[1] = indices;
+
+        return 0;
+    }
+
     #pragma omp parallel num_threads(opt.num_threads)
     {
         std::vector<std::pair<float, int> > vec;

From 7befff69286b4abe9b538d65084f84213809f4b4 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:51:56 +0100
Subject: [PATCH 16/29] topk: add guarded neon fast path for k=1

---
 src/layer/topk.cpp | 75 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index 00d632068dd6..f527021e40bb 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -4,10 +4,15 @@
 #include "topk.h"
 
 #include <algorithm>
+#include <float.h>
 #include <stdint.h>
 #include <string.h>
 #include <vector>
 
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
 namespace ncnn {
 
 static inline bool topk_isnan(float v)
@@ -192,6 +197,76 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             int in_base = outer_i * axis_size * inner + inner_i;
             int out_base = outer_i * inner + inner_i;
 
+#if __ARM_NEON
+            if (!output_indices && inner == 1 && axis_size >= 4)
+            {
+                const float* lineptr = ptr + in_base;
+
+                float best_value = largest_flag ? -FLT_MAX : FLT_MAX;
+                int j = 0;
+                int has_nan = 0;
+
+                for (; j + 3 < axis_size; j += 4)
+                {
+                    float32x4_t v = vld1q_f32(lineptr + j);
+                    uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(v, v));
+                    if (vmaxvq_u32(nan_mask) != 0)
+                    {
+                        has_nan = 1;
+                        break;
+                    }
+
+                    float tmp[4];
+                    vst1q_f32(tmp, v);
+
+                    if (largest_flag)
+                    {
+                        if (tmp[0] > best_value) best_value = tmp[0];
+                        if (tmp[1] > best_value) best_value = tmp[1];
+                        if (tmp[2] > best_value) best_value = tmp[2];
+                        if (tmp[3] > best_value) best_value = tmp[3];
+                    }
+                    else
+                    {
+                        if (tmp[0] < best_value) best_value = tmp[0];
+                        if (tmp[1] < best_value) best_value = tmp[1];
+                        if (tmp[2] < best_value) best_value = tmp[2];
+                        if (tmp[3] < best_value) best_value = tmp[3];
+                    }
+                }
+
+                if (!has_nan)
+                {
+                    for (; j < axis_size; j++)
+                    {
+                        const float candidate_value = lineptr[j];
+                        if (topk_isnan(candidate_value))
+                        {
+                            has_nan = 1;
+                            break;
+                        }
+
+                        if (largest_flag)
+                        {
+                            if (candidate_value > best_value)
+                                best_value = candidate_value;
+                        }
+                        else
+                        {
+                            if (candidate_value < best_value)
+                                best_value = candidate_value;
+                        }
+                    }
+                }
+
+                if (!has_nan)
+                {
+                    outptr[out_base] = best_value;
+                    continue;
+                }
+            }
+#endif // __ARM_NEON
+
             float best_value = ptr[in_base];
             int best_index = 0;
 

From 5ba7fbcab1ec7aa2a0ce945461ab53ebce1049b9 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:56:02 +0100
Subject: [PATCH 17/29] topk: fix neon k=1 inf initialization edge case

---
 src/layer/topk.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index f527021e40bb..dbab3b19ed20 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -4,7 +4,6 @@
 #include "topk.h"
 
 #include <algorithm>
-#include <float.h>
 #include <stdint.h>
 #include <string.h>
 #include <vector>
@@ -202,11 +201,11 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             {
                 const float* lineptr = ptr + in_base;
 
-                float best_value = largest_flag ? -FLT_MAX : FLT_MAX;
-                int j = 0;
-                int has_nan = 0;
+                float best_value = lineptr[0];
+                int j = 1;
+                int has_nan = topk_isnan(best_value);
 
-                for (; j + 3 < axis_size; j += 4)
+                for (; !has_nan && j + 3 < axis_size; j += 4)
                 {
                     float32x4_t v = vld1q_f32(lineptr + j);
                     uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(v, v));

From e4b4073935f9df6931188da31e00ee2eef3a84d4 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 16:58:55 +0100
Subject: [PATCH 18/29] topk: make neon mask check arm-portable

---
 src/layer/topk.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index dbab3b19ed20..59946b1d6e43 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -209,7 +209,9 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
                 {
                     float32x4_t v = vld1q_f32(lineptr + j);
                     uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(v, v));
-                    if (vmaxvq_u32(nan_mask) != 0)
+                    uint32_t nan_mask_lanes[4];
+                    vst1q_u32(nan_mask_lanes, nan_mask);
+                    if (nan_mask_lanes[0] || nan_mask_lanes[1] || nan_mask_lanes[2] || nan_mask_lanes[3])
                     {
                         has_nan = 1;
                         break;

From 49dbc7be2f4f7e56f4efc2848b8da4e80387bc00 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 17:04:24 +0100
Subject: [PATCH 19/29] topk: optimize small-k unsorted selection path

---
 src/layer/topk.cpp | 72 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 20 deletions(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index 59946b1d6e43..10b7b1d2ccc0 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -343,36 +343,68 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             int top_indices[4];
             int top_count = 0;
 
-            for (int j = 0; j < axis_size; j++)
+            if (sorted_flag)
             {
-                const float candidate_value = ptr[in_base + j * inner];
-
-                if (top_count < _k)
+                for (int j = 0; j < axis_size; j++)
                 {
-                    int insert_pos = top_count;
-                    while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag))
+                    const float candidate_value = ptr[in_base + j * inner];
+
+                    if (top_count < _k)
                     {
-                        top_values[insert_pos] = top_values[insert_pos - 1];
-                        top_indices[insert_pos] = top_indices[insert_pos - 1];
-                        insert_pos--;
+                        int insert_pos = top_count;
+                        while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag))
+                        {
+                            top_values[insert_pos] = top_values[insert_pos - 1];
+                            top_indices[insert_pos] = top_indices[insert_pos - 1];
+                            insert_pos--;
+                        }
+
+                        top_values[insert_pos] = candidate_value;
+                        top_indices[insert_pos] = j;
+                        top_count++;
                     }
+                    else if (topk_value_index_comp(candidate_value, j, top_values[_k - 1], top_indices[_k - 1], largest_flag))
+                    {
+                        int insert_pos = _k - 1;
+                        while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag))
+                        {
+                            top_values[insert_pos] = top_values[insert_pos - 1];
+                            top_indices[insert_pos] = top_indices[insert_pos - 1];
+                            insert_pos--;
+                        }
 
-                    top_values[insert_pos] = candidate_value;
-                    top_indices[insert_pos] = j;
-                    top_count++;
+                        top_values[insert_pos] = candidate_value;
+                        top_indices[insert_pos] = j;
+                    }
                 }
-                else if (topk_value_index_comp(candidate_value, j, top_values[_k - 1], top_indices[_k - 1], largest_flag))
+            }
+            else
+            {
+                for (int j = 0; j < axis_size; j++)
                 {
-                    int insert_pos = _k - 1;
-                    while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag))
+                    const float candidate_value = ptr[in_base + j * inner];
+
+                    if (top_count < _k)
                     {
-                        top_values[insert_pos] = top_values[insert_pos - 1];
-                        top_indices[insert_pos] = top_indices[insert_pos - 1];
-                        insert_pos--;
+                        top_values[top_count] = candidate_value;
+                        top_indices[top_count] = j;
+                        top_count++;
                     }
+                    else
+                    {
+                        int worst_pos = 0;
+                        for (int t = 1; t < _k; t++)
+                        {
+                            if (topk_value_index_comp(top_values[worst_pos], top_indices[worst_pos], top_values[t], top_indices[t], largest_flag))
+                                worst_pos = t;
+                        }
 
-                    top_values[insert_pos] = candidate_value;
-                    top_indices[insert_pos] = j;
+                        if (topk_value_index_comp(candidate_value, j, top_values[worst_pos], top_indices[worst_pos], largest_flag))
+                        {
+                            top_values[worst_pos] = candidate_value;
+                            top_indices[worst_pos] = j;
+                        }
+                    }
                 }
             }
 

From 9d31f3bee6185a8102be5f84131bcf972e0a5946 Mon Sep 17 00:00:00 2001
From: vlordier <vincent.lordier@sanofi.com>
Date: Fri, 27 Feb 2026 17:18:19 +0100
Subject: [PATCH 20/29] tests: add values-only topk coverage in cpp and onnx

---
 tests/test_topk.cpp                      | 97 +++++++++++++++++++++++-
 tools/pnnx/tests/onnx/test_torch_topk.py |  4 +
 2 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 0f9d8fee3a4e..8568041b5c34 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -49,6 +49,49 @@ static int test_topk_cpu_forward(const ncnn::Mat& a, int axis, int k, int larges
     return 0;
 }
 
+static int test_topk_cpu_forward_values_only(const ncnn::Mat& a, int axis, int k, int largest, int sorted, ncnn::Mat& values)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, axis);
+    pd.set(1, largest);
+    pd.set(2, sorted);
+    pd.set(3, k);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = false;
+    opt.use_packing_layout = false;
+
+    ncnn::Layer* op = ncnn::create_layer_cpu("TopK");
+    if (!op)
+        return -1;
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+    op->load_model(mb);
+
+    op->create_pipeline(opt);
+
+    std::vector<ncnn::Mat> bottom_blobs(1);
+    bottom_blobs[0] = a;
+
+    std::vector<ncnn::Mat> top_blobs(1);
+    int ret = op->forward(bottom_blobs, top_blobs, opt);
+
+    op->destroy_pipeline(opt);
+    delete op;
+
+    if (ret != 0)
+        return ret;
+
+    values = top_blobs[0];
+
+    return 0;
+}
+
 static int test_topk(const ncnn::Mat& a, int axis, int k, int largest, int sorted)
 {
     ncnn::ParamDict pd;
@@ -251,6 +294,57 @@ static int test_topk_nan_robust()
     return 0;
 }
 
+static int test_topk_values_only_fastpaths()
+{
+    ncnn::Mat a(5);
+    float* ptr = a;
+    ptr[0] = 1.f;
+    ptr[1] = -2.f;
+    ptr[2] = 4.f;
+    ptr[3] = 3.f;
+    ptr[4] = 0.f;
+
+    ncnn::Mat values;
+
+    int ret = test_topk_cpu_forward_values_only(a, 0, 1, 1, 0, values);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_topk_values_only_fastpaths k1 failed ret=%d\n", ret);
+        return -1;
+    }
+
+    if (values.w != 1 || ((const float*)values)[0] != 4.f)
+    {
+        fprintf(stderr, "test_topk_values_only_fastpaths k1 result mismatch\n");
+        return -1;
+    }
+
+    ret = test_topk_cpu_forward_values_only(a, 0, 5, 1, 0, values);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_topk_values_only_fastpaths fullk failed ret=%d\n", ret);
+        return -1;
+    }
+
+    if (values.w != 5)
+    {
+        fprintf(stderr, "test_topk_values_only_fastpaths fullk shape mismatch\n");
+        return -1;
+    }
+
+    const float* vptr = values;
+    for (int i = 0; i < 5; i++)
+    {
+        if (vptr[i] != ptr[i])
+        {
+            fprintf(stderr, "test_topk_values_only_fastpaths fullk value mismatch\n");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
 int main()
 {
     SRAND(7767517);
@@ -261,5 +355,6 @@ int main()
            || test_topk_2()
            || test_topk_3()
            || test_topk_inf_order()
-           || test_topk_nan_robust();
+           || test_topk_nan_robust()
+           || test_topk_values_only_fastpaths();
 }
diff --git a/tools/pnnx/tests/onnx/test_torch_topk.py b/tools/pnnx/tests/onnx/test_torch_topk.py
index d62db5990003..dfd99ee2ac26 100644
--- a/tools/pnnx/tests/onnx/test_torch_topk.py
+++ b/tools/pnnx/tests/onnx/test_torch_topk.py
@@ -22,6 +22,9 @@ def forward(self, x, y, z, u, v):
         x_unsorted_values, x_unsorted_indices = torch.topk(
             x, 2, dim=1, largest=True, sorted=False
         )
+        x_values_only = torch.topk(
+            x, 3, dim=1, largest=True, sorted=True
+        )[0]
         y_values, y_indices = torch.topk(
             y, 4, dim=3, largest=False, sorted=True
         )
@@ -47,6 +50,7 @@ def forward(self, x, y, z, u, v):
             x_k0_indices,
             x_unsorted_values,
             x_unsorted_indices,
+            x_values_only,
             y_values,
             y_indices,
             z_values,

From 84e083b6f49631583d997790948461adefc8993e Mon Sep 17 00:00:00 2001
From: vlordier <vlordier@users.noreply.github.com>
Date: Fri, 10 Apr 2026 12:18:48 +0200
Subject: [PATCH 21/29] topk: fix STL compatibility, cstep indexing, omp
 barrier, and code style

- Guard <algorithm>/<vector> behind #if NCNN_SIMPLESTL, include simplestl.h
- Use std::partial_sort in simplestl mode (no std::nth_element available)
- Guard <math.h> in tests behind #if !NCNN_SIMPLESTL to avoid simplemath.h
  conflict; define INFINITY/NAN as float expressions in simplestl mode
- Fix cstep-unaware indexing for 3D/4D output tensors: use actual cstep
  for channel offset instead of assuming contiguous w*h layout
- Convert #pragma omp parallel + inner #pragma omp for to #pragma omp
  parallel for to avoid __kmpc_barrier in simpleomp mode
- Fix copyright year 2026->2025
- Apply code-format whitespace cleanup
---
 src/layer/topk.cpp                | 178 +++++++++++++++++++++---------
 src/layer/topk.h                  |   2 +-
 tests/test_topk.cpp               |  24 ++--
 tools/pnnx/src/ir.cpp             |   8 +-
 tools/pnnx/src/pass_ncnn/TopK.cpp |   2 +-
 5 files changed, 145 insertions(+), 69 deletions(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index 10b7b1d2ccc0..3b78fbfce3fe 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -1,12 +1,17 @@
-// Copyright 2026 Tencent
+// Copyright 2025 Tencent
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "topk.h"
 
-#include <algorithm>
 #include <stdint.h>
 #include <string.h>
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
 #include <vector>
+#endif
 
 #if __ARM_NEON
 #include <arm_neon.h>
@@ -185,6 +190,21 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     const int total_lines = outer * inner;
 
+    // ncnn 3-/4-D mats have a channel stride (cstep) that may be larger than w*h
+    // due to alignment padding.  The flat inner/outer indexing must account for this:
+    //   - when axis reduces a non-channel dim, the outer loop spans channels and
+    //     the channel offset must use cstep rather than the product of spatial sizes;
+    //   - when axis IS the channel dim, the per-element j-stride must be cstep.
+    const size_t in_cstep = (dims >= 3) ? (size_t)bottom_blob.cstep : 0;
+    const size_t out_cstep = (dims >= 3) ? values.cstep : 0;
+    const bool axis_is_channel = (dims >= 3 && positive_axis == dims - 1);
+    // spatial-only outer count: channels factored out so cstep can be used separately
+    const int c_channels = (!axis_is_channel && dims >= 3) ? shape[dims - 1] : 1;
+    const int outer_spatial = (dims >= 3 && !axis_is_channel) ? outer / c_channels : outer;
+    // stride when stepping along the axis in memory
+    const size_t in_axis_stride = axis_is_channel ? in_cstep : (size_t)inner;
+    const size_t out_axis_stride = axis_is_channel ? out_cstep : (size_t)inner;
+
     if (_k == 1)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -193,8 +213,19 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             int outer_i = line / inner;
             int inner_i = line - outer_i * inner;
 
-            int in_base = outer_i * axis_size * inner + inner_i;
-            int out_base = outer_i * inner + inner_i;
+            size_t in_base, out_base;
+            if (!axis_is_channel && dims >= 3)
+            {
+                const int ci = outer_i / outer_spatial;
+                const int sp_i = outer_i % outer_spatial;
+                in_base = (size_t)ci * in_cstep + (size_t)sp_i * axis_size * inner + inner_i;
+                out_base = (size_t)ci * out_cstep + (size_t)sp_i * 1 * inner + inner_i;
+            }
+            else
+            {
+                in_base = (size_t)outer_i * axis_size * inner + inner_i;
+                out_base = (size_t)outer_i * 1 * inner + inner_i;
+            }
 
 #if __ARM_NEON
             if (!output_indices && inner == 1 && axis_size >= 4)
@@ -273,7 +304,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
             for (int j = 1; j < axis_size; j++)
             {
-                const float candidate_value = ptr[in_base + j * inner];
+                const float candidate_value = ptr[in_base + j * in_axis_stride];
                 if (topk_value_index_comp(candidate_value, j, best_value, best_index, largest_flag))
                 {
                     best_value = candidate_value;
@@ -301,22 +332,33 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             int outer_i = line / inner;
             int inner_i = line - outer_i * inner;
 
-            int in_base = outer_i * axis_size * inner + inner_i;
-            int out_base = outer_i * _k * inner + inner_i;
+            size_t in_base, out_base;
+            if (!axis_is_channel && dims >= 3)
+            {
+                const int ci = outer_i / outer_spatial;
+                const int sp_i = outer_i % outer_spatial;
+                in_base = (size_t)ci * in_cstep + (size_t)sp_i * axis_size * inner + inner_i;
+                out_base = (size_t)ci * out_cstep + (size_t)sp_i * _k * inner + inner_i;
+            }
+            else
+            {
+                in_base = (size_t)outer_i * axis_size * inner + inner_i;
+                out_base = (size_t)outer_i * _k * inner + inner_i;
+            }
 
             if (output_indices)
             {
                 for (int j = 0; j < _k; j++)
                 {
-                    outptr[out_base + j * inner] = ptr[in_base + j * inner];
-                    outidxptr[out_base + j * inner] = (float)j;
+                    outptr[out_base + j * out_axis_stride] = ptr[in_base + j * in_axis_stride];
+                    outidxptr[out_base + j * out_axis_stride] = (float)j;
                 }
             }
             else
             {
                 for (int j = 0; j < _k; j++)
                 {
-                    outptr[out_base + j * inner] = ptr[in_base + j * inner];
+                    outptr[out_base + j * out_axis_stride] = ptr[in_base + j * in_axis_stride];
                 }
             }
         }
@@ -336,8 +378,19 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             int outer_i = line / inner;
             int inner_i = line - outer_i * inner;
 
-            int in_base = outer_i * axis_size * inner + inner_i;
-            int out_base = outer_i * _k * inner + inner_i;
+            size_t in_base, out_base;
+            if (!axis_is_channel && dims >= 3)
+            {
+                const int ci = outer_i / outer_spatial;
+                const int sp_i = outer_i % outer_spatial;
+                in_base = (size_t)ci * in_cstep + (size_t)sp_i * axis_size * inner + inner_i;
+                out_base = (size_t)ci * out_cstep + (size_t)sp_i * _k * inner + inner_i;
+            }
+            else
+            {
+                in_base = (size_t)outer_i * axis_size * inner + inner_i;
+                out_base = (size_t)outer_i * _k * inner + inner_i;
+            }
 
             float top_values[4];
             int top_indices[4];
@@ -347,7 +400,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             {
                 for (int j = 0; j < axis_size; j++)
                 {
-                    const float candidate_value = ptr[in_base + j * inner];
+                    const float candidate_value = ptr[in_base + j * in_axis_stride];
 
                     if (top_count < _k)
                     {
@@ -382,7 +435,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             {
                 for (int j = 0; j < axis_size; j++)
                 {
-                    const float candidate_value = ptr[in_base + j * inner];
+                    const float candidate_value = ptr[in_base + j * in_axis_stride];
 
                     if (top_count < _k)
                     {
@@ -412,15 +465,15 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
             {
                 for (int j = 0; j < _k; j++)
                 {
-                    outptr[out_base + j * inner] = top_values[j];
-                    outidxptr[out_base + j * inner] = (float)top_indices[j];
+                    outptr[out_base + j * out_axis_stride] = top_values[j];
+                    outidxptr[out_base + j * out_axis_stride] = (float)top_indices[j];
                 }
             }
             else
             {
                 for (int j = 0; j < _k; j++)
                 {
-                    outptr[out_base + j * inner] = top_values[j];
+                    outptr[out_base + j * out_axis_stride] = top_values[j];
                 }
             }
         }
@@ -432,58 +485,73 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
         return 0;
     }
 
-    #pragma omp parallel num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int line = 0; line < total_lines; line++)
     {
-        std::vector<std::pair<float, int> > vec;
-        vec.resize(axis_size);
+        std::vector<std::pair<float, int> > vec(axis_size);
 
         topk_pair_comparator comp(largest_flag);
 
-        #pragma omp for
-        for (int line = 0; line < total_lines; line++)
-        {
-            int outer_i = line / inner;
-            int inner_i = line - outer_i * inner;
+        int outer_i = line / inner;
+        int inner_i = line - outer_i * inner;
 
-            int in_base = outer_i * axis_size * inner + inner_i;
-            int out_base = outer_i * _k * inner + inner_i;
+        size_t in_base, out_base;
+        if (!axis_is_channel && dims >= 3)
+        {
+            const int ci = outer_i / outer_spatial;
+            const int sp_i = outer_i % outer_spatial;
+            in_base = (size_t)ci * in_cstep + (size_t)sp_i * axis_size * inner + inner_i;
+            out_base = (size_t)ci * out_cstep + (size_t)sp_i * _k * inner + inner_i;
+        }
+        else
+        {
+            in_base = (size_t)outer_i * axis_size * inner + inner_i;
+            out_base = (size_t)outer_i * _k * inner + inner_i;
+        }
 
-            for (int j = 0; j < axis_size; j++)
-            {
-                vec[j].first = ptr[in_base + j * inner];
-                vec[j].second = j;
-            }
+        for (int j = 0; j < axis_size; j++)
+        {
+            vec[j].first = ptr[in_base + j * in_axis_stride];
+            vec[j].second = j;
+        }
 
-            if (_k < axis_size)
+        if (_k < axis_size)
+        {
+#if NCNN_SIMPLESTL
+            std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp);
+#else
+            if (sorted_flag)
             {
-                if (sorted_flag)
-                {
-                    std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
-                    std::sort(vec.begin(), vec.begin() + _k, comp);
-                }
-                else
-                    std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
+                std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
+                std::sort(vec.begin(), vec.begin() + _k, comp);
             }
             else
-            {
-                if (sorted_flag)
-                    std::sort(vec.begin(), vec.end(), comp);
-            }
+                std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp);
+#endif
+        }
+        else
+        {
+            if (sorted_flag)
+#if NCNN_SIMPLESTL
+                std::partial_sort(vec.begin(), vec.end(), vec.end(), comp);
+#else
+                std::sort(vec.begin(), vec.end(), comp);
+#endif
+        }
 
-            if (output_indices)
+        if (output_indices)
+        {
+            for (int j = 0; j < _k; j++)
             {
-                for (int j = 0; j < _k; j++)
-                {
-                    outptr[out_base + j * inner] = vec[j].first;
-                    outidxptr[out_base + j * inner] = (float)vec[j].second;
-                }
+                outptr[out_base + j * out_axis_stride] = vec[j].first;
+                outidxptr[out_base + j * out_axis_stride] = (float)vec[j].second;
             }
-            else
+        }
+        else
+        {
+            for (int j = 0; j < _k; j++)
             {
-                for (int j = 0; j < _k; j++)
-                {
-                    outptr[out_base + j * inner] = vec[j].first;
-                }
+                outptr[out_base + j * out_axis_stride] = vec[j].first;
             }
         }
     }
diff --git a/src/layer/topk.h b/src/layer/topk.h
index ff8f410926d8..947dc21343ff 100644
--- a/src/layer/topk.h
+++ b/src/layer/topk.h
@@ -1,4 +1,4 @@
-// Copyright 2026 Tencent
+// Copyright 2025 Tencent
 // SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef LAYER_TOPK_H
diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp
index 8568041b5c34..ac3375058e3f 100644
--- a/tests/test_topk.cpp
+++ b/tests/test_topk.cpp
@@ -1,9 +1,17 @@
-// Copyright 2026 Tencent
+// Copyright 2025 Tencent
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "testutil.h"
 
-#include <limits>
+#if NCNN_SIMPLESTL
+// simplemath.h conflicts with system math.h; define only what we need
+static const float TEST_INF = 1.f / 0.f;
+static const float TEST_NAN = 0.f / 0.f;
+#define INFINITY TEST_INF
+#define NAN      TEST_NAN
+#else
+#include <math.h>
+#endif
 
 static int test_topk_cpu_forward(const ncnn::Mat& a, int axis, int k, int largest, int sorted, ncnn::Mat& values, ncnn::Mat& indices)
 {
@@ -121,7 +129,7 @@ static int test_topk_0()
     return 0
            || test_topk(a, 0, 1, 1, 1)
            || test_topk(a, 0, 5, 1, 1)
-            || test_topk(a, 0, 1, 0, 0)
+           || test_topk(a, 0, 1, 0, 0)
            || test_topk(a, -1, 7, 0, 1)
            || test_topk(a, 0, 4, 1, 0)
            || test_topk(a, 0, 9, 1, 1);
@@ -175,9 +183,9 @@ static int test_topk_inf_order()
     ncnn::Mat a(6);
     float* ptr = a;
     ptr[0] = 1.f;
-    ptr[1] = std::numeric_limits<float>::infinity();
+    ptr[1] = INFINITY;
     ptr[2] = -2.f;
-    ptr[3] = -std::numeric_limits<float>::infinity();
+    ptr[3] = -INFINITY;
     ptr[4] = 0.5f;
     ptr[5] = 3.f;
 
@@ -193,7 +201,7 @@ static int test_topk_inf_order()
 
     const float* vptr = values;
     const float* iptr = indices;
-    if (values.w != 2 || indices.w != 2 || vptr[0] != std::numeric_limits<float>::infinity() || vptr[1] != 3.f || (int)iptr[0] != 1 || (int)iptr[1] != 5)
+    if (values.w != 2 || indices.w != 2 || vptr[0] != INFINITY || vptr[1] != 3.f || (int)iptr[0] != 1 || (int)iptr[1] != 5)
     {
         fprintf(stderr, "test_topk_inf_order largest result mismatch\n");
         return -1;
@@ -208,7 +216,7 @@ static int test_topk_inf_order()
 
     vptr = values;
     iptr = indices;
-    if (values.w != 2 || indices.w != 2 || vptr[0] != -std::numeric_limits<float>::infinity() || vptr[1] != -2.f || (int)iptr[0] != 3 || (int)iptr[1] != 2)
+    if (values.w != 2 || indices.w != 2 || vptr[0] != -INFINITY || vptr[1] != -2.f || (int)iptr[0] != 3 || (int)iptr[1] != 2)
     {
         fprintf(stderr, "test_topk_inf_order smallest result mismatch\n");
         return -1;
@@ -222,7 +230,7 @@ static int test_topk_nan_robust()
     ncnn::Mat a(4);
     float* ptr = a;
     ptr[0] = 1.f;
-    ptr[1] = std::numeric_limits<float>::quiet_NaN();
+    ptr[1] = NAN;
     ptr[2] = 2.f;
     ptr[3] = -1.f;
 
diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 63f9c70e21f4..456f51993b15 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1640,12 +1640,12 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
                 continue;
 
             fprintf(pyfp, "        self.%s = TopK(", sanitize_identifier(op->name).c_str());
-            
+
             int i = 0;
             for (const auto& it : op->params)
             {
                 fprintf(pyfp, "%s=", it.first.c_str());
-                
+
                 const Parameter& param = it.second;
                 if (param.type == 2)
                 {
@@ -1655,12 +1655,12 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
                 {
                     fprintf(pyfp, "%d", param.b ? 1 : 0);
                 }
-                
+
                 if (i + 1 != op->params.size())
                     fprintf(pyfp, ", ");
                 i++;
             }
-            
+
             fprintf(pyfp, ")\n");
         }
     }
diff --git a/tools/pnnx/src/pass_ncnn/TopK.cpp b/tools/pnnx/src/pass_ncnn/TopK.cpp
index ed226605ad8c..13549437d271 100644
--- a/tools/pnnx/src/pass_ncnn/TopK.cpp
+++ b/tools/pnnx/src/pass_ncnn/TopK.cpp
@@ -1,4 +1,4 @@
-// Copyright 2026 Tencent
+// Copyright 2025 Tencent
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "pass_ncnn.h"

From 2ea44ddc98562ef45e94a40df391d1aedaf376e5 Mon Sep 17 00:00:00 2001
From: vlordier <vlordier@users.noreply.github.com>
Date: Fri, 10 Apr 2026 12:28:46 +0200
Subject: [PATCH 22/29] apply code-format

---
 src/layer/topk.cpp                          |  8 ++++----
 tools/pnnx/src/ir.cpp                       | 20 ++++++++++----------
 tools/pnnx/src/pass_onnx/fold_constants.cpp | 14 +++++++-------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index 3b78fbfce3fe..7e1a3c77ad78 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -207,7 +207,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     if (_k == 1)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int line = 0; line < total_lines; line++)
         {
             int outer_i = line / inner;
@@ -326,7 +326,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     if (_k == axis_size && !sorted_flag)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int line = 0; line < total_lines; line++)
         {
             int outer_i = line / inner;
@@ -372,7 +372,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     if (_k <= 4)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int line = 0; line < total_lines; line++)
         {
             int outer_i = line / inner;
@@ -485,7 +485,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
         return 0;
     }
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int line = 0; line < total_lines; line++)
     {
         std::vector<std::pair<float, int> > vec(axis_size);
diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 456f51993b15..1d88ba384bfb 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1576,10 +1576,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
                     for (size_t i = 0; i < param.ai.size(); i++)
                     {
                         if ((op->type == "nn.AdaptiveAvgPool2d"
-                                || op->type == "nn.AdaptiveAvgPool3d"
-                                || op->type == "nn.AdaptiveMaxPool2d"
-                                || op->type == "nn.AdaptiveMaxPool3d")
-                                && it.first == "output_size" && param.ai[i] == 0)
+                             || op->type == "nn.AdaptiveAvgPool3d"
+                             || op->type == "nn.AdaptiveMaxPool2d"
+                             || op->type == "nn.AdaptiveMaxPool3d")
+                            && it.first == "output_size" && param.ai[i] == 0)
                         {
                             fprintf(pyfp, "None");
                         }
@@ -2390,8 +2390,8 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
 
                     bool scalar_as_tensor = false;
                     if ((op->type == "Tensor.index_put" && it.first == "values")
-                            || (op->type == "torch.where" && it.first == "input")
-                            || (op->type == "torch.where" && it.first == "other"))
+                        || (op->type == "torch.where" && it.first == "input")
+                        || (op->type == "torch.where" && it.first == "other"))
                     {
                         scalar_as_tensor = true;
                     }
@@ -2478,10 +2478,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
                         for (size_t i = 0; i < param.ai.size(); i++)
                         {
                             if ((op->type == "F.adaptive_avg_pool2d"
-                                    || op->type == "F.adaptive_avg_pool3d"
-                                    || op->type == "F.adaptive_max_pool2d"
-                                    || op->type == "F.adaptive_max_pool3d")
-                                    && it.first == "output_size" && param.ai[i] == 0)
+                                 || op->type == "F.adaptive_avg_pool3d"
+                                 || op->type == "F.adaptive_max_pool2d"
+                                 || op->type == "F.adaptive_max_pool3d")
+                                && it.first == "output_size" && param.ai[i] == 0)
                             {
                                 fprintf(pyfp, "None");
                             }
diff --git a/tools/pnnx/src/pass_onnx/fold_constants.cpp b/tools/pnnx/src/pass_onnx/fold_constants.cpp
index c79cb29f34a1..6c843188d1b0 100644
--- a/tools/pnnx/src/pass_onnx/fold_constants.cpp
+++ b/tools/pnnx/src/pass_onnx/fold_constants.cpp
@@ -198,13 +198,13 @@ void fold_constants(onnx::ModelProto& model, const std::string& external_data_pa
             // aten::size
             // aten::_shape_as_tensor
             if (op_type == "aten_new_empty"
-                    || op_type == "aten_new_full"
-                    || op_type == "aten_new_ones"
-                    || op_type == "aten_new_zeros"
-                    || op_type == "aten_empty_like"
-                    || op_type == "aten_full_like"
-                    || op_type == "aten_ones_like"
-                    || op_type == "aten_zeros_like")
+                || op_type == "aten_new_full"
+                || op_type == "aten_new_ones"
+                || op_type == "aten_new_zeros"
+                || op_type == "aten_empty_like"
+                || op_type == "aten_full_like"
+                || op_type == "aten_ones_like"
+                || op_type == "aten_zeros_like")
             {
                 is_outputs_foldable = ignore_aten_size;
             }

From 5674b1ceee432a91a5dd8fcaa79d35c02ffb3502 Mon Sep 17 00:00:00 2001
From: vlordier <5443125+vlordier@users.noreply.github.com>
Date: Fri, 10 Apr 2026 10:31:02 +0000
Subject: [PATCH 23/29] apply code-format changes

---
 src/layer/topk.cpp                          |  8 ++++----
 tools/pnnx/src/ir.cpp                       | 20 ++++++++++----------
 tools/pnnx/src/pass_onnx/fold_constants.cpp | 14 +++++++-------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index 7e1a3c77ad78..3b78fbfce3fe 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -207,7 +207,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     if (_k == 1)
     {
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int line = 0; line < total_lines; line++)
         {
             int outer_i = line / inner;
@@ -326,7 +326,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     if (_k == axis_size && !sorted_flag)
     {
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int line = 0; line < total_lines; line++)
         {
             int outer_i = line / inner;
@@ -372,7 +372,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     if (_k <= 4)
     {
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int line = 0; line < total_lines; line++)
         {
             int outer_i = line / inner;
@@ -485,7 +485,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
         return 0;
     }
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int line = 0; line < total_lines; line++)
     {
         std::vector<std::pair<float, int> > vec(axis_size);
diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 1d88ba384bfb..456f51993b15 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1576,10 +1576,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
                     for (size_t i = 0; i < param.ai.size(); i++)
                     {
                         if ((op->type == "nn.AdaptiveAvgPool2d"
-                             || op->type == "nn.AdaptiveAvgPool3d"
-                             || op->type == "nn.AdaptiveMaxPool2d"
-                             || op->type == "nn.AdaptiveMaxPool3d")
-                            && it.first == "output_size" && param.ai[i] == 0)
+                                || op->type == "nn.AdaptiveAvgPool3d"
+                                || op->type == "nn.AdaptiveMaxPool2d"
+                                || op->type == "nn.AdaptiveMaxPool3d")
+                                && it.first == "output_size" && param.ai[i] == 0)
                         {
                             fprintf(pyfp, "None");
                         }
@@ -2390,8 +2390,8 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
 
                     bool scalar_as_tensor = false;
                     if ((op->type == "Tensor.index_put" && it.first == "values")
-                        || (op->type == "torch.where" && it.first == "input")
-                        || (op->type == "torch.where" && it.first == "other"))
+                            || (op->type == "torch.where" && it.first == "input")
+                            || (op->type == "torch.where" && it.first == "other"))
                     {
                         scalar_as_tensor = true;
                     }
@@ -2478,10 +2478,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
                         for (size_t i = 0; i < param.ai.size(); i++)
                         {
                             if ((op->type == "F.adaptive_avg_pool2d"
-                                 || op->type == "F.adaptive_avg_pool3d"
-                                 || op->type == "F.adaptive_max_pool2d"
-                                 || op->type == "F.adaptive_max_pool3d")
-                                && it.first == "output_size" && param.ai[i] == 0)
+                                    || op->type == "F.adaptive_avg_pool3d"
+                                    || op->type == "F.adaptive_max_pool2d"
+                                    || op->type == "F.adaptive_max_pool3d")
+                                    && it.first == "output_size" && param.ai[i] == 0)
                             {
                                 fprintf(pyfp, "None");
                             }
diff --git a/tools/pnnx/src/pass_onnx/fold_constants.cpp b/tools/pnnx/src/pass_onnx/fold_constants.cpp
index 6c843188d1b0..c79cb29f34a1 100644
--- a/tools/pnnx/src/pass_onnx/fold_constants.cpp
+++ b/tools/pnnx/src/pass_onnx/fold_constants.cpp
@@ -198,13 +198,13 @@ void fold_constants(onnx::ModelProto& model, const std::string& external_data_pa
             // aten::size
             // aten::_shape_as_tensor
             if (op_type == "aten_new_empty"
-                || op_type == "aten_new_full"
-                || op_type == "aten_new_ones"
-                || op_type == "aten_new_zeros"
-                || op_type == "aten_empty_like"
-                || op_type == "aten_full_like"
-                || op_type == "aten_ones_like"
-                || op_type == "aten_zeros_like")
+                    || op_type == "aten_new_full"
+                    || op_type == "aten_new_ones"
+                    || op_type == "aten_new_zeros"
+                    || op_type == "aten_empty_like"
+                    || op_type == "aten_full_like"
+                    || op_type == "aten_ones_like"
+                    || op_type == "aten_zeros_like")
             {
                 is_outputs_foldable = ignore_aten_size;
             }

From caa9de366c86c43fad02392a69961d3cf26c8fb7 Mon Sep 17 00:00:00 2001
From: vlordier <vlordier@users.noreply.github.com>
Date: Fri, 10 Apr 2026 12:31:39 +0200
Subject: [PATCH 24/29] ci: add topk test coverage and pnnx onnx test

---
 .github/workflows/topk-linux-test.yml | 111 ++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 .github/workflows/topk-linux-test.yml

diff --git a/.github/workflows/topk-linux-test.yml b/.github/workflows/topk-linux-test.yml
new file mode 100644
index 000000000000..5a25a7320d30
--- /dev/null
+++ b/.github/workflows/topk-linux-test.yml
@@ -0,0 +1,111 @@
+name: topk-linux-test
+on:
+  push:
+    branches:
+    - topk-ci-tests
+
+jobs:
+  x64-none:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
+            -DNCNN_SSE2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  x64-sse2:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
+            -DNCNN_SSE2=ON -DNCNN_AVX=OFF \
+            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  x64-avx2:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
+            -DNCNN_SSE2=ON -DNCNN_AVX=ON -DNCNN_F16C=ON -DNCNN_FMA=ON -DNCNN_AVX2=ON \
+            -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF \
+            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  simplestl-simplemath:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON \
+            -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  linux-x86-gcc:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: install
+      run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \
+            -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+    - name: build-nosse
+      run: |
+        mkdir build-nosse && cd build-nosse
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \
+            -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test-nosse
+      run: cd build-nosse && ./tests/test_topk
+
+  pnnx-onnx-topk:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+    - name: setup-pytorch
+      run: |
+        pip3 install torch --index-url https://download.pytorch.org/whl/cpu
+        pip3 install numpy packaging onnx onnxruntime
+    - name: build-pnnx
+      run: |
+        cd tools/pnnx
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Release ..
+        cmake --build . --config Release -j$(nproc)
+    - name: test-topk
+      run: |
+        cd tools/pnnx
+        build/src/pnnx tests/onnx/test_torch_topk.py

From 4e39cb6ae25eeb061e79a56bc43f60941586d21f Mon Sep 17 00:00:00 2001
From: vlordier <vlordier@users.noreply.github.com>
Date: Fri, 10 Apr 2026 12:52:52 +0200
Subject: [PATCH 25/29] =?UTF-8?q?ci:=20fix=20pnnx=20test=20invocation=20?=
 =?UTF-8?q?=E2=80=94=20use=20ctest?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/topk-linux-test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/topk-linux-test.yml b/.github/workflows/topk-linux-test.yml
index 5a25a7320d30..c4ef3861d6db 100644
--- a/.github/workflows/topk-linux-test.yml
+++ b/.github/workflows/topk-linux-test.yml
@@ -107,5 +107,5 @@ jobs:
         cmake --build . --config Release -j$(nproc)
     - name: test-topk
       run: |
-        cd tools/pnnx
-        build/src/pnnx tests/onnx/test_torch_topk.py
+        cd tools/pnnx/build
+        ctest --output-on-failure -R test_onnx_torch_topk

From ca55f8a9b1ef4f13736d3a0d18f8c95eca1977bc Mon Sep 17 00:00:00 2001
From: vlordier <5443125+vlordier@users.noreply.github.com>
Date: Fri, 10 Apr 2026 11:28:31 +0000
Subject: [PATCH 26/29] apply code-format changes

---
 src/layer/topk.cpp                          |  8 ++++----
 tools/pnnx/src/ir.cpp                       | 20 ++++++++++----------
 tools/pnnx/src/pass_onnx/fold_constants.cpp | 14 +++++++-------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp
index 7e1a3c77ad78..3b78fbfce3fe 100644
--- a/src/layer/topk.cpp
+++ b/src/layer/topk.cpp
@@ -207,7 +207,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     if (_k == 1)
     {
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int line = 0; line < total_lines; line++)
         {
             int outer_i = line / inner;
@@ -326,7 +326,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     if (_k == axis_size && !sorted_flag)
     {
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int line = 0; line < total_lines; line++)
         {
             int outer_i = line / inner;
@@ -372,7 +372,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
 
     if (_k <= 4)
     {
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int line = 0; line < total_lines; line++)
         {
             int outer_i = line / inner;
@@ -485,7 +485,7 @@ int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_bl
         return 0;
     }
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int line = 0; line < total_lines; line++)
     {
         std::vector<std::pair<float, int> > vec(axis_size);
diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 1d88ba384bfb..456f51993b15 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1576,10 +1576,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
                     for (size_t i = 0; i < param.ai.size(); i++)
                     {
                         if ((op->type == "nn.AdaptiveAvgPool2d"
-                             || op->type == "nn.AdaptiveAvgPool3d"
-                             || op->type == "nn.AdaptiveMaxPool2d"
-                             || op->type == "nn.AdaptiveMaxPool3d")
-                            && it.first == "output_size" && param.ai[i] == 0)
+                                || op->type == "nn.AdaptiveAvgPool3d"
+                                || op->type == "nn.AdaptiveMaxPool2d"
+                                || op->type == "nn.AdaptiveMaxPool3d")
+                                && it.first == "output_size" && param.ai[i] == 0)
                         {
                             fprintf(pyfp, "None");
                         }
@@ -2390,8 +2390,8 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
 
                     bool scalar_as_tensor = false;
                     if ((op->type == "Tensor.index_put" && it.first == "values")
-                        || (op->type == "torch.where" && it.first == "input")
-                        || (op->type == "torch.where" && it.first == "other"))
+                            || (op->type == "torch.where" && it.first == "input")
+                            || (op->type == "torch.where" && it.first == "other"))
                     {
                         scalar_as_tensor = true;
                     }
@@ -2478,10 +2478,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con
                         for (size_t i = 0; i < param.ai.size(); i++)
                         {
                             if ((op->type == "F.adaptive_avg_pool2d"
-                                 || op->type == "F.adaptive_avg_pool3d"
-                                 || op->type == "F.adaptive_max_pool2d"
-                                 || op->type == "F.adaptive_max_pool3d")
-                                && it.first == "output_size" && param.ai[i] == 0)
+                                    || op->type == "F.adaptive_avg_pool3d"
+                                    || op->type == "F.adaptive_max_pool2d"
+                                    || op->type == "F.adaptive_max_pool3d")
+                                    && it.first == "output_size" && param.ai[i] == 0)
                             {
                                 fprintf(pyfp, "None");
                             }
diff --git a/tools/pnnx/src/pass_onnx/fold_constants.cpp b/tools/pnnx/src/pass_onnx/fold_constants.cpp
index 6c843188d1b0..c79cb29f34a1 100644
--- a/tools/pnnx/src/pass_onnx/fold_constants.cpp
+++ b/tools/pnnx/src/pass_onnx/fold_constants.cpp
@@ -198,13 +198,13 @@ void fold_constants(onnx::ModelProto& model, const std::string& external_data_pa
             // aten::size
             // aten::_shape_as_tensor
             if (op_type == "aten_new_empty"
-                || op_type == "aten_new_full"
-                || op_type == "aten_new_ones"
-                || op_type == "aten_new_zeros"
-                || op_type == "aten_empty_like"
-                || op_type == "aten_full_like"
-                || op_type == "aten_ones_like"
-                || op_type == "aten_zeros_like")
+                    || op_type == "aten_new_full"
+                    || op_type == "aten_new_ones"
+                    || op_type == "aten_new_zeros"
+                    || op_type == "aten_empty_like"
+                    || op_type == "aten_full_like"
+                    || op_type == "aten_ones_like"
+                    || op_type == "aten_zeros_like")
             {
                 is_outputs_foldable = ignore_aten_size;
             }

From d8fd80c1580d29667e2d5ab46de88a63ad632e8f Mon Sep 17 00:00:00 2001
From: vlordier <vlordier@users.noreply.github.com>
Date: Sat, 11 Apr 2026 00:05:50 +0200
Subject: [PATCH 27/29] feat: add TopK + Gather ncnn support for YOLOv10

- pass_level2/torch_topk.cpp: capture k/dim/largest/sorted as parameters
  (prim::Constant) instead of tensor inputs, enabling ncnn pass matching
- pass_level2/torch_gather.cpp: restore original pattern (dim as tensor)
- pass_ncnn/TopK.cpp: match torch.topk with captured parameters and
  convert to ncnn TopK layer (axis, largest, sorted)
- pass_ncnn/torch_gather.cpp (NEW): match torch.gather with 2 inputs
  (input, index) and captured dim parameter, convert to ncnn Gather layer
- src/layer/gather.{h,cpp} (NEW): implement Gather ncnn operator
  supporting 1D/2D/3D tensors with arbitrary axis
- PNNX CMakeLists fixes:
  - per-target Torch include dirs to avoid protobuf header conflicts
  - Abseil linking for Homebrew protobuf 34.x
  - disable onnxruntime auto-detection (protobuf conflict)
  - directory-level INCLUDE_DIRECTORIES_BEFORE for protobuf headers

Verified: YOLOv10n converts with 2 TopK + 2 Gather layers, only
cosmetic ops (Tensor.to, pnnx.Expression) ignored.

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
---
 src/CMakeLists.txt                        |   1 +
 src/layer/gather.cpp                      | 111 ++++++++++++++++++++++
 src/layer/gather.h                        |  27 ++++++
 tools/pnnx/CMakeLists.txt                 |  31 +++---
 tools/pnnx/src/CMakeLists.txt             |  21 ++++
 tools/pnnx/src/pass_level2/torch_topk.cpp |  12 +--
 tools/pnnx/src/pass_ncnn/TopK.cpp         |  75 ++++++++++++---
 tools/pnnx/src/pass_ncnn/torch_gather.cpp |  54 +++++++++++
 8 files changed, 301 insertions(+), 31 deletions(-)
 create mode 100644 src/layer/gather.cpp
 create mode 100644 src/layer/gather.h
 create mode 100644 tools/pnnx/src/pass_ncnn/torch_gather.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c79d779cf220..3f518f11117b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -102,6 +102,7 @@ ncnn_add_layer(TanH)
 ncnn_add_layer(Threshold)
 ncnn_add_layer(Tile)
 ncnn_add_layer(TopK)
+ncnn_add_layer(Gather)
 ncnn_add_layer(RNN)
 ncnn_add_layer(LSTM)
 ncnn_add_layer(BinaryOp)
diff --git a/src/layer/gather.cpp b/src/layer/gather.cpp
new file mode 100644
index 000000000000..738cd85f9f41
--- /dev/null
+++ b/src/layer/gather.cpp
@@ -0,0 +1,111 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "gather.h"
+
+namespace ncnn {
+
+Gather::Gather()
+{
+    one_blob_only = false;
+    support_inplace = false;
+}
+
+int Gather::load_param(const ParamDict& pd)
+{
+    axis = pd.get(0, 0);
+
+    return 0;
+}
+
+int Gather::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    if (bottom_blobs.size() < 2)
+        return -1;
+
+    const Mat& input_blob = bottom_blobs[0];
+    const Mat& index_blob = bottom_blobs[1];
+    const int dims = input_blob.dims;
+
+    // index_blob should contain int64 or int32 indices
+    // For simplicity we treat it as float and cast
+    const int index_size = (int)index_blob.total();
+
+    int positive_axis = axis < 0 ? axis + dims : axis;
+    if (positive_axis < 0 || positive_axis >= dims)
+        return -1;
+
+    int shape[4] = {1, 1, 1, 1};
+    shape[0] = input_blob.w;
+    if (dims >= 2) shape[1] = input_blob.h;
+    if (dims == 3)    shape[2] = input_blob.c;
+    if (dims == 4)    shape[2] = input_blob.c; // w*h*c layout
+
+    const int axis_dim_size = shape[positive_axis];
+
+    // Output shape matches index_blob shape
+    const Mat& out_shape = index_blob;
+
+    // Allocate output (same dtype as input, shape matches index)
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(out_shape.w, out_shape.h, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const float* inp = input_blob;
+    const int* idx = (const int*)index_blob;
+    float* out = top_blob;
+
+    // General case: iterate over all output positions
+    // Map flat output index to multi-dimensional coords,
+    // then compute corresponding input position with index substitution
+    const int total_out = (int)top_blob.total();
+    for (int i = 0; i < total_out; i++)
+    {
+        // Decompose flat index i into coordinates based on top_blob shape
+        int rem = i;
+        int coord_out[4] = {0, 0, 0, 0};
+        if (top_blob.dims == 1) {
+            coord_out[0] = rem;
+        } else if (top_blob.dims == 2) {
+            coord_out[0] = rem % top_blob.w;
+            coord_out[1] = rem / top_blob.w;
+        } else if (top_blob.dims == 3) {
+            int hw = top_blob.w * top_blob.h;
+            coord_out[0] = (rem % hw) % top_blob.w;
+            coord_out[1] = (rem % hw) / top_blob.w;
+            coord_out[2] = rem / hw;
+        }
+
+        // Get index value at this output position
+        int gather_idx = idx[i];
+        // Handle negative indices
+        if (gather_idx < 0) gather_idx += axis_dim_size;
+
+        // Build input coordinate (same as output, but axis coord replaced)
+        int coord_in[4] = {coord_out[0], coord_out[1], coord_out[2], coord_out[3]};
+        coord_in[positive_axis] = gather_idx;
+
+        // Clamp to input bounds
+        if (coord_in[positive_axis] >= axis_dim_size) coord_in[positive_axis] = axis_dim_size - 1;
+        if (coord_in[positive_axis] < 0) coord_in[positive_axis] = 0;
+
+        // Compute flat input index
+        int flat_in = 0;
+        if (dims == 1) {
+            flat_in = coord_in[0];
+        } else if (dims == 2) {
+            flat_in = coord_in[0] + coord_in[1] * input_blob.w;
+        } else if (dims == 3) {
+            // ncnn 3D layout: w * h * c, with cstride padding
+            size_t cstep = input_blob.cstep;
+            flat_in = coord_in[0] + coord_in[1] * input_blob.w + coord_in[2] * (int)cstep;
+        }
+
+        out[i] = inp[flat_in];
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/gather.h b/src/layer/gather.h
new file mode 100644
index 000000000000..f8d24d9afb54
--- /dev/null
+++ b/src/layer/gather.h
@@ -0,0 +1,27 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_GATHER_H
+#define LAYER_GATHER_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Gather : public Layer
+{
+public:
+    Gather();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+public:
+    // param_0 = axis (default 0)
+    int axis;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GATHER_H
diff --git a/tools/pnnx/CMakeLists.txt b/tools/pnnx/CMakeLists.txt
index e50ab4788c3d..5b3250943cf8 100644
--- a/tools/pnnx/CMakeLists.txt
+++ b/tools/pnnx/CMakeLists.txt
@@ -83,7 +83,8 @@ else()
     message(WARNING "Building without TorchVision")
 endif()
 
-include_directories(SYSTEM ${TORCH_INCLUDE_DIRS})
+# Torch includes are added per-target in src/CMakeLists.txt to avoid
+# conflicts with system protobuf headers
 
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
     # test if libtorch and protobuf has the same cxxabi version
@@ -95,7 +96,10 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 endif()
 
 if((PNNX_TORCH_USE_CXX11_ABI AND PNNX_COMPILER_USE_CXX11_ABI) OR (NOT PNNX_TORCH_USE_CXX11_ABI AND NOT PNNX_COMPILER_USE_CXX11_ABI))
-    find_package(protobuf CONFIG)
+    # Torch may have already registered protobuf targets — skip find_package if so
+    if(NOT TARGET protobuf::libprotobuf)
+        find_package(protobuf CONFIG)
+    endif()
 
     if(protobuf_FOUND)
         set(PROTOBUF_FOUND ${protobuf_FOUND})
@@ -109,20 +113,21 @@ if((PNNX_TORCH_USE_CXX11_ABI AND PNNX_COMPILER_USE_CXX11_ABI) OR (NOT PNNX_TORCH
             set_target_properties(protobuf::protoc PROPERTIES IMPORTED_LOCATION_RELEASE "${PROTOBUF_PROTOC_EXECUTABLE}")
         endif()
     endif()
-endif()
 
-# https://github.com/supertone-inc/onnxruntime-build
-set(onnxruntime_INSTALL_DIR "/home/nihui/osd/pnnx/install" CACHE STRING "")
-find_library(onnxruntime_LIB NAMES onnxruntime PATHS ${onnxruntime_INSTALL_DIR}/lib64 ${onnxruntime_INSTALL_DIR}/lib)
-if(onnxruntime_LIB)
-    set(onnxruntime_FOUND TRUE)
-    add_library(onnxruntime::onnxruntime STATIC IMPORTED)
-    set_target_properties(onnxruntime::onnxruntime PROPERTIES IMPORTED_LOCATION ${onnxruntime_LIB})
-    set_target_properties(onnxruntime::onnxruntime PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${onnxruntime_INSTALL_DIR}/include)
-else()
-    set(onnxruntime_FOUND FALSE)
+    # Homebrew protobuf 34.x depends on Abseil — we need to link it explicitly
+    # because macOS doesn't resolve transitive dylib deps with @rpath properly
+    find_package(PkgConfig QUIET)
+    if(PKG_CONFIG_FOUND)
+        pkg_check_modules(ABSL QUIET absl_log_internal_check_op absl_die_if_null absl_log_internal_conditions absl_log_internal_message absl_examine_stack absl_statusor absl_synchronization absl_time)
+        if(ABSL_FOUND)
+            set(ABSL_LIBRARIES ${ABSL_LINK_LIBRARIES})
+        endif()
+    endif()
 endif()
 
+# Disable onnxruntime auto-detection — we only need torch2pnnx for YOLOv10
+set(onnxruntime_FOUND FALSE)
+
 option(PNNX_TNN2PNNX "build tnn2pnnx" ON)
 
 add_subdirectory(src)
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index c554a6873e81..15aa16b46376 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -603,6 +603,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/torch_diag.cpp
     pass_ncnn/torch_flatten.cpp
     pass_ncnn/torch_flip.cpp
+    pass_ncnn/torch_gather.cpp
     pass_ncnn/torch_istft.cpp
     pass_ncnn/torch_logsumexp.cpp
     pass_ncnn/torch_matmul.cpp
@@ -635,6 +636,15 @@ if(PROTOBUF_FOUND)
         add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
         target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
         target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES})
+        if(ABSL_LIBRARIES)
+            target_link_libraries(onnxproto PUBLIC ${ABSL_LIBRARIES})
+        endif()
+        # Force system protobuf headers BEFORE any Torch-bundled old headers
+        # (Torch bundles an ancient protobuf that conflicts with system protobuf >= 22)
+        set_property(DIRECTORY APPEND PROPERTY INCLUDE_DIRECTORIES_BEFORE
+            ${PROTOBUF_INCLUDE_DIR}
+            ${CMAKE_CURRENT_BINARY_DIR}
+        )
     else()
         add_library(onnxproto STATIC onnx-data.proto onnx-ml.proto onnx-operators-ml.proto)
         target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
@@ -674,6 +684,7 @@ set(torch2pnnx_SRCS
 add_library(torch2pnnx OBJECT ${torch2pnnx_SRCS})
 target_compile_definitions(torch2pnnx PRIVATE BUILD_TORCH2PNNX)
 target_compile_options(torch2pnnx PUBLIC "${TORCH_CXX_FLAGS}")
+target_include_directories(torch2pnnx SYSTEM PRIVATE ${TORCH_INCLUDE_DIRS})
 
 if(WIN32)
     target_compile_definitions(torch2pnnx PUBLIC NOMINMAX)
@@ -687,6 +698,10 @@ if(PROTOBUF_FOUND)
     add_library(pnnx2onnx STATIC
         save_onnx.cpp
     )
+    # Ensure Homebrew protobuf headers are found BEFORE Torch's bundled old ones
+    if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE)
+        target_include_directories(pnnx2onnx BEFORE PRIVATE ${PROTOBUF_INCLUDE_DIR})
+    endif()
     if(onnxruntime_FOUND)
         target_link_libraries(pnnx2onnx PRIVATE onnxruntime::onnxruntime)
     else()
@@ -779,12 +794,18 @@ set(pnnx_SRCS
 add_executable(pnnx ${pnnx_SRCS})
 
 set_property(SOURCE main.cpp APPEND PROPERTY COMPILE_DEFINITIONS BUILD_TORCH2PNNX)
+target_include_directories(pnnx SYSTEM PRIVATE ${TORCH_INCLUDE_DIRS})
 target_link_libraries(pnnx PRIVATE torch2pnnx)
 
 if(TorchVision_FOUND)
     target_link_libraries(pnnx PRIVATE ${TORCHVISION_LIBRARY})
 endif()
 
+# Link Abseil (needed for protobuf 34.x on macOS/Homebrew)
+if(ABSL_LIBRARIES)
+    target_link_libraries(pnnx PRIVATE ${ABSL_LIBRARIES})
+endif()
+
 if(WIN32)
     target_link_libraries(pnnx PRIVATE ${TORCH_LIBRARIES})
 else()
diff --git a/tools/pnnx/src/pass_level2/torch_topk.cpp b/tools/pnnx/src/pass_level2/torch_topk.cpp
index f3d7fae98ba4..339271f95fb7 100644
--- a/tools/pnnx/src/pass_level2/torch_topk.cpp
+++ b/tools/pnnx/src/pass_level2/torch_topk.cpp
@@ -11,13 +11,13 @@ class torch_topk : public GraphRewriterPass
     const char* match_pattern_graph() const
     {
         return R"PNNXIR(7767517
-7 7
+12 7
 pnnx.Input              input_0     0 1 input
-pnnx.Input              input_1     0 1 k
-pnnx.Input              input_2     0 1 dim
-pnnx.Input              input_3     0 1 largest
-pnnx.Input              input_4     0 1 sorted
-aten::topk              op_0        5 2 input k dim largest sorted values indices
+prim::Constant          op_0        0 1 k value=%k
+prim::Constant          op_1        0 1 dim value=%dim
+prim::Constant          op_2        0 1 largest value=%largest
+prim::Constant          op_3        0 1 sorted value=%sorted
+aten::topk              op_4        5 2 input k dim largest sorted values indices
 pnnx.Output             output      2 0 values indices
 )PNNXIR";
     }
diff --git a/tools/pnnx/src/pass_ncnn/TopK.cpp b/tools/pnnx/src/pass_ncnn/TopK.cpp
index 13549437d271..2641493dd0fc 100644
--- a/tools/pnnx/src/pass_ncnn/TopK.cpp
+++ b/tools/pnnx/src/pass_ncnn/TopK.cpp
@@ -17,16 +17,15 @@ static int parameter_to_bool(const Parameter& p, int default_value)
     return default_value;
 }
 
-class TopK : public GraphRewriterPass
+class torch_topk : public GraphRewriterPass
 {
 public:
     const char* match_pattern_graph() const
     {
         return R"PNNXIR(7767517
-4 3
+3 2
 pnnx.Input              input_0     0 1 input
-pnnx.Input              input_1     0 1 k
-TopK                    op_0        2 2 input k values indices axis=%axis largest=%largest sorted=%sorted
+torch.topk              op_0        1 2 input values indices k=%k dim=%dim largest=%largest sorted=%sorted
 pnnx.Output             output      2 0 values indices
 )PNNXIR";
     }
@@ -44,8 +43,14 @@ pnnx.Output             output      2 0 values indices
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
         int axis = -1;
-        if (captured_params.find("axis") != captured_params.end())
-            axis = captured_params.at("axis").i;
+        if (captured_params.find("dim") != captured_params.end())
+        {
+            const Parameter& dim_p = captured_params.at("dim");
+            if (dim_p.type == 2)
+                axis = dim_p.i;
+            else if (dim_p.type == 5 && !dim_p.ai.empty())
+                axis = dim_p.ai[0];
+        }
 
         int largest = 1;
         if (captured_params.find("largest") != captured_params.end())
@@ -73,24 +78,70 @@ pnnx.Output             output      2 0 values indices
     }
 };
 
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(TopK, 20)
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_topk, 20)
 
-class TopK_0 : public TopK
+class torch_topk_0 : public GraphRewriterPass
 {
 public:
     const char* match_pattern_graph() const
     {
         return R"PNNXIR(7767517
-4 2
+3 1
 pnnx.Input              input_0     0 1 input
-pnnx.Input              input_1     0 1 k
-TopK                    op_0        2 1 input k values axis=%axis largest=%largest sorted=%sorted
+torch.topk              op_0        1 1 input values k=%k dim=%dim largest=%largest sorted=%sorted
 pnnx.Output             output      1 0 values
 )PNNXIR";
     }
+
+    const char* type_str() const
+    {
+        return "TopK";
+    }
+
+    const char* name_str() const
+    {
+        return "topk";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        int axis = -1;
+        if (captured_params.find("dim") != captured_params.end())
+        {
+            const Parameter& dim_p = captured_params.at("dim");
+            if (dim_p.type == 2)
+                axis = dim_p.i;
+            else if (dim_p.type == 5 && !dim_p.ai.empty())
+                axis = dim_p.ai[0];
+        }
+
+        int largest = 1;
+        if (captured_params.find("largest") != captured_params.end())
+            largest = parameter_to_bool(captured_params.at("largest"), 1);
+
+        int sorted = 1;
+        if (captured_params.find("sorted") != captured_params.end())
+            sorted = parameter_to_bool(captured_params.at("sorted"), 1);
+
+        const int batch_index = op->inputs[0]->params["__batch_index"].i;
+
+        if (axis == batch_index)
+        {
+            fprintf(stderr, "TopK along batch axis is not supported\n");
+            return;
+        }
+
+        int new_axis = axis;
+        if (axis >= 0)
+            new_axis = axis > batch_index ? axis - 1 : axis;
+
+        op->params["0"] = new_axis;
+        op->params["1"] = largest;
+        op->params["2"] = sorted;
+    }
 };
 
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(TopK_0, 20)
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_topk_0, 20)
 
 } // namespace ncnn
 
diff --git a/tools/pnnx/src/pass_ncnn/torch_gather.cpp b/tools/pnnx/src/pass_ncnn/torch_gather.cpp
new file mode 100644
index 000000000000..13d1d69e0103
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/torch_gather.cpp
@@ -0,0 +1,54 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class torch_gather : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 index
+torch.gather            op_0        2 1 input index out dim=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Gather";
+    }
+
+    const char* name_str() const
+    {
+        return "gather";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        int axis = 0;
+        if (captured_params.find("dim") != captured_params.end())
+        {
+            const Parameter& dim_p = captured_params.at("dim");
+            if (dim_p.type == 2)
+                axis = dim_p.i;
+            else if (dim_p.type == 5 && !dim_p.ai.empty())
+                axis = dim_p.ai[0];
+        }
+
+        op->params["0"] = axis;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_gather, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx

From d68852df6817c600862238c7e880b21c66d1e2c1 Mon Sep 17 00:00:00 2001
From: vlordier <5443125+vlordier@users.noreply.github.com>
Date: Sat, 11 Apr 2026 07:43:01 +0000
Subject: [PATCH 28/29] apply code-format changes

---
 src/layer/gather.cpp | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/layer/gather.cpp b/src/layer/gather.cpp
index 738cd85f9f41..850b65b3d121 100644
--- a/src/layer/gather.cpp
+++ b/src/layer/gather.cpp
@@ -38,8 +38,8 @@ int Gather::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
     int shape[4] = {1, 1, 1, 1};
     shape[0] = input_blob.w;
     if (dims >= 2) shape[1] = input_blob.h;
-    if (dims == 3)    shape[2] = input_blob.c;
-    if (dims == 4)    shape[2] = input_blob.c; // w*h*c layout
+    if (dims == 3) shape[2] = input_blob.c;
+    if (dims == 4) shape[2] = input_blob.c; // w*h*c layout
 
     const int axis_dim_size = shape[positive_axis];
 
@@ -65,12 +65,17 @@ int Gather::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
         // Decompose flat index i into coordinates based on top_blob shape
         int rem = i;
         int coord_out[4] = {0, 0, 0, 0};
-        if (top_blob.dims == 1) {
+        if (top_blob.dims == 1)
+        {
             coord_out[0] = rem;
-        } else if (top_blob.dims == 2) {
+        }
+        else if (top_blob.dims == 2)
+        {
             coord_out[0] = rem % top_blob.w;
             coord_out[1] = rem / top_blob.w;
-        } else if (top_blob.dims == 3) {
+        }
+        else if (top_blob.dims == 3)
+        {
             int hw = top_blob.w * top_blob.h;
             coord_out[0] = (rem % hw) % top_blob.w;
             coord_out[1] = (rem % hw) / top_blob.w;
@@ -92,11 +97,16 @@ int Gather::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_
 
         // Compute flat input index
         int flat_in = 0;
-        if (dims == 1) {
+        if (dims == 1)
+        {
             flat_in = coord_in[0];
-        } else if (dims == 2) {
+        }
+        else if (dims == 2)
+        {
             flat_in = coord_in[0] + coord_in[1] * input_blob.w;
-        } else if (dims == 3) {
+        }
+        else if (dims == 3)
+        {
             // ncnn 3D layout: w * h * c, with cstride padding
             size_t cstep = input_blob.cstep;
             flat_in = coord_in[0] + coord_in[1] * input_blob.w + coord_in[2] * (int)cstep;

From 93bd42378acaaab0e5aee237dca92b1c68002197 Mon Sep 17 00:00:00 2001
From: vlordier <vlordier@users.noreply.github.com>
Date: Sat, 11 Apr 2026 10:42:25 +0200
Subject: [PATCH 29/29] =?UTF-8?q?feat:=20add=20Tensor.to=20=E2=86=92=20Cas?=
 =?UTF-8?q?t=20conversion=20with=20int64/int32=20support?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- src/layer/cast.{h,cpp}: extend Cast layer with int64 (type 5) and
  int32 (type 6) support, adding conversions int64↔float32 and
  int32↔float32
- pass_ncnn/tensor_to.cpp (NEW): convert Tensor.to (dtype cast) to
  ncnn Cast layer, mapping torch dtype strings to ncnn type codes
- CMakeLists.txt: register tensor_to.cpp in pass_ncnn sources

Verified: YOLOv10n Tensor.to (i64→f32) now converts to Cast layer
instead of being ignored. Only cosmetic ops (pnnx.Expression) remain.

Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
---
 src/layer/cast.cpp                     | 74 ++++++++++++++++++++++++++
 src/layer/cast.h                       |  2 +
 tools/pnnx/src/CMakeLists.txt          |  1 +
 tools/pnnx/src/pass_ncnn/tensor_to.cpp | 67 +++++++++++++++++++++++
 4 files changed, 144 insertions(+)
 create mode 100644 tools/pnnx/src/pass_ncnn/tensor_to.cpp

diff --git a/src/layer/cast.cpp b/src/layer/cast.cpp
index 3dcff38f3cac..e18a7c3a8ae2 100644
--- a/src/layer/cast.cpp
+++ b/src/layer/cast.cpp
@@ -74,6 +74,16 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
         // bfloat16
         out_elemsize = 2 * elempack;
     }
+    else if (type_to == 5)
+    {
+        // int64
+        out_elemsize = 8 * elempack;
+    }
+    else if (type_to == 6)
+    {
+        // int32
+        out_elemsize = 4 * elempack;
+    }
 
     if (dims == 1)
     {
@@ -173,6 +183,70 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
 
     // TODO more cast type
 
+    if (type_from == 5 && type_to == 1)
+    {
+        // int64 → float32
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const long long* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (float)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 1 && type_to == 5)
+    {
+        // float32 → int64
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            long long* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (long long)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 6 && type_to == 1)
+    {
+        // int32 → float32
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (float)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 1 && type_to == 6)
+    {
+        // float32 → int32
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            int* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (int)ptr[i];
+            }
+        }
+    }
+
     return 0;
 }
 
diff --git a/src/layer/cast.h b/src/layer/cast.h
index 036e61efed04..22c8f5da4626 100644
--- a/src/layer/cast.h
+++ b/src/layer/cast.h
@@ -24,6 +24,8 @@ class Cast : public Layer
     // 2 = float16
     // 3 = int8
     // 4 = bfloat16
+    // 5 = int64
+    // 6 = int32
     int type_from;
     int type_to;
 };
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index 15aa16b46376..86c0593b9b37 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -616,6 +616,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/torch_roll.cpp
     pass_ncnn/torch_slice_scatter.cpp
     pass_ncnn/torch_squeeze.cpp
+    pass_ncnn/tensor_to.cpp
     pass_ncnn/torch_sum.cpp
     pass_ncnn/torch_stft.cpp
     pass_ncnn/torch_t.cpp
diff --git a/tools/pnnx/src/pass_ncnn/tensor_to.cpp b/tools/pnnx/src/pass_ncnn/tensor_to.cpp
new file mode 100644
index 000000000000..252498fd0ffa
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/tensor_to.cpp
@@ -0,0 +1,67 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class Tensor_to : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 2
+pnnx.Input              input_0     0 1 input
+Tensor.to               op_0        1 1 input out copy=%copy dtype=%dtype
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Cast";
+    }
+
+    const char* name_str() const
+    {
+        return "to";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        // Map torch dtype to ncnn cast type
+        // torch.float = 1 (float32), torch.int64 = 5 (int64), torch.int32 = 6 (int32), etc.
+        // The input type is auto-detected, we only need to set the target type
+        std::string dtype = "torch.float";
+        if (captured_params.find("dtype") != captured_params.end())
+        {
+            dtype = captured_params.at("dtype").s;
+        }
+
+        int type_to = 0;
+        if (dtype == "torch.float" || dtype == "torch.float32")
+            type_to = 1;
+        else if (dtype == "torch.float16" || dtype == "torch.half")
+            type_to = 2;
+        else if (dtype == "torch.int8")
+            type_to = 3;
+        else if (dtype == "torch.bfloat16")
+            type_to = 4;
+        else if (dtype == "torch.int64" || dtype == "torch.long")
+            type_to = 5;
+        else if (dtype == "torch.int32" || dtype == "torch.int")
+            type_to = 6;
+
+        op->params["0"] = 0; // auto-detect input type
+        op->params["1"] = type_to;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(Tensor_to, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx