From 8bd7d308e3381fc961b11f0a8194778f76de1b16 Mon Sep 17 00:00:00 2001 From: vlordier Date: Thu, 26 Feb 2026 23:24:18 +0100 Subject: [PATCH 01/29] Add TopK layer and pnnx ONNX TopK lowering --- src/CMakeLists.txt | 1 + src/layer/topk.cpp | 194 ++++++++++++++++++++++++++++++ src/layer/topk.h | 29 +++++ tests/CMakeLists.txt | 1 + tests/test_topk.cpp | 88 ++++++++++++++ tools/pnnx/src/CMakeLists.txt | 1 + tools/pnnx/src/pass_ncnn/TopK.cpp | 97 +++++++++++++++ 7 files changed, 411 insertions(+) create mode 100644 src/layer/topk.cpp create mode 100644 src/layer/topk.h create mode 100644 tests/test_topk.cpp create mode 100644 tools/pnnx/src/pass_ncnn/TopK.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 614c3b8f31f1..c79d779cf220 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -101,6 +101,7 @@ ncnn_add_layer(SPP OFF) ncnn_add_layer(TanH) ncnn_add_layer(Threshold) ncnn_add_layer(Tile) +ncnn_add_layer(TopK) ncnn_add_layer(RNN) ncnn_add_layer(LSTM) ncnn_add_layer(BinaryOp) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp new file mode 100644 index 000000000000..c65dbc9689ba --- /dev/null +++ b/src/layer/topk.cpp @@ -0,0 +1,194 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include "topk.h" + +#include +#include + +namespace ncnn { + +TopK::TopK() +{ + one_blob_only = false; + support_inplace = false; + + axis = -1; + largest = 1; + sorted = 1; + k = 1; +} + +int TopK::load_param(const ParamDict& pd) +{ + axis = pd.get(0, -1); + largest = pd.get(1, 1); + sorted = pd.get(2, 1); + k = pd.get(3, 1); + + return 0; +} + +int TopK::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + if (bottom_blobs.empty()) + return -1; + + const Mat& bottom_blob = bottom_blobs[0]; + + int _k = k; + if (bottom_blobs.size() >= 2) + { + const Mat& k_blob = bottom_blobs[1]; + if (k_blob.total() < 1) + return -1; + + _k = (int)((const float*)k_blob)[0]; + } + + if (bottom_blob.dims < 1 || bottom_blob.dims > 4) + return -100; + + int dims = bottom_blob.dims; + + int axis_p = axis < 0 ? axis + dims : axis; + if (axis_p < 0 || axis_p >= dims) + return -1; + + int shape[4] = {1, 1, 1, 1}; + shape[0] = bottom_blob.w; + if (dims >= 2) shape[1] = bottom_blob.h; + if (dims >= 3) shape[2] = bottom_blob.dims == 3 ? bottom_blob.c : bottom_blob.d; + if (dims >= 4) shape[3] = bottom_blob.c; + + int axis_size = shape[axis_p]; + if (axis_size <= 0) + return -1; + + if (_k < 0) + return -1; + if (_k > axis_size) + _k = axis_size; + + int out_shape[4] = {shape[0], shape[1], shape[2], shape[3]}; + out_shape[axis_p] = _k; + + Mat values; + if (dims == 1) values.create(out_shape[0], 4u, opt.blob_allocator); + if (dims == 2) values.create(out_shape[0], out_shape[1], 4u, opt.blob_allocator); + if (dims == 3) values.create(out_shape[0], out_shape[1], out_shape[2], 4u, opt.blob_allocator); + if (dims == 4) values.create(out_shape[0], out_shape[1], out_shape[2], out_shape[3], 4u, opt.blob_allocator); + if (values.empty()) + return -100; + + Mat indices; + if (top_blobs.size() >= 2) + { + if (dims == 1) indices.create(out_shape[0], 4u, opt.blob_allocator); + if (dims == 2) indices.create(out_shape[0], out_shape[1], 4u, opt.blob_allocator); + if (dims == 3) indices.create(out_shape[0], out_shape[1], out_shape[2], 4u, opt.blob_allocator); + if (dims == 4) indices.create(out_shape[0], out_shape[1], out_shape[2], out_shape[3], 4u, opt.blob_allocator); + if (indices.empty()) + return -100; + } + + const float* ptr = bottom_blob; + float* outptr = values; + float* outidxptr = indices; + + int inner = 1; + for (int i = 0; i < axis_p; i++) + { + inner *= shape[i]; + } + + int outer = 1; + for (int i = axis_p + 1; i < dims; i++) + { + outer *= shape[i]; + } + + const bool largest_p = largest != 0; + const bool sorted_p = sorted != 0; + + const int total_lines = outer * inner; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int line = 0; line < total_lines; line++) + { + int outer_i = line / inner; + int inner_i = line - outer_i * inner; + + int in_base = outer_i * axis_size * inner + inner_i; + int out_base = outer_i * _k * inner + inner_i; + + std::vector > vec; + vec.resize(axis_size); + + for (int j = 0; j < axis_size; j++) + { + vec[j].first = ptr[in_base + j * inner]; + vec[j].second = j; + } + + if (largest_p) + { + auto comp = [](const std::pair& a, const std::pair& b) + { + if (a.first != b.first) + return a.first > b.first; + return a.second < b.second; + }; + + if (_k < axis_size) + { + if (sorted_p) + std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp); + else + std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); + } + else + { + if (sorted_p) + std::sort(vec.begin(), vec.end(), comp); + } + } + else + { + auto comp = [](const std::pair& a, const std::pair& b) + { + if (a.first != b.first) + return a.first < b.first; + return a.second < b.second; + }; + + if (_k < axis_size) + { + if (sorted_p) + std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp); + else + std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); + } + else + { + if (sorted_p) + std::sort(vec.begin(), vec.end(), comp); + } + } + + for (int j = 0; j < _k; j++) + { + outptr[out_base + j * inner] = vec[j].first; + if (outidxptr) + outidxptr[out_base + j * inner] = (float)vec[j].second; + } + } + + top_blobs[0] = values; + if (top_blobs.size() >= 2) + top_blobs[1] = indices; + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/topk.h b/src/layer/topk.h new file mode 100644 index 000000000000..ff8f410926d8 --- /dev/null +++ b/src/layer/topk.h @@ -0,0 +1,29 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef LAYER_TOPK_H +#define LAYER_TOPK_H + +#include "layer.h" + +namespace ncnn { + +class TopK : public Layer +{ +public: + TopK(); + + virtual int load_param(const ParamDict& pd); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +public: + int axis; + int largest; + int sorted; + int k; +}; + +} // namespace ncnn + +#endif // LAYER_TOPK_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e72e6d02b86e..4f40f8279428 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -166,6 +166,7 @@ ncnn_add_layer_test(Spectrogram) ncnn_add_layer_test(Squeeze) ncnn_add_layer_test(Swish) ncnn_add_layer_test(TanH) +ncnn_add_layer_test(TopK) ncnn_add_layer_test(Tile) ncnn_add_layer_test(UnaryOp) ncnn_add_layer_test(Unfold) diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp new file mode 100644 index 000000000000..7b7fe82690ba --- /dev/null +++ b/tests/test_topk.cpp @@ -0,0 +1,88 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include "testutil.h" + +static int test_topk(const ncnn::Mat& a, int axis, int k, int largest, int sorted) +{ + ncnn::ParamDict pd; + pd.set(0, axis); + pd.set(1, largest); + pd.set(2, sorted); + pd.set(3, k); + + std::vector weights(0); + + std::vector a0(1); + a0[0] = a; + + int ret = test_layer("TopK", pd, weights, a0, 2, 0.01f, TEST_LAYER_DISABLE_AUTO_INPUT_CASTING); + if (ret != 0) + { + fprintf(stderr, "test_topk failed a.dims=%d a=(%d %d %d %d) axis=%d k=%d largest=%d sorted=%d\n", a.dims, a.w, a.h, a.d, a.c, axis, k, largest, sorted); + } + + return ret; +} + +static int test_topk_0() +{ + ncnn::Mat a = RandomMat(13); + + return 0 + || test_topk(a, 0, 1, 1, 1) + || test_topk(a, 0, 5, 1, 1) + || test_topk(a, -1, 7, 0, 1) + || test_topk(a, 0, 9, 1, 1); +} + +static int test_topk_1() +{ + ncnn::Mat a = RandomMat(12, 17); + + return 0 + || test_topk(a, 0, 1, 1, 1) + || test_topk(a, 0, 5, 1, 1) + || test_topk(a, 1, 3, 1, 1) + || test_topk(a, -1, 8, 0, 1) + || test_topk(a, -2, 7, 1, 1); +} + +static int test_topk_2() +{ + ncnn::Mat a = RandomMat(8, 9, 11); + + return 0 + || test_topk(a, 0, 3, 1, 1) + || test_topk(a, 1, 4, 1, 1) + || test_topk(a, 2, 2, 0, 1) + || test_topk(a, -1, 6, 1, 1) + || test_topk(a, -2, 5, 0, 1) + || test_topk(a, -3, 7, 1, 1); +} + +static int test_topk_3() +{ + ncnn::Mat a = RandomMat(5, 7, 9, 10); + + return 0 + || test_topk(a, 0, 2, 1, 1) + || test_topk(a, 1, 3, 0, 1) + || test_topk(a, 2, 4, 1, 1) + || test_topk(a, 3, 5, 1, 1) + || test_topk(a, -1, 6, 0, 1) + || test_topk(a, -2, 3, 1, 1) + || test_topk(a, -3, 4, 0, 1) + || test_topk(a, -4, 2, 1, 1); +} + +int main() +{ + SRAND(7767517); + + return 0 + || test_topk_0() + || test_topk_1() + || test_topk_2() + || test_topk_3(); +} diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index 3e0c6f865a87..c554a6873e81 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -592,6 +592,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/Tensor_reshape_as.cpp pass_ncnn/Tensor_repeat.cpp pass_ncnn/Tensor_unflatten.cpp + pass_ncnn/TopK.cpp pass_ncnn/torch_addmm.cpp pass_ncnn/torch_amax.cpp pass_ncnn/torch_amin.cpp diff --git a/tools/pnnx/src/pass_ncnn/TopK.cpp b/tools/pnnx/src/pass_ncnn/TopK.cpp new file mode 100644 index 000000000000..515790e38518 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/TopK.cpp @@ -0,0 +1,97 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +static int parameter_to_bool(const Parameter& p, int default_value) +{ + if (p.type == 1) + return p.b ? 1 : 0; + if (p.type == 2) + return p.i ? 1 : 0; + + return default_value; +} + +class TopK : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 k +TopK op_0 2 2 input k values indices %*=%* +pnnx.Output output 2 0 values indices +)PNNXIR"; + } + + const char* type_str() const + { + return "TopK"; + } + + const char* name_str() const + { + return "topk"; + } + + void write(Operator* op, const std::map& captured_params) const + { + int axis = -1; + if (captured_params.find("op_0.axis") != captured_params.end()) + axis = captured_params.at("op_0.axis").i; + + int largest = 1; + if (captured_params.find("op_0.largest") != captured_params.end()) + largest = parameter_to_bool(captured_params.at("op_0.largest"), 1); + + int sorted = 1; + if (captured_params.find("op_0.sorted") != captured_params.end()) + sorted = parameter_to_bool(captured_params.at("op_0.sorted"), 1); + + const int batch_index = op->inputs[0]->params["__batch_index"].i; + + if (axis == batch_index) + { + fprintf(stderr, "TopK along batch axis is not supported\n"); + return; + } + + int new_axis = axis; + if (axis >= 0) + new_axis = axis > batch_index ? axis - 1 : axis; + + op->params["0"] = new_axis; + op->params["1"] = largest; + op->params["2"] = sorted; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(TopK, 20) + +class TopK_0 : public TopK +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 2 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 k +TopK op_0 2 1 input k values %*=%* +pnnx.Output output 1 0 values +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(TopK_0, 20) + +} // namespace ncnn + +} // namespace pnnx From b2c445a61763ccf3e1e162803ccc23bdcb0b8d12 Mon Sep 17 00:00:00 2001 From: vlordier Date: Thu, 26 Feb 2026 23:34:51 +0100 Subject: [PATCH 02/29] Add ONNX torch_topk pnnx regression test --- tools/pnnx/tests/onnx/CMakeLists.txt | 1 + tools/pnnx/tests/onnx/test_torch_topk.py | 61 ++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.py diff --git a/tools/pnnx/tests/onnx/CMakeLists.txt b/tools/pnnx/tests/onnx/CMakeLists.txt index f029a669584d..ba821233ad12 100644 --- a/tools/pnnx/tests/onnx/CMakeLists.txt +++ b/tools/pnnx/tests/onnx/CMakeLists.txt @@ -191,6 +191,7 @@ pnnx_onnx_add_test(torch_split) pnnx_onnx_add_test(torch_squeeze) pnnx_onnx_add_test(torch_stack) pnnx_onnx_add_test(torch_sum) +pnnx_onnx_add_test(torch_topk) pnnx_onnx_add_test(torch_transpose) pnnx_onnx_add_test(torch_unbind) pnnx_onnx_add_test(torch_unsqueeze) diff --git a/tools/pnnx/tests/onnx/test_torch_topk.py b/tools/pnnx/tests/onnx/test_torch_topk.py new file mode 100644 index 000000000000..fe3d15c99b84 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_topk.py @@ -0,0 +1,61 @@ +# Copyright 2026 Tencent +# SPDX-License-Identifier: BSD-3-Clause + +import torch +import torch.nn as nn + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x_values, x_indices = torch.topk( + x, 2, dim=1, largest=True, sorted=True + ) + y_values, y_indices = torch.topk( + y, 4, dim=3, largest=False, sorted=True + ) + z_values, z_indices = torch.topk( + z, 3, dim=0, largest=True, sorted=True + ) + return x_values, x_indices, y_values, y_indices, z_values, z_indices + + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_topk.onnx") + + # onnx to pnnx + import os + + os.system( + "../../src/pnnx test_torch_topk.onnx " + "inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]" + ) + + # pnnx inference + import test_torch_topk_pnnx + b = test_torch_topk_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From 01d15cb58615e20d35c1fc3071fee5cbd378efc3 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 08:33:25 +0100 Subject: [PATCH 03/29] Add TopK Python class generation to pnnx module export MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Generate TopK class definition in pnnx.py output with forward() method - Instantiate TopK modules in Model.__init__() with proper parameters - Update forward() method to call self.topk_name() instead of direct TopK() calls - Fixes pnnx inference to properly execute TopK operations using torch.topk() - Test confirms TopK ONNX→pnnx conversion and inference working correctly --- tools/pnnx/src/CMakeLists.txt | 12 +- tools/pnnx/src/ir.cpp | 78 +++++++++++++ tools/pnnx/src/load_onnx.cpp | 8 ++ tools/pnnx/src/pass_onnx/fold_constants.cpp | 8 ++ tools/pnnx/src/pass_onnx/shape_inference.cpp | 8 ++ tools/pnnx/src/pnnx | 1 + tools/pnnx/tests/onnx/test_torch_topk.onnx | Bin 0 -> 3317 bytes .../pnnx/tests/onnx/test_torch_topk.onnx.data | 0 .../pnnx/tests/onnx/test_torch_topk.pnnx.bin | Bin 0 -> 98 bytes .../pnnx/tests/onnx/test_torch_topk.pnnx.onnx | Bin 0 -> 882 bytes .../tests/onnx/test_torch_topk.pnnx.param | 17 +++ .../tests/onnx/test_torch_topk.pnnxsim.onnx | Bin 0 -> 2861 bytes tools/pnnx/tests/onnx/test_torch_topk_pnnx.py | 109 ++++++++++++++++++ 13 files changed, 236 insertions(+), 5 deletions(-) create mode 120000 tools/pnnx/src/pnnx create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.onnx create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.onnx.data create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.param create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnxsim.onnx create mode 100644 tools/pnnx/tests/onnx/test_torch_topk_pnnx.py diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index c554a6873e81..6231e36b16ac 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -630,23 +630,25 @@ if(PROTOBUF_FOUND) set(CMAKE_CXX_STANDARD 17) endif() - if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE) + if(COMMAND protobuf_generate_cpp) protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx-data.proto onnx-ml.proto onnx-operators-ml.proto) add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS}) target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES}) - else() + elseif(COMMAND protobuf_generate) add_library(onnxproto STATIC onnx-data.proto onnx-ml.proto onnx-operators-ml.proto) target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) protobuf_generate(TARGET onnxproto) target_link_libraries(onnxproto PUBLIC protobuf::libprotobuf) + else() + message(FATAL_ERROR "Neither protobuf_generate_cpp nor protobuf_generate is available. Please install protobuf with CMake codegen support.") endif() # use onnxruntime onnx proto if found if(onnxruntime_FOUND) add_dependencies(onnxruntime::onnxruntime onnxproto) - if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE) + if(COMMAND protobuf_generate_cpp) set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PROTOBUF_LIBRARIES}) else() @@ -688,7 +690,7 @@ if(PROTOBUF_FOUND) save_onnx.cpp ) if(onnxruntime_FOUND) - target_link_libraries(pnnx2onnx PRIVATE onnxruntime::onnxruntime) + target_link_libraries(pnnx2onnx PRIVATE onnxruntime::onnxruntime onnxproto) else() target_link_libraries(pnnx2onnx PRIVATE onnxproto) endif() @@ -720,7 +722,7 @@ if(onnxruntime_FOUND) ) add_library(onnx2pnnx OBJECT ${onnx2pnnx_SRCS}) - target_link_libraries(onnx2pnnx PRIVATE onnxruntime::onnxruntime) + target_link_libraries(onnx2pnnx PRIVATE onnxruntime::onnxruntime onnxproto) target_compile_definitions(onnx2pnnx PRIVATE BUILD_ONNX2PNNX) message(STATUS "Building with onnx2pnnx") diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp index 44e4b77fdf2f..63f9c70e21f4 100644 --- a/tools/pnnx/src/ir.cpp +++ b/tools/pnnx/src/ir.cpp @@ -1479,6 +1479,33 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con fprintf(pyfp, "\n"); + // output custom layer classes for pnnx operators + { + bool has_topk = false; + for (const Operator* op : ops) + { + if (op->type == "TopK") + { + has_topk = true; + break; + } + } + + if (has_topk) + { + fprintf(pyfp, "class TopK(nn.Module):\n"); + fprintf(pyfp, " def __init__(self, axis=1, largest=1, sorted=1):\n"); + fprintf(pyfp, " super(TopK, self).__init__()\n"); + fprintf(pyfp, " self.axis = axis\n"); + fprintf(pyfp, " self.largest = largest\n"); + fprintf(pyfp, " self.sorted = sorted\n"); + fprintf(pyfp, " def forward(self, x, k):\n"); + fprintf(pyfp, " # Torch topk returns (values, indices)\n"); + fprintf(pyfp, " return torch.topk(x, k.item() if hasattr(k, 'item') else k, dim=self.axis, largest=bool(self.largest), sorted=bool(self.sorted))\n"); + fprintf(pyfp, "\n"); + } + } + fprintf(pyfp, "class Model(nn.Module):\n"); fprintf(pyfp, " def __init__(self):\n"); fprintf(pyfp, " super(Model, self).__init__()\n"); @@ -1605,6 +1632,39 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con } } + // TopK modules + { + for (const Operator* op : ops) + { + if (op->type != "TopK") + continue; + + fprintf(pyfp, " self.%s = TopK(", sanitize_identifier(op->name).c_str()); + + int i = 0; + for (const auto& it : op->params) + { + fprintf(pyfp, "%s=", it.first.c_str()); + + const Parameter& param = it.second; + if (param.type == 2) + { + fprintf(pyfp, "%d", param.i); + } + else if (param.type == 1) + { + fprintf(pyfp, "%d", param.b ? 1 : 0); + } + + if (i + 1 != op->params.size()) + fprintf(pyfp, ", "); + i++; + } + + fprintf(pyfp, ")\n"); + } + } + fprintf(pyfp, "\n"); // load weights @@ -2186,6 +2246,24 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con } fprintf(pyfp, ")\n"); } + else if (op->type == "TopK") + { + // self.topk_name() + for (size_t i = 0; i < op->outputs.size(); i++) + { + fprintf(pyfp, "v_%s", sanitize_identifier(op->outputs[i]->name).c_str()); + if (i + 1 != op->outputs.size()) + fprintf(pyfp, ", "); + } + fprintf(pyfp, " = self.%s(", sanitize_identifier(op->name).c_str()); + for (size_t i = 0; i < op->inputs.size(); i++) + { + fprintf(pyfp, "v_%s", sanitize_identifier(op->inputs[i]->name).c_str()); + if (i + 1 != op->inputs.size()) + fprintf(pyfp, ", "); + } + fprintf(pyfp, ")\n"); + } else { if (op->type.find("::") == std::string::npos && op->type.find(".") == std::string::npos) diff --git a/tools/pnnx/src/load_onnx.cpp b/tools/pnnx/src/load_onnx.cpp index 3c788a0c4849..6cc4a1de4284 100644 --- a/tools/pnnx/src/load_onnx.cpp +++ b/tools/pnnx/src/load_onnx.cpp @@ -13,7 +13,15 @@ #include #include +#if __has_include() #include +#elif __has_include() +#include +#elif __has_include() +#include +#else +#error "onnxruntime_c_api.h not found" +#endif #include "ir.h" diff --git a/tools/pnnx/src/pass_onnx/fold_constants.cpp b/tools/pnnx/src/pass_onnx/fold_constants.cpp index 1ef0092a72ec..c79cb29f34a1 100644 --- a/tools/pnnx/src/pass_onnx/fold_constants.cpp +++ b/tools/pnnx/src/pass_onnx/fold_constants.cpp @@ -9,7 +9,15 @@ #include #include +#if __has_include() #include +#elif __has_include() +#include +#elif __has_include() +#include +#else +#error "onnxruntime_c_api.h not found" +#endif #include "dead_code_elimination.h" diff --git a/tools/pnnx/src/pass_onnx/shape_inference.cpp b/tools/pnnx/src/pass_onnx/shape_inference.cpp index 99dc652389d8..23986a7a7d2d 100644 --- a/tools/pnnx/src/pass_onnx/shape_inference.cpp +++ b/tools/pnnx/src/pass_onnx/shape_inference.cpp @@ -8,7 +8,15 @@ #include #include +#if __has_include() #include +#elif __has_include() +#include +#elif __has_include() +#include +#else +#error "onnxruntime_c_api.h not found" +#endif namespace pnnx { diff --git a/tools/pnnx/src/pnnx b/tools/pnnx/src/pnnx new file mode 120000 index 000000000000..909f9eae4b3f --- /dev/null +++ b/tools/pnnx/src/pnnx @@ -0,0 +1 @@ +../build/src/pnnx \ No newline at end of file diff --git a/tools/pnnx/tests/onnx/test_torch_topk.onnx b/tools/pnnx/tests/onnx/test_torch_topk.onnx new file mode 100644 index 0000000000000000000000000000000000000000..e57e7e63ec365e26943043ad0202d1152ca55191 GIT binary patch literal 3317 zcmc(h&u`l{6vrdStraE<3@$5L9|poh;nWZ$ONsl#Kv1AS3&g|PVoflhLCUEz<7k#C zL8M#DMK3J|?9giuI}F%u|HRJQX@9`}mmT&b|B(D6Sk}6T0m9^y$oGAGA3ahcY{|GDcl8!Kv3s6u?xQb*P8)2XSNA#Z>yBp&K~G&4+F*-t<)|{fRz)L~xrHjn z&9@4=GPBl6pT=Qf_j>G7~J7J$rkLa?+GJ-turK~Mi}ufCS7w|4W2N1l z(q;;fc^-sTVx-0ht-_#rD~nKdAwtyr#1OqZ8=^3Qh?>jADALHTMj$5-Y)`ORnzT7U zvNUa`G*R=yc)B^qQ#9czI)kS_fTzKbrwl~9DNpq*Pu6*ON=nq2rx>nt@nn5so_;)= zr_PWkYq>lP5}0Zq4w7|YAl;b*X)pn4uoxtal2fJ~+QDh{3sor=?hVhlrkYzB{<3DX zspc1$t){vCd1m!dK81=qiqMU4s8L zXzLfO?HABCz$J7Qv~7W&O=v2#u|9OM`kp^SlN--;Vsjv}SiXyD|CyObJvo)x^(3>8 z4i6x;&%a0MMJhq3FhOTY9$Kjf>kJ6;A0J1?%TD5@Fo)U_yqu=7>}H^eKeB0PJ~}TM zPxZu8A;EL`o__dze6>bMRSP%|u_$Asi5D2HtnrQVkFtQl`$>&qt;eAcb*4L8Z1A57 z`UfO`d%Kb6u15b!!HUU literal 0 HcmV?d00001 diff --git a/tools/pnnx/tests/onnx/test_torch_topk.onnx.data b/tools/pnnx/tests/onnx/test_torch_topk.onnx.data new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin new file mode 100644 index 0000000000000000000000000000000000000000..aa99d4621ed08e4d5412634fb912b37433a365d8 GIT binary patch literal 98 gcmWIWW@FP~ARPpFv#}%VVgzymyjj`)qX7m603eYP6951J literal 0 HcmV?d00001 diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx new file mode 100644 index 0000000000000000000000000000000000000000..83b5d3a0f7a0476395b71a8e3c1232fa127a2904 GIT binary patch literal 882 zcmaiyO;5r=5QZ6AOC1oK4FOUQnjSe|w;z0X*JwO=@lHr=42>xbEeSt|f9PLuW~qd> z5f78iyzlHj?{53>ZD3D;Yip~-budkm-S{O*w>hhlRasR%R=tvXFTl6kym=Ar(#^ti zuGLA8)I?dfS|_6p>TEgS?LosQ1Q)><5C{lu<9(gJE?h_Fb<{?Exm#sJ*h6#F#n#ty z^BHCNp}#^STW|8{L$yfY$thT#0S5(GB1{BK1x-1?%YQRY<<1WT}M6n_-&i>2O~XHx~%C_rF%7frgXoo50Ek{Bd8lv z8DYtY2lO?CK+8nY!KMjR8kH5_AV|^29vN0)T+n(+)sAoJMTCt*GJ7O+sZ6Th0gT)H zhZv}v4bfmmP+>G_+F~PAM(YSa7_@zI+)hcRerT7v+<$6^zM|yBwj5*|T0{UP@!{8_mp_p-5H81K872O4 zb;Y%urqyi!x{m(T(Q?3>=*n|1q)n_xLz&&3>dI~uKcdZ=o<192p%oD&l67xrAGA_q z6_rCT=`kU*{57O|qd=%kr-iO^Z@@(C`3!Hg38e!s;FMb5M_m@$GK%|c;z`D>^wsLJ zBX1bW&JMa6_j{m$M=RodWci^d1U(K|;(<2C_5J#6Uv-nz3mw%DI&UkRg&{o}a$g1!-$}yTy@{fk#R0WW+_Ezg3K~x zI^Rtr(^!ZMb-pk(znl+EqM)J9=f!4hk}Z?K96a2|r}_WE zaW4&=dj)V#PQf`@3=T$W7wIKleBAiP&?@Eo*{;y?f{idnS z4`Wm@G<+SE3=PsUq)KfY*=K7v8^y5qS!!r1f*$weFk#jXkB9PU%zUd4OC?5oPaph9 zcOF04+lA(>|Ij>!H9ZipAxf-2jyQYvyK?5>b+iV~D!H>O@VpA?+9kVtuiUw~if(|r zHE^^F$$&I*h%VRRVHIU3nc0~wfx}8(=BQaOpU!$#c2l);&xyN!n3Zp;@^WEb^uwBJ zzcm$^l%|=;%#xuE-nH;o%L!)X02c|%>L0PV+TlF|6 u!=k)M-drqiEtXre@@wrbaxeV+#N$0i{fG-VVg+0ZTzdKHzK)hEYo7plmS9`} literal 0 HcmV?d00001 diff --git a/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py b/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py new file mode 100644 index 000000000000..2b4e7ed5abae --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py @@ -0,0 +1,109 @@ +import os +import numpy as np +import tempfile, zipfile +import torch +import torch.nn as nn +import torch.nn.functional as F +try: + import torchvision + import torchaudio +except: + pass + +class TopK(nn.Module): + def __init__(self, axis=1, largest=1, sorted=1): + super(TopK, self).__init__() + self.axis = axis + self.largest = largest + self.sorted = sorted + def forward(self, x, k): + # Torch topk returns (values, indices) + return torch.topk(x, k.item() if hasattr(k, 'item') else k, dim=self.axis, largest=bool(self.largest), sorted=bool(self.sorted)) + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.TopK_0 = TopK(axis=1, largest=1, sorted=1) + self.TopK_1 = TopK(axis=3, largest=0, sorted=1) + self.TopK_2 = TopK(axis=0, largest=1, sorted=1) + + archive = zipfile.ZipFile('test_torch_topk.pnnx.bin', 'r') + archive.close() + + def load_pnnx_bin_as_parameter(self, archive, key, shape, dtype, requires_grad=True): + return nn.Parameter(self.load_pnnx_bin_as_tensor(archive, key, shape, dtype), requires_grad) + + def load_pnnx_bin_as_tensor(self, archive, key, shape, dtype): + fd, tmppath = tempfile.mkstemp() + with os.fdopen(fd, 'wb') as tmpf, archive.open(key) as keyfile: + tmpf.write(keyfile.read()) + m = np.memmap(tmppath, dtype=dtype, mode='r', shape=shape).copy() + os.remove(tmppath) + return torch.from_numpy(m) + + def forward(self, v_0, v_1, v_2): + v_3 = 2 + v_4, v_5 = self.TopK_0(v_0, v_3) + v_6 = 4 + v_7, v_8 = self.TopK_1(v_1, v_6) + v_9 = 3 + v_10, v_11 = self.TopK_2(v_2, v_9) + return v_4, v_5, v_7, v_8, v_10, v_11 + +def export_torchscript(): + net = Model() + net.float() + net.eval() + + torch.manual_seed(0) + v_0 = torch.rand(1, 3, 16, dtype=torch.float) + v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float) + v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float) + + mod = torch.jit.trace(net, (v_0, v_1, v_2)) + mod.save("test_torch_topk_pnnx.py.pt") + +def export_onnx(): + net = Model() + net.float() + net.eval() + + torch.manual_seed(0) + v_0 = torch.rand(1, 3, 16, dtype=torch.float) + v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float) + v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float) + + torch.onnx.export(net, (v_0, v_1, v_2), "test_torch_topk_pnnx.py.onnx", export_params=True, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, opset_version=13, input_names=['in0', 'in1', 'in2'], output_names=['out0', 'out1', 'out2', 'out3', 'out4', 'out5']) + +def export_pnnx(): + net = Model() + net.float() + net.eval() + + torch.manual_seed(0) + v_0 = torch.rand(1, 3, 16, dtype=torch.float) + v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float) + v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float) + + import pnnx + pnnx.export(net, "test_torch_topk_pnnx.py.pt", (v_0, v_1, v_2)) + +def export_ncnn(): + export_pnnx() + +@torch.no_grad() +def test_inference(): + net = Model() + net.float() + net.eval() + + torch.manual_seed(0) + v_0 = torch.rand(1, 3, 16, dtype=torch.float) + v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float) + v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float) + + return net(v_0, v_1, v_2) + +if __name__ == "__main__": + print(test_inference()) From 13cf18c4f055dbae88e103a049c8e911aea98af4 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 08:38:06 +0100 Subject: [PATCH 04/29] Fix pnnx pass_ncnn TopK pattern matching and parameter capture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix IR pattern syntax to use explicit parameter names (axis=%, largest=%, sorted=%) - Replace incorrect parameter lookup from 'op_0.axis' to 'axis' to match captured names - TopK pass now properly fires during ONNX→pnnx→ncnn conversion - All TopK parameters (axis, largest, sorted) correctly captured and set in ncnn layers - End-to-end test confirms ONNX→pnnx→ncnn conversion with TopK working correctly --- tools/pnnx/src/pass_ncnn/TopK.cpp | 16 ++++---- .../pnnx/tests/onnx/test_torch_topk.ncnn.bin | 0 .../tests/onnx/test_torch_topk.ncnn.param | 11 +++++ tools/pnnx/tests/onnx/test_torch_topk_ncnn.py | 40 +++++++++++++++++++ 4 files changed, 59 insertions(+), 8 deletions(-) create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin create mode 100644 tools/pnnx/tests/onnx/test_torch_topk.ncnn.param create mode 100644 tools/pnnx/tests/onnx/test_torch_topk_ncnn.py diff --git a/tools/pnnx/src/pass_ncnn/TopK.cpp b/tools/pnnx/src/pass_ncnn/TopK.cpp index 515790e38518..ed226605ad8c 100644 --- a/tools/pnnx/src/pass_ncnn/TopK.cpp +++ b/tools/pnnx/src/pass_ncnn/TopK.cpp @@ -26,7 +26,7 @@ class TopK : public GraphRewriterPass 4 3 pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 k -TopK op_0 2 2 input k values indices %*=%* +TopK op_0 2 2 input k values indices axis=%axis largest=%largest sorted=%sorted pnnx.Output output 2 0 values indices )PNNXIR"; } @@ -44,16 +44,16 @@ pnnx.Output output 2 0 values indices void write(Operator* op, const std::map& captured_params) const { int axis = -1; - if (captured_params.find("op_0.axis") != captured_params.end()) - axis = captured_params.at("op_0.axis").i; + if (captured_params.find("axis") != captured_params.end()) + axis = captured_params.at("axis").i; int largest = 1; - if (captured_params.find("op_0.largest") != captured_params.end()) - largest = parameter_to_bool(captured_params.at("op_0.largest"), 1); + if (captured_params.find("largest") != captured_params.end()) + largest = parameter_to_bool(captured_params.at("largest"), 1); int sorted = 1; - if (captured_params.find("op_0.sorted") != captured_params.end()) - sorted = parameter_to_bool(captured_params.at("op_0.sorted"), 1); + if (captured_params.find("sorted") != captured_params.end()) + sorted = parameter_to_bool(captured_params.at("sorted"), 1); const int batch_index = op->inputs[0]->params["__batch_index"].i; @@ -84,7 +84,7 @@ class TopK_0 : public TopK 4 2 pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 k -TopK op_0 2 1 input k values %*=%* +TopK op_0 2 1 input k values axis=%axis largest=%largest sorted=%sorted pnnx.Output output 1 0 values )PNNXIR"; } diff --git a/tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin b/tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param b/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param new file mode 100644 index 000000000000..f15762f83651 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param @@ -0,0 +1,11 @@ +7767517 +9 12 +Input in0 0 1 in0 +Input in1 0 1 in1 +Input in2 0 1 in2 +pnnx.Expression pnnx_expr_2 0 1 3 +TopK topk_0 2 2 in0 3 out0 out1 0=1 1=1 2=1 +pnnx.Expression pnnx_expr_1 0 1 6 +TopK topk_1 2 2 in1 6 out2 out3 0=3 1=0 2=1 +pnnx.Expression pnnx_expr_0 0 1 9 +TopK topk_2 2 2 in2 9 out4 out5 0=0 1=1 2=1 diff --git a/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py b/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py new file mode 100644 index 000000000000..bcb84b7afc45 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py @@ -0,0 +1,40 @@ +import numpy as np +import ncnn +import torch + +def test_inference(): + torch.manual_seed(0) + in0 = torch.rand(1, 3, 16, dtype=torch.float) + in1 = torch.rand(1, 5, 9, 11, dtype=torch.float) + in2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float) + out = [] + + with ncnn.Net() as net: + net.load_param("test_torch_topk.ncnn.param") + net.load_model("test_torch_topk.ncnn.bin") + + with net.create_extractor() as ex: + ex.input("in0", ncnn.Mat(in0.numpy()).clone()) + ex.input("in1", ncnn.Mat(in1.numpy()).clone()) + ex.input("in2", ncnn.Mat(in2.numpy()).clone()) + + _, out0 = ex.extract("out0") + out.append(torch.from_numpy(np.array(out0))) + _, out1 = ex.extract("out1") + out.append(torch.from_numpy(np.array(out1))) + _, out2 = ex.extract("out2") + out.append(torch.from_numpy(np.array(out2))) + _, out3 = ex.extract("out3") + out.append(torch.from_numpy(np.array(out3))) + _, out4 = ex.extract("out4") + out.append(torch.from_numpy(np.array(out4))) + _, out5 = ex.extract("out5") + out.append(torch.from_numpy(np.array(out5))) + + if len(out) == 1: + return out[0] + else: + return tuple(out) + +if __name__ == "__main__": + print(test_inference()) From e95770e0bb0fcfef0ca74693d60af18054da3b75 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 14:43:11 +0100 Subject: [PATCH 05/29] topk: align with codebase style and expand ONNX coverage use c++03-style topk comparator and keep deterministic nan/inf ordering remove redundant constructor param initialization fix tests cmakelists alphabetical order (Tile before TopK) expand torch_topk onnx tests (k=0/k=1, negative dim, sorted=false cases) drop generated topk onnx/pnnx/ncnn sidecar artifacts from repo --- src/layer/topk.cpp | 115 ++++++------ tests/CMakeLists.txt | 2 +- tests/test_topk.cpp | 174 +++++++++++++++++- .../tests/onnx/test_torch_topk.ncnn.param | 11 -- tools/pnnx/tests/onnx/test_torch_topk.onnx | Bin 3317 -> 0 bytes .../pnnx/tests/onnx/test_torch_topk.pnnx.onnx | Bin 882 -> 0 bytes .../tests/onnx/test_torch_topk.pnnx.param | 17 -- .../tests/onnx/test_torch_topk.pnnxsim.onnx | Bin 2861 -> 0 bytes tools/pnnx/tests/onnx/test_torch_topk.py | 50 ++++- tools/pnnx/tests/onnx/test_torch_topk_ncnn.py | 40 ---- tools/pnnx/tests/onnx/test_torch_topk_pnnx.py | 109 ----------- 11 files changed, 281 insertions(+), 237 deletions(-) delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.ncnn.param delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.onnx delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.param delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnxsim.onnx delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk_ncnn.py delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk_pnnx.py diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index c65dbc9689ba..72b4df40813d 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -4,19 +4,58 @@ #include "topk.h" #include +#include +#include #include namespace ncnn { +static inline bool topk_isnan(float v) +{ + uint32_t u; + memcpy(&u, &v, sizeof(uint32_t)); + return (u & 0x7fffffff) > 0x7f800000; +} + +static inline bool topk_pair_comp(const std::pair& a, const std::pair& b, bool largest) +{ + const bool a_nan = topk_isnan(a.first); + const bool b_nan = topk_isnan(b.first); + + // Keep NaN at the end for both largest/smallest to ensure deterministic ordering. + if (a_nan || b_nan) + { + if (a_nan != b_nan) + return !a_nan && b_nan; + + return a.second < b.second; + } + + if (a.first != b.first) + return largest ? (a.first > b.first) : (a.first < b.first); + + return a.second < b.second; +} + +struct topk_pair_comparator +{ + topk_pair_comparator(bool _largest) + : largest(_largest) + { + } + + bool operator()(const std::pair& a, const std::pair& b) const + { + return topk_pair_comp(a, b, largest); + } + + bool largest; +}; + TopK::TopK() { one_blob_only = false; support_inplace = false; - - axis = -1; - largest = 1; - sorted = 1; - k = 1; } int TopK::load_param(const ParamDict& pd) @@ -49,10 +88,10 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (bottom_blob.dims < 1 || bottom_blob.dims > 4) return -100; - int dims = bottom_blob.dims; + const int dims = bottom_blob.dims; - int axis_p = axis < 0 ? axis + dims : axis; - if (axis_p < 0 || axis_p >= dims) + const int positive_axis = axis < 0 ? axis + dims : axis; + if (positive_axis < 0 || positive_axis >= dims) return -1; int shape[4] = {1, 1, 1, 1}; @@ -61,7 +100,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (dims >= 3) shape[2] = bottom_blob.dims == 3 ? bottom_blob.c : bottom_blob.d; if (dims >= 4) shape[3] = bottom_blob.c; - int axis_size = shape[axis_p]; + const int axis_size = shape[positive_axis]; if (axis_size <= 0) return -1; @@ -71,7 +110,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl _k = axis_size; int out_shape[4] = {shape[0], shape[1], shape[2], shape[3]}; - out_shape[axis_p] = _k; + out_shape[positive_axis] = _k; Mat values; if (dims == 1) values.create(out_shape[0], 4u, opt.blob_allocator); @@ -97,23 +136,23 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl float* outidxptr = indices; int inner = 1; - for (int i = 0; i < axis_p; i++) + for (int i = 0; i < positive_axis; i++) { inner *= shape[i]; } int outer = 1; - for (int i = axis_p + 1; i < dims; i++) + for (int i = positive_axis + 1; i < dims; i++) { outer *= shape[i]; } - const bool largest_p = largest != 0; - const bool sorted_p = sorted != 0; + const bool largest_flag = largest != 0; + const bool sorted_flag = sorted != 0; const int total_lines = outer * inner; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { int outer_i = line / inner; @@ -131,49 +170,19 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl vec[j].second = j; } - if (largest_p) + topk_pair_comparator comp(largest_flag); + + if (_k < axis_size) { - auto comp = [](const std::pair& a, const std::pair& b) - { - if (a.first != b.first) - return a.first > b.first; - return a.second < b.second; - }; - - if (_k < axis_size) - { - if (sorted_p) - std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp); - else - std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); - } + if (sorted_flag) + std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp); else - { - if (sorted_p) - std::sort(vec.begin(), vec.end(), comp); - } + std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); } else { - auto comp = [](const std::pair& a, const std::pair& b) - { - if (a.first != b.first) - return a.first < b.first; - return a.second < b.second; - }; - - if (_k < axis_size) - { - if (sorted_p) - std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp); - else - std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); - } - else - { - if (sorted_p) - std::sort(vec.begin(), vec.end(), comp); - } + if (sorted_flag) + std::sort(vec.begin(), vec.end(), comp); } for (int j = 0; j < _k; j++) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4f40f8279428..35df0d37a967 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -166,8 +166,8 @@ ncnn_add_layer_test(Spectrogram) ncnn_add_layer_test(Squeeze) ncnn_add_layer_test(Swish) ncnn_add_layer_test(TanH) -ncnn_add_layer_test(TopK) ncnn_add_layer_test(Tile) +ncnn_add_layer_test(TopK) ncnn_add_layer_test(UnaryOp) ncnn_add_layer_test(Unfold) ncnn_add_layer_test(Yolov3DetectionOutput) diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp index 7b7fe82690ba..b35be1574b18 100644 --- a/tests/test_topk.cpp +++ b/tests/test_topk.cpp @@ -3,6 +3,52 @@ #include "testutil.h" +#include + +static int test_topk_cpu_forward(const ncnn::Mat& a, int axis, int k, int largest, int sorted, ncnn::Mat& values, ncnn::Mat& indices) +{ + ncnn::ParamDict pd; + pd.set(0, axis); + pd.set(1, largest); + pd.set(2, sorted); + pd.set(3, k); + + std::vector weights(0); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_vulkan_compute = false; + opt.use_packing_layout = false; + + ncnn::Layer* op = ncnn::create_layer_cpu("TopK"); + if (!op) + return -1; + + op->load_param(pd); + + ncnn::ModelBinFromMatArray mb(weights.data()); + op->load_model(mb); + + op->create_pipeline(opt); + + std::vector bottom_blobs(1); + bottom_blobs[0] = a; + + std::vector top_blobs(2); + int ret = op->forward(bottom_blobs, top_blobs, opt); + + op->destroy_pipeline(opt); + delete op; + + if (ret != 0) + return ret; + + values = top_blobs[0]; + indices = top_blobs[1]; + + return 0; +} + static int test_topk(const ncnn::Mat& a, int axis, int k, int largest, int sorted) { ncnn::ParamDict pd; @@ -76,6 +122,130 @@ static int test_topk_3() || test_topk(a, -4, 2, 1, 1); } +static int test_topk_inf_order() +{ + ncnn::Mat a(6); + float* ptr = a; + ptr[0] = 1.f; + ptr[1] = std::numeric_limits::infinity(); + ptr[2] = -2.f; + ptr[3] = -std::numeric_limits::infinity(); + ptr[4] = 0.5f; + ptr[5] = 3.f; + + ncnn::Mat values; + ncnn::Mat indices; + + int ret = test_topk_cpu_forward(a, 0, 2, 1, 1, values, indices); + if (ret != 0) + { + fprintf(stderr, "test_topk_inf_order largest failed ret=%d\n", ret); + return -1; + } + + const float* vptr = values; + const float* iptr = indices; + if (values.w != 2 || indices.w != 2 || vptr[0] != std::numeric_limits::infinity() || vptr[1] != 3.f || (int)iptr[0] != 1 || (int)iptr[1] != 5) + { + fprintf(stderr, "test_topk_inf_order largest result mismatch\n"); + return -1; + } + + ret = test_topk_cpu_forward(a, 0, 2, 0, 1, values, indices); + if (ret != 0) + { + fprintf(stderr, "test_topk_inf_order smallest failed ret=%d\n", ret); + return -1; + } + + vptr = values; + iptr = indices; + if (values.w != 2 || indices.w != 2 || vptr[0] != -std::numeric_limits::infinity() || vptr[1] != -2.f || (int)iptr[0] != 3 || (int)iptr[1] != 2) + { + fprintf(stderr, "test_topk_inf_order smallest result mismatch\n"); + return -1; + } + + return 0; +} + +static int test_topk_nan_robust() +{ + ncnn::Mat a(4); + float* ptr = a; + ptr[0] = 1.f; + ptr[1] = std::numeric_limits::quiet_NaN(); + ptr[2] = 2.f; + ptr[3] = -1.f; + + ncnn::Mat values; + ncnn::Mat indices; + + int ret = test_topk_cpu_forward(a, 0, 2, 1, 1, values, indices); + if (ret != 0) + { + fprintf(stderr, "test_topk_nan_robust sorted failed ret=%d\n", ret); + return -1; + } + + if (values.w != 2 || indices.w != 2) + { + fprintf(stderr, "test_topk_nan_robust sorted shape mismatch\n"); + return -1; + } + + const float* vptr = values; + const float* iptr = indices; + if (vptr[0] != 2.f || vptr[1] != 1.f || (int)iptr[0] != 2 || (int)iptr[1] != 0) + { + fprintf(stderr, "test_topk_nan_robust sorted largest mismatch\n"); + return -1; + } + + ret = test_topk_cpu_forward(a, 0, 2, 0, 1, values, indices); + if (ret != 0) + { + fprintf(stderr, "test_topk_nan_robust sorted smallest failed ret=%d\n", ret); + return -1; + } + + if (values.w != 2 || indices.w != 2) + { + fprintf(stderr, "test_topk_nan_robust sorted smallest shape mismatch\n"); + return -1; + } + + vptr = values; + iptr = indices; + if (vptr[0] != -1.f || vptr[1] != 1.f || (int)iptr[0] != 3 || (int)iptr[1] != 0) + { + fprintf(stderr, "test_topk_nan_robust sorted smallest mismatch\n"); + return -1; + } + + ret = test_topk_cpu_forward(a, 0, 2, 1, 0, values, indices); + if (ret != 0) + { + fprintf(stderr, "test_topk_nan_robust unsorted failed ret=%d\n", ret); + return -1; + } + + if (values.w != 2 || indices.w != 2) + { + fprintf(stderr, "test_topk_nan_robust unsorted shape mismatch\n"); + return -1; + } + + iptr = indices; + if ((int)iptr[0] < 0 || (int)iptr[0] >= 4 || (int)iptr[1] < 0 || (int)iptr[1] >= 4) + { + fprintf(stderr, "test_topk_nan_robust unsorted invalid indices\n"); + return -1; + } + + return 0; +} + int main() { SRAND(7767517); @@ -84,5 +254,7 @@ int main() || test_topk_0() || test_topk_1() || test_topk_2() - || test_topk_3(); + || test_topk_3() + || test_topk_inf_order() + || test_topk_nan_robust(); } diff --git a/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param b/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param deleted file mode 100644 index f15762f83651..000000000000 --- a/tools/pnnx/tests/onnx/test_torch_topk.ncnn.param +++ /dev/null @@ -1,11 +0,0 @@ -7767517 -9 12 -Input in0 0 1 in0 -Input in1 0 1 in1 -Input in2 0 1 in2 -pnnx.Expression pnnx_expr_2 0 1 3 -TopK topk_0 2 2 in0 3 out0 out1 0=1 1=1 2=1 -pnnx.Expression pnnx_expr_1 0 1 6 -TopK topk_1 2 2 in1 6 out2 out3 0=3 1=0 2=1 -pnnx.Expression pnnx_expr_0 0 1 9 -TopK topk_2 2 2 in2 9 out4 out5 0=0 1=1 2=1 diff --git a/tools/pnnx/tests/onnx/test_torch_topk.onnx b/tools/pnnx/tests/onnx/test_torch_topk.onnx deleted file mode 100644 index e57e7e63ec365e26943043ad0202d1152ca55191..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3317 zcmc(h&u`l{6vrdStraE<3@$5L9|poh;nWZ$ONsl#Kv1AS3&g|PVoflhLCUEz<7k#C zL8M#DMK3J|?9giuI}F%u|HRJQX@9`}mmT&b|B(D6Sk}6T0m9^y$oGAGA3ahcY{|GDcl8!Kv3s6u?xQb*P8)2XSNA#Z>yBp&K~G&4+F*-t<)|{fRz)L~xrHjn z&9@4=GPBl6pT=Qf_j>G7~J7J$rkLa?+GJ-turK~Mi}ufCS7w|4W2N1l z(q;;fc^-sTVx-0ht-_#rD~nKdAwtyr#1OqZ8=^3Qh?>jADALHTMj$5-Y)`ORnzT7U zvNUa`G*R=yc)B^qQ#9czI)kS_fTzKbrwl~9DNpq*Pu6*ON=nq2rx>nt@nn5so_;)= zr_PWkYq>lP5}0Zq4w7|YAl;b*X)pn4uoxtal2fJ~+QDh{3sor=?hVhlrkYzB{<3DX zspc1$t){vCd1m!dK81=qiqMU4s8L zXzLfO?HABCz$J7Qv~7W&O=v2#u|9OM`kp^SlN--;Vsjv}SiXyD|CyObJvo)x^(3>8 z4i6x;&%a0MMJhq3FhOTY9$Kjf>kJ6;A0J1?%TD5@Fo)U_yqu=7>}H^eKeB0PJ~}TM zPxZu8A;EL`o__dze6>bMRSP%|u_$Asi5D2HtnrQVkFtQl`$>&qt;eAcb*4L8Z1A57 z`UfO`d%Kb6u15b!!HUU diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.onnx deleted file mode 100644 index 83b5d3a0f7a0476395b71a8e3c1232fa127a2904..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 882 zcmaiyO;5r=5QZ6AOC1oK4FOUQnjSe|w;z0X*JwO=@lHr=42>xbEeSt|f9PLuW~qd> z5f78iyzlHj?{53>ZD3D;Yip~-budkm-S{O*w>hhlRasR%R=tvXFTl6kym=Ar(#^ti zuGLA8)I?dfS|_6p>TEgS?LosQ1Q)><5C{lu<9(gJE?h_Fb<{?Exm#sJ*h6#F#n#ty z^BHCNp}#^STW|8{L$yfY$thT#0S5(GB1{BK1x-1?%YQRY<<1WT}M6n_-&i>2O~XHx~%C_rF%7frgXoo50Ek{Bd8lv z8DYtY2lO?CK+8nY!KMjR8kH5_AV|^29vN0)T+n(+)sAoJMTCt*GJ7O+sZ6Th0gT)H zhZv}v4bfmmP+>G_+F~PAM(YSa7_@zI+)hcRerT7v+<$6^zM|yBwj5*|T0{UP@!{8_mp_p-5H81K872O4 zb;Y%urqyi!x{m(T(Q?3>=*n|1q)n_xLz&&3>dI~uKcdZ=o<192p%oD&l67xrAGA_q z6_rCT=`kU*{57O|qd=%kr-iO^Z@@(C`3!Hg38e!s;FMb5M_m@$GK%|c;z`D>^wsLJ zBX1bW&JMa6_j{m$M=RodWci^d1U(K|;(<2C_5J#6Uv-nz3mw%DI&UkRg&{o}a$g1!-$}yTy@{fk#R0WW+_Ezg3K~x zI^Rtr(^!ZMb-pk(znl+EqM)J9=f!4hk}Z?K96a2|r}_WE zaW4&=dj)V#PQf`@3=T$W7wIKleBAiP&?@Eo*{;y?f{idnS z4`Wm@G<+SE3=PsUq)KfY*=K7v8^y5qS!!r1f*$weFk#jXkB9PU%zUd4OC?5oPaph9 zcOF04+lA(>|Ij>!H9ZipAxf-2jyQYvyK?5>b+iV~D!H>O@VpA?+9kVtuiUw~if(|r zHE^^F$$&I*h%VRRVHIU3nc0~wfx}8(=BQaOpU!$#c2l);&xyN!n3Zp;@^WEb^uwBJ zzcm$^l%|=;%#xuE-nH;o%L!)X02c|%>L0PV+TlF|6 u!=k)M-drqiEtXre@@wrbaxeV+#N$0i{fG-VVg+0ZTzdKHzK)hEYo7plmS9`} diff --git a/tools/pnnx/tests/onnx/test_torch_topk.py b/tools/pnnx/tests/onnx/test_torch_topk.py index fe3d15c99b84..d62db5990003 100644 --- a/tools/pnnx/tests/onnx/test_torch_topk.py +++ b/tools/pnnx/tests/onnx/test_torch_topk.py @@ -9,17 +9,55 @@ class Model(nn.Module): def __init__(self): super(Model, self).__init__() - def forward(self, x, y, z): + def forward(self, x, y, z, u, v): x_values, x_indices = torch.topk( x, 2, dim=1, largest=True, sorted=True ) + x_k1_values, x_k1_indices = torch.topk( + x, 1, dim=1, largest=True, sorted=True + ) + x_k0_values, x_k0_indices = torch.topk( + x, 0, dim=1, largest=True, sorted=True + ) + x_unsorted_values, x_unsorted_indices = torch.topk( + x, 2, dim=1, largest=True, sorted=False + ) y_values, y_indices = torch.topk( y, 4, dim=3, largest=False, sorted=True ) z_values, z_indices = torch.topk( z, 3, dim=0, largest=True, sorted=True ) - return x_values, x_indices, y_values, y_indices, z_values, z_indices + z_unsorted_values, z_unsorted_indices = torch.topk( + z, 3, dim=0, largest=True, sorted=False + ) + u_values, u_indices = torch.topk( + u, 2, dim=-1, largest=True, sorted=True + ) + v_values, v_indices = torch.topk( + v, 2, dim=1, largest=True, sorted=True + ) + + return ( + x_values, + x_indices, + x_k1_values, + x_k1_indices, + x_k0_values, + x_k0_indices, + x_unsorted_values, + x_unsorted_indices, + y_values, + y_indices, + z_values, + z_indices, + z_unsorted_values, + z_unsorted_indices, + u_values, + u_indices, + v_values, + v_indices, + ) def test(): @@ -30,18 +68,20 @@ def test(): x = torch.rand(1, 3, 16) y = torch.rand(1, 5, 9, 11) z = torch.rand(14, 8, 5, 9, 10) + u = torch.rand(2, 8, 4) + v = torch.rand(2, 4, 3) - a = net(x, y, z) + a = net(x, y, z, u, v) # export onnx - torch.onnx.export(net, (x, y, z), "test_torch_topk.onnx") + torch.onnx.export(net, (x, y, z, u, v), "test_torch_topk.onnx") # onnx to pnnx import os os.system( "../../src/pnnx test_torch_topk.onnx " - "inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]" + "inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10],[2,8,4],[2,4,3]" ) # pnnx inference diff --git a/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py b/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py deleted file mode 100644 index bcb84b7afc45..000000000000 --- a/tools/pnnx/tests/onnx/test_torch_topk_ncnn.py +++ /dev/null @@ -1,40 +0,0 @@ -import numpy as np -import ncnn -import torch - -def test_inference(): - torch.manual_seed(0) - in0 = torch.rand(1, 3, 16, dtype=torch.float) - in1 = torch.rand(1, 5, 9, 11, dtype=torch.float) - in2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float) - out = [] - - with ncnn.Net() as net: - net.load_param("test_torch_topk.ncnn.param") - net.load_model("test_torch_topk.ncnn.bin") - - with net.create_extractor() as ex: - ex.input("in0", ncnn.Mat(in0.numpy()).clone()) - ex.input("in1", ncnn.Mat(in1.numpy()).clone()) - ex.input("in2", ncnn.Mat(in2.numpy()).clone()) - - _, out0 = ex.extract("out0") - out.append(torch.from_numpy(np.array(out0))) - _, out1 = ex.extract("out1") - out.append(torch.from_numpy(np.array(out1))) - _, out2 = ex.extract("out2") - out.append(torch.from_numpy(np.array(out2))) - _, out3 = ex.extract("out3") - out.append(torch.from_numpy(np.array(out3))) - _, out4 = ex.extract("out4") - out.append(torch.from_numpy(np.array(out4))) - _, out5 = ex.extract("out5") - out.append(torch.from_numpy(np.array(out5))) - - if len(out) == 1: - return out[0] - else: - return tuple(out) - -if __name__ == "__main__": - print(test_inference()) diff --git a/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py b/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py deleted file mode 100644 index 2b4e7ed5abae..000000000000 --- a/tools/pnnx/tests/onnx/test_torch_topk_pnnx.py +++ /dev/null @@ -1,109 +0,0 @@ -import os -import numpy as np -import tempfile, zipfile -import torch -import torch.nn as nn -import torch.nn.functional as F -try: - import torchvision - import torchaudio -except: - pass - -class TopK(nn.Module): - def __init__(self, axis=1, largest=1, sorted=1): - super(TopK, self).__init__() - self.axis = axis - self.largest = largest - self.sorted = sorted - def forward(self, x, k): - # Torch topk returns (values, indices) - return torch.topk(x, k.item() if hasattr(k, 'item') else k, dim=self.axis, largest=bool(self.largest), sorted=bool(self.sorted)) - -class Model(nn.Module): - def __init__(self): - super(Model, self).__init__() - - self.TopK_0 = TopK(axis=1, largest=1, sorted=1) - self.TopK_1 = TopK(axis=3, largest=0, sorted=1) - self.TopK_2 = TopK(axis=0, largest=1, sorted=1) - - archive = zipfile.ZipFile('test_torch_topk.pnnx.bin', 'r') - archive.close() - - def load_pnnx_bin_as_parameter(self, archive, key, shape, dtype, requires_grad=True): - return nn.Parameter(self.load_pnnx_bin_as_tensor(archive, key, shape, dtype), requires_grad) - - def load_pnnx_bin_as_tensor(self, archive, key, shape, dtype): - fd, tmppath = tempfile.mkstemp() - with os.fdopen(fd, 'wb') as tmpf, archive.open(key) as keyfile: - tmpf.write(keyfile.read()) - m = np.memmap(tmppath, dtype=dtype, mode='r', shape=shape).copy() - os.remove(tmppath) - return torch.from_numpy(m) - - def forward(self, v_0, v_1, v_2): - v_3 = 2 - v_4, v_5 = self.TopK_0(v_0, v_3) - v_6 = 4 - v_7, v_8 = self.TopK_1(v_1, v_6) - v_9 = 3 - v_10, v_11 = self.TopK_2(v_2, v_9) - return v_4, v_5, v_7, v_8, v_10, v_11 - -def export_torchscript(): - net = Model() - net.float() - net.eval() - - torch.manual_seed(0) - v_0 = torch.rand(1, 3, 16, dtype=torch.float) - v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float) - v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float) - - mod = torch.jit.trace(net, (v_0, v_1, v_2)) - mod.save("test_torch_topk_pnnx.py.pt") - -def export_onnx(): - net = Model() - net.float() - net.eval() - - torch.manual_seed(0) - v_0 = torch.rand(1, 3, 16, dtype=torch.float) - v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float) - v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float) - - torch.onnx.export(net, (v_0, v_1, v_2), "test_torch_topk_pnnx.py.onnx", export_params=True, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, opset_version=13, input_names=['in0', 'in1', 'in2'], output_names=['out0', 'out1', 'out2', 'out3', 'out4', 'out5']) - -def export_pnnx(): - net = Model() - net.float() - net.eval() - - torch.manual_seed(0) - v_0 = torch.rand(1, 3, 16, dtype=torch.float) - v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float) - v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float) - - import pnnx - pnnx.export(net, "test_torch_topk_pnnx.py.pt", (v_0, v_1, v_2)) - -def export_ncnn(): - export_pnnx() - -@torch.no_grad() -def test_inference(): - net = Model() - net.float() - net.eval() - - torch.manual_seed(0) - v_0 = torch.rand(1, 3, 16, dtype=torch.float) - v_1 = torch.rand(1, 5, 9, 11, dtype=torch.float) - v_2 = torch.rand(14, 8, 5, 9, 10, dtype=torch.float) - - return net(v_0, v_1, v_2) - -if __name__ == "__main__": - print(test_inference()) From 4b4b87a7c74086cae9b0d30a27ca26f12ac83738 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 15:11:34 +0100 Subject: [PATCH 06/29] tests: add sorted=0 coverage for topk --- tests/test_topk.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp index b35be1574b18..55a95ef56bf0 100644 --- a/tests/test_topk.cpp +++ b/tests/test_topk.cpp @@ -79,6 +79,7 @@ static int test_topk_0() || test_topk(a, 0, 1, 1, 1) || test_topk(a, 0, 5, 1, 1) || test_topk(a, -1, 7, 0, 1) + || test_topk(a, 0, 4, 1, 0) || test_topk(a, 0, 9, 1, 1); } @@ -91,6 +92,7 @@ static int test_topk_1() || test_topk(a, 0, 5, 1, 1) || test_topk(a, 1, 3, 1, 1) || test_topk(a, -1, 8, 0, 1) + || test_topk(a, 1, 6, 0, 0) || test_topk(a, -2, 7, 1, 1); } @@ -102,6 +104,7 @@ static int test_topk_2() || test_topk(a, 0, 3, 1, 1) || test_topk(a, 1, 4, 1, 1) || test_topk(a, 2, 2, 0, 1) + || test_topk(a, 2, 5, 1, 0) || test_topk(a, -1, 6, 1, 1) || test_topk(a, -2, 5, 0, 1) || test_topk(a, -3, 7, 1, 1); @@ -115,6 +118,7 @@ static int test_topk_3() || test_topk(a, 0, 2, 1, 1) || test_topk(a, 1, 3, 0, 1) || test_topk(a, 2, 4, 1, 1) + || test_topk(a, 3, 4, 0, 0) || test_topk(a, 3, 5, 1, 1) || test_topk(a, -1, 6, 0, 1) || test_topk(a, -2, 3, 1, 1) From c9e856e8f59e3faad636a7523976401048a7d1da Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 15:29:13 +0100 Subject: [PATCH 07/29] tests: remove generated topk onnx artifacts --- tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin | 0 tools/pnnx/tests/onnx/test_torch_topk.onnx.data | 0 tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin | Bin 98 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.onnx.data delete mode 100644 tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin diff --git a/tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin b/tools/pnnx/tests/onnx/test_torch_topk.ncnn.bin deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tools/pnnx/tests/onnx/test_torch_topk.onnx.data b/tools/pnnx/tests/onnx/test_torch_topk.onnx.data deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin b/tools/pnnx/tests/onnx/test_torch_topk.pnnx.bin deleted file mode 100644 index aa99d4621ed08e4d5412634fb912b37433a365d8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 98 gcmWIWW@FP~ARPpFv#}%VVgzymyjj`)qX7m603eYP6951J From 4d5b35fed2d6b0c910e01aa4735fe3e6fb13b3c9 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:27:51 +0100 Subject: [PATCH 08/29] pnnx: drop unrelated cmake and symlink changes --- tools/pnnx/src/CMakeLists.txt | 12 +++++------- tools/pnnx/src/pnnx | 1 - 2 files changed, 5 insertions(+), 8 deletions(-) delete mode 120000 tools/pnnx/src/pnnx diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index 6231e36b16ac..c554a6873e81 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -630,25 +630,23 @@ if(PROTOBUF_FOUND) set(CMAKE_CXX_STANDARD 17) endif() - if(COMMAND protobuf_generate_cpp) + if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE) protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx-data.proto onnx-ml.proto onnx-operators-ml.proto) add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS}) target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES}) - elseif(COMMAND protobuf_generate) + else() add_library(onnxproto STATIC onnx-data.proto onnx-ml.proto onnx-operators-ml.proto) target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) protobuf_generate(TARGET onnxproto) target_link_libraries(onnxproto PUBLIC protobuf::libprotobuf) - else() - message(FATAL_ERROR "Neither protobuf_generate_cpp nor protobuf_generate is available. Please install protobuf with CMake codegen support.") endif() # use onnxruntime onnx proto if found if(onnxruntime_FOUND) add_dependencies(onnxruntime::onnxruntime onnxproto) - if(COMMAND protobuf_generate_cpp) + if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE) set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) set_property(TARGET onnxruntime::onnxruntime APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PROTOBUF_LIBRARIES}) else() @@ -690,7 +688,7 @@ if(PROTOBUF_FOUND) save_onnx.cpp ) if(onnxruntime_FOUND) - target_link_libraries(pnnx2onnx PRIVATE onnxruntime::onnxruntime onnxproto) + target_link_libraries(pnnx2onnx PRIVATE onnxruntime::onnxruntime) else() target_link_libraries(pnnx2onnx PRIVATE onnxproto) endif() @@ -722,7 +720,7 @@ if(onnxruntime_FOUND) ) add_library(onnx2pnnx OBJECT ${onnx2pnnx_SRCS}) - target_link_libraries(onnx2pnnx PRIVATE onnxruntime::onnxruntime onnxproto) + target_link_libraries(onnx2pnnx PRIVATE onnxruntime::onnxruntime) target_compile_definitions(onnx2pnnx PRIVATE BUILD_ONNX2PNNX) message(STATUS "Building with onnx2pnnx") diff --git a/tools/pnnx/src/pnnx b/tools/pnnx/src/pnnx deleted file mode 120000 index 909f9eae4b3f..000000000000 --- a/tools/pnnx/src/pnnx +++ /dev/null @@ -1 +0,0 @@ -../build/src/pnnx \ No newline at end of file From 5c11058f6c8e543d27bc5a5c4b1ad6dabed11eab Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:32:46 +0100 Subject: [PATCH 09/29] topk: reuse per-thread scratch buffer in forward --- src/layer/topk.cpp | 63 ++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index 72b4df40813d..2c9554ae06a9 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -152,44 +152,47 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl const int total_lines = outer * inner; - #pragma omp parallel for num_threads(opt.num_threads) - for (int line = 0; line < total_lines; line++) + #pragma omp parallel num_threads(opt.num_threads) { - int outer_i = line / inner; - int inner_i = line - outer_i * inner; - - int in_base = outer_i * axis_size * inner + inner_i; - int out_base = outer_i * _k * inner + inner_i; - std::vector > vec; vec.resize(axis_size); - for (int j = 0; j < axis_size; j++) - { - vec[j].first = ptr[in_base + j * inner]; - vec[j].second = j; - } - topk_pair_comparator comp(largest_flag); - if (_k < axis_size) + #pragma omp for + for (int line = 0; line < total_lines; line++) { - if (sorted_flag) - std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp); + int outer_i = line / inner; + int inner_i = line - outer_i * inner; + + int in_base = outer_i * axis_size * inner + inner_i; + int out_base = outer_i * _k * inner + inner_i; + + for (int j = 0; j < axis_size; j++) + { + vec[j].first = ptr[in_base + j * inner]; + vec[j].second = j; + } + + if (_k < axis_size) + { + if (sorted_flag) + std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp); + else + std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); + } else - std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); - } - else - { - if (sorted_flag) - std::sort(vec.begin(), vec.end(), comp); - } - - for (int j = 0; j < _k; j++) - { - outptr[out_base + j * inner] = vec[j].first; - if (outidxptr) - outidxptr[out_base + j * inner] = (float)vec[j].second; + { + if (sorted_flag) + std::sort(vec.begin(), vec.end(), comp); + } + + for (int j = 0; j < _k; j++) + { + outptr[out_base + j * inner] = vec[j].first; + if (outidxptr) + outidxptr[out_base + j * inner] = (float)vec[j].second; + } } } From 226bd88c4ead69883085b9dcf52e73d3be070057 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:34:48 +0100 Subject: [PATCH 10/29] topk: optimize sorted path and k=0 fast return --- src/layer/topk.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index 2c9554ae06a9..77814c9e0600 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -131,6 +131,15 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl return -100; } + if (_k == 0) + { + top_blobs[0] = values; + if (top_blobs.size() >= 2) + top_blobs[1] = indices; + + return 0; + } + const float* ptr = bottom_blob; float* outptr = values; float* outidxptr = indices; @@ -177,7 +186,10 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (_k < axis_size) { if (sorted_flag) - std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp); + { + std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); + std::sort(vec.begin(), vec.begin() + _k, comp); + } else std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); } From 6c5978b0ab8f0478f8412d96d87585f05c56d779 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:36:01 +0100 Subject: [PATCH 11/29] topk: add k=1 fast path for embedded runtime --- src/layer/topk.cpp | 36 ++++++++++++++++++++++++++++++++++++ tests/test_topk.cpp | 1 + 2 files changed, 37 insertions(+) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index 77814c9e0600..d7a67fe87b33 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -161,6 +161,42 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl const int total_lines = outer * inner; + if (_k == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int line = 0; line < total_lines; line++) + { + int outer_i = line / inner; + int inner_i = line - outer_i * inner; + + int in_base = outer_i * axis_size * inner + inner_i; + int out_base = outer_i * inner + inner_i; + + float best_value = ptr[in_base]; + int best_index = 0; + + for (int j = 1; j < axis_size; j++) + { + const float candidate_value = ptr[in_base + j * inner]; + if (topk_pair_comp(std::make_pair(candidate_value, j), std::make_pair(best_value, best_index), largest_flag)) + { + best_value = candidate_value; + best_index = j; + } + } + + outptr[out_base] = best_value; + if (outidxptr) + outidxptr[out_base] = (float)best_index; + } + + top_blobs[0] = values; + if (top_blobs.size() >= 2) + top_blobs[1] = indices; + + return 0; + } + #pragma omp parallel num_threads(opt.num_threads) { std::vector > vec; diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp index 55a95ef56bf0..0f9d8fee3a4e 100644 --- a/tests/test_topk.cpp +++ b/tests/test_topk.cpp @@ -78,6 +78,7 @@ static int test_topk_0() return 0 || test_topk(a, 0, 1, 1, 1) || test_topk(a, 0, 5, 1, 1) + || test_topk(a, 0, 1, 0, 0) || test_topk(a, -1, 7, 0, 1) || test_topk(a, 0, 4, 1, 0) || test_topk(a, 0, 9, 1, 1); From e16514bb00a95e73edf770922c2a399750cddad9 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:37:07 +0100 Subject: [PATCH 12/29] topk: avoid pair temporaries in k=1 hot loop --- src/layer/topk.cpp | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index d7a67fe87b33..d30af50c8d52 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -37,6 +37,25 @@ static inline bool topk_pair_comp(const std::pair& a, const std::pai return a.second < b.second; } +static inline bool topk_value_index_comp(float a_value, int a_index, float b_value, int b_index, bool largest) +{ + const bool a_nan = topk_isnan(a_value); + const bool b_nan = topk_isnan(b_value); + + if (a_nan || b_nan) + { + if (a_nan != b_nan) + return !a_nan && b_nan; + + return a_index < b_index; + } + + if (a_value != b_value) + return largest ? (a_value > b_value) : (a_value < b_value); + + return a_index < b_index; +} + struct topk_pair_comparator { topk_pair_comparator(bool _largest) @@ -178,7 +197,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl for (int j = 1; j < axis_size; j++) { const float candidate_value = ptr[in_base + j * inner]; - if (topk_pair_comp(std::make_pair(candidate_value, j), std::make_pair(best_value, best_index), largest_flag)) + if (topk_value_index_comp(candidate_value, j, best_value, best_index, largest_flag)) { best_value = candidate_value; best_index = j; From 00be7f82e60dc139991cb969b013df5fcfb5917a Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:39:32 +0100 Subject: [PATCH 13/29] topk: reduce writeback branching in hot loop --- src/layer/topk.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index d30af50c8d52..3026b8088ffa 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -162,6 +162,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl const float* ptr = bottom_blob; float* outptr = values; float* outidxptr = indices; + const bool output_indices = outidxptr != 0; int inner = 1; for (int i = 0; i < positive_axis; i++) @@ -205,7 +206,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl } outptr[out_base] = best_value; - if (outidxptr) + if (output_indices) outidxptr[out_base] = (float)best_index; } @@ -254,11 +255,20 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl std::sort(vec.begin(), vec.end(), comp); } - for (int j = 0; j < _k; j++) + if (output_indices) { - outptr[out_base + j * inner] = vec[j].first; - if (outidxptr) + for (int j = 0; j < _k; j++) + { + outptr[out_base + j * inner] = vec[j].first; outidxptr[out_base + j * inner] = (float)vec[j].second; + } + } + else + { + for (int j = 0; j < _k; j++) + { + outptr[out_base + j * inner] = vec[j].first; + } } } } From 1fe44637e330453a3b9a95ff0d54e2244e58fe03 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:47:47 +0100 Subject: [PATCH 14/29] topk: fast path unsorted full-k copy --- src/layer/topk.cpp | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index 3026b8088ffa..c87c485fc8e3 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -217,6 +217,41 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl return 0; } + if (_k == axis_size && !sorted_flag) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int line = 0; line < total_lines; line++) + { + int outer_i = line / inner; + int inner_i = line - outer_i * inner; + + int in_base = outer_i * axis_size * inner + inner_i; + int out_base = outer_i * _k * inner + inner_i; + + if (output_indices) + { + for (int j = 0; j < _k; j++) + { + outptr[out_base + j * inner] = ptr[in_base + j * inner]; + outidxptr[out_base + j * inner] = (float)j; + } + } + else + { + for (int j = 0; j < _k; j++) + { + outptr[out_base + j * inner] = ptr[in_base + j * inner]; + } + } + } + + top_blobs[0] = values; + if (top_blobs.size() >= 2) + top_blobs[1] = indices; + + return 0; + } + #pragma omp parallel num_threads(opt.num_threads) { std::vector > vec; From 6ea29eb6e380562f613dc11511e237070c997422 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:49:20 +0100 Subject: [PATCH 15/29] topk: add small-k hot path for embedded runtime --- src/layer/topk.cpp | 72 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index c87c485fc8e3..00d632068dd6 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -252,6 +252,78 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl return 0; } + if (_k <= 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int line = 0; line < total_lines; line++) + { + int outer_i = line / inner; + int inner_i = line - outer_i * inner; + + int in_base = outer_i * axis_size * inner + inner_i; + int out_base = outer_i * _k * inner + inner_i; + + float top_values[4]; + int top_indices[4]; + int top_count = 0; + + for (int j = 0; j < axis_size; j++) + { + const float candidate_value = ptr[in_base + j * inner]; + + if (top_count < _k) + { + int insert_pos = top_count; + while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag)) + { + top_values[insert_pos] = top_values[insert_pos - 1]; + top_indices[insert_pos] = top_indices[insert_pos - 1]; + insert_pos--; + } + + top_values[insert_pos] = candidate_value; + top_indices[insert_pos] = j; + top_count++; + } + else if (topk_value_index_comp(candidate_value, j, top_values[_k - 1], top_indices[_k - 1], largest_flag)) + { + int insert_pos = _k - 1; + while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag)) + { + top_values[insert_pos] = top_values[insert_pos - 1]; + top_indices[insert_pos] = top_indices[insert_pos - 1]; + insert_pos--; + } + + top_values[insert_pos] = candidate_value; + top_indices[insert_pos] = j; + } + } + + if (output_indices) + { + for (int j = 0; j < _k; j++) + { + outptr[out_base + j * inner] = top_values[j]; + outidxptr[out_base + j * inner] = (float)top_indices[j]; + } + } + else + { + for (int j = 0; j < _k; j++) + { + outptr[out_base + j * inner] = top_values[j]; + } + } + } + + top_blobs[0] = values; + if (top_blobs.size() >= 2) + top_blobs[1] = indices; + + return 0; + } + #pragma omp parallel num_threads(opt.num_threads) { std::vector > vec; From 7befff69286b4abe9b538d65084f84213809f4b4 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:51:56 +0100 Subject: [PATCH 16/29] topk: add guarded neon fast path for k=1 --- src/layer/topk.cpp | 75 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index 00d632068dd6..f527021e40bb 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -4,10 +4,15 @@ #include "topk.h" #include +#include #include #include #include +#if __ARM_NEON +#include +#endif // __ARM_NEON + namespace ncnn { static inline bool topk_isnan(float v) @@ -192,6 +197,76 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl int in_base = outer_i * axis_size * inner + inner_i; int out_base = outer_i * inner + inner_i; +#if __ARM_NEON + if (!output_indices && inner == 1 && axis_size >= 4) + { + const float* lineptr = ptr + in_base; + + float best_value = largest_flag ? -FLT_MAX : FLT_MAX; + int j = 0; + int has_nan = 0; + + for (; j + 3 < axis_size; j += 4) + { + float32x4_t v = vld1q_f32(lineptr + j); + uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(v, v)); + if (vmaxvq_u32(nan_mask) != 0) + { + has_nan = 1; + break; + } + + float tmp[4]; + vst1q_f32(tmp, v); + + if (largest_flag) + { + if (tmp[0] > best_value) best_value = tmp[0]; + if (tmp[1] > best_value) best_value = tmp[1]; + if (tmp[2] > best_value) best_value = tmp[2]; + if (tmp[3] > best_value) best_value = tmp[3]; + } + else + { + if (tmp[0] < best_value) best_value = tmp[0]; + if (tmp[1] < best_value) best_value = tmp[1]; + if (tmp[2] < best_value) best_value = tmp[2]; + if (tmp[3] < best_value) best_value = tmp[3]; + } + } + + if (!has_nan) + { + for (; j < axis_size; j++) + { + const float candidate_value = lineptr[j]; + if (topk_isnan(candidate_value)) + { + has_nan = 1; + break; + } + + if (largest_flag) + { + if (candidate_value > best_value) + best_value = candidate_value; + } + else + { + if (candidate_value < best_value) + best_value = candidate_value; + } + } + } + + if (!has_nan) + { + outptr[out_base] = best_value; + continue; + } + } +#endif // __ARM_NEON + float best_value = ptr[in_base]; int best_index = 0; From 5ba7fbcab1ec7aa2a0ce945461ab53ebce1049b9 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:56:02 +0100 Subject: [PATCH 17/29] topk: fix neon k=1 inf initialization edge case --- src/layer/topk.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index f527021e40bb..dbab3b19ed20 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -4,7 +4,6 @@ #include "topk.h" #include -#include #include #include #include @@ -202,11 +201,11 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl { const float* lineptr = ptr + in_base; - float best_value = largest_flag ? -FLT_MAX : FLT_MAX; - int j = 0; - int has_nan = 0; + float best_value = lineptr[0]; + int j = 1; + int has_nan = topk_isnan(best_value); - for (; j + 3 < axis_size; j += 4) + for (; !has_nan && j + 3 < axis_size; j += 4) { float32x4_t v = vld1q_f32(lineptr + j); uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(v, v)); From e4b4073935f9df6931188da31e00ee2eef3a84d4 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 16:58:55 +0100 Subject: [PATCH 18/29] topk: make neon mask check arm-portable --- src/layer/topk.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index dbab3b19ed20..59946b1d6e43 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -209,7 +209,9 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl { float32x4_t v = vld1q_f32(lineptr + j); uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(v, v)); - if (vmaxvq_u32(nan_mask) != 0) + uint32_t nan_mask_lanes[4]; + vst1q_u32(nan_mask_lanes, nan_mask); + if (nan_mask_lanes[0] || nan_mask_lanes[1] || nan_mask_lanes[2] || nan_mask_lanes[3]) { has_nan = 1; break; From 49dbc7be2f4f7e56f4efc2848b8da4e80387bc00 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 17:04:24 +0100 Subject: [PATCH 19/29] topk: optimize small-k unsorted selection path --- src/layer/topk.cpp | 72 +++++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index 59946b1d6e43..10b7b1d2ccc0 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -343,36 +343,68 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl int top_indices[4]; int top_count = 0; - for (int j = 0; j < axis_size; j++) + if (sorted_flag) { - const float candidate_value = ptr[in_base + j * inner]; - - if (top_count < _k) + for (int j = 0; j < axis_size; j++) { - int insert_pos = top_count; - while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag)) + const float candidate_value = ptr[in_base + j * inner]; + + if (top_count < _k) { - top_values[insert_pos] = top_values[insert_pos - 1]; - top_indices[insert_pos] = top_indices[insert_pos - 1]; - insert_pos--; + int insert_pos = top_count; + while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag)) + { + top_values[insert_pos] = top_values[insert_pos - 1]; + top_indices[insert_pos] = top_indices[insert_pos - 1]; + insert_pos--; + } + + top_values[insert_pos] = candidate_value; + top_indices[insert_pos] = j; + top_count++; } + else if (topk_value_index_comp(candidate_value, j, top_values[_k - 1], top_indices[_k - 1], largest_flag)) + { + int insert_pos = _k - 1; + while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag)) + { + top_values[insert_pos] = top_values[insert_pos - 1]; + top_indices[insert_pos] = top_indices[insert_pos - 1]; + insert_pos--; + } - top_values[insert_pos] = candidate_value; - top_indices[insert_pos] = j; - top_count++; + top_values[insert_pos] = candidate_value; + top_indices[insert_pos] = j; + } } - else if (topk_value_index_comp(candidate_value, j, top_values[_k - 1], top_indices[_k - 1], largest_flag)) + } + else + { + for (int j = 0; j < axis_size; j++) { - int insert_pos = _k - 1; - while (insert_pos > 0 && topk_value_index_comp(candidate_value, j, top_values[insert_pos - 1], top_indices[insert_pos - 1], largest_flag)) + const float candidate_value = ptr[in_base + j * inner]; + + if (top_count < _k) { - top_values[insert_pos] = top_values[insert_pos - 1]; - top_indices[insert_pos] = top_indices[insert_pos - 1]; - insert_pos--; + top_values[top_count] = candidate_value; + top_indices[top_count] = j; + top_count++; } + else + { + int worst_pos = 0; + for (int t = 1; t < _k; t++) + { + if (topk_value_index_comp(top_values[worst_pos], top_indices[worst_pos], top_values[t], top_indices[t], largest_flag)) + worst_pos = t; + } - top_values[insert_pos] = candidate_value; - top_indices[insert_pos] = j; + if (topk_value_index_comp(candidate_value, j, top_values[worst_pos], top_indices[worst_pos], largest_flag)) + { + top_values[worst_pos] = candidate_value; + top_indices[worst_pos] = j; + } + } } } From 9d31f3bee6185a8102be5f84131bcf972e0a5946 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 27 Feb 2026 17:18:19 +0100 Subject: [PATCH 20/29] tests: add values-only topk coverage in cpp and onnx --- tests/test_topk.cpp | 97 +++++++++++++++++++++++- tools/pnnx/tests/onnx/test_torch_topk.py | 4 + 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp index 0f9d8fee3a4e..8568041b5c34 100644 --- a/tests/test_topk.cpp +++ b/tests/test_topk.cpp @@ -49,6 +49,49 @@ static int test_topk_cpu_forward(const ncnn::Mat& a, int axis, int k, int larges return 0; } +static int test_topk_cpu_forward_values_only(const ncnn::Mat& a, int axis, int k, int largest, int sorted, ncnn::Mat& values) +{ + ncnn::ParamDict pd; + pd.set(0, axis); + pd.set(1, largest); + pd.set(2, sorted); + pd.set(3, k); + + std::vector weights(0); + + ncnn::Option opt; + opt.num_threads = 1; + opt.use_vulkan_compute = false; + opt.use_packing_layout = false; + + ncnn::Layer* op = ncnn::create_layer_cpu("TopK"); + if (!op) + return -1; + + op->load_param(pd); + + ncnn::ModelBinFromMatArray mb(weights.data()); + op->load_model(mb); + + op->create_pipeline(opt); + + std::vector bottom_blobs(1); + bottom_blobs[0] = a; + + std::vector top_blobs(1); + int ret = op->forward(bottom_blobs, top_blobs, opt); + + op->destroy_pipeline(opt); + delete op; + + if (ret != 0) + return ret; + + values = top_blobs[0]; + + return 0; +} + static int test_topk(const ncnn::Mat& a, int axis, int k, int largest, int sorted) { ncnn::ParamDict pd; @@ -251,6 +294,57 @@ static int test_topk_nan_robust() return 0; } +static int test_topk_values_only_fastpaths() +{ + ncnn::Mat a(5); + float* ptr = a; + ptr[0] = 1.f; + ptr[1] = -2.f; + ptr[2] = 4.f; + ptr[3] = 3.f; + ptr[4] = 0.f; + + ncnn::Mat values; + + int ret = test_topk_cpu_forward_values_only(a, 0, 1, 1, 0, values); + if (ret != 0) + { + fprintf(stderr, "test_topk_values_only_fastpaths k1 failed ret=%d\n", ret); + return -1; + } + + if (values.w != 1 || ((const float*)values)[0] != 4.f) + { + fprintf(stderr, "test_topk_values_only_fastpaths k1 result mismatch\n"); + return -1; + } + + ret = test_topk_cpu_forward_values_only(a, 0, 5, 1, 0, values); + if (ret != 0) + { + fprintf(stderr, "test_topk_values_only_fastpaths fullk failed ret=%d\n", ret); + return -1; + } + + if (values.w != 5) + { + fprintf(stderr, "test_topk_values_only_fastpaths fullk shape mismatch\n"); + return -1; + } + + const float* vptr = values; + for (int i = 0; i < 5; i++) + { + if (vptr[i] != ptr[i]) + { + fprintf(stderr, "test_topk_values_only_fastpaths fullk value mismatch\n"); + return -1; + } + } + + return 0; +} + int main() { SRAND(7767517); @@ -261,5 +355,6 @@ int main() || test_topk_2() || test_topk_3() || test_topk_inf_order() - || test_topk_nan_robust(); + || test_topk_nan_robust() + || test_topk_values_only_fastpaths(); } diff --git a/tools/pnnx/tests/onnx/test_torch_topk.py b/tools/pnnx/tests/onnx/test_torch_topk.py index d62db5990003..dfd99ee2ac26 100644 --- a/tools/pnnx/tests/onnx/test_torch_topk.py +++ b/tools/pnnx/tests/onnx/test_torch_topk.py @@ -22,6 +22,9 @@ def forward(self, x, y, z, u, v): x_unsorted_values, x_unsorted_indices = torch.topk( x, 2, dim=1, largest=True, sorted=False ) + x_values_only = torch.topk( + x, 3, dim=1, largest=True, sorted=True + )[0] y_values, y_indices = torch.topk( y, 4, dim=3, largest=False, sorted=True ) @@ -47,6 +50,7 @@ def forward(self, x, y, z, u, v): x_k0_indices, x_unsorted_values, x_unsorted_indices, + x_values_only, y_values, y_indices, z_values, From 84e083b6f49631583d997790948461adefc8993e Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 10 Apr 2026 12:18:48 +0200 Subject: [PATCH 21/29] topk: fix STL compatibility, cstep indexing, omp barrier, and code style - Guard / behind #if NCNN_SIMPLESTL, include simplestl.h - Use std::partial_sort in simplestl mode (no std::nth_element available) - Guard in tests behind #if !NCNN_SIMPLESTL to avoid simplemath.h conflict; define INFINITY/NAN as float expressions in simplestl mode - Fix cstep-unaware indexing for 3D/4D output tensors: use actual cstep for channel offset instead of assuming contiguous w*h layout - Convert #pragma omp parallel + inner #pragma omp for to #pragma omp parallel for to avoid __kmpc_barrier in simpleomp mode - Fix copyright year 2026->2025 - Apply code-format whitespace cleanup --- src/layer/topk.cpp | 178 +++++++++++++++++++++--------- src/layer/topk.h | 2 +- tests/test_topk.cpp | 24 ++-- tools/pnnx/src/ir.cpp | 8 +- tools/pnnx/src/pass_ncnn/TopK.cpp | 2 +- 5 files changed, 145 insertions(+), 69 deletions(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index 10b7b1d2ccc0..3b78fbfce3fe 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -1,12 +1,17 @@ -// Copyright 2026 Tencent +// Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "topk.h" -#include #include #include + +#if NCNN_SIMPLESTL +#include "simplestl.h" +#else +#include #include +#endif #if __ARM_NEON #include @@ -185,6 +190,21 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl const int total_lines = outer * inner; + // ncnn 3-/4-D mats have a channel stride (cstep) that may be larger than w*h + // due to alignment padding. The flat inner/outer indexing must account for this: + // - when axis reduces a non-channel dim, the outer loop spans channels and + // the channel offset must use cstep rather than the product of spatial sizes; + // - when axis IS the channel dim, the per-element j-stride must be cstep. + const size_t in_cstep = (dims >= 3) ? (size_t)bottom_blob.cstep : 0; + const size_t out_cstep = (dims >= 3) ? values.cstep : 0; + const bool axis_is_channel = (dims >= 3 && positive_axis == dims - 1); + // spatial-only outer count: channels factored out so cstep can be used separately + const int c_channels = (!axis_is_channel && dims >= 3) ? shape[dims - 1] : 1; + const int outer_spatial = (dims >= 3 && !axis_is_channel) ? outer / c_channels : outer; + // stride when stepping along the axis in memory + const size_t in_axis_stride = axis_is_channel ? in_cstep : (size_t)inner; + const size_t out_axis_stride = axis_is_channel ? out_cstep : (size_t)inner; + if (_k == 1) { #pragma omp parallel for num_threads(opt.num_threads) @@ -193,8 +213,19 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl int outer_i = line / inner; int inner_i = line - outer_i * inner; - int in_base = outer_i * axis_size * inner + inner_i; - int out_base = outer_i * inner + inner_i; + size_t in_base, out_base; + if (!axis_is_channel && dims >= 3) + { + const int ci = outer_i / outer_spatial; + const int sp_i = outer_i % outer_spatial; + in_base = (size_t)ci * in_cstep + (size_t)sp_i * axis_size * inner + inner_i; + out_base = (size_t)ci * out_cstep + (size_t)sp_i * 1 * inner + inner_i; + } + else + { + in_base = (size_t)outer_i * axis_size * inner + inner_i; + out_base = (size_t)outer_i * 1 * inner + inner_i; + } #if __ARM_NEON if (!output_indices && inner == 1 && axis_size >= 4) @@ -273,7 +304,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl for (int j = 1; j < axis_size; j++) { - const float candidate_value = ptr[in_base + j * inner]; + const float candidate_value = ptr[in_base + j * in_axis_stride]; if (topk_value_index_comp(candidate_value, j, best_value, best_index, largest_flag)) { best_value = candidate_value; @@ -301,22 +332,33 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl int outer_i = line / inner; int inner_i = line - outer_i * inner; - int in_base = outer_i * axis_size * inner + inner_i; - int out_base = outer_i * _k * inner + inner_i; + size_t in_base, out_base; + if (!axis_is_channel && dims >= 3) + { + const int ci = outer_i / outer_spatial; + const int sp_i = outer_i % outer_spatial; + in_base = (size_t)ci * in_cstep + (size_t)sp_i * axis_size * inner + inner_i; + out_base = (size_t)ci * out_cstep + (size_t)sp_i * _k * inner + inner_i; + } + else + { + in_base = (size_t)outer_i * axis_size * inner + inner_i; + out_base = (size_t)outer_i * _k * inner + inner_i; + } if (output_indices) { for (int j = 0; j < _k; j++) { - outptr[out_base + j * inner] = ptr[in_base + j * inner]; - outidxptr[out_base + j * inner] = (float)j; + outptr[out_base + j * out_axis_stride] = ptr[in_base + j * in_axis_stride]; + outidxptr[out_base + j * out_axis_stride] = (float)j; } } else { for (int j = 0; j < _k; j++) { - outptr[out_base + j * inner] = ptr[in_base + j * inner]; + outptr[out_base + j * out_axis_stride] = ptr[in_base + j * in_axis_stride]; } } } @@ -336,8 +378,19 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl int outer_i = line / inner; int inner_i = line - outer_i * inner; - int in_base = outer_i * axis_size * inner + inner_i; - int out_base = outer_i * _k * inner + inner_i; + size_t in_base, out_base; + if (!axis_is_channel && dims >= 3) + { + const int ci = outer_i / outer_spatial; + const int sp_i = outer_i % outer_spatial; + in_base = (size_t)ci * in_cstep + (size_t)sp_i * axis_size * inner + inner_i; + out_base = (size_t)ci * out_cstep + (size_t)sp_i * _k * inner + inner_i; + } + else + { + in_base = (size_t)outer_i * axis_size * inner + inner_i; + out_base = (size_t)outer_i * _k * inner + inner_i; + } float top_values[4]; int top_indices[4]; @@ -347,7 +400,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl { for (int j = 0; j < axis_size; j++) { - const float candidate_value = ptr[in_base + j * inner]; + const float candidate_value = ptr[in_base + j * in_axis_stride]; if (top_count < _k) { @@ -382,7 +435,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl { for (int j = 0; j < axis_size; j++) { - const float candidate_value = ptr[in_base + j * inner]; + const float candidate_value = ptr[in_base + j * in_axis_stride]; if (top_count < _k) { @@ -412,15 +465,15 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl { for (int j = 0; j < _k; j++) { - outptr[out_base + j * inner] = top_values[j]; - outidxptr[out_base + j * inner] = (float)top_indices[j]; + outptr[out_base + j * out_axis_stride] = top_values[j]; + outidxptr[out_base + j * out_axis_stride] = (float)top_indices[j]; } } else { for (int j = 0; j < _k; j++) { - outptr[out_base + j * inner] = top_values[j]; + outptr[out_base + j * out_axis_stride] = top_values[j]; } } } @@ -432,58 +485,73 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl return 0; } - #pragma omp parallel num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) + for (int line = 0; line < total_lines; line++) { - std::vector > vec; - vec.resize(axis_size); + std::vector > vec(axis_size); topk_pair_comparator comp(largest_flag); - #pragma omp for - for (int line = 0; line < total_lines; line++) - { - int outer_i = line / inner; - int inner_i = line - outer_i * inner; + int outer_i = line / inner; + int inner_i = line - outer_i * inner; - int in_base = outer_i * axis_size * inner + inner_i; - int out_base = outer_i * _k * inner + inner_i; + size_t in_base, out_base; + if (!axis_is_channel && dims >= 3) + { + const int ci = outer_i / outer_spatial; + const int sp_i = outer_i % outer_spatial; + in_base = (size_t)ci * in_cstep + (size_t)sp_i * axis_size * inner + inner_i; + out_base = (size_t)ci * out_cstep + (size_t)sp_i * _k * inner + inner_i; + } + else + { + in_base = (size_t)outer_i * axis_size * inner + inner_i; + out_base = (size_t)outer_i * _k * inner + inner_i; + } - for (int j = 0; j < axis_size; j++) - { - vec[j].first = ptr[in_base + j * inner]; - vec[j].second = j; - } + for (int j = 0; j < axis_size; j++) + { + vec[j].first = ptr[in_base + j * in_axis_stride]; + vec[j].second = j; + } - if (_k < axis_size) + if (_k < axis_size) + { +#if NCNN_SIMPLESTL + std::partial_sort(vec.begin(), vec.begin() + _k, vec.end(), comp); +#else + if (sorted_flag) { - if (sorted_flag) - { - std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); - std::sort(vec.begin(), vec.begin() + _k, comp); - } - else - std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); + std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); + std::sort(vec.begin(), vec.begin() + _k, comp); } else - { - if (sorted_flag) - std::sort(vec.begin(), vec.end(), comp); - } + std::nth_element(vec.begin(), vec.begin() + _k, vec.end(), comp); +#endif + } + else + { + if (sorted_flag) +#if NCNN_SIMPLESTL + std::partial_sort(vec.begin(), vec.end(), vec.end(), comp); +#else + std::sort(vec.begin(), vec.end(), comp); +#endif + } - if (output_indices) + if (output_indices) + { + for (int j = 0; j < _k; j++) { - for (int j = 0; j < _k; j++) - { - outptr[out_base + j * inner] = vec[j].first; - outidxptr[out_base + j * inner] = (float)vec[j].second; - } + outptr[out_base + j * out_axis_stride] = vec[j].first; + outidxptr[out_base + j * out_axis_stride] = (float)vec[j].second; } - else + } + else + { + for (int j = 0; j < _k; j++) { - for (int j = 0; j < _k; j++) - { - outptr[out_base + j * inner] = vec[j].first; - } + outptr[out_base + j * out_axis_stride] = vec[j].first; } } } diff --git a/src/layer/topk.h b/src/layer/topk.h index ff8f410926d8..947dc21343ff 100644 --- a/src/layer/topk.h +++ b/src/layer/topk.h @@ -1,4 +1,4 @@ -// Copyright 2026 Tencent +// Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause #ifndef LAYER_TOPK_H diff --git a/tests/test_topk.cpp b/tests/test_topk.cpp index 8568041b5c34..ac3375058e3f 100644 --- a/tests/test_topk.cpp +++ b/tests/test_topk.cpp @@ -1,9 +1,17 @@ -// Copyright 2026 Tencent +// Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "testutil.h" -#include +#if NCNN_SIMPLESTL +// simplemath.h conflicts with system math.h; define only what we need +static const float TEST_INF = 1.f / 0.f; +static const float TEST_NAN = 0.f / 0.f; +#define INFINITY TEST_INF +#define NAN TEST_NAN +#else +#include +#endif static int test_topk_cpu_forward(const ncnn::Mat& a, int axis, int k, int largest, int sorted, ncnn::Mat& values, ncnn::Mat& indices) { @@ -121,7 +129,7 @@ static int test_topk_0() return 0 || test_topk(a, 0, 1, 1, 1) || test_topk(a, 0, 5, 1, 1) - || test_topk(a, 0, 1, 0, 0) + || test_topk(a, 0, 1, 0, 0) || test_topk(a, -1, 7, 0, 1) || test_topk(a, 0, 4, 1, 0) || test_topk(a, 0, 9, 1, 1); @@ -175,9 +183,9 @@ static int test_topk_inf_order() ncnn::Mat a(6); float* ptr = a; ptr[0] = 1.f; - ptr[1] = std::numeric_limits::infinity(); + ptr[1] = INFINITY; ptr[2] = -2.f; - ptr[3] = -std::numeric_limits::infinity(); + ptr[3] = -INFINITY; ptr[4] = 0.5f; ptr[5] = 3.f; @@ -193,7 +201,7 @@ static int test_topk_inf_order() const float* vptr = values; const float* iptr = indices; - if (values.w != 2 || indices.w != 2 || vptr[0] != std::numeric_limits::infinity() || vptr[1] != 3.f || (int)iptr[0] != 1 || (int)iptr[1] != 5) + if (values.w != 2 || indices.w != 2 || vptr[0] != INFINITY || vptr[1] != 3.f || (int)iptr[0] != 1 || (int)iptr[1] != 5) { fprintf(stderr, "test_topk_inf_order largest result mismatch\n"); return -1; @@ -208,7 +216,7 @@ static int test_topk_inf_order() vptr = values; iptr = indices; - if (values.w != 2 || indices.w != 2 || vptr[0] != -std::numeric_limits::infinity() || vptr[1] != -2.f || (int)iptr[0] != 3 || (int)iptr[1] != 2) + if (values.w != 2 || indices.w != 2 || vptr[0] != -INFINITY || vptr[1] != -2.f || (int)iptr[0] != 3 || (int)iptr[1] != 2) { fprintf(stderr, "test_topk_inf_order smallest result mismatch\n"); return -1; @@ -222,7 +230,7 @@ static int test_topk_nan_robust() ncnn::Mat a(4); float* ptr = a; ptr[0] = 1.f; - ptr[1] = std::numeric_limits::quiet_NaN(); + ptr[1] = NAN; ptr[2] = 2.f; ptr[3] = -1.f; diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp index 63f9c70e21f4..456f51993b15 100644 --- a/tools/pnnx/src/ir.cpp +++ b/tools/pnnx/src/ir.cpp @@ -1640,12 +1640,12 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con continue; fprintf(pyfp, " self.%s = TopK(", sanitize_identifier(op->name).c_str()); - + int i = 0; for (const auto& it : op->params) { fprintf(pyfp, "%s=", it.first.c_str()); - + const Parameter& param = it.second; if (param.type == 2) { @@ -1655,12 +1655,12 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con { fprintf(pyfp, "%d", param.b ? 1 : 0); } - + if (i + 1 != op->params.size()) fprintf(pyfp, ", "); i++; } - + fprintf(pyfp, ")\n"); } } diff --git a/tools/pnnx/src/pass_ncnn/TopK.cpp b/tools/pnnx/src/pass_ncnn/TopK.cpp index ed226605ad8c..13549437d271 100644 --- a/tools/pnnx/src/pass_ncnn/TopK.cpp +++ b/tools/pnnx/src/pass_ncnn/TopK.cpp @@ -1,4 +1,4 @@ -// Copyright 2026 Tencent +// Copyright 2025 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "pass_ncnn.h" From 2ea44ddc98562ef45e94a40df391d1aedaf376e5 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 10 Apr 2026 12:28:46 +0200 Subject: [PATCH 22/29] apply code-format --- src/layer/topk.cpp | 8 ++++---- tools/pnnx/src/ir.cpp | 20 ++++++++++---------- tools/pnnx/src/pass_onnx/fold_constants.cpp | 14 +++++++------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index 3b78fbfce3fe..7e1a3c77ad78 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -207,7 +207,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (_k == 1) { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { int outer_i = line / inner; @@ -326,7 +326,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (_k == axis_size && !sorted_flag) { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { int outer_i = line / inner; @@ -372,7 +372,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (_k <= 4) { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { int outer_i = line / inner; @@ -485,7 +485,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl return 0; } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { std::vector > vec(axis_size); diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp index 456f51993b15..1d88ba384bfb 100644 --- a/tools/pnnx/src/ir.cpp +++ b/tools/pnnx/src/ir.cpp @@ -1576,10 +1576,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con for (size_t i = 0; i < param.ai.size(); i++) { if ((op->type == "nn.AdaptiveAvgPool2d" - || op->type == "nn.AdaptiveAvgPool3d" - || op->type == "nn.AdaptiveMaxPool2d" - || op->type == "nn.AdaptiveMaxPool3d") - && it.first == "output_size" && param.ai[i] == 0) + || op->type == "nn.AdaptiveAvgPool3d" + || op->type == "nn.AdaptiveMaxPool2d" + || op->type == "nn.AdaptiveMaxPool3d") + && it.first == "output_size" && param.ai[i] == 0) { fprintf(pyfp, "None"); } @@ -2390,8 +2390,8 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con bool scalar_as_tensor = false; if ((op->type == "Tensor.index_put" && it.first == "values") - || (op->type == "torch.where" && it.first == "input") - || (op->type == "torch.where" && it.first == "other")) + || (op->type == "torch.where" && it.first == "input") + || (op->type == "torch.where" && it.first == "other")) { scalar_as_tensor = true; } @@ -2478,10 +2478,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con for (size_t i = 0; i < param.ai.size(); i++) { if ((op->type == "F.adaptive_avg_pool2d" - || op->type == "F.adaptive_avg_pool3d" - || op->type == "F.adaptive_max_pool2d" - || op->type == "F.adaptive_max_pool3d") - && it.first == "output_size" && param.ai[i] == 0) + || op->type == "F.adaptive_avg_pool3d" + || op->type == "F.adaptive_max_pool2d" + || op->type == "F.adaptive_max_pool3d") + && it.first == "output_size" && param.ai[i] == 0) { fprintf(pyfp, "None"); } diff --git a/tools/pnnx/src/pass_onnx/fold_constants.cpp b/tools/pnnx/src/pass_onnx/fold_constants.cpp index c79cb29f34a1..6c843188d1b0 100644 --- a/tools/pnnx/src/pass_onnx/fold_constants.cpp +++ b/tools/pnnx/src/pass_onnx/fold_constants.cpp @@ -198,13 +198,13 @@ void fold_constants(onnx::ModelProto& model, const std::string& external_data_pa // aten::size // aten::_shape_as_tensor if (op_type == "aten_new_empty" - || op_type == "aten_new_full" - || op_type == "aten_new_ones" - || op_type == "aten_new_zeros" - || op_type == "aten_empty_like" - || op_type == "aten_full_like" - || op_type == "aten_ones_like" - || op_type == "aten_zeros_like") + || op_type == "aten_new_full" + || op_type == "aten_new_ones" + || op_type == "aten_new_zeros" + || op_type == "aten_empty_like" + || op_type == "aten_full_like" + || op_type == "aten_ones_like" + || op_type == "aten_zeros_like") { is_outputs_foldable = ignore_aten_size; } From 5674b1ceee432a91a5dd8fcaa79d35c02ffb3502 Mon Sep 17 00:00:00 2001 From: vlordier <5443125+vlordier@users.noreply.github.com> Date: Fri, 10 Apr 2026 10:31:02 +0000 Subject: [PATCH 23/29] apply code-format changes --- src/layer/topk.cpp | 8 ++++---- tools/pnnx/src/ir.cpp | 20 ++++++++++---------- tools/pnnx/src/pass_onnx/fold_constants.cpp | 14 +++++++------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index 7e1a3c77ad78..3b78fbfce3fe 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -207,7 +207,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (_k == 1) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { int outer_i = line / inner; @@ -326,7 +326,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (_k == axis_size && !sorted_flag) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { int outer_i = line / inner; @@ -372,7 +372,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (_k <= 4) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { int outer_i = line / inner; @@ -485,7 +485,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl return 0; } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { std::vector > vec(axis_size); diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp index 1d88ba384bfb..456f51993b15 100644 --- a/tools/pnnx/src/ir.cpp +++ b/tools/pnnx/src/ir.cpp @@ -1576,10 +1576,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con for (size_t i = 0; i < param.ai.size(); i++) { if ((op->type == "nn.AdaptiveAvgPool2d" - || op->type == "nn.AdaptiveAvgPool3d" - || op->type == "nn.AdaptiveMaxPool2d" - || op->type == "nn.AdaptiveMaxPool3d") - && it.first == "output_size" && param.ai[i] == 0) + || op->type == "nn.AdaptiveAvgPool3d" + || op->type == "nn.AdaptiveMaxPool2d" + || op->type == "nn.AdaptiveMaxPool3d") + && it.first == "output_size" && param.ai[i] == 0) { fprintf(pyfp, "None"); } @@ -2390,8 +2390,8 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con bool scalar_as_tensor = false; if ((op->type == "Tensor.index_put" && it.first == "values") - || (op->type == "torch.where" && it.first == "input") - || (op->type == "torch.where" && it.first == "other")) + || (op->type == "torch.where" && it.first == "input") + || (op->type == "torch.where" && it.first == "other")) { scalar_as_tensor = true; } @@ -2478,10 +2478,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con for (size_t i = 0; i < param.ai.size(); i++) { if ((op->type == "F.adaptive_avg_pool2d" - || op->type == "F.adaptive_avg_pool3d" - || op->type == "F.adaptive_max_pool2d" - || op->type == "F.adaptive_max_pool3d") - && it.first == "output_size" && param.ai[i] == 0) + || op->type == "F.adaptive_avg_pool3d" + || op->type == "F.adaptive_max_pool2d" + || op->type == "F.adaptive_max_pool3d") + && it.first == "output_size" && param.ai[i] == 0) { fprintf(pyfp, "None"); } diff --git a/tools/pnnx/src/pass_onnx/fold_constants.cpp b/tools/pnnx/src/pass_onnx/fold_constants.cpp index 6c843188d1b0..c79cb29f34a1 100644 --- a/tools/pnnx/src/pass_onnx/fold_constants.cpp +++ b/tools/pnnx/src/pass_onnx/fold_constants.cpp @@ -198,13 +198,13 @@ void fold_constants(onnx::ModelProto& model, const std::string& external_data_pa // aten::size // aten::_shape_as_tensor if (op_type == "aten_new_empty" - || op_type == "aten_new_full" - || op_type == "aten_new_ones" - || op_type == "aten_new_zeros" - || op_type == "aten_empty_like" - || op_type == "aten_full_like" - || op_type == "aten_ones_like" - || op_type == "aten_zeros_like") + || op_type == "aten_new_full" + || op_type == "aten_new_ones" + || op_type == "aten_new_zeros" + || op_type == "aten_empty_like" + || op_type == "aten_full_like" + || op_type == "aten_ones_like" + || op_type == "aten_zeros_like") { is_outputs_foldable = ignore_aten_size; } From caa9de366c86c43fad02392a69961d3cf26c8fb7 Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 10 Apr 2026 12:31:39 +0200 Subject: [PATCH 24/29] ci: add topk test coverage and pnnx onnx test --- .github/workflows/topk-linux-test.yml | 111 ++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 .github/workflows/topk-linux-test.yml diff --git a/.github/workflows/topk-linux-test.yml b/.github/workflows/topk-linux-test.yml new file mode 100644 index 000000000000..5a25a7320d30 --- /dev/null +++ b/.github/workflows/topk-linux-test.yml @@ -0,0 +1,111 @@ +name: topk-linux-test +on: + push: + branches: + - topk-ci-tests + +jobs: + x64-none: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \ + -DNCNN_SSE2=OFF -DNCNN_AVX=OFF \ + -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . --target test_topk -j$(nproc) + - name: test + run: cd build && ./tests/test_topk + + x64-sse2: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \ + -DNCNN_SSE2=ON -DNCNN_AVX=OFF \ + -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . --target test_topk -j$(nproc) + - name: test + run: cd build && ./tests/test_topk + + x64-avx2: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \ + -DNCNN_SSE2=ON -DNCNN_AVX=ON -DNCNN_F16C=ON -DNCNN_FMA=ON -DNCNN_AVX2=ON \ + -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF \ + -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . --target test_topk -j$(nproc) + - name: test + run: cd build && ./tests/test_topk + + simplestl-simplemath: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake \ + -DCMAKE_BUILD_TYPE=Debug \ + -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON \ + -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . --target test_topk -j$(nproc) + - name: test + run: cd build && ./tests/test_topk + + linux-x86-gcc: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: install + run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \ + -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . --target test_topk -j$(nproc) + - name: test + run: cd build && ./tests/test_topk + - name: build-nosse + run: | + mkdir build-nosse && cd build-nosse + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \ + -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . --target test_topk -j$(nproc) + - name: test-nosse + run: cd build-nosse && ./tests/test_topk + + pnnx-onnx-topk: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: setup-pytorch + run: | + pip3 install torch --index-url https://download.pytorch.org/whl/cpu + pip3 install numpy packaging onnx onnxruntime + - name: build-pnnx + run: | + cd tools/pnnx + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release .. + cmake --build . --config Release -j$(nproc) + - name: test-topk + run: | + cd tools/pnnx + build/src/pnnx tests/onnx/test_torch_topk.py From 4e39cb6ae25eeb061e79a56bc43f60941586d21f Mon Sep 17 00:00:00 2001 From: vlordier Date: Fri, 10 Apr 2026 12:52:52 +0200 Subject: [PATCH 25/29] =?UTF-8?q?ci:=20fix=20pnnx=20test=20invocation=20?= =?UTF-8?q?=E2=80=94=20use=20ctest?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/topk-linux-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/topk-linux-test.yml b/.github/workflows/topk-linux-test.yml index 5a25a7320d30..c4ef3861d6db 100644 --- a/.github/workflows/topk-linux-test.yml +++ b/.github/workflows/topk-linux-test.yml @@ -107,5 +107,5 @@ jobs: cmake --build . --config Release -j$(nproc) - name: test-topk run: | - cd tools/pnnx - build/src/pnnx tests/onnx/test_torch_topk.py + cd tools/pnnx/build + ctest --output-on-failure -R test_onnx_torch_topk From ca55f8a9b1ef4f13736d3a0d18f8c95eca1977bc Mon Sep 17 00:00:00 2001 From: vlordier <5443125+vlordier@users.noreply.github.com> Date: Fri, 10 Apr 2026 11:28:31 +0000 Subject: [PATCH 26/29] apply code-format changes --- src/layer/topk.cpp | 8 ++++---- tools/pnnx/src/ir.cpp | 20 ++++++++++---------- tools/pnnx/src/pass_onnx/fold_constants.cpp | 14 +++++++------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/layer/topk.cpp b/src/layer/topk.cpp index 7e1a3c77ad78..3b78fbfce3fe 100644 --- a/src/layer/topk.cpp +++ b/src/layer/topk.cpp @@ -207,7 +207,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (_k == 1) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { int outer_i = line / inner; @@ -326,7 +326,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (_k == axis_size && !sorted_flag) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { int outer_i = line / inner; @@ -372,7 +372,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl if (_k <= 4) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { int outer_i = line / inner; @@ -485,7 +485,7 @@ int TopK::forward(const std::vector& bottom_blobs, std::vector& top_bl return 0; } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int line = 0; line < total_lines; line++) { std::vector > vec(axis_size); diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp index 1d88ba384bfb..456f51993b15 100644 --- a/tools/pnnx/src/ir.cpp +++ b/tools/pnnx/src/ir.cpp @@ -1576,10 +1576,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con for (size_t i = 0; i < param.ai.size(); i++) { if ((op->type == "nn.AdaptiveAvgPool2d" - || op->type == "nn.AdaptiveAvgPool3d" - || op->type == "nn.AdaptiveMaxPool2d" - || op->type == "nn.AdaptiveMaxPool3d") - && it.first == "output_size" && param.ai[i] == 0) + || op->type == "nn.AdaptiveAvgPool3d" + || op->type == "nn.AdaptiveMaxPool2d" + || op->type == "nn.AdaptiveMaxPool3d") + && it.first == "output_size" && param.ai[i] == 0) { fprintf(pyfp, "None"); } @@ -2390,8 +2390,8 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con bool scalar_as_tensor = false; if ((op->type == "Tensor.index_put" && it.first == "values") - || (op->type == "torch.where" && it.first == "input") - || (op->type == "torch.where" && it.first == "other")) + || (op->type == "torch.where" && it.first == "input") + || (op->type == "torch.where" && it.first == "other")) { scalar_as_tensor = true; } @@ -2478,10 +2478,10 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath, con for (size_t i = 0; i < param.ai.size(); i++) { if ((op->type == "F.adaptive_avg_pool2d" - || op->type == "F.adaptive_avg_pool3d" - || op->type == "F.adaptive_max_pool2d" - || op->type == "F.adaptive_max_pool3d") - && it.first == "output_size" && param.ai[i] == 0) + || op->type == "F.adaptive_avg_pool3d" + || op->type == "F.adaptive_max_pool2d" + || op->type == "F.adaptive_max_pool3d") + && it.first == "output_size" && param.ai[i] == 0) { fprintf(pyfp, "None"); } diff --git a/tools/pnnx/src/pass_onnx/fold_constants.cpp b/tools/pnnx/src/pass_onnx/fold_constants.cpp index 6c843188d1b0..c79cb29f34a1 100644 --- a/tools/pnnx/src/pass_onnx/fold_constants.cpp +++ b/tools/pnnx/src/pass_onnx/fold_constants.cpp @@ -198,13 +198,13 @@ void fold_constants(onnx::ModelProto& model, const std::string& external_data_pa // aten::size // aten::_shape_as_tensor if (op_type == "aten_new_empty" - || op_type == "aten_new_full" - || op_type == "aten_new_ones" - || op_type == "aten_new_zeros" - || op_type == "aten_empty_like" - || op_type == "aten_full_like" - || op_type == "aten_ones_like" - || op_type == "aten_zeros_like") + || op_type == "aten_new_full" + || op_type == "aten_new_ones" + || op_type == "aten_new_zeros" + || op_type == "aten_empty_like" + || op_type == "aten_full_like" + || op_type == "aten_ones_like" + || op_type == "aten_zeros_like") { is_outputs_foldable = ignore_aten_size; } From d8fd80c1580d29667e2d5ab46de88a63ad632e8f Mon Sep 17 00:00:00 2001 From: vlordier Date: Sat, 11 Apr 2026 00:05:50 +0200 Subject: [PATCH 27/29] feat: add TopK + Gather ncnn support for YOLOv10 - pass_level2/torch_topk.cpp: capture k/dim/largest/sorted as parameters (prim::Constant) instead of tensor inputs, enabling ncnn pass matching - pass_level2/torch_gather.cpp: restore original pattern (dim as tensor) - pass_ncnn/TopK.cpp: match torch.topk with captured parameters and convert to ncnn TopK layer (axis, largest, sorted) - pass_ncnn/torch_gather.cpp (NEW): match torch.gather with 2 inputs (input, index) and captured dim parameter, convert to ncnn Gather layer - src/layer/gather.{h,cpp} (NEW): implement Gather ncnn operator supporting 1D/2D/3D tensors with arbitrary axis - PNNX CMakeLists fixes: - per-target Torch include dirs to avoid protobuf header conflicts - Abseil linking for Homebrew protobuf 34.x - disable onnxruntime auto-detection (protobuf conflict) - directory-level INCLUDE_DIRECTORIES_BEFORE for protobuf headers Verified: YOLOv10n converts with 2 TopK + 2 Gather layers, only cosmetic ops (Tensor.to, pnnx.Expression) ignored. Co-authored-by: Qwen-Coder --- src/CMakeLists.txt | 1 + src/layer/gather.cpp | 111 ++++++++++++++++++++++ src/layer/gather.h | 27 ++++++ tools/pnnx/CMakeLists.txt | 31 +++--- tools/pnnx/src/CMakeLists.txt | 21 ++++ tools/pnnx/src/pass_level2/torch_topk.cpp | 12 +-- tools/pnnx/src/pass_ncnn/TopK.cpp | 75 ++++++++++++--- tools/pnnx/src/pass_ncnn/torch_gather.cpp | 54 +++++++++++ 8 files changed, 301 insertions(+), 31 deletions(-) create mode 100644 src/layer/gather.cpp create mode 100644 src/layer/gather.h create mode 100644 tools/pnnx/src/pass_ncnn/torch_gather.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c79d779cf220..3f518f11117b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -102,6 +102,7 @@ ncnn_add_layer(TanH) ncnn_add_layer(Threshold) ncnn_add_layer(Tile) ncnn_add_layer(TopK) +ncnn_add_layer(Gather) ncnn_add_layer(RNN) ncnn_add_layer(LSTM) ncnn_add_layer(BinaryOp) diff --git a/src/layer/gather.cpp b/src/layer/gather.cpp new file mode 100644 index 000000000000..738cd85f9f41 --- /dev/null +++ b/src/layer/gather.cpp @@ -0,0 +1,111 @@ +// Copyright 2025 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include "gather.h" + +namespace ncnn { + +Gather::Gather() +{ + one_blob_only = false; + support_inplace = false; +} + +int Gather::load_param(const ParamDict& pd) +{ + axis = pd.get(0, 0); + + return 0; +} + +int Gather::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + if (bottom_blobs.size() < 2) + return -1; + + const Mat& input_blob = bottom_blobs[0]; + const Mat& index_blob = bottom_blobs[1]; + const int dims = input_blob.dims; + + // index_blob should contain int64 or int32 indices + // For simplicity we treat it as float and cast + const int index_size = (int)index_blob.total(); + + int positive_axis = axis < 0 ? axis + dims : axis; + if (positive_axis < 0 || positive_axis >= dims) + return -1; + + int shape[4] = {1, 1, 1, 1}; + shape[0] = input_blob.w; + if (dims >= 2) shape[1] = input_blob.h; + if (dims == 3) shape[2] = input_blob.c; + if (dims == 4) shape[2] = input_blob.c; // w*h*c layout + + const int axis_dim_size = shape[positive_axis]; + + // Output shape matches index_blob shape + const Mat& out_shape = index_blob; + + // Allocate output (same dtype as input, shape matches index) + Mat& top_blob = top_blobs[0]; + top_blob.create(out_shape.w, out_shape.h, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const float* inp = input_blob; + const int* idx = (const int*)index_blob; + float* out = top_blob; + + // General case: iterate over all output positions + // Map flat output index to multi-dimensional coords, + // then compute corresponding input position with index substitution + const int total_out = (int)top_blob.total(); + for (int i = 0; i < total_out; i++) + { + // Decompose flat index i into coordinates based on top_blob shape + int rem = i; + int coord_out[4] = {0, 0, 0, 0}; + if (top_blob.dims == 1) { + coord_out[0] = rem; + } else if (top_blob.dims == 2) { + coord_out[0] = rem % top_blob.w; + coord_out[1] = rem / top_blob.w; + } else if (top_blob.dims == 3) { + int hw = top_blob.w * top_blob.h; + coord_out[0] = (rem % hw) % top_blob.w; + coord_out[1] = (rem % hw) / top_blob.w; + coord_out[2] = rem / hw; + } + + // Get index value at this output position + int gather_idx = idx[i]; + // Handle negative indices + if (gather_idx < 0) gather_idx += axis_dim_size; + + // Build input coordinate (same as output, but axis coord replaced) + int coord_in[4] = {coord_out[0], coord_out[1], coord_out[2], coord_out[3]}; + coord_in[positive_axis] = gather_idx; + + // Clamp to input bounds + if (coord_in[positive_axis] >= axis_dim_size) coord_in[positive_axis] = axis_dim_size - 1; + if (coord_in[positive_axis] < 0) coord_in[positive_axis] = 0; + + // Compute flat input index + int flat_in = 0; + if (dims == 1) { + flat_in = coord_in[0]; + } else if (dims == 2) { + flat_in = coord_in[0] + coord_in[1] * input_blob.w; + } else if (dims == 3) { + // ncnn 3D layout: w * h * c, with cstride padding + size_t cstep = input_blob.cstep; + flat_in = coord_in[0] + coord_in[1] * input_blob.w + coord_in[2] * (int)cstep; + } + + out[i] = inp[flat_in]; + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/gather.h b/src/layer/gather.h new file mode 100644 index 000000000000..f8d24d9afb54 --- /dev/null +++ b/src/layer/gather.h @@ -0,0 +1,27 @@ +// Copyright 2025 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef LAYER_GATHER_H +#define LAYER_GATHER_H + +#include "layer.h" + +namespace ncnn { + +class Gather : public Layer +{ +public: + Gather(); + + virtual int load_param(const ParamDict& pd); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +public: + // param_0 = axis (default 0) + int axis; +}; + +} // namespace ncnn + +#endif // LAYER_GATHER_H diff --git a/tools/pnnx/CMakeLists.txt b/tools/pnnx/CMakeLists.txt index e50ab4788c3d..5b3250943cf8 100644 --- a/tools/pnnx/CMakeLists.txt +++ b/tools/pnnx/CMakeLists.txt @@ -83,7 +83,8 @@ else() message(WARNING "Building without TorchVision") endif() -include_directories(SYSTEM ${TORCH_INCLUDE_DIRS}) +# Torch includes are added per-target in src/CMakeLists.txt to avoid +# conflicts with system protobuf headers if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") # test if libtorch and protobuf has the same cxxabi version @@ -95,7 +96,10 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") endif() if((PNNX_TORCH_USE_CXX11_ABI AND PNNX_COMPILER_USE_CXX11_ABI) OR (NOT PNNX_TORCH_USE_CXX11_ABI AND NOT PNNX_COMPILER_USE_CXX11_ABI)) - find_package(protobuf CONFIG) + # Torch may have already registered protobuf targets — skip find_package if so + if(NOT TARGET protobuf::libprotobuf) + find_package(protobuf CONFIG) + endif() if(protobuf_FOUND) set(PROTOBUF_FOUND ${protobuf_FOUND}) @@ -109,20 +113,21 @@ if((PNNX_TORCH_USE_CXX11_ABI AND PNNX_COMPILER_USE_CXX11_ABI) OR (NOT PNNX_TORCH set_target_properties(protobuf::protoc PROPERTIES IMPORTED_LOCATION_RELEASE "${PROTOBUF_PROTOC_EXECUTABLE}") endif() endif() -endif() -# https://github.com/supertone-inc/onnxruntime-build -set(onnxruntime_INSTALL_DIR "/home/nihui/osd/pnnx/install" CACHE STRING "") -find_library(onnxruntime_LIB NAMES onnxruntime PATHS ${onnxruntime_INSTALL_DIR}/lib64 ${onnxruntime_INSTALL_DIR}/lib) -if(onnxruntime_LIB) - set(onnxruntime_FOUND TRUE) - add_library(onnxruntime::onnxruntime STATIC IMPORTED) - set_target_properties(onnxruntime::onnxruntime PROPERTIES IMPORTED_LOCATION ${onnxruntime_LIB}) - set_target_properties(onnxruntime::onnxruntime PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${onnxruntime_INSTALL_DIR}/include) -else() - set(onnxruntime_FOUND FALSE) + # Homebrew protobuf 34.x depends on Abseil — we need to link it explicitly + # because macOS doesn't resolve transitive dylib deps with @rpath properly + find_package(PkgConfig QUIET) + if(PKG_CONFIG_FOUND) + pkg_check_modules(ABSL QUIET absl_log_internal_check_op absl_die_if_null absl_log_internal_conditions absl_log_internal_message absl_examine_stack absl_statusor absl_synchronization absl_time) + if(ABSL_FOUND) + set(ABSL_LIBRARIES ${ABSL_LINK_LIBRARIES}) + endif() + endif() endif() +# Disable onnxruntime auto-detection — we only need torch2pnnx for YOLOv10 +set(onnxruntime_FOUND FALSE) + option(PNNX_TNN2PNNX "build tnn2pnnx" ON) add_subdirectory(src) diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index c554a6873e81..15aa16b46376 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -603,6 +603,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/torch_diag.cpp pass_ncnn/torch_flatten.cpp pass_ncnn/torch_flip.cpp + pass_ncnn/torch_gather.cpp pass_ncnn/torch_istft.cpp pass_ncnn/torch_logsumexp.cpp pass_ncnn/torch_matmul.cpp @@ -635,6 +636,15 @@ if(PROTOBUF_FOUND) add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS}) target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES}) + if(ABSL_LIBRARIES) + target_link_libraries(onnxproto PUBLIC ${ABSL_LIBRARIES}) + endif() + # Force system protobuf headers BEFORE any Torch-bundled old headers + # (Torch bundles an ancient protobuf that conflicts with system protobuf >= 22) + set_property(DIRECTORY APPEND PROPERTY INCLUDE_DIRECTORIES_BEFORE + ${PROTOBUF_INCLUDE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} + ) else() add_library(onnxproto STATIC onnx-data.proto onnx-ml.proto onnx-operators-ml.proto) target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) @@ -674,6 +684,7 @@ set(torch2pnnx_SRCS add_library(torch2pnnx OBJECT ${torch2pnnx_SRCS}) target_compile_definitions(torch2pnnx PRIVATE BUILD_TORCH2PNNX) target_compile_options(torch2pnnx PUBLIC "${TORCH_CXX_FLAGS}") +target_include_directories(torch2pnnx SYSTEM PRIVATE ${TORCH_INCLUDE_DIRS}) if(WIN32) target_compile_definitions(torch2pnnx PUBLIC NOMINMAX) @@ -687,6 +698,10 @@ if(PROTOBUF_FOUND) add_library(pnnx2onnx STATIC save_onnx.cpp ) + # Ensure Homebrew protobuf headers are found BEFORE Torch's bundled old ones + if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE) + target_include_directories(pnnx2onnx BEFORE PRIVATE ${PROTOBUF_INCLUDE_DIR}) + endif() if(onnxruntime_FOUND) target_link_libraries(pnnx2onnx PRIVATE onnxruntime::onnxruntime) else() @@ -779,12 +794,18 @@ set(pnnx_SRCS add_executable(pnnx ${pnnx_SRCS}) set_property(SOURCE main.cpp APPEND PROPERTY COMPILE_DEFINITIONS BUILD_TORCH2PNNX) +target_include_directories(pnnx SYSTEM PRIVATE ${TORCH_INCLUDE_DIRS}) target_link_libraries(pnnx PRIVATE torch2pnnx) if(TorchVision_FOUND) target_link_libraries(pnnx PRIVATE ${TORCHVISION_LIBRARY}) endif() +# Link Abseil (needed for protobuf 34.x on macOS/Homebrew) +if(ABSL_LIBRARIES) + target_link_libraries(pnnx PRIVATE ${ABSL_LIBRARIES}) +endif() + if(WIN32) target_link_libraries(pnnx PRIVATE ${TORCH_LIBRARIES}) else() diff --git a/tools/pnnx/src/pass_level2/torch_topk.cpp b/tools/pnnx/src/pass_level2/torch_topk.cpp index f3d7fae98ba4..339271f95fb7 100644 --- a/tools/pnnx/src/pass_level2/torch_topk.cpp +++ b/tools/pnnx/src/pass_level2/torch_topk.cpp @@ -11,13 +11,13 @@ class torch_topk : public GraphRewriterPass const char* match_pattern_graph() const { return R"PNNXIR(7767517 -7 7 +12 7 pnnx.Input input_0 0 1 input -pnnx.Input input_1 0 1 k -pnnx.Input input_2 0 1 dim -pnnx.Input input_3 0 1 largest -pnnx.Input input_4 0 1 sorted -aten::topk op_0 5 2 input k dim largest sorted values indices +prim::Constant op_0 0 1 k value=%k +prim::Constant op_1 0 1 dim value=%dim +prim::Constant op_2 0 1 largest value=%largest +prim::Constant op_3 0 1 sorted value=%sorted +aten::topk op_4 5 2 input k dim largest sorted values indices pnnx.Output output 2 0 values indices )PNNXIR"; } diff --git a/tools/pnnx/src/pass_ncnn/TopK.cpp b/tools/pnnx/src/pass_ncnn/TopK.cpp index 13549437d271..2641493dd0fc 100644 --- a/tools/pnnx/src/pass_ncnn/TopK.cpp +++ b/tools/pnnx/src/pass_ncnn/TopK.cpp @@ -17,16 +17,15 @@ static int parameter_to_bool(const Parameter& p, int default_value) return default_value; } -class TopK : public GraphRewriterPass +class torch_topk : public GraphRewriterPass { public: const char* match_pattern_graph() const { return R"PNNXIR(7767517 -4 3 +3 2 pnnx.Input input_0 0 1 input -pnnx.Input input_1 0 1 k -TopK op_0 2 2 input k values indices axis=%axis largest=%largest sorted=%sorted +torch.topk op_0 1 2 input values indices k=%k dim=%dim largest=%largest sorted=%sorted pnnx.Output output 2 0 values indices )PNNXIR"; } @@ -44,8 +43,14 @@ pnnx.Output output 2 0 values indices void write(Operator* op, const std::map& captured_params) const { int axis = -1; - if (captured_params.find("axis") != captured_params.end()) - axis = captured_params.at("axis").i; + if (captured_params.find("dim") != captured_params.end()) + { + const Parameter& dim_p = captured_params.at("dim"); + if (dim_p.type == 2) + axis = dim_p.i; + else if (dim_p.type == 5 && !dim_p.ai.empty()) + axis = dim_p.ai[0]; + } int largest = 1; if (captured_params.find("largest") != captured_params.end()) @@ -73,24 +78,70 @@ pnnx.Output output 2 0 values indices } }; -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(TopK, 20) +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_topk, 20) -class TopK_0 : public TopK +class torch_topk_0 : public GraphRewriterPass { public: const char* match_pattern_graph() const { return R"PNNXIR(7767517 -4 2 +3 1 pnnx.Input input_0 0 1 input -pnnx.Input input_1 0 1 k -TopK op_0 2 1 input k values axis=%axis largest=%largest sorted=%sorted +torch.topk op_0 1 1 input values k=%k dim=%dim largest=%largest sorted=%sorted pnnx.Output output 1 0 values )PNNXIR"; } + + const char* type_str() const + { + return "TopK"; + } + + const char* name_str() const + { + return "topk"; + } + + void write(Operator* op, const std::map& captured_params) const + { + int axis = -1; + if (captured_params.find("dim") != captured_params.end()) + { + const Parameter& dim_p = captured_params.at("dim"); + if (dim_p.type == 2) + axis = dim_p.i; + else if (dim_p.type == 5 && !dim_p.ai.empty()) + axis = dim_p.ai[0]; + } + + int largest = 1; + if (captured_params.find("largest") != captured_params.end()) + largest = parameter_to_bool(captured_params.at("largest"), 1); + + int sorted = 1; + if (captured_params.find("sorted") != captured_params.end()) + sorted = parameter_to_bool(captured_params.at("sorted"), 1); + + const int batch_index = op->inputs[0]->params["__batch_index"].i; + + if (axis == batch_index) + { + fprintf(stderr, "TopK along batch axis is not supported\n"); + return; + } + + int new_axis = axis; + if (axis >= 0) + new_axis = axis > batch_index ? axis - 1 : axis; + + op->params["0"] = new_axis; + op->params["1"] = largest; + op->params["2"] = sorted; + } }; -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(TopK_0, 20) +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_topk_0, 20) } // namespace ncnn diff --git a/tools/pnnx/src/pass_ncnn/torch_gather.cpp b/tools/pnnx/src/pass_ncnn/torch_gather.cpp new file mode 100644 index 000000000000..13d1d69e0103 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/torch_gather.cpp @@ -0,0 +1,54 @@ +// Copyright 2025 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class torch_gather : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 index +torch.gather op_0 2 1 input index out dim=%dim +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Gather"; + } + + const char* name_str() const + { + return "gather"; + } + + void write(Operator* op, const std::map& captured_params) const + { + int axis = 0; + if (captured_params.find("dim") != captured_params.end()) + { + const Parameter& dim_p = captured_params.at("dim"); + if (dim_p.type == 2) + axis = dim_p.i; + else if (dim_p.type == 5 && !dim_p.ai.empty()) + axis = dim_p.ai[0]; + } + + op->params["0"] = axis; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_gather, 20) + +} // namespace ncnn + +} // namespace pnnx From d68852df6817c600862238c7e880b21c66d1e2c1 Mon Sep 17 00:00:00 2001 From: vlordier <5443125+vlordier@users.noreply.github.com> Date: Sat, 11 Apr 2026 07:43:01 +0000 Subject: [PATCH 28/29] apply code-format changes --- src/layer/gather.cpp | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/layer/gather.cpp b/src/layer/gather.cpp index 738cd85f9f41..850b65b3d121 100644 --- a/src/layer/gather.cpp +++ b/src/layer/gather.cpp @@ -38,8 +38,8 @@ int Gather::forward(const std::vector& bottom_blobs, std::vector& top_ int shape[4] = {1, 1, 1, 1}; shape[0] = input_blob.w; if (dims >= 2) shape[1] = input_blob.h; - if (dims == 3) shape[2] = input_blob.c; - if (dims == 4) shape[2] = input_blob.c; // w*h*c layout + if (dims == 3) shape[2] = input_blob.c; + if (dims == 4) shape[2] = input_blob.c; // w*h*c layout const int axis_dim_size = shape[positive_axis]; @@ -65,12 +65,17 @@ int Gather::forward(const std::vector& bottom_blobs, std::vector& top_ // Decompose flat index i into coordinates based on top_blob shape int rem = i; int coord_out[4] = {0, 0, 0, 0}; - if (top_blob.dims == 1) { + if (top_blob.dims == 1) + { coord_out[0] = rem; - } else if (top_blob.dims == 2) { + } + else if (top_blob.dims == 2) + { coord_out[0] = rem % top_blob.w; coord_out[1] = rem / top_blob.w; - } else if (top_blob.dims == 3) { + } + else if (top_blob.dims == 3) + { int hw = top_blob.w * top_blob.h; coord_out[0] = (rem % hw) % top_blob.w; coord_out[1] = (rem % hw) / top_blob.w; @@ -92,11 +97,16 @@ int Gather::forward(const std::vector& bottom_blobs, std::vector& top_ // Compute flat input index int flat_in = 0; - if (dims == 1) { + if (dims == 1) + { flat_in = coord_in[0]; - } else if (dims == 2) { + } + else if (dims == 2) + { flat_in = coord_in[0] + coord_in[1] * input_blob.w; - } else if (dims == 3) { + } + else if (dims == 3) + { // ncnn 3D layout: w * h * c, with cstride padding size_t cstep = input_blob.cstep; flat_in = coord_in[0] + coord_in[1] * input_blob.w + coord_in[2] * (int)cstep; From 93bd42378acaaab0e5aee237dca92b1c68002197 Mon Sep 17 00:00:00 2001 From: vlordier Date: Sat, 11 Apr 2026 10:42:25 +0200 Subject: [PATCH 29/29] =?UTF-8?q?feat:=20add=20Tensor.to=20=E2=86=92=20Cas?= =?UTF-8?q?t=20conversion=20with=20int64/int32=20support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - src/layer/cast.{h,cpp}: extend Cast layer with int64 (type 5) and int32 (type 6) support, adding conversions int64↔float32 and int32↔float32 - pass_ncnn/tensor_to.cpp (NEW): convert Tensor.to (dtype cast) to ncnn Cast layer, mapping torch dtype strings to ncnn type codes - CMakeLists.txt: register tensor_to.cpp in pass_ncnn sources Verified: YOLOv10n Tensor.to (i64→f32) now converts to Cast layer instead of being ignored. Only cosmetic ops (pnnx.Expression) remain. Co-authored-by: Qwen-Coder --- src/layer/cast.cpp | 74 ++++++++++++++++++++++++++ src/layer/cast.h | 2 + tools/pnnx/src/CMakeLists.txt | 1 + tools/pnnx/src/pass_ncnn/tensor_to.cpp | 67 +++++++++++++++++++++++ 4 files changed, 144 insertions(+) create mode 100644 tools/pnnx/src/pass_ncnn/tensor_to.cpp diff --git a/src/layer/cast.cpp b/src/layer/cast.cpp index 3dcff38f3cac..e18a7c3a8ae2 100644 --- a/src/layer/cast.cpp +++ b/src/layer/cast.cpp @@ -74,6 +74,16 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons // bfloat16 out_elemsize = 2 * elempack; } + else if (type_to == 5) + { + // int64 + out_elemsize = 8 * elempack; + } + else if (type_to == 6) + { + // int32 + out_elemsize = 4 * elempack; + } if (dims == 1) { @@ -173,6 +183,70 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons // TODO more cast type + if (type_from == 5 && type_to == 1) + { + // int64 → float32 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const long long* ptr = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = (float)ptr[i]; + } + } + } + + if (type_from == 1 && type_to == 5) + { + // float32 → int64 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + long long* outptr = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = (long long)ptr[i]; + } + } + } + + if (type_from == 6 && type_to == 1) + { + // int32 → float32 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* ptr = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = (float)ptr[i]; + } + } + } + + if (type_from == 1 && type_to == 6) + { + // float32 → int32 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + int* outptr = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = (int)ptr[i]; + } + } + } + return 0; } diff --git a/src/layer/cast.h b/src/layer/cast.h index 036e61efed04..22c8f5da4626 100644 --- a/src/layer/cast.h +++ b/src/layer/cast.h @@ -24,6 +24,8 @@ class Cast : public Layer // 2 = float16 // 3 = int8 // 4 = bfloat16 + // 5 = int64 + // 6 = int32 int type_from; int type_to; }; diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index 15aa16b46376..86c0593b9b37 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -616,6 +616,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/torch_roll.cpp pass_ncnn/torch_slice_scatter.cpp pass_ncnn/torch_squeeze.cpp + pass_ncnn/tensor_to.cpp pass_ncnn/torch_sum.cpp pass_ncnn/torch_stft.cpp pass_ncnn/torch_t.cpp diff --git a/tools/pnnx/src/pass_ncnn/tensor_to.cpp b/tools/pnnx/src/pass_ncnn/tensor_to.cpp new file mode 100644 index 000000000000..252498fd0ffa --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/tensor_to.cpp @@ -0,0 +1,67 @@ +// Copyright 2025 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class Tensor_to : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 2 +pnnx.Input input_0 0 1 input +Tensor.to op_0 1 1 input out copy=%copy dtype=%dtype +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Cast"; + } + + const char* name_str() const + { + return "to"; + } + + void write(Operator* op, const std::map& captured_params) const + { + // Map torch dtype to ncnn cast type + // torch.float = 1 (float32), torch.int64 = 5 (int64), torch.int32 = 6 (int32), etc. + // The input type is auto-detected, we only need to set the target type + std::string dtype = "torch.float"; + if (captured_params.find("dtype") != captured_params.end()) + { + dtype = captured_params.at("dtype").s; + } + + int type_to = 0; + if (dtype == "torch.float" || dtype == "torch.float32") + type_to = 1; + else if (dtype == "torch.float16" || dtype == "torch.half") + type_to = 2; + else if (dtype == "torch.int8") + type_to = 3; + else if (dtype == "torch.bfloat16") + type_to = 4; + else if (dtype == "torch.int64" || dtype == "torch.long") + type_to = 5; + else if (dtype == "torch.int32" || dtype == "torch.int") + type_to = 6; + + op->params["0"] = 0; // auto-detect input type + op->params["1"] = type_to; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(Tensor_to, 20) + +} // namespace ncnn + +} // namespace pnnx