From 917dfd5fb8b78da43ff7fc6e42632c946e65e21f Mon Sep 17 00:00:00 2001 From: chenglimin Date: Tue, 10 Feb 2026 16:00:52 +0800 Subject: [PATCH 01/10] add the deformableconv2d operator for RVV backend, with 12.94x-20.16x speedup over scaler implementation --- src/layer/riscv/deformableconv2d_pack1ton.h | 160 +++++++ src/layer/riscv/deformableconv2d_packn.h | 182 ++++++++ src/layer/riscv/deformableconv2d_packnto1.h | 174 +++++++ src/layer/riscv/deformableconv2d_riscv.cpp | 482 ++++++++++++++++++++ src/layer/riscv/deformableconv2d_riscv.h | 43 ++ 5 files changed, 1041 insertions(+) create mode 100644 src/layer/riscv/deformableconv2d_pack1ton.h create mode 100644 src/layer/riscv/deformableconv2d_packn.h create mode 100644 src/layer/riscv/deformableconv2d_packnto1.h create mode 100644 src/layer/riscv/deformableconv2d_riscv.cpp create mode 100644 src/layer/riscv/deformableconv2d_riscv.h diff --git a/src/layer/riscv/deformableconv2d_pack1ton.h b/src/layer/riscv/deformableconv2d_pack1ton.h new file mode 100644 index 000000000000..dfa046c15e57 --- /dev/null +++ b/src/layer/riscv/deformableconv2d_pack1ton.h @@ -0,0 +1,160 @@ + + +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const float* bias_data_ptr = bias_data; + const int packn = csrr_vlenb() / 4; + const size_t vl = __riscv_vsetvl_e32m1(packn); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); + if (bias_data_ptr) + _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl); + + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = (int)floorf(h_im); + int w_low = (int)floorf(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + + if (cond) + { + float v_in = 0.f; + if (v1_cond) v_in += data_im_ptr[v1_pos] * w1; + if (v2_cond) v_in += data_im_ptr[v2_pos] * w2; + if (v3_cond) v_in += data_im_ptr[v3_pos] * w3; + if (v4_cond) v_in += data_im_ptr[v4_pos] * w4; + + if (has_mask) v_in *= mask_; + + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl); + } + + kptr += packn; + } + } + } + _sum = activation_ps(_sum, activation_type, activation_params, vl); + __riscv_vse32_v_f32m1(outptr + (h_col * outw + w_col) * packn, _sum, vl); + } + } + } +} + diff --git a/src/layer/riscv/deformableconv2d_packn.h b/src/layer/riscv/deformableconv2d_packn.h new file mode 100644 index 000000000000..c44e8bacc19e --- /dev/null +++ b/src/layer/riscv/deformableconv2d_packn.h @@ -0,0 +1,182 @@ + + +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const float* bias_data_ptr = bias_data; + const int packn = csrr_vlenb() / 4; + const size_t vl = __riscv_vsetvl_e32m1(packn); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); + if (bias_data_ptr) + _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl); + + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = (int)floorf(h_im); + int w_low = (int)floorf(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + + if (cond) + { + vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl); + + // Since we are iterating over input channels which are packed, + // we need to handle each element in the pack. + // However, the weight layout for packn is: + // [outch/packn][kh][kw][inch/packn][packn_in][packn_out] + // Wait, let's check the weight transformation in deformableconv2d_riscv.cpp + // weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack); + // It seems the weight is packed as [packn_in * packn_out] + + // For each input channel pack (size packn), we have packn input values. + // Each input value contributes to all packn output values. + // So we have packn * packn weights for this block. + + // Let's look at x86 implementation again. + // _val_channel0..3 corresponds to the 4 input values in the pack. + // _conv_w0..3 corresponds to the weights for these input values. + // Each _conv_w is a vector of size 4 (out_elempack), representing weights for one input channel to all 4 output channels. + + for (int k = 0; k < packn; k++) + { + float v_in = 0.f; + if (v1_cond) v_in += data_im_ptr[v1_pos * packn + k] * w1; + if (v2_cond) v_in += data_im_ptr[v2_pos * packn + k] * w2; + if (v3_cond) v_in += data_im_ptr[v3_pos * packn + k] * w3; + if (v4_cond) v_in += data_im_ptr[v4_pos * packn + k] * w4; + + if (has_mask) v_in *= mask_; + + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl); + } + } + + kptr += packn * packn; + } + } + } + _sum = activation_ps(_sum, activation_type, activation_params, vl); + __riscv_vse32_v_f32m1(outptr + (h_col * outw + w_col) * packn, _sum, vl); + } + } + } +} + diff --git a/src/layer/riscv/deformableconv2d_packnto1.h b/src/layer/riscv/deformableconv2d_packnto1.h new file mode 100644 index 000000000000..9c04b28c35ca --- /dev/null +++ b/src/layer/riscv/deformableconv2d_packnto1.h @@ -0,0 +1,174 @@ + + + +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_packnto1(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const float* bias_data_ptr = bias_data; + const int packn = csrr_vlenb() / 4; + const size_t vl = __riscv_vsetvl_e32m1(packn); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + float sum = 0.f; + if (bias_data_ptr) + sum = bias_data_ptr[oc]; + + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); + + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = (int)floorf(h_im); + int w_low = (int)floorf(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + + if (cond) + { + vfloat32m1_t _v1 = v1_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v1_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _v2 = v2_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v2_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _v3 = v3_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v3_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _v4 = v4_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v4_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); + + vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl); + _val = __riscv_vfmacc_vf_f32m1(_val, w1, _v1, vl); + _val = __riscv_vfmacc_vf_f32m1(_val, w2, _v2, vl); + _val = __riscv_vfmacc_vf_f32m1(_val, w3, _v3, vl); + _val = __riscv_vfmacc_vf_f32m1(_val, w4, _v4, vl); + + if (has_mask) + _val = __riscv_vfmul_vf_f32m1(_val, mask_, vl); + + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); + _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl); + } + + kptr += packn; + } + } + } + + vfloat32m1_t _v_sum = __riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_v_f_f32m1(0.f, vl), vl); + sum += __riscv_vfmv_f_s_f32m1_f32(_v_sum); + + sum = activation_ss(sum, activation_type, activation_params); + outptr[h_col * outw + w_col] = sum; + } + } + } +} + + diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp new file mode 100644 index 000000000000..d88eda0e4c7c --- /dev/null +++ b/src/layer/riscv/deformableconv2d_riscv.cpp @@ -0,0 +1,482 @@ + +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "deformableconv2d_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_activation.h" +#include "riscv_usability.h" + +#include "benchmark.h" +#include "cpu.h" +#include "layer_type.h" + +namespace ncnn { + +#if __riscv_vector +#include "deformableconv2d_packn.h" +#include "deformableconv2d_pack1ton.h" +#include "deformableconv2d_packnto1.h" +#endif // __riscv_vector + +DeformableConv2D_riscv::DeformableConv2D_riscv() +{ +#if __riscv_vector + support_packing = true; +#endif // __riscv_vector + + activation = 0; + gemm = 0; +} + +static int _4Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int l1, int l2, int l3) +{ + return ((i0 * l1 + i1) * l2 + i2) * l3 + i3; +} + +static int _6Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int i4, int i5, int l1, int l2, int l3, int l4, int l5) +{ + return ((((i0 * l1 + i1) * l2 + i2) * l3 + i3) * l4 + i4) * l5 + i5; +} + +#if __riscv_vector +static void deformableconv2d_transform_kernel_packed_riscv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) +{ + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-inch/pa-kw-kh-outch/pb + { + const float* weight_ptr = weight_data; + + weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack); + float* ptr = weight_data_tm; + for (int oc = 0; oc < num_output; oc++) + { + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + for (int ic = 0; ic < num_input; ic++) + { + ptr[_6Dindex_to_1Dindex(oc / out_elempack, i, j, ic / elempack, ic % elempack, oc % out_elempack, kernel_h, kernel_w, num_input / elempack, elempack, out_elempack)] = weight_ptr[_4Dindex_to_1Dindex(oc, ic, i, j, num_input, kernel_h, kernel_w)]; + } + } + } + } + weight_data_tm = weight_data_tm.reshape(num_input / elempack, maxk, num_output / out_elempack); + } +} +#endif // __riscv_vector + +int DeformableConv2D_riscv::create_pipeline(const Option& opt) +{ + activation = create_activation_layer(activation_type, activation_params, opt); + + int kernel_size = kernel_w * kernel_h; + int num_input = weight_data_size / kernel_size / num_output; + + int elempack = 1; + int out_elempack = 1; + +#if __riscv_vector + if (opt.use_packing_layout) + { + const int packn = csrr_vlenb() / 4; + elempack = num_input % packn == 0 ? packn : 1; + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_vector + + if (opt.use_sgemm_convolution) + { + const int maxk = kernel_w * kernel_h; + + gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm); + + ncnn::ParamDict pd; + pd.set(2, 0); // transA + pd.set(3, 0); // transB + pd.set(4, 1); // constantA + pd.set(5, 0); // constantB + pd.set(6, 1); // constantC + pd.set(7, num_output); // M = outch + pd.set(8, 0); // N = size + pd.set(9, maxk * num_input); // K = maxk*inch + pd.set(10, bias_term ? 1 : -1); // constant_broadcast_type_C = (M) + pd.set(11, 1); // output_N1M + + gemm->load_param(pd); + + // maxk-inch-outch to pa-maxk-inch/pa-outch + Mat tmp; + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + tmp.create(maxk * num_input, num_output); + + for (int q = 0; q < num_output; q += 1) + { + float* g00 = tmp.row(q); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < elempack; i++) + { + const float* k00 = weight_data_r2.channel(q).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } + + if (bias_term) + { + ncnn::Mat weights[2]; + weights[0] = tmp; + weights[1] = bias_data; + + gemm->load_model(ModelBinFromMatArray(weights)); + } + else + { + ncnn::Mat weights[1]; + weights[0] = tmp; + + gemm->load_model(ModelBinFromMatArray(weights)); + } + + gemm->create_pipeline(opt); + } + else if (elempack == 1 && out_elempack == 1) + { + weight_data_tm = weight_data; + } + else + { +#if __riscv_vector + deformableconv2d_transform_kernel_packed_riscv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); +#endif // __riscv_vector + } + + if (opt.lightmode) + { + if (!(elempack == 1 && out_elempack == 1)) + weight_data.release(); + } + + return 0; +} + +int DeformableConv2D_riscv::destroy_pipeline(const Option& opt) +{ + if (activation) + { + activation->destroy_pipeline(opt); + delete activation; + activation = 0; + } + + if (gemm) + { + gemm->destroy_pipeline(opt); + delete gemm; + gemm = 0; + } + + return 0; +} + +int DeformableConv2D_riscv::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + Mat& top_blob = top_blobs[0]; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + const int outw = (w + pad_left + pad_right - kernel_extent_w) / stride_w + 1; + const int outh = (h + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1; + + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + const int packn = csrr_vlenb() / 4; + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_vector + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (opt.use_sgemm_convolution) + { + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + Mat offset_unpacked; + convert_packing(offset, offset_unpacked, 1, opt); + + Mat mask_unpacked; + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + convert_packing(mask, mask_unpacked, 1, opt); + } + + // im2col + Mat bottom_im2col(size, maxk * channels, elemsize, elempack, opt.workspace_allocator); + +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const Mat img = bottom_blob.channel(p); + float* ptr = bottom_im2col.row(p * maxk); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const Mat offset_h_k = offset_unpacked.channel((u * kernel_w + v) * 2); + const Mat offset_w_k = offset_unpacked.channel((u * kernel_w + v) * 2 + 1); + const Mat mask_k = has_mask ? mask_unpacked.channel(u * kernel_w + v) : 0; + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float offset_h = offset_h_k.row(i)[j]; + float offset_w = offset_w_k.row(i)[j]; + + int h_in = i * stride_h - pad_top; + int w_in = j * stride_w - pad_left; + + const float h_im = h_in + u * dilation_h + offset_h; + const float w_im = w_in + v * dilation_w + offset_w; + + // Bilinear + size_t vl = __riscv_vsetvl_e32m1(packn); + vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl); + bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + bool v1_cond = (h_low >= 0 && w_low >= 0); + bool v2_cond = (h_low >= 0 && w_high <= w - 1); + bool v3_cond = (h_high <= h - 1 && w_low >= 0); + bool v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + float w1 = hh * hw; + float w2 = hh * lw; + float w3 = lh * hw; + float w4 = lh * lw; + + vfloat32m1_t _v1 = v1_cond ? __riscv_vle32_v_f32m1((const float*)img.row(h_low) + w_low * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _v2 = v2_cond ? __riscv_vle32_v_f32m1((const float*)img.row(h_low) + w_high * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _v3 = v3_cond ? __riscv_vle32_v_f32m1((const float*)img.row(h_high) + w_low * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _v4 = v4_cond ? __riscv_vle32_v_f32m1((const float*)img.row(h_high) + w_high * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); + + _val = __riscv_vfmacc_vf_f32m1(_val, w1, _v1, vl); + _val = __riscv_vfmacc_vf_f32m1(_val, w2, _v2, vl); + _val = __riscv_vfmacc_vf_f32m1(_val, w3, _v3, vl); + _val = __riscv_vfmacc_vf_f32m1(_val, w4, _v4, vl); + + if (has_mask) + _val = __riscv_vfmul_vf_f32m1(_val, mask_k.row(i)[j], vl); + } + + __riscv_vse32_v_f32m1(ptr, _val, vl); + + ptr += packn; + } + } + } + } + } + } +#endif // __riscv_vector + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const Mat img = bottom_blob.channel(p); + float* ptr = bottom_im2col.row(p * maxk); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const Mat offset_h_k = offset_unpacked.channel((u * kernel_w + v) * 2); + const Mat offset_w_k = offset_unpacked.channel((u * kernel_w + v) * 2 + 1); + const Mat mask_k = has_mask ? mask_unpacked.channel(u * kernel_w + v) : 0; + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float offset_h = offset_h_k.row(i)[j]; + float offset_w = offset_w_k.row(i)[j]; + + int h_in = i * stride_h - pad_top; + int w_in = j * stride_w - pad_left; + + const float h_im = h_in + u * dilation_h + offset_h; + const float w_im = w_in + v * dilation_w + offset_w; + + // Bilinear + float val = 0.f; + bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + if (cond) + { + int h_low = (int)floorf(h_im); + int w_low = (int)floorf(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + bool v1_cond = (h_low >= 0 && w_low >= 0); + bool v2_cond = (h_low >= 0 && w_high <= w - 1); + bool v3_cond = (h_high <= h - 1 && w_low >= 0); + bool v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + float w1 = hh * hw; + float w2 = hh * lw; + float w3 = lh * hw; + float w4 = lh * lw; + + float v1 = v1_cond ? img.row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? img.row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? img.row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? img.row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + + if (has_mask) + val *= mask_k.row(i)[j]; + } + + ptr[0] = val; + + ptr += 1; + } + } + } + } + } + } + + // sgemm + { + top_blob.w = outw * outh; + top_blob.h = 1; + } + Option opt_b = opt; + opt_b.blob_allocator = opt.workspace_allocator; + gemm->forward(bottom_im2col, top_blob, opt_b); + { + top_blob.w = outw; + top_blob.h = outh; + } + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + + if (elempack == packn && out_elempack == packn) + { + deformableconv2d_packn(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + + if (elempack == 1 && out_elempack == packn) + { + deformableconv2d_pack1ton(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + + if (elempack == packn && out_elempack == 1) + { + deformableconv2d_packnto1(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + + if (elempack == 1 && out_elempack == 1) + { + std::vector bottom_blobs_unpacked = bottom_blobs; + Mat offset_unpacked; + if (offset.elempack != 1) + { + convert_packing(offset, offset_unpacked, 1, opt); + bottom_blobs_unpacked[1] = offset_unpacked; + } + + if (bottom_blobs.size() == 3) + { + const Mat& mask = bottom_blobs[2]; + if (mask.elempack != 1) + { + Mat mask_unpacked; + convert_packing(mask, mask_unpacked, 1, opt); + bottom_blobs_unpacked[2] = mask_unpacked; + } + } + + return DeformableConv2D::forward(bottom_blobs_unpacked, top_blobs, opt); + } +#endif // __riscv_vector + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/deformableconv2d_riscv.h b/src/layer/riscv/deformableconv2d_riscv.h new file mode 100644 index 000000000000..5d6a3b7765be --- /dev/null +++ b/src/layer/riscv/deformableconv2d_riscv.h @@ -0,0 +1,43 @@ + +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DEFORMABLECONV2D_RISCV_H +#define LAYER_DEFORMABLECONV2D_RISCV_H + +#include "deformableconv2d.h" + +namespace ncnn { + +class DeformableConv2D_riscv : public DeformableConv2D +{ +public: + DeformableConv2D_riscv(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +public: + Layer* activation; + + Mat weight_data_tm; + + Layer* gemm; +}; + +} // namespace ncnn + +#endif // LAYER_DEFORMABLECONV2D_RISCV_H From 2934858505513cf601a43ca3568793802876be9e Mon Sep 17 00:00:00 2001 From: chenglimin <18213449+chenglimin@users.noreply.github.com> Date: Tue, 10 Feb 2026 08:54:17 +0000 Subject: [PATCH 02/10] apply code-format changes --- src/layer/riscv/deformableconv2d_pack1ton.h | 11 +++++------ src/layer/riscv/deformableconv2d_packn.h | 19 +++++++++---------- src/layer/riscv/deformableconv2d_packnto1.h | 21 +++++++++------------ 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/src/layer/riscv/deformableconv2d_pack1ton.h b/src/layer/riscv/deformableconv2d_pack1ton.h index dfa046c15e57..628a9f7ae470 100644 --- a/src/layer/riscv/deformableconv2d_pack1ton.h +++ b/src/layer/riscv/deformableconv2d_pack1ton.h @@ -48,7 +48,7 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_data_ptr) _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl); - + for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) @@ -132,7 +132,7 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& for (int ic = 0; ic < inch; ic++) { const float* data_im_ptr = bottom_blob.channel(ic); - + if (cond) { float v_in = 0.f; @@ -140,13 +140,13 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& if (v2_cond) v_in += data_im_ptr[v2_pos] * w2; if (v3_cond) v_in += data_im_ptr[v3_pos] * w3; if (v4_cond) v_in += data_im_ptr[v4_pos] * w4; - + if (has_mask) v_in *= mask_; - + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl); } - + kptr += packn; } } @@ -157,4 +157,3 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& } } } - diff --git a/src/layer/riscv/deformableconv2d_packn.h b/src/layer/riscv/deformableconv2d_packn.h index c44e8bacc19e..50346e6816bd 100644 --- a/src/layer/riscv/deformableconv2d_packn.h +++ b/src/layer/riscv/deformableconv2d_packn.h @@ -48,7 +48,7 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_data_ptr) _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl); - + for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) @@ -132,11 +132,11 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to for (int ic = 0; ic < inch; ic++) { const float* data_im_ptr = bottom_blob.channel(ic); - + if (cond) { vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl); - + // Since we are iterating over input channels which are packed, // we need to handle each element in the pack. // However, the weight layout for packn is: @@ -144,16 +144,16 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to // Wait, let's check the weight transformation in deformableconv2d_riscv.cpp // weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack); // It seems the weight is packed as [packn_in * packn_out] - + // For each input channel pack (size packn), we have packn input values. // Each input value contributes to all packn output values. // So we have packn * packn weights for this block. - + // Let's look at x86 implementation again. // _val_channel0..3 corresponds to the 4 input values in the pack. // _conv_w0..3 corresponds to the weights for these input values. // Each _conv_w is a vector of size 4 (out_elempack), representing weights for one input channel to all 4 output channels. - + for (int k = 0; k < packn; k++) { float v_in = 0.f; @@ -161,14 +161,14 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to if (v2_cond) v_in += data_im_ptr[v2_pos * packn + k] * w2; if (v3_cond) v_in += data_im_ptr[v3_pos * packn + k] * w3; if (v4_cond) v_in += data_im_ptr[v4_pos * packn + k] * w4; - + if (has_mask) v_in *= mask_; - + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl); _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl); } } - + kptr += packn * packn; } } @@ -179,4 +179,3 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to } } } - diff --git a/src/layer/riscv/deformableconv2d_packnto1.h b/src/layer/riscv/deformableconv2d_packnto1.h index 9c04b28c35ca..2f5896c04f79 100644 --- a/src/layer/riscv/deformableconv2d_packnto1.h +++ b/src/layer/riscv/deformableconv2d_packnto1.h @@ -1,6 +1,5 @@ - // Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. @@ -49,9 +48,9 @@ static void deformableconv2d_packnto1(const std::vector& bottom_blobs, Mat& float sum = 0.f; if (bias_data_ptr) sum = bias_data_ptr[oc]; - + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); - + for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) @@ -135,40 +134,38 @@ static void deformableconv2d_packnto1(const std::vector& bottom_blobs, Mat& for (int ic = 0; ic < inch; ic++) { const float* data_im_ptr = bottom_blob.channel(ic); - + if (cond) { vfloat32m1_t _v1 = v1_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v1_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); vfloat32m1_t _v2 = v2_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v2_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); vfloat32m1_t _v3 = v3_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v3_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); vfloat32m1_t _v4 = v4_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v4_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); - + vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w1, _v1, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w2, _v2, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w3, _v3, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w4, _v4, vl); - + if (has_mask) _val = __riscv_vfmul_vf_f32m1(_val, mask_, vl); - + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl); } - + kptr += packn; } } } - + vfloat32m1_t _v_sum = __riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_v_f_f32m1(0.f, vl), vl); sum += __riscv_vfmv_f_s_f32m1_f32(_v_sum); - + sum = activation_ss(sum, activation_type, activation_params); outptr[h_col * outw + w_col] = sum; } } } } - - From 8ceffbed9d2a5804eabd71b959a0be215fdeb377 Mon Sep 17 00:00:00 2001 From: chenglimin Date: Tue, 10 Feb 2026 18:23:21 +0800 Subject: [PATCH 03/10] Fix deformableconv2d RVV implementation --- src/layer/riscv/deformableconv2d_pack1ton.h | 28 +++++----------- src/layer/riscv/deformableconv2d_packn.h | 36 +++++++------------- src/layer/riscv/deformableconv2d_packnto1.h | 37 ++++++++------------- src/layer/riscv/deformableconv2d_riscv.cpp | 16 ++------- src/layer/riscv/deformableconv2d_riscv.h | 16 ++------- 5 files changed, 37 insertions(+), 96 deletions(-) diff --git a/src/layer/riscv/deformableconv2d_pack1ton.h b/src/layer/riscv/deformableconv2d_pack1ton.h index 628a9f7ae470..945d2b5e39c0 100644 --- a/src/layer/riscv/deformableconv2d_pack1ton.h +++ b/src/layer/riscv/deformableconv2d_pack1ton.h @@ -1,18 +1,5 @@ - - -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) { @@ -48,7 +35,7 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_data_ptr) _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl); - + for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) @@ -132,7 +119,7 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& for (int ic = 0; ic < inch; ic++) { const float* data_im_ptr = bottom_blob.channel(ic); - + if (cond) { float v_in = 0.f; @@ -140,13 +127,13 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& if (v2_cond) v_in += data_im_ptr[v2_pos] * w2; if (v3_cond) v_in += data_im_ptr[v3_pos] * w3; if (v4_cond) v_in += data_im_ptr[v4_pos] * w4; - + if (has_mask) v_in *= mask_; - + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl); } - + kptr += packn; } } @@ -157,3 +144,4 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& } } } + diff --git a/src/layer/riscv/deformableconv2d_packn.h b/src/layer/riscv/deformableconv2d_packn.h index 50346e6816bd..61612a260aea 100644 --- a/src/layer/riscv/deformableconv2d_packn.h +++ b/src/layer/riscv/deformableconv2d_packn.h @@ -1,18 +1,5 @@ - - -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) { @@ -48,7 +35,7 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_data_ptr) _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl); - + for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) @@ -132,11 +119,11 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to for (int ic = 0; ic < inch; ic++) { const float* data_im_ptr = bottom_blob.channel(ic); - + if (cond) { vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl); - + // Since we are iterating over input channels which are packed, // we need to handle each element in the pack. // However, the weight layout for packn is: @@ -144,16 +131,16 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to // Wait, let's check the weight transformation in deformableconv2d_riscv.cpp // weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack); // It seems the weight is packed as [packn_in * packn_out] - + // For each input channel pack (size packn), we have packn input values. // Each input value contributes to all packn output values. // So we have packn * packn weights for this block. - + // Let's look at x86 implementation again. // _val_channel0..3 corresponds to the 4 input values in the pack. // _conv_w0..3 corresponds to the weights for these input values. // Each _conv_w is a vector of size 4 (out_elempack), representing weights for one input channel to all 4 output channels. - + for (int k = 0; k < packn; k++) { float v_in = 0.f; @@ -161,14 +148,14 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to if (v2_cond) v_in += data_im_ptr[v2_pos * packn + k] * w2; if (v3_cond) v_in += data_im_ptr[v3_pos * packn + k] * w3; if (v4_cond) v_in += data_im_ptr[v4_pos * packn + k] * w4; - + if (has_mask) v_in *= mask_; - + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl); _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl); } } - + kptr += packn * packn; } } @@ -179,3 +166,4 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to } } } + diff --git a/src/layer/riscv/deformableconv2d_packnto1.h b/src/layer/riscv/deformableconv2d_packnto1.h index 2f5896c04f79..577058ea1d41 100644 --- a/src/layer/riscv/deformableconv2d_packnto1.h +++ b/src/layer/riscv/deformableconv2d_packnto1.h @@ -1,18 +1,5 @@ - - -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause static void deformableconv2d_packnto1(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) { @@ -48,9 +35,9 @@ static void deformableconv2d_packnto1(const std::vector& bottom_blobs, Mat& float sum = 0.f; if (bias_data_ptr) sum = bias_data_ptr[oc]; - + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); - + for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) @@ -134,38 +121,40 @@ static void deformableconv2d_packnto1(const std::vector& bottom_blobs, Mat& for (int ic = 0; ic < inch; ic++) { const float* data_im_ptr = bottom_blob.channel(ic); - + if (cond) { vfloat32m1_t _v1 = v1_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v1_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); vfloat32m1_t _v2 = v2_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v2_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); vfloat32m1_t _v3 = v3_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v3_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); vfloat32m1_t _v4 = v4_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v4_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); - + vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w1, _v1, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w2, _v2, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w3, _v3, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w4, _v4, vl); - + if (has_mask) _val = __riscv_vfmul_vf_f32m1(_val, mask_, vl); - + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl); } - + kptr += packn; } } } - + vfloat32m1_t _v_sum = __riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_v_f_f32m1(0.f, vl), vl); sum += __riscv_vfmv_f_s_f32m1_f32(_v_sum); - + sum = activation_ss(sum, activation_type, activation_params); outptr[h_col * outw + w_col] = sum; } } } } + + diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp index d88eda0e4c7c..e0975e1e6054 100644 --- a/src/layer/riscv/deformableconv2d_riscv.cpp +++ b/src/layer/riscv/deformableconv2d_riscv.cpp @@ -1,17 +1,5 @@ - -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause #include "deformableconv2d_riscv.h" diff --git a/src/layer/riscv/deformableconv2d_riscv.h b/src/layer/riscv/deformableconv2d_riscv.h index 5d6a3b7765be..d538e5097075 100644 --- a/src/layer/riscv/deformableconv2d_riscv.h +++ b/src/layer/riscv/deformableconv2d_riscv.h @@ -1,17 +1,5 @@ - -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause #ifndef LAYER_DEFORMABLECONV2D_RISCV_H #define LAYER_DEFORMABLECONV2D_RISCV_H From f8dd5ab068c17121f8f8e93817c505a48c84a153 Mon Sep 17 00:00:00 2001 From: chenglimin <18213449+chenglimin@users.noreply.github.com> Date: Tue, 10 Feb 2026 10:27:31 +0000 Subject: [PATCH 04/10] apply code-format changes --- src/layer/riscv/deformableconv2d_pack1ton.h | 11 +++++------ src/layer/riscv/deformableconv2d_packn.h | 19 +++++++++---------- src/layer/riscv/deformableconv2d_packnto1.h | 20 +++++++++----------- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/src/layer/riscv/deformableconv2d_pack1ton.h b/src/layer/riscv/deformableconv2d_pack1ton.h index 945d2b5e39c0..e5ba658c9551 100644 --- a/src/layer/riscv/deformableconv2d_pack1ton.h +++ b/src/layer/riscv/deformableconv2d_pack1ton.h @@ -35,7 +35,7 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_data_ptr) _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl); - + for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) @@ -119,7 +119,7 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& for (int ic = 0; ic < inch; ic++) { const float* data_im_ptr = bottom_blob.channel(ic); - + if (cond) { float v_in = 0.f; @@ -127,13 +127,13 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& if (v2_cond) v_in += data_im_ptr[v2_pos] * w2; if (v3_cond) v_in += data_im_ptr[v3_pos] * w3; if (v4_cond) v_in += data_im_ptr[v4_pos] * w4; - + if (has_mask) v_in *= mask_; - + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl); } - + kptr += packn; } } @@ -144,4 +144,3 @@ static void deformableconv2d_pack1ton(const std::vector& bottom_blobs, Mat& } } } - diff --git a/src/layer/riscv/deformableconv2d_packn.h b/src/layer/riscv/deformableconv2d_packn.h index 61612a260aea..4b2b0e7f194e 100644 --- a/src/layer/riscv/deformableconv2d_packn.h +++ b/src/layer/riscv/deformableconv2d_packn.h @@ -35,7 +35,7 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_data_ptr) _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl); - + for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) @@ -119,11 +119,11 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to for (int ic = 0; ic < inch; ic++) { const float* data_im_ptr = bottom_blob.channel(ic); - + if (cond) { vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl); - + // Since we are iterating over input channels which are packed, // we need to handle each element in the pack. // However, the weight layout for packn is: @@ -131,16 +131,16 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to // Wait, let's check the weight transformation in deformableconv2d_riscv.cpp // weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack); // It seems the weight is packed as [packn_in * packn_out] - + // For each input channel pack (size packn), we have packn input values. // Each input value contributes to all packn output values. // So we have packn * packn weights for this block. - + // Let's look at x86 implementation again. // _val_channel0..3 corresponds to the 4 input values in the pack. // _conv_w0..3 corresponds to the weights for these input values. // Each _conv_w is a vector of size 4 (out_elempack), representing weights for one input channel to all 4 output channels. - + for (int k = 0; k < packn; k++) { float v_in = 0.f; @@ -148,14 +148,14 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to if (v2_cond) v_in += data_im_ptr[v2_pos * packn + k] * w2; if (v3_cond) v_in += data_im_ptr[v3_pos * packn + k] * w3; if (v4_cond) v_in += data_im_ptr[v4_pos * packn + k] * w4; - + if (has_mask) v_in *= mask_; - + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl); _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl); } } - + kptr += packn * packn; } } @@ -166,4 +166,3 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to } } } - diff --git a/src/layer/riscv/deformableconv2d_packnto1.h b/src/layer/riscv/deformableconv2d_packnto1.h index 577058ea1d41..d84ccd0a77c8 100644 --- a/src/layer/riscv/deformableconv2d_packnto1.h +++ b/src/layer/riscv/deformableconv2d_packnto1.h @@ -35,9 +35,9 @@ static void deformableconv2d_packnto1(const std::vector& bottom_blobs, Mat& float sum = 0.f; if (bias_data_ptr) sum = bias_data_ptr[oc]; - + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); - + for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) @@ -121,40 +121,38 @@ static void deformableconv2d_packnto1(const std::vector& bottom_blobs, Mat& for (int ic = 0; ic < inch; ic++) { const float* data_im_ptr = bottom_blob.channel(ic); - + if (cond) { vfloat32m1_t _v1 = v1_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v1_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); vfloat32m1_t _v2 = v2_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v2_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); vfloat32m1_t _v3 = v3_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v3_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); vfloat32m1_t _v4 = v4_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v4_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); - + vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w1, _v1, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w2, _v2, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w3, _v3, vl); _val = __riscv_vfmacc_vf_f32m1(_val, w4, _v4, vl); - + if (has_mask) _val = __riscv_vfmul_vf_f32m1(_val, mask_, vl); - + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl); } - + kptr += packn; } } } - + vfloat32m1_t _v_sum = __riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_v_f_f32m1(0.f, vl), vl); sum += __riscv_vfmv_f_s_f32m1_f32(_v_sum); - + sum = activation_ss(sum, activation_type, activation_params); outptr[h_col * outw + w_col] = sum; } } } } - - From d94348c1af85e7098f0d7bd349ef90547a83f0f6 Mon Sep 17 00:00:00 2001 From: chenglimin Date: Thu, 12 Feb 2026 15:50:47 +0800 Subject: [PATCH 05/10] Update src/layer/riscv/deformableconv2d_packn.h Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/layer/riscv/deformableconv2d_packn.h | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/src/layer/riscv/deformableconv2d_packn.h b/src/layer/riscv/deformableconv2d_packn.h index 4b2b0e7f194e..3a03e9b0b552 100644 --- a/src/layer/riscv/deformableconv2d_packn.h +++ b/src/layer/riscv/deformableconv2d_packn.h @@ -124,23 +124,13 @@ static void deformableconv2d_packn(const std::vector& bottom_blobs, Mat& to { vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl); - // Since we are iterating over input channels which are packed, - // we need to handle each element in the pack. - // However, the weight layout for packn is: - // [outch/packn][kh][kw][inch/packn][packn_in][packn_out] - // Wait, let's check the weight transformation in deformableconv2d_riscv.cpp - // weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack); - // It seems the weight is packed as [packn_in * packn_out] - - // For each input channel pack (size packn), we have packn input values. - // Each input value contributes to all packn output values. - // So we have packn * packn weights for this block. - - // Let's look at x86 implementation again. - // _val_channel0..3 corresponds to the 4 input values in the pack. - // _conv_w0..3 corresponds to the weights for these input values. - // Each _conv_w is a vector of size 4 (out_elempack), representing weights for one input channel to all 4 output channels. - + // Packed-weight memory layout for packn: + // For each output-channel pack, kernel position (kh, kw) and input-channel pack, + // the weights are stored as a contiguous block of size packn_in * packn_out + // (with packn_in == packn_out == packn here). Within this block, lane k in + // the input pack uses the vector loaded from kptr + k * packn, which contains + // the weights from that input lane to all packn output channels. After all + // packn input lanes are processed, kptr is advanced by packn * packn. for (int k = 0; k < packn; k++) { float v_in = 0.f; From 09bbbc98df5b4eb63f5db386ae194b1122e76b8c Mon Sep 17 00:00:00 2001 From: chenglimin Date: Thu, 12 Feb 2026 16:59:18 +0800 Subject: [PATCH 06/10] add scaler fallback --- src/layer/riscv/deformableconv2d_riscv.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp index e0975e1e6054..86e94b74c58e 100644 --- a/src/layer/riscv/deformableconv2d_riscv.cpp +++ b/src/layer/riscv/deformableconv2d_riscv.cpp @@ -437,7 +437,7 @@ int DeformableConv2D_riscv::forward(const std::vector& bottom_blobs, std::v { deformableconv2d_packnto1(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); } - +#endif // __riscv_vector if (elempack == 1 && out_elempack == 1) { std::vector bottom_blobs_unpacked = bottom_blobs; @@ -461,7 +461,7 @@ int DeformableConv2D_riscv::forward(const std::vector& bottom_blobs, std::v return DeformableConv2D::forward(bottom_blobs_unpacked, top_blobs, opt); } -#endif // __riscv_vector + } return 0; From feff32e4574ec6d8e52c05de467b2ba42839866b Mon Sep 17 00:00:00 2001 From: chenglimin <18213449+chenglimin@users.noreply.github.com> Date: Thu, 12 Feb 2026 09:05:37 +0000 Subject: [PATCH 07/10] apply code-format changes --- src/layer/riscv/deformableconv2d_riscv.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp index 86e94b74c58e..8e6f7aa2b697 100644 --- a/src/layer/riscv/deformableconv2d_riscv.cpp +++ b/src/layer/riscv/deformableconv2d_riscv.cpp @@ -461,7 +461,6 @@ int DeformableConv2D_riscv::forward(const std::vector& bottom_blobs, std::v return DeformableConv2D::forward(bottom_blobs_unpacked, top_blobs, opt); } - } return 0; From 3380ab2053caa3322f9ea77a41d8459a4b05d638 Mon Sep 17 00:00:00 2001 From: chenglimin Date: Thu, 12 Feb 2026 18:49:10 +0800 Subject: [PATCH 08/10] add always release weight_data --- src/layer/riscv/deformableconv2d_riscv.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp index 8e6f7aa2b697..3355238e72a1 100644 --- a/src/layer/riscv/deformableconv2d_riscv.cpp +++ b/src/layer/riscv/deformableconv2d_riscv.cpp @@ -168,8 +168,7 @@ int DeformableConv2D_riscv::create_pipeline(const Option& opt) if (opt.lightmode) { - if (!(elempack == 1 && out_elempack == 1)) - weight_data.release(); + weight_data.release(); } return 0; From f0def8687ec09a3becbba397060562debd63a458 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 20 Feb 2026 21:32:52 +0800 Subject: [PATCH 09/10] add pack1 path --- src/layer/riscv/deformableconv2d_riscv.cpp | 120 ++++++++++++++++++--- 1 file changed, 104 insertions(+), 16 deletions(-) diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp index 3355238e72a1..a0695abe86b3 100644 --- a/src/layer/riscv/deformableconv2d_riscv.cpp +++ b/src/layer/riscv/deformableconv2d_riscv.cpp @@ -439,26 +439,114 @@ int DeformableConv2D_riscv::forward(const std::vector& bottom_blobs, std::v #endif // __riscv_vector if (elempack == 1 && out_elempack == 1) { - std::vector bottom_blobs_unpacked = bottom_blobs; - Mat offset_unpacked; - if (offset.elempack != 1) - { - convert_packing(offset, offset_unpacked, 1, opt); - bottom_blobs_unpacked[1] = offset_unpacked; - } - - if (bottom_blobs.size() == 3) + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + const float* weight_ptr = weight_data_tm; + + // naive deformable conv + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) { - const Mat& mask = bottom_blobs[2]; - if (mask.elempack != 1) + for (int w_col = 0; w_col < outw; w_col++) { - Mat mask_unpacked; - convert_packing(mask, mask_unpacked, 1, opt); - bottom_blobs_unpacked[2] = mask_unpacked; + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < num_output; oc++) + { + float sum = 0.f; + if (bias_term) + sum = bias_data[oc]; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = (int)floorf(h_im); + w_low = (int)floorf(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < channels; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + sum += val * mask_ * weight_ptr[((oc * channels + ic) * kernel_h + i) * kernel_w + j]; + } + } + } + top_blob.channel(oc).row(h_col)[w_col] = activation_ss(sum, activation_type, activation_params); + } } } - - return DeformableConv2D::forward(bottom_blobs_unpacked, top_blobs, opt); } } From 5e1a76315208e9743083153e066d8b937cac36c3 Mon Sep 17 00:00:00 2001 From: nihui <171016+nihui@users.noreply.github.com> Date: Fri, 20 Feb 2026 13:34:37 +0000 Subject: [PATCH 10/10] apply code-format changes --- src/layer/riscv/deformableconv2d_riscv.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp index a0695abe86b3..1bf560afc428 100644 --- a/src/layer/riscv/deformableconv2d_riscv.cpp +++ b/src/layer/riscv/deformableconv2d_riscv.cpp @@ -442,7 +442,7 @@ int DeformableConv2D_riscv::forward(const std::vector& bottom_blobs, std::v const bool offset_not_pack = offset.elempack == 1; const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; const float* weight_ptr = weight_data_tm; - + // naive deformable conv #pragma omp parallel for num_threads(opt.num_threads) for (int h_col = 0; h_col < outh; h_col++) @@ -490,7 +490,7 @@ int DeformableConv2D_riscv::forward(const std::vector& bottom_blobs, std::v } const float h_im = h_in + i * dilation_h + offset_h; const float w_im = w_in + j * dilation_w + offset_w; - + // Bilinear const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; int h_low = 0; @@ -511,23 +511,23 @@ int DeformableConv2D_riscv::forward(const std::vector& bottom_blobs, std::v w_low = (int)floorf(w_im); h_high = h_low + 1; w_high = w_low + 1; - + float lh = h_im - h_low; float lw = w_im - w_low; float hh = 1 - lh; float hw = 1 - lw; - + v1_cond = (h_low >= 0 && w_low >= 0); v2_cond = (h_low >= 0 && w_high <= w - 1); v3_cond = (h_high <= h - 1 && w_low >= 0); v4_cond = (h_high <= h - 1 && w_high <= w - 1); - + w1 = hh * hw; w2 = hh * lw; w3 = lh * hw; w4 = lh * lw; } - + for (int ic = 0; ic < channels; ic++) { float val = 0.f;