From 917dfd5fb8b78da43ff7fc6e42632c946e65e21f Mon Sep 17 00:00:00 2001
From: chenglimin <clmngu@163.com>
Date: Tue, 10 Feb 2026 16:00:52 +0800
Subject: [PATCH 01/10] add the deformableconv2d operator for RVV backend, with
 12.94x-20.16x speedup over scaler implementation

---
 src/layer/riscv/deformableconv2d_pack1ton.h | 160 +++++++
 src/layer/riscv/deformableconv2d_packn.h    | 182 ++++++++
 src/layer/riscv/deformableconv2d_packnto1.h | 174 +++++++
 src/layer/riscv/deformableconv2d_riscv.cpp  | 482 ++++++++++++++++++++
 src/layer/riscv/deformableconv2d_riscv.h    |  43 ++
 5 files changed, 1041 insertions(+)
 create mode 100644 src/layer/riscv/deformableconv2d_pack1ton.h
 create mode 100644 src/layer/riscv/deformableconv2d_packn.h
 create mode 100644 src/layer/riscv/deformableconv2d_packnto1.h
 create mode 100644 src/layer/riscv/deformableconv2d_riscv.cpp
 create mode 100644 src/layer/riscv/deformableconv2d_riscv.h
diff --git a/src/layer/riscv/deformableconv2d_pack1ton.h b/src/layer/riscv/deformableconv2d_pack1ton.h
new file mode 100644
index 000000000000..dfa046c15e57
--- /dev/null
+++ b/src/layer/riscv/deformableconv2d_pack1ton.h
@@ -0,0 +1,160 @@
+
+
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* bias_data_ptr = bias_data;
+    const int packn = csrr_vlenb() / 4;
+    const size_t vl = __riscv_vsetvl_e32m1(packn);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
+                if (bias_data_ptr)
+                    _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl);
+                
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = (int)floorf(h_im);
+                            int w_low = (int)floorf(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            
+                            if (cond)
+                            {
+                                float v_in = 0.f;
+                                if (v1_cond) v_in += data_im_ptr[v1_pos] * w1;
+                                if (v2_cond) v_in += data_im_ptr[v2_pos] * w2;
+                                if (v3_cond) v_in += data_im_ptr[v3_pos] * w3;
+                                if (v4_cond) v_in += data_im_ptr[v4_pos] * w4;
+                                
+                                if (has_mask) v_in *= mask_;
+                                
+                                vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl);
+                                _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl);
+                            }
+                            
+                            kptr += packn;
+                        }
+                    }
+                }
+                _sum = activation_ps(_sum, activation_type, activation_params, vl);
+                __riscv_vse32_v_f32m1(outptr + (h_col * outw + w_col) * packn, _sum, vl);
+            }
+        }
+    }
+}
+
diff --git a/src/layer/riscv/deformableconv2d_packn.h b/src/layer/riscv/deformableconv2d_packn.h
new file mode 100644
index 000000000000..c44e8bacc19e
--- /dev/null
+++ b/src/layer/riscv/deformableconv2d_packn.h
@@ -0,0 +1,182 @@
+
+
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* bias_data_ptr = bias_data;
+    const int packn = csrr_vlenb() / 4;
+    const size_t vl = __riscv_vsetvl_e32m1(packn);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
+                if (bias_data_ptr)
+                    _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl);
+                
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = (int)floorf(h_im);
+                            int w_low = (int)floorf(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            
+                            if (cond)
+                            {
+                                vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl);
+                                
+                                // Since we are iterating over input channels which are packed,
+                                // we need to handle each element in the pack.
+                                // However, the weight layout for packn is:
+                                // [outch/packn][kh][kw][inch/packn][packn_in][packn_out]
+                                // Wait, let's check the weight transformation in deformableconv2d_riscv.cpp
+                                // weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack);
+                                // It seems the weight is packed as [packn_in * packn_out]
+                                
+                                // For each input channel pack (size packn), we have packn input values.
+                                // Each input value contributes to all packn output values.
+                                // So we have packn * packn weights for this block.
+                                
+                                // Let's look at x86 implementation again.
+                                // _val_channel0..3 corresponds to the 4 input values in the pack.
+                                // _conv_w0..3 corresponds to the weights for these input values.
+                                // Each _conv_w is a vector of size 4 (out_elempack), representing weights for one input channel to all 4 output channels.
+                                
+                                for (int k = 0; k < packn; k++)
+                                {
+                                    float v_in = 0.f;
+                                    if (v1_cond) v_in += data_im_ptr[v1_pos * packn + k] * w1;
+                                    if (v2_cond) v_in += data_im_ptr[v2_pos * packn + k] * w2;
+                                    if (v3_cond) v_in += data_im_ptr[v3_pos * packn + k] * w3;
+                                    if (v4_cond) v_in += data_im_ptr[v4_pos * packn + k] * w4;
+                                    
+                                    if (has_mask) v_in *= mask_;
+                                    
+                                    vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl);
+                                    _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl);
+                                }
+                            }
+                            
+                            kptr += packn * packn;
+                        }
+                    }
+                }
+                _sum = activation_ps(_sum, activation_type, activation_params, vl);
+                __riscv_vse32_v_f32m1(outptr + (h_col * outw + w_col) * packn, _sum, vl);
+            }
+        }
+    }
+}
+
diff --git a/src/layer/riscv/deformableconv2d_packnto1.h b/src/layer/riscv/deformableconv2d_packnto1.h
new file mode 100644
index 000000000000..9c04b28c35ca
--- /dev/null
+++ b/src/layer/riscv/deformableconv2d_packnto1.h
@@ -0,0 +1,174 @@
+
+
+
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void deformableconv2d_packnto1(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    const bool offset_not_pack = offset.elempack == 1;
+    const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float* bias_data_ptr = bias_data;
+    const int packn = csrr_vlenb() / 4;
+    const size_t vl = __riscv_vsetvl_e32m1(packn);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int h_col = 0; h_col < outh; h_col++)
+    {
+        for (int w_col = 0; w_col < outw; w_col++)
+        {
+            int h_in = h_col * stride_h - pad_top;
+            int w_in = w_col * stride_w - pad_left;
+            for (int oc = 0; oc < outch; oc++)
+            {
+                const float* kptr = weight_data_packed.channel(oc);
+                float* outptr = top_blob.channel(oc);
+                float sum = 0.f;
+                if (bias_data_ptr)
+                    sum = bias_data_ptr[oc];
+                
+                vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
+                
+                for (int i = 0; i < kernel_h; i++)
+                {
+                    for (int j = 0; j < kernel_w; j++)
+                    {
+                        float offset_h = 0.f;
+                        float offset_w = 0.f;
+                        float mask_ = 1.f;
+                        if (offset_not_pack)
+                        {
+                            offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                            offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                        }
+                        else
+                        {
+                            const int y_c = (i * kernel_w + j) * 2;
+                            const int x_c = (i * kernel_w + j) * 2 + 1;
+                            offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                            offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                        }
+                        if (has_mask)
+                        {
+                            const Mat& mask = bottom_blobs[2];
+                            if (mask_not_pack)
+                            {
+                                mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                            }
+                            else
+                            {
+                                const int m_c = i * kernel_w + j;
+                                mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                            }
+                        }
+                        const float h_im = h_in + i * dilation_h + offset_h;
+                        const float w_im = w_in + j * dilation_w + offset_w;
+
+                        // Bilinear
+                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                        float w1 = 0.f;
+                        float w2 = 0.f;
+                        float w3 = 0.f;
+                        float w4 = 0.f;
+                        bool v1_cond = false;
+                        bool v2_cond = false;
+                        bool v3_cond = false;
+                        bool v4_cond = false;
+                        int v1_pos = 0;
+                        int v2_pos = 0;
+                        int v3_pos = 0;
+                        int v4_pos = 0;
+                        if (cond)
+                        {
+                            int h_low = (int)floorf(h_im);
+                            int w_low = (int)floorf(w_im);
+                            int h_high = h_low + 1;
+                            int w_high = w_low + 1;
+
+                            float lh = h_im - h_low;
+                            float lw = w_im - w_low;
+                            float hh = 1 - lh;
+                            float hw = 1 - lw;
+
+                            v1_cond = (h_low >= 0 && w_low >= 0);
+                            v2_cond = (h_low >= 0 && w_high <= w - 1);
+                            v3_cond = (h_high <= h - 1 && w_low >= 0);
+                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+                            if (v1_cond)
+                                v1_pos = h_low * w + w_low;
+                            if (v2_cond)
+                                v2_pos = h_low * w + w_high;
+                            if (v3_cond)
+                                v3_pos = h_high * w + w_low;
+                            if (v4_cond)
+                                v4_pos = h_high * w + w_high;
+
+                            w1 = hh * hw;
+                            w2 = hh * lw;
+                            w3 = lh * hw;
+                            w4 = lh * lw;
+                        }
+
+                        for (int ic = 0; ic < inch; ic++)
+                        {
+                            const float* data_im_ptr = bottom_blob.channel(ic);
+                            
+                            if (cond)
+                            {
+                                vfloat32m1_t _v1 = v1_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v1_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
+                                vfloat32m1_t _v2 = v2_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v2_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
+                                vfloat32m1_t _v3 = v3_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v3_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
+                                vfloat32m1_t _v4 = v4_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v4_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
+                                
+                                vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl);
+                                _val = __riscv_vfmacc_vf_f32m1(_val, w1, _v1, vl);
+                                _val = __riscv_vfmacc_vf_f32m1(_val, w2, _v2, vl);
+                                _val = __riscv_vfmacc_vf_f32m1(_val, w3, _v3, vl);
+                                _val = __riscv_vfmacc_vf_f32m1(_val, w4, _v4, vl);
+                                
+                                if (has_mask)
+                                    _val = __riscv_vfmul_vf_f32m1(_val, mask_, vl);
+                                
+                                vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl);
+                                _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl);
+                            }
+                            
+                            kptr += packn;
+                        }
+                    }
+                }
+                
+                vfloat32m1_t _v_sum = __riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_v_f_f32m1(0.f, vl), vl);
+                sum += __riscv_vfmv_f_s_f32m1_f32(_v_sum);
+                
+                sum = activation_ss(sum, activation_type, activation_params);
+                outptr[h_col * outw + w_col] = sum;
+            }
+        }
+    }
+}
+
+
diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp
new file mode 100644
index 000000000000..d88eda0e4c7c
--- /dev/null
+++ b/src/layer/riscv/deformableconv2d_riscv.cpp
@@ -0,0 +1,482 @@
+
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "deformableconv2d_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_activation.h"
+#include "riscv_usability.h"
+
+#include "benchmark.h"
+#include "cpu.h"
+#include "layer_type.h"
+
+namespace ncnn {
+
+#if __riscv_vector
+#include "deformableconv2d_packn.h"
+#include "deformableconv2d_pack1ton.h"
+#include "deformableconv2d_packnto1.h"
+#endif // __riscv_vector
+
+DeformableConv2D_riscv::DeformableConv2D_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+
+    activation = 0;
+    gemm = 0;
+}
+
+static int _4Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int l1, int l2, int l3)
+{
+    return ((i0 * l1 + i1) * l2 + i2) * l3 + i3;
+}
+
+static int _6Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int i4, int i5, int l1, int l2, int l3, int l4, int l5)
+{
+    return ((((i0 * l1 + i1) * l2 + i2) * l3 + i3) * l4 + i4) * l5 + i5;
+}
+
+#if __riscv_vector
+static void deformableconv2d_transform_kernel_packed_riscv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // src = kw-kh-inch-outch
+    // dst = pb-pa-inch/pa-kw-kh-outch/pb
+    {
+        const float* weight_ptr = weight_data;
+
+        weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack);
+        float* ptr = weight_data_tm;
+        for (int oc = 0; oc < num_output; oc++)
+        {
+            for (int i = 0; i < kernel_h; i++)
+            {
+                for (int j = 0; j < kernel_w; j++)
+                {
+                    for (int ic = 0; ic < num_input; ic++)
+                    {
+                        ptr[_6Dindex_to_1Dindex(oc / out_elempack, i, j, ic / elempack, ic % elempack, oc % out_elempack, kernel_h, kernel_w, num_input / elempack, elempack, out_elempack)] = weight_ptr[_4Dindex_to_1Dindex(oc, ic, i, j, num_input, kernel_h, kernel_w)];
+                    }
+                }
+            }
+        }
+        weight_data_tm = weight_data_tm.reshape(num_input / elempack, maxk, num_output / out_elempack);
+    }
+}
+#endif // __riscv_vector
+
+int DeformableConv2D_riscv::create_pipeline(const Option& opt)
+{
+    activation = create_activation_layer(activation_type, activation_params, opt);
+
+    int kernel_size = kernel_w * kernel_h;
+    int num_input = weight_data_size / kernel_size / num_output;
+
+    int elempack = 1;
+    int out_elempack = 1;
+
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        const int packn = csrr_vlenb() / 4;
+        elempack = num_input % packn == 0 ? packn : 1;
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_vector
+
+    if (opt.use_sgemm_convolution)
+    {
+        const int maxk = kernel_w * kernel_h;
+
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
+
+        ncnn::ParamDict pd;
+        pd.set(2, 0);                   // transA
+        pd.set(3, 0);                   // transB
+        pd.set(4, 1);                   // constantA
+        pd.set(5, 0);                   // constantB
+        pd.set(6, 1);                   // constantC
+        pd.set(7, num_output);          // M = outch
+        pd.set(8, 0);                   // N = size
+        pd.set(9, maxk * num_input);    // K = maxk*inch
+        pd.set(10, bias_term ? 1 : -1); // constant_broadcast_type_C = (M)
+        pd.set(11, 1);                  // output_N1M
+
+        gemm->load_param(pd);
+
+        // maxk-inch-outch to pa-maxk-inch/pa-outch
+        Mat tmp;
+        {
+            Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+            tmp.create(maxk * num_input, num_output);
+
+            for (int q = 0; q < num_output; q += 1)
+            {
+                float* g00 = tmp.row(q);
+
+                for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        for (int i = 0; i < elempack; i++)
+                        {
+                            const float* k00 = weight_data_r2.channel(q).row(p + i);
+                            g00[0] = k00[k];
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (bias_term)
+        {
+            ncnn::Mat weights[2];
+            weights[0] = tmp;
+            weights[1] = bias_data;
+
+            gemm->load_model(ModelBinFromMatArray(weights));
+        }
+        else
+        {
+            ncnn::Mat weights[1];
+            weights[0] = tmp;
+
+            gemm->load_model(ModelBinFromMatArray(weights));
+        }
+
+        gemm->create_pipeline(opt);
+    }
+    else if (elempack == 1 && out_elempack == 1)
+    {
+        weight_data_tm = weight_data;
+    }
+    else
+    {
+#if __riscv_vector
+        deformableconv2d_transform_kernel_packed_riscv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
+#endif // __riscv_vector
+    }
+
+    if (opt.lightmode)
+    {
+        if (!(elempack == 1 && out_elempack == 1))
+            weight_data.release();
+    }
+
+    return 0;
+}
+
+int DeformableConv2D_riscv::destroy_pipeline(const Option& opt)
+{
+    if (activation)
+    {
+        activation->destroy_pipeline(opt);
+        delete activation;
+        activation = 0;
+    }
+
+    if (gemm)
+    {
+        gemm->destroy_pipeline(opt);
+        delete gemm;
+        gemm = 0;
+    }
+
+    return 0;
+}
+
+int DeformableConv2D_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& offset = bottom_blobs[1];
+    const bool has_mask = (bottom_blobs.size() == 3);
+    Mat& top_blob = top_blobs[0];
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+    const int outw = (w + pad_left + pad_right - kernel_extent_w) / stride_w + 1;
+    const int outh = (h + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1;
+
+    int out_elempack = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        const int packn = csrr_vlenb() / 4;
+        out_elempack = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_vector
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (opt.use_sgemm_convolution)
+    {
+        const int size = outw * outh;
+        const int maxk = kernel_w * kernel_h;
+
+        Mat offset_unpacked;
+        convert_packing(offset, offset_unpacked, 1, opt);
+
+        Mat mask_unpacked;
+        if (has_mask)
+        {
+            const Mat& mask = bottom_blobs[2];
+            convert_packing(mask, mask_unpacked, 1, opt);
+        }
+
+        // im2col
+        Mat bottom_im2col(size, maxk * channels, elemsize, elempack, opt.workspace_allocator);
+
+#if __riscv_vector
+        const int packn = csrr_vlenb() / 4;
+        if (elempack == packn)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < channels; p++)
+            {
+                const Mat img = bottom_blob.channel(p);
+                float* ptr = bottom_im2col.row(p * maxk);
+
+                for (int u = 0; u < kernel_h; u++)
+                {
+                    for (int v = 0; v < kernel_w; v++)
+                    {
+                        const Mat offset_h_k = offset_unpacked.channel((u * kernel_w + v) * 2);
+                        const Mat offset_w_k = offset_unpacked.channel((u * kernel_w + v) * 2 + 1);
+                        const Mat mask_k = has_mask ? mask_unpacked.channel(u * kernel_w + v) : 0;
+
+                        for (int i = 0; i < outh; i++)
+                        {
+                            for (int j = 0; j < outw; j++)
+                            {
+                                float offset_h = offset_h_k.row(i)[j];
+                                float offset_w = offset_w_k.row(i)[j];
+
+                                int h_in = i * stride_h - pad_top;
+                                int w_in = j * stride_w - pad_left;
+
+                                const float h_im = h_in + u * dilation_h + offset_h;
+                                const float w_im = w_in + v * dilation_w + offset_w;
+
+                                // Bilinear
+                                size_t vl = __riscv_vsetvl_e32m1(packn);
+                                vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl);
+                                bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                                if (cond)
+                                {
+                                    int h_low = floor(h_im);
+                                    int w_low = floor(w_im);
+                                    int h_high = h_low + 1;
+                                    int w_high = w_low + 1;
+
+                                    float lh = h_im - h_low;
+                                    float lw = w_im - w_low;
+                                    float hh = 1 - lh;
+                                    float hw = 1 - lw;
+
+                                    bool v1_cond = (h_low >= 0 && w_low >= 0);
+                                    bool v2_cond = (h_low >= 0 && w_high <= w - 1);
+                                    bool v3_cond = (h_high <= h - 1 && w_low >= 0);
+                                    bool v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+
+                                    float w1 = hh * hw;
+                                    float w2 = hh * lw;
+                                    float w3 = lh * hw;
+                                    float w4 = lh * lw;
+
+                                    vfloat32m1_t _v1 = v1_cond ? __riscv_vle32_v_f32m1((const float*)img.row(h_low) + w_low * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
+                                    vfloat32m1_t _v2 = v2_cond ? __riscv_vle32_v_f32m1((const float*)img.row(h_low) + w_high * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
+                                    vfloat32m1_t _v3 = v3_cond ? __riscv_vle32_v_f32m1((const float*)img.row(h_high) + w_low * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
+                                    vfloat32m1_t _v4 = v4_cond ? __riscv_vle32_v_f32m1((const float*)img.row(h_high) + w_high * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
+
+                                    _val = __riscv_vfmacc_vf_f32m1(_val, w1, _v1, vl);
+                                    _val = __riscv_vfmacc_vf_f32m1(_val, w2, _v2, vl);
+                                    _val = __riscv_vfmacc_vf_f32m1(_val, w3, _v3, vl);
+                                    _val = __riscv_vfmacc_vf_f32m1(_val, w4, _v4, vl);
+
+                                    if (has_mask)
+                                        _val = __riscv_vfmul_vf_f32m1(_val, mask_k.row(i)[j], vl);
+                                }
+
+                                __riscv_vse32_v_f32m1(ptr, _val, vl);
+
+                                ptr += packn;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#endif // __riscv_vector
+
+        if (elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p = 0; p < channels; p++)
+            {
+                const Mat img = bottom_blob.channel(p);
+                float* ptr = bottom_im2col.row(p * maxk);
+
+                for (int u = 0; u < kernel_h; u++)
+                {
+                    for (int v = 0; v < kernel_w; v++)
+                    {
+                        const Mat offset_h_k = offset_unpacked.channel((u * kernel_w + v) * 2);
+                        const Mat offset_w_k = offset_unpacked.channel((u * kernel_w + v) * 2 + 1);
+                        const Mat mask_k = has_mask ? mask_unpacked.channel(u * kernel_w + v) : 0;
+
+                        for (int i = 0; i < outh; i++)
+                        {
+                            for (int j = 0; j < outw; j++)
+                            {
+                                float offset_h = offset_h_k.row(i)[j];
+                                float offset_w = offset_w_k.row(i)[j];
+
+                                int h_in = i * stride_h - pad_top;
+                                int w_in = j * stride_w - pad_left;
+
+                                const float h_im = h_in + u * dilation_h + offset_h;
+                                const float w_im = w_in + v * dilation_w + offset_w;
+
+                                // Bilinear
+                                float val = 0.f;
+                                bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                                if (cond)
+                                {
+                                    int h_low = (int)floorf(h_im);
+                                    int w_low = (int)floorf(w_im);
+                                    int h_high = h_low + 1;
+                                    int w_high = w_low + 1;
+
+                                    float lh = h_im - h_low;
+                                    float lw = w_im - w_low;
+                                    float hh = 1 - lh;
+                                    float hw = 1 - lw;
+
+                                    bool v1_cond = (h_low >= 0 && w_low >= 0);
+                                    bool v2_cond = (h_low >= 0 && w_high <= w - 1);
+                                    bool v3_cond = (h_high <= h - 1 && w_low >= 0);
+                                    bool v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+
+                                    float w1 = hh * hw;
+                                    float w2 = hh * lw;
+                                    float w3 = lh * hw;
+                                    float w4 = lh * lw;
+
+                                    float v1 = v1_cond ? img.row(h_low)[w_low] : 0.f;
+                                    float v2 = v2_cond ? img.row(h_low)[w_high] : 0.f;
+                                    float v3 = v3_cond ? img.row(h_high)[w_low] : 0.f;
+                                    float v4 = v4_cond ? img.row(h_high)[w_high] : 0.f;
+                                    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+
+                                    if (has_mask)
+                                        val *= mask_k.row(i)[j];
+                                }
+
+                                ptr[0] = val;
+
+                                ptr += 1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        // sgemm
+        {
+            top_blob.w = outw * outh;
+            top_blob.h = 1;
+        }
+        Option opt_b = opt;
+        opt_b.blob_allocator = opt.workspace_allocator;
+        gemm->forward(bottom_im2col, top_blob, opt_b);
+        {
+            top_blob.w = outw;
+            top_blob.h = outh;
+        }
+
+        if (activation)
+        {
+            activation->forward_inplace(top_blob, opt);
+        }
+    }
+    else
+    {
+#if __riscv_vector
+        const int packn = csrr_vlenb() / 4;
+
+        if (elempack == packn && out_elempack == packn)
+        {
+            deformableconv2d_packn(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+
+        if (elempack == 1 && out_elempack == packn)
+        {
+            deformableconv2d_pack1ton(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+
+        if (elempack == packn && out_elempack == 1)
+        {
+            deformableconv2d_packnto1(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
+        }
+
+        if (elempack == 1 && out_elempack == 1)
+        {
+            std::vector<Mat> bottom_blobs_unpacked = bottom_blobs;
+            Mat offset_unpacked;
+            if (offset.elempack != 1)
+            {
+                convert_packing(offset, offset_unpacked, 1, opt);
+                bottom_blobs_unpacked[1] = offset_unpacked;
+            }
+
+            if (bottom_blobs.size() == 3)
+            {
+                const Mat& mask = bottom_blobs[2];
+                if (mask.elempack != 1)
+                {
+                    Mat mask_unpacked;
+                    convert_packing(mask, mask_unpacked, 1, opt);
+                    bottom_blobs_unpacked[2] = mask_unpacked;
+                }
+            }
+
+            return DeformableConv2D::forward(bottom_blobs_unpacked, top_blobs, opt);
+        }
+#endif // __riscv_vector
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/deformableconv2d_riscv.h b/src/layer/riscv/deformableconv2d_riscv.h
new file mode 100644
index 000000000000..5d6a3b7765be
--- /dev/null
+++ b/src/layer/riscv/deformableconv2d_riscv.h
@@ -0,0 +1,43 @@
+
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DEFORMABLECONV2D_RISCV_H
+#define LAYER_DEFORMABLECONV2D_RISCV_H
+
+#include "deformableconv2d.h"
+
+namespace ncnn {
+
+class DeformableConv2D_riscv : public DeformableConv2D
+{
+public:
+    DeformableConv2D_riscv();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+public:
+    Layer* activation;
+
+    Mat weight_data_tm;
+
+    Layer* gemm;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DEFORMABLECONV2D_RISCV_H

From 2934858505513cf601a43ca3568793802876be9e Mon Sep 17 00:00:00 2001
From: chenglimin <18213449+chenglimin@users.noreply.github.com>
Date: Tue, 10 Feb 2026 08:54:17 +0000
Subject: [PATCH 02/10] apply code-format changes

---
 src/layer/riscv/deformableconv2d_pack1ton.h | 11 +++++------
 src/layer/riscv/deformableconv2d_packn.h    | 19 +++++++++----------
 src/layer/riscv/deformableconv2d_packnto1.h | 21 +++++++++------------
 3 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/src/layer/riscv/deformableconv2d_pack1ton.h b/src/layer/riscv/deformableconv2d_pack1ton.h
index dfa046c15e57..628a9f7ae470 100644
--- a/src/layer/riscv/deformableconv2d_pack1ton.h
+++ b/src/layer/riscv/deformableconv2d_pack1ton.h
@@ -48,7 +48,7 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
                 vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
                 if (bias_data_ptr)
                     _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl);
-                
+
                 for (int i = 0; i < kernel_h; i++)
                 {
                     for (int j = 0; j < kernel_w; j++)
@@ -132,7 +132,7 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
                         for (int ic = 0; ic < inch; ic++)
                         {
                             const float* data_im_ptr = bottom_blob.channel(ic);
-                            
+
                             if (cond)
                             {
                                 float v_in = 0.f;
@@ -140,13 +140,13 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
                                 if (v2_cond) v_in += data_im_ptr[v2_pos] * w2;
                                 if (v3_cond) v_in += data_im_ptr[v3_pos] * w3;
                                 if (v4_cond) v_in += data_im_ptr[v4_pos] * w4;
-                                
+
                                 if (has_mask) v_in *= mask_;
-                                
+
                                 vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl);
                                 _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl);
                             }
-                            
+
                             kptr += packn;
                         }
                     }
@@ -157,4 +157,3 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
         }
     }
 }
-
diff --git a/src/layer/riscv/deformableconv2d_packn.h b/src/layer/riscv/deformableconv2d_packn.h
index c44e8bacc19e..50346e6816bd 100644
--- a/src/layer/riscv/deformableconv2d_packn.h
+++ b/src/layer/riscv/deformableconv2d_packn.h
@@ -48,7 +48,7 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                 vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
                 if (bias_data_ptr)
                     _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl);
-                
+
                 for (int i = 0; i < kernel_h; i++)
                 {
                     for (int j = 0; j < kernel_w; j++)
@@ -132,11 +132,11 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                         for (int ic = 0; ic < inch; ic++)
                         {
                             const float* data_im_ptr = bottom_blob.channel(ic);
-                            
+
                             if (cond)
                             {
                                 vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl);
-                                
+
                                 // Since we are iterating over input channels which are packed,
                                 // we need to handle each element in the pack.
                                 // However, the weight layout for packn is:
@@ -144,16 +144,16 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                                 // Wait, let's check the weight transformation in deformableconv2d_riscv.cpp
                                 // weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack);
                                 // It seems the weight is packed as [packn_in * packn_out]
-                                
+
                                 // For each input channel pack (size packn), we have packn input values.
                                 // Each input value contributes to all packn output values.
                                 // So we have packn * packn weights for this block.
-                                
+
                                 // Let's look at x86 implementation again.
                                 // _val_channel0..3 corresponds to the 4 input values in the pack.
                                 // _conv_w0..3 corresponds to the weights for these input values.
                                 // Each _conv_w is a vector of size 4 (out_elempack), representing weights for one input channel to all 4 output channels.
-                                
+
                                 for (int k = 0; k < packn; k++)
                                 {
                                     float v_in = 0.f;
@@ -161,14 +161,14 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                                     if (v2_cond) v_in += data_im_ptr[v2_pos * packn + k] * w2;
                                     if (v3_cond) v_in += data_im_ptr[v3_pos * packn + k] * w3;
                                     if (v4_cond) v_in += data_im_ptr[v4_pos * packn + k] * w4;
-                                    
+
                                     if (has_mask) v_in *= mask_;
-                                    
+
                                     vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl);
                                     _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl);
                                 }
                             }
-                            
+
                             kptr += packn * packn;
                         }
                     }
@@ -179,4 +179,3 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
         }
     }
 }
-
diff --git a/src/layer/riscv/deformableconv2d_packnto1.h b/src/layer/riscv/deformableconv2d_packnto1.h
index 9c04b28c35ca..2f5896c04f79 100644
--- a/src/layer/riscv/deformableconv2d_packnto1.h
+++ b/src/layer/riscv/deformableconv2d_packnto1.h
@@ -1,6 +1,5 @@
 
 
-
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
@@ -49,9 +48,9 @@ static void deformableconv2d_packnto1(const std::vector<Mat>& bottom_blobs, Mat&
                 float sum = 0.f;
                 if (bias_data_ptr)
                     sum = bias_data_ptr[oc];
-                
+
                 vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
-                
+
                 for (int i = 0; i < kernel_h; i++)
                 {
                     for (int j = 0; j < kernel_w; j++)
@@ -135,40 +134,38 @@ static void deformableconv2d_packnto1(const std::vector<Mat>& bottom_blobs, Mat&
                         for (int ic = 0; ic < inch; ic++)
                         {
                             const float* data_im_ptr = bottom_blob.channel(ic);
-                            
+
                             if (cond)
                             {
                                 vfloat32m1_t _v1 = v1_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v1_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 vfloat32m1_t _v2 = v2_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v2_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 vfloat32m1_t _v3 = v3_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v3_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 vfloat32m1_t _v4 = v4_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v4_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
-                                
+
                                 vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w1, _v1, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w2, _v2, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w3, _v3, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w4, _v4, vl);
-                                
+
                                 if (has_mask)
                                     _val = __riscv_vfmul_vf_f32m1(_val, mask_, vl);
-                                
+
                                 vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl);
                                 _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl);
                             }
-                            
+
                             kptr += packn;
                         }
                     }
                 }
-                
+
                 vfloat32m1_t _v_sum = __riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_v_f_f32m1(0.f, vl), vl);
                 sum += __riscv_vfmv_f_s_f32m1_f32(_v_sum);
-                
+
                 sum = activation_ss(sum, activation_type, activation_params);
                 outptr[h_col * outw + w_col] = sum;
             }
         }
     }
 }
-
-

From 8ceffbed9d2a5804eabd71b959a0be215fdeb377 Mon Sep 17 00:00:00 2001
From: chenglimin <clmngu@163.com>
Date: Tue, 10 Feb 2026 18:23:21 +0800
Subject: [PATCH 03/10] Fix deformableconv2d RVV implementation

---
 src/layer/riscv/deformableconv2d_pack1ton.h | 28 +++++-----------
 src/layer/riscv/deformableconv2d_packn.h    | 36 +++++++-------------
 src/layer/riscv/deformableconv2d_packnto1.h | 37 ++++++++-------------
 src/layer/riscv/deformableconv2d_riscv.cpp  | 16 ++-------
 src/layer/riscv/deformableconv2d_riscv.h    | 16 ++-------
 5 files changed, 37 insertions(+), 96 deletions(-)

diff --git a/src/layer/riscv/deformableconv2d_pack1ton.h b/src/layer/riscv/deformableconv2d_pack1ton.h
index 628a9f7ae470..945d2b5e39c0 100644
--- a/src/layer/riscv/deformableconv2d_pack1ton.h
+++ b/src/layer/riscv/deformableconv2d_pack1ton.h
@@ -1,18 +1,5 @@
-
-
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
 
 static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
 {
@@ -48,7 +35,7 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
                 vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
                 if (bias_data_ptr)
                     _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl);
-
+                
                 for (int i = 0; i < kernel_h; i++)
                 {
                     for (int j = 0; j < kernel_w; j++)
@@ -132,7 +119,7 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
                         for (int ic = 0; ic < inch; ic++)
                         {
                             const float* data_im_ptr = bottom_blob.channel(ic);
-
+                            
                             if (cond)
                             {
                                 float v_in = 0.f;
@@ -140,13 +127,13 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
                                 if (v2_cond) v_in += data_im_ptr[v2_pos] * w2;
                                 if (v3_cond) v_in += data_im_ptr[v3_pos] * w3;
                                 if (v4_cond) v_in += data_im_ptr[v4_pos] * w4;
-
+                                
                                 if (has_mask) v_in *= mask_;
-
+                                
                                 vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl);
                                 _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl);
                             }
-
+                            
                             kptr += packn;
                         }
                     }
@@ -157,3 +144,4 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
         }
     }
 }
+
diff --git a/src/layer/riscv/deformableconv2d_packn.h b/src/layer/riscv/deformableconv2d_packn.h
index 50346e6816bd..61612a260aea 100644
--- a/src/layer/riscv/deformableconv2d_packn.h
+++ b/src/layer/riscv/deformableconv2d_packn.h
@@ -1,18 +1,5 @@
-
-
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
 
 static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
 {
@@ -48,7 +35,7 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                 vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
                 if (bias_data_ptr)
                     _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl);
-
+                
                 for (int i = 0; i < kernel_h; i++)
                 {
                     for (int j = 0; j < kernel_w; j++)
@@ -132,11 +119,11 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                         for (int ic = 0; ic < inch; ic++)
                         {
                             const float* data_im_ptr = bottom_blob.channel(ic);
-
+                            
                             if (cond)
                             {
                                 vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl);
-
+                                
                                 // Since we are iterating over input channels which are packed,
                                 // we need to handle each element in the pack.
                                 // However, the weight layout for packn is:
@@ -144,16 +131,16 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                                 // Wait, let's check the weight transformation in deformableconv2d_riscv.cpp
                                 // weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack);
                                 // It seems the weight is packed as [packn_in * packn_out]
-
+                                
                                 // For each input channel pack (size packn), we have packn input values.
                                 // Each input value contributes to all packn output values.
                                 // So we have packn * packn weights for this block.
-
+                                
                                 // Let's look at x86 implementation again.
                                 // _val_channel0..3 corresponds to the 4 input values in the pack.
                                 // _conv_w0..3 corresponds to the weights for these input values.
                                 // Each _conv_w is a vector of size 4 (out_elempack), representing weights for one input channel to all 4 output channels.
-
+                                
                                 for (int k = 0; k < packn; k++)
                                 {
                                     float v_in = 0.f;
@@ -161,14 +148,14 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                                     if (v2_cond) v_in += data_im_ptr[v2_pos * packn + k] * w2;
                                     if (v3_cond) v_in += data_im_ptr[v3_pos * packn + k] * w3;
                                     if (v4_cond) v_in += data_im_ptr[v4_pos * packn + k] * w4;
-
+                                    
                                     if (has_mask) v_in *= mask_;
-
+                                    
                                     vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl);
                                     _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl);
                                 }
                             }
-
+                            
                             kptr += packn * packn;
                         }
                     }
@@ -179,3 +166,4 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
         }
     }
 }
+
diff --git a/src/layer/riscv/deformableconv2d_packnto1.h b/src/layer/riscv/deformableconv2d_packnto1.h
index 2f5896c04f79..577058ea1d41 100644
--- a/src/layer/riscv/deformableconv2d_packnto1.h
+++ b/src/layer/riscv/deformableconv2d_packnto1.h
@@ -1,18 +1,5 @@
-
-
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
 
 static void deformableconv2d_packnto1(const std::vector<Mat>& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt)
 {
@@ -48,9 +35,9 @@ static void deformableconv2d_packnto1(const std::vector<Mat>& bottom_blobs, Mat&
                 float sum = 0.f;
                 if (bias_data_ptr)
                     sum = bias_data_ptr[oc];
-
+                
                 vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
-
+                
                 for (int i = 0; i < kernel_h; i++)
                 {
                     for (int j = 0; j < kernel_w; j++)
@@ -134,38 +121,40 @@ static void deformableconv2d_packnto1(const std::vector<Mat>& bottom_blobs, Mat&
                         for (int ic = 0; ic < inch; ic++)
                         {
                             const float* data_im_ptr = bottom_blob.channel(ic);
-
+                            
                             if (cond)
                             {
                                 vfloat32m1_t _v1 = v1_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v1_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 vfloat32m1_t _v2 = v2_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v2_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 vfloat32m1_t _v3 = v3_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v3_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 vfloat32m1_t _v4 = v4_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v4_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
-
+                                
                                 vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w1, _v1, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w2, _v2, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w3, _v3, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w4, _v4, vl);
-
+                                
                                 if (has_mask)
                                     _val = __riscv_vfmul_vf_f32m1(_val, mask_, vl);
-
+                                
                                 vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl);
                                 _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl);
                             }
-
+                            
                             kptr += packn;
                         }
                     }
                 }
-
+                
                 vfloat32m1_t _v_sum = __riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_v_f_f32m1(0.f, vl), vl);
                 sum += __riscv_vfmv_f_s_f32m1_f32(_v_sum);
-
+                
                 sum = activation_ss(sum, activation_type, activation_params);
                 outptr[h_col * outw + w_col] = sum;
             }
         }
     }
 }
+
+
diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp
index d88eda0e4c7c..e0975e1e6054 100644
--- a/src/layer/riscv/deformableconv2d_riscv.cpp
+++ b/src/layer/riscv/deformableconv2d_riscv.cpp
@@ -1,17 +1,5 @@
-
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
 
 #include "deformableconv2d_riscv.h"
 
diff --git a/src/layer/riscv/deformableconv2d_riscv.h b/src/layer/riscv/deformableconv2d_riscv.h
index 5d6a3b7765be..d538e5097075 100644
--- a/src/layer/riscv/deformableconv2d_riscv.h
+++ b/src/layer/riscv/deformableconv2d_riscv.h
@@ -1,17 +1,5 @@
-
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
 
 #ifndef LAYER_DEFORMABLECONV2D_RISCV_H
 #define LAYER_DEFORMABLECONV2D_RISCV_H

From f8dd5ab068c17121f8f8e93817c505a48c84a153 Mon Sep 17 00:00:00 2001
From: chenglimin <18213449+chenglimin@users.noreply.github.com>
Date: Tue, 10 Feb 2026 10:27:31 +0000
Subject: [PATCH 04/10] apply code-format changes

---
 src/layer/riscv/deformableconv2d_pack1ton.h | 11 +++++------
 src/layer/riscv/deformableconv2d_packn.h    | 19 +++++++++----------
 src/layer/riscv/deformableconv2d_packnto1.h | 20 +++++++++-----------
 3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/src/layer/riscv/deformableconv2d_pack1ton.h b/src/layer/riscv/deformableconv2d_pack1ton.h
index 945d2b5e39c0..e5ba658c9551 100644
--- a/src/layer/riscv/deformableconv2d_pack1ton.h
+++ b/src/layer/riscv/deformableconv2d_pack1ton.h
@@ -35,7 +35,7 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
                 vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
                 if (bias_data_ptr)
                     _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl);
-                
+
                 for (int i = 0; i < kernel_h; i++)
                 {
                     for (int j = 0; j < kernel_w; j++)
@@ -119,7 +119,7 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
                         for (int ic = 0; ic < inch; ic++)
                         {
                             const float* data_im_ptr = bottom_blob.channel(ic);
-                            
+
                             if (cond)
                             {
                                 float v_in = 0.f;
@@ -127,13 +127,13 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
                                 if (v2_cond) v_in += data_im_ptr[v2_pos] * w2;
                                 if (v3_cond) v_in += data_im_ptr[v3_pos] * w3;
                                 if (v4_cond) v_in += data_im_ptr[v4_pos] * w4;
-                                
+
                                 if (has_mask) v_in *= mask_;
-                                
+
                                 vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl);
                                 _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl);
                             }
-                            
+
                             kptr += packn;
                         }
                     }
@@ -144,4 +144,3 @@ static void deformableconv2d_pack1ton(const std::vector<Mat>& bottom_blobs, Mat&
         }
     }
 }
-
diff --git a/src/layer/riscv/deformableconv2d_packn.h b/src/layer/riscv/deformableconv2d_packn.h
index 61612a260aea..4b2b0e7f194e 100644
--- a/src/layer/riscv/deformableconv2d_packn.h
+++ b/src/layer/riscv/deformableconv2d_packn.h
@@ -35,7 +35,7 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                 vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
                 if (bias_data_ptr)
                     _sum = __riscv_vle32_v_f32m1(bias_data_ptr + oc * packn, vl);
-                
+
                 for (int i = 0; i < kernel_h; i++)
                 {
                     for (int j = 0; j < kernel_w; j++)
@@ -119,11 +119,11 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                         for (int ic = 0; ic < inch; ic++)
                         {
                             const float* data_im_ptr = bottom_blob.channel(ic);
-                            
+
                             if (cond)
                             {
                                 vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl);
-                                
+
                                 // Since we are iterating over input channels which are packed,
                                 // we need to handle each element in the pack.
                                 // However, the weight layout for packn is:
@@ -131,16 +131,16 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                                 // Wait, let's check the weight transformation in deformableconv2d_riscv.cpp
                                 // weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack);
                                 // It seems the weight is packed as [packn_in * packn_out]
-                                
+
                                 // For each input channel pack (size packn), we have packn input values.
                                 // Each input value contributes to all packn output values.
                                 // So we have packn * packn weights for this block.
-                                
+
                                 // Let's look at x86 implementation again.
                                 // _val_channel0..3 corresponds to the 4 input values in the pack.
                                 // _conv_w0..3 corresponds to the weights for these input values.
                                 // Each _conv_w is a vector of size 4 (out_elempack), representing weights for one input channel to all 4 output channels.
-                                
+
                                 for (int k = 0; k < packn; k++)
                                 {
                                     float v_in = 0.f;
@@ -148,14 +148,14 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                                     if (v2_cond) v_in += data_im_ptr[v2_pos * packn + k] * w2;
                                     if (v3_cond) v_in += data_im_ptr[v3_pos * packn + k] * w3;
                                     if (v4_cond) v_in += data_im_ptr[v4_pos * packn + k] * w4;
-                                    
+
                                     if (has_mask) v_in *= mask_;
-                                    
+
                                     vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl);
                                     _sum = __riscv_vfmacc_vf_f32m1(_sum, v_in, _w, vl);
                                 }
                             }
-                            
+
                             kptr += packn * packn;
                         }
                     }
@@ -166,4 +166,3 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
         }
     }
 }
-
diff --git a/src/layer/riscv/deformableconv2d_packnto1.h b/src/layer/riscv/deformableconv2d_packnto1.h
index 577058ea1d41..d84ccd0a77c8 100644
--- a/src/layer/riscv/deformableconv2d_packnto1.h
+++ b/src/layer/riscv/deformableconv2d_packnto1.h
@@ -35,9 +35,9 @@ static void deformableconv2d_packnto1(const std::vector<Mat>& bottom_blobs, Mat&
                 float sum = 0.f;
                 if (bias_data_ptr)
                     sum = bias_data_ptr[oc];
-                
+
                 vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl);
-                
+
                 for (int i = 0; i < kernel_h; i++)
                 {
                     for (int j = 0; j < kernel_w; j++)
@@ -121,40 +121,38 @@ static void deformableconv2d_packnto1(const std::vector<Mat>& bottom_blobs, Mat&
                         for (int ic = 0; ic < inch; ic++)
                         {
                             const float* data_im_ptr = bottom_blob.channel(ic);
-                            
+
                             if (cond)
                             {
                                 vfloat32m1_t _v1 = v1_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v1_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 vfloat32m1_t _v2 = v2_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v2_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 vfloat32m1_t _v3 = v3_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v3_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 vfloat32m1_t _v4 = v4_cond ? __riscv_vle32_v_f32m1(data_im_ptr + v4_pos * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl);
-                                
+
                                 vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w1, _v1, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w2, _v2, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w3, _v3, vl);
                                 _val = __riscv_vfmacc_vf_f32m1(_val, w4, _v4, vl);
-                                
+
                                 if (has_mask)
                                     _val = __riscv_vfmul_vf_f32m1(_val, mask_, vl);
-                                
+
                                 vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl);
                                 _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl);
                             }
-                            
+
                             kptr += packn;
                         }
                     }
                 }
-                
+
                 vfloat32m1_t _v_sum = __riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_v_f_f32m1(0.f, vl), vl);
                 sum += __riscv_vfmv_f_s_f32m1_f32(_v_sum);
-                
+
                 sum = activation_ss(sum, activation_type, activation_params);
                 outptr[h_col * outw + w_col] = sum;
             }
         }
     }
 }
-
-

From d94348c1af85e7098f0d7bd349ef90547a83f0f6 Mon Sep 17 00:00:00 2001
From: chenglimin <clmngu@163.com>
Date: Thu, 12 Feb 2026 15:50:47 +0800
Subject: [PATCH 05/10] Update src/layer/riscv/deformableconv2d_packn.h

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/layer/riscv/deformableconv2d_packn.h | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/src/layer/riscv/deformableconv2d_packn.h b/src/layer/riscv/deformableconv2d_packn.h
index 4b2b0e7f194e..3a03e9b0b552 100644
--- a/src/layer/riscv/deformableconv2d_packn.h
+++ b/src/layer/riscv/deformableconv2d_packn.h
@@ -124,23 +124,13 @@ static void deformableconv2d_packn(const std::vector<Mat>& bottom_blobs, Mat& to
                             {
                                 vfloat32m1_t _val = __riscv_vfmv_v_f_f32m1(0.f, vl);
 
-                                // Since we are iterating over input channels which are packed,
-                                // we need to handle each element in the pack.
-                                // However, the weight layout for packn is:
-                                // [outch/packn][kh][kw][inch/packn][packn_in][packn_out]
-                                // Wait, let's check the weight transformation in deformableconv2d_riscv.cpp
-                                // weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack);
-                                // It seems the weight is packed as [packn_in * packn_out]
-
-                                // For each input channel pack (size packn), we have packn input values.
-                                // Each input value contributes to all packn output values.
-                                // So we have packn * packn weights for this block.
-
-                                // Let's look at x86 implementation again.
-                                // _val_channel0..3 corresponds to the 4 input values in the pack.
-                                // _conv_w0..3 corresponds to the weights for these input values.
-                                // Each _conv_w is a vector of size 4 (out_elempack), representing weights for one input channel to all 4 output channels.
-
+                                // Packed-weight memory layout for packn:
+                                // For each output-channel pack, kernel position (kh, kw) and input-channel pack,
+                                // the weights are stored as a contiguous block of size packn_in * packn_out
+                                // (with packn_in == packn_out == packn here). Within this block, lane k in
+                                // the input pack uses the vector loaded from kptr + k * packn, which contains
+                                // the weights from that input lane to all packn output channels. After all
+                                // packn input lanes are processed, kptr is advanced by packn * packn.
                                 for (int k = 0; k < packn; k++)
                                 {
                                     float v_in = 0.f;

From 09bbbc98df5b4eb63f5db386ae194b1122e76b8c Mon Sep 17 00:00:00 2001
From: chenglimin <clmngu@163.com>
Date: Thu, 12 Feb 2026 16:59:18 +0800
Subject: [PATCH 06/10] add scaler fallback

---
 src/layer/riscv/deformableconv2d_riscv.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp
index e0975e1e6054..86e94b74c58e 100644
--- a/src/layer/riscv/deformableconv2d_riscv.cpp
+++ b/src/layer/riscv/deformableconv2d_riscv.cpp
@@ -437,7 +437,7 @@ int DeformableConv2D_riscv::forward(const std::vector<Mat>& bottom_blobs, std::v
         {
             deformableconv2d_packnto1(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt);
         }
-
+#endif // __riscv_vector
         if (elempack == 1 && out_elempack == 1)
         {
             std::vector<Mat> bottom_blobs_unpacked = bottom_blobs;
@@ -461,7 +461,7 @@ int DeformableConv2D_riscv::forward(const std::vector<Mat>& bottom_blobs, std::v
 
             return DeformableConv2D::forward(bottom_blobs_unpacked, top_blobs, opt);
         }
-#endif // __riscv_vector
+
     }
 
     return 0;

From feff32e4574ec6d8e52c05de467b2ba42839866b Mon Sep 17 00:00:00 2001
From: chenglimin <18213449+chenglimin@users.noreply.github.com>
Date: Thu, 12 Feb 2026 09:05:37 +0000
Subject: [PATCH 07/10] apply code-format changes

---
 src/layer/riscv/deformableconv2d_riscv.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp
index 86e94b74c58e..8e6f7aa2b697 100644
--- a/src/layer/riscv/deformableconv2d_riscv.cpp
+++ b/src/layer/riscv/deformableconv2d_riscv.cpp
@@ -461,7 +461,6 @@ int DeformableConv2D_riscv::forward(const std::vector<Mat>& bottom_blobs, std::v
 
             return DeformableConv2D::forward(bottom_blobs_unpacked, top_blobs, opt);
         }
-
     }
 
     return 0;

From 3380ab2053caa3322f9ea77a41d8459a4b05d638 Mon Sep 17 00:00:00 2001
From: chenglimin <clmngu@163.com>
Date: Thu, 12 Feb 2026 18:49:10 +0800
Subject: [PATCH 08/10] add always release weight_data

---
 src/layer/riscv/deformableconv2d_riscv.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp
index 8e6f7aa2b697..3355238e72a1 100644
--- a/src/layer/riscv/deformableconv2d_riscv.cpp
+++ b/src/layer/riscv/deformableconv2d_riscv.cpp
@@ -168,8 +168,7 @@ int DeformableConv2D_riscv::create_pipeline(const Option& opt)
 
     if (opt.lightmode)
     {
-        if (!(elempack == 1 && out_elempack == 1))
-            weight_data.release();
+        weight_data.release();
     }
 
     return 0;

From f0def8687ec09a3becbba397060562debd63a458 Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Fri, 20 Feb 2026 21:32:52 +0800
Subject: [PATCH 09/10] add pack1 path

---
 src/layer/riscv/deformableconv2d_riscv.cpp | 120 ++++++++++++++++++---
 1 file changed, 104 insertions(+), 16 deletions(-)

diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp
index 3355238e72a1..a0695abe86b3 100644
--- a/src/layer/riscv/deformableconv2d_riscv.cpp
+++ b/src/layer/riscv/deformableconv2d_riscv.cpp
@@ -439,26 +439,114 @@ int DeformableConv2D_riscv::forward(const std::vector<Mat>& bottom_blobs, std::v
 #endif // __riscv_vector
         if (elempack == 1 && out_elempack == 1)
         {
-            std::vector<Mat> bottom_blobs_unpacked = bottom_blobs;
-            Mat offset_unpacked;
-            if (offset.elempack != 1)
-            {
-                convert_packing(offset, offset_unpacked, 1, opt);
-                bottom_blobs_unpacked[1] = offset_unpacked;
-            }
-
-            if (bottom_blobs.size() == 3)
+            const bool offset_not_pack = offset.elempack == 1;
+            const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
+            const float* weight_ptr = weight_data_tm;
+    
+            // naive deformable conv
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int h_col = 0; h_col < outh; h_col++)
             {
-                const Mat& mask = bottom_blobs[2];
-                if (mask.elempack != 1)
+                for (int w_col = 0; w_col < outw; w_col++)
                 {
-                    Mat mask_unpacked;
-                    convert_packing(mask, mask_unpacked, 1, opt);
-                    bottom_blobs_unpacked[2] = mask_unpacked;
+                    int h_in = h_col * stride_h - pad_top;
+                    int w_in = w_col * stride_w - pad_left;
+                    for (int oc = 0; oc < num_output; oc++)
+                    {
+                        float sum = 0.f;
+                        if (bias_term)
+                            sum = bias_data[oc];
+                        for (int i = 0; i < kernel_h; i++)
+                        {
+                            for (int j = 0; j < kernel_w; j++)
+                            {
+                                float offset_h = 0.f;
+                                float offset_w = 0.f;
+                                float mask_ = 1.f;
+                                if (offset_not_pack)
+                                {
+                                    offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
+                                    offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
+                                }
+                                else
+                                {
+                                    const int y_c = (i * kernel_w + j) * 2;
+                                    const int x_c = (i * kernel_w + j) * 2 + 1;
+                                    offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack];
+                                    offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack];
+                                }
+                                if (has_mask)
+                                {
+                                    const Mat& mask = bottom_blobs[2];
+                                    if (mask_not_pack)
+                                    {
+                                        mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col];
+                                    }
+                                    else
+                                    {
+                                        const int m_c = i * kernel_w + j;
+                                        mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack];
+                                    }
+                                }
+                                const float h_im = h_in + i * dilation_h + offset_h;
+                                const float w_im = w_in + j * dilation_w + offset_w;
+    
+                                // Bilinear
+                                const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
+                                int h_low = 0;
+                                int w_low = 0;
+                                int h_high = 0;
+                                int w_high = 0;
+                                float w1 = 0.f;
+                                float w2 = 0.f;
+                                float w3 = 0.f;
+                                float w4 = 0.f;
+                                bool v1_cond = false;
+                                bool v2_cond = false;
+                                bool v3_cond = false;
+                                bool v4_cond = false;
+                                if (cond)
+                                {
+                                    h_low = (int)floorf(h_im);
+                                    w_low = (int)floorf(w_im);
+                                    h_high = h_low + 1;
+                                    w_high = w_low + 1;
+    
+                                    float lh = h_im - h_low;
+                                    float lw = w_im - w_low;
+                                    float hh = 1 - lh;
+                                    float hw = 1 - lw;
+    
+                                    v1_cond = (h_low >= 0 && w_low >= 0);
+                                    v2_cond = (h_low >= 0 && w_high <= w - 1);
+                                    v3_cond = (h_high <= h - 1 && w_low >= 0);
+                                    v4_cond = (h_high <= h - 1 && w_high <= w - 1);
+    
+                                    w1 = hh * hw;
+                                    w2 = hh * lw;
+                                    w3 = lh * hw;
+                                    w4 = lh * lw;
+                                }
+    
+                                for (int ic = 0; ic < channels; ic++)
+                                {
+                                    float val = 0.f;
+                                    if (cond)
+                                    {
+                                        float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f;
+                                        float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f;
+                                        float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f;
+                                        float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f;
+                                        val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+                                    }
+                                    sum += val * mask_ * weight_ptr[((oc * channels + ic) * kernel_h + i) * kernel_w + j];
+                                }
+                            }
+                        }
+                        top_blob.channel(oc).row(h_col)[w_col] = activation_ss(sum, activation_type, activation_params);
+                    }
                 }
             }
-
-            return DeformableConv2D::forward(bottom_blobs_unpacked, top_blobs, opt);
         }
     }
 

From 5e1a76315208e9743083153e066d8b937cac36c3 Mon Sep 17 00:00:00 2001
From: nihui <171016+nihui@users.noreply.github.com>
Date: Fri, 20 Feb 2026 13:34:37 +0000
Subject: [PATCH 10/10] apply code-format changes

---
 src/layer/riscv/deformableconv2d_riscv.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/layer/riscv/deformableconv2d_riscv.cpp b/src/layer/riscv/deformableconv2d_riscv.cpp
index a0695abe86b3..1bf560afc428 100644
--- a/src/layer/riscv/deformableconv2d_riscv.cpp
+++ b/src/layer/riscv/deformableconv2d_riscv.cpp
@@ -442,7 +442,7 @@ int DeformableConv2D_riscv::forward(const std::vector<Mat>& bottom_blobs, std::v
             const bool offset_not_pack = offset.elempack == 1;
             const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true;
             const float* weight_ptr = weight_data_tm;
-    
+
             // naive deformable conv
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int h_col = 0; h_col < outh; h_col++)
@@ -490,7 +490,7 @@ int DeformableConv2D_riscv::forward(const std::vector<Mat>& bottom_blobs, std::v
                                 }
                                 const float h_im = h_in + i * dilation_h + offset_h;
                                 const float w_im = w_in + j * dilation_w + offset_w;
-    
+
                                 // Bilinear
                                 const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
                                 int h_low = 0;
@@ -511,23 +511,23 @@ int DeformableConv2D_riscv::forward(const std::vector<Mat>& bottom_blobs, std::v
                                     w_low = (int)floorf(w_im);
                                     h_high = h_low + 1;
                                     w_high = w_low + 1;
-    
+
                                     float lh = h_im - h_low;
                                     float lw = w_im - w_low;
                                     float hh = 1 - lh;
                                     float hw = 1 - lw;
-    
+
                                     v1_cond = (h_low >= 0 && w_low >= 0);
                                     v2_cond = (h_low >= 0 && w_high <= w - 1);
                                     v3_cond = (h_high <= h - 1 && w_low >= 0);
                                     v4_cond = (h_high <= h - 1 && w_high <= w - 1);
-    
+
                                     w1 = hh * hw;
                                     w2 = hh * lw;
                                     w3 = lh * hw;
                                     w4 = lh * lw;
                                 }
-    
+
                                 for (int ic = 0; ic < channels; ic++)
                                 {
                                     float val = 0.f;