From b606c1e691434f5780ef97e9c4868abcce3ee2f9 Mon Sep 17 00:00:00 2001 From: velonica0 Date: Tue, 14 Apr 2026 09:51:11 +0800 Subject: [PATCH 1/4] [RVV] add rvv f32 kernel for ppmm Add RVV kernel for f32-ppmm with MR={1,2,4} and NR={m1,m2,m4} variants. Tested on SpacemiT K1 CPU and K3 CPU, both VLEN=256. --- bench/f32-bgemm.cc | 22 ++++ cmake/gen/rvv_microkernels.cmake | 13 ++- gen/rvv_microkernels.bzl | 9 ++ scripts/generate-f32-ppmm.sh | 11 ++ src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c | 80 ++++++++++++++ src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c | 80 ++++++++++++++ src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c | 80 ++++++++++++++ src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c | 91 ++++++++++++++++ src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c | 91 ++++++++++++++++ src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c | 91 ++++++++++++++++ src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c | 113 ++++++++++++++++++++ src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c | 113 ++++++++++++++++++++ src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c | 113 ++++++++++++++++++++ src/f32-ppmm/rvv.c.in | 99 +++++++++++++++++ src/xnnpack/ppmm.h | 19 ++++ 15 files changed, 1023 insertions(+), 2 deletions(-) create mode 100644 src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c create mode 100644 src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c create mode 100644 src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c create mode 100644 src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c create mode 100644 src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c create mode 100644 src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c create mode 100644 src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c create mode 100644 src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c create mode 100644 src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c create mode 100644 src/f32-ppmm/rvv.c.in diff --git a/bench/f32-bgemm.cc b/bench/f32-bgemm.cc index 32a444339c0..2288328cf11 100644 --- a/bench/f32-bgemm.cc +++ b/bench/f32-bgemm.cc @@ -1535,6 +1535,28 @@ BENCHMARK_BGEMM(f32_ppmm_4x2_twopass__scalar) BENCHMARK_BGEMM(f32_ppmm_4x4_twopass__scalar) BENCHMARK_BGEMM(f32_ppmm_3x3_twopass__scalar) +#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV +static void f32_ppmm_4x4v_unipass__rvv(benchmark::State& state) { + f32_ppmm1p(state, xnn_x32_packw_gemm_goi_ukernel_x4__scalar_float_u4, + xnn_x32_packx_ukernel_4x__scalar, + xnn_f32_ppmm_minmax_ukernel_4x4v__rvv, + xnn_init_f32_minmax_scalar_params, + /*mr=*/4, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), + /*kr=*/1, /*sr=*/1); +} +static void f32_ppmm_4x4v_twopass__rvv(benchmark::State& state) { + f32_ppmm2p(state, xnn_x32_packw_gemm_goi_ukernel_x4__scalar_float_u4, + xnn_x32_packx_ukernel_4x__scalar, + xnn_f32_ppmm_minmax_ukernel_4x4v__rvv, + xnn_init_f32_minmax_scalar_params, + /*mr=*/4, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float), + /*kr=*/1, /*sr=*/1); +} + +BENCHMARK_BGEMM(f32_ppmm_4x4v_unipass__rvv) +BENCHMARK_BGEMM(f32_ppmm_4x4v_twopass__rvv) +#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + #ifdef BENCHMARK_RUY BENCHMARK_BGEMM(ruy_st) #endif // BENCHMARK_RUY diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index e67a14221e0..3a5f0c28bc9 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -145,7 +145,8 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/x32-transposec/gen/x32-transposec-4x4-rvv.c src/x32-transposec/gen/x32-transposec-8x8-rvv.c src/x32-transposec/gen/x32-transposec-16x8-rvv.c - src/x32-transposec/gen/x32-transposec-32x8-rvv.c) + src/x32-transposec/gen/x32-transposec-32x8-rvv.c + src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c) SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-1x1.c @@ -356,6 +357,14 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c src/x32-transposec/gen/x32-transposec-8xv1-rvv.c src/x32-transposec/gen/x32-transposec-8xv2-rvv.c - src/x32-transposec/gen/x32-transposec-8xv4-rvv.c) + src/x32-transposec/gen/x32-transposec-8xv4-rvv.c + src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c) SET(ALL_RVV_MICROKERNEL_SRCS ${PROD_RVV_MICROKERNEL_SRCS} ${NON_PROD_RVV_MICROKERNEL_SRCS}) diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index cb5b1dfb9e9..c18efa693b3 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -142,6 +142,7 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/x32-transposec/gen/x32-transposec-8x8-rvv.c", "src/x32-transposec/gen/x32-transposec-16x8-rvv.c", "src/x32-transposec/gen/x32-transposec-32x8-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c", ] NON_PROD_RVV_MICROKERNEL_SRCS = [ @@ -354,6 +355,14 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/x32-transposec/gen/x32-transposec-8xv1-rvv.c", "src/x32-transposec/gen/x32-transposec-8xv2-rvv.c", "src/x32-transposec/gen/x32-transposec-8xv4-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c", ] ALL_RVV_MICROKERNEL_SRCS = PROD_RVV_MICROKERNEL_SRCS + NON_PROD_RVV_MICROKERNEL_SRCS diff --git a/scripts/generate-f32-ppmm.sh b/scripts/generate-f32-ppmm.sh index 105be6ccc09..5730f18d024 100755 --- a/scripts/generate-f32-ppmm.sh +++ b/scripts/generate-f32-ppmm.sh @@ -43,4 +43,15 @@ tools/xngen src/f32-ppmm/8x8-aarch64-neonfma-cortex-a75.S.in -D PREFETCH=1 -o sr ################################### x86 SSE ################################### tools/xngen src/f32-ppmm/sse.c.in -D MR=4 -D NR=8 -o src/f32-ppmm/gen/f32-ppmm-4x8-minmax-sse.c & +################################## RISC-V RVV ################################# +tools/xngen src/f32-ppmm/rvv.c.in -D MR=1 -D NR=m1 -o src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c & +tools/xngen src/f32-ppmm/rvv.c.in -D MR=1 -D NR=m2 -o src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c & +tools/xngen src/f32-ppmm/rvv.c.in -D MR=1 -D NR=m4 -o src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c & +tools/xngen src/f32-ppmm/rvv.c.in -D MR=2 -D NR=m1 -o src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c & +tools/xngen src/f32-ppmm/rvv.c.in -D MR=2 -D NR=m2 -o src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c & +tools/xngen src/f32-ppmm/rvv.c.in -D MR=2 -D NR=m4 -o src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c & +tools/xngen src/f32-ppmm/rvv.c.in -D MR=4 -D NR=m1 -o src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c & +tools/xngen src/f32-ppmm/rvv.c.in -D MR=4 -D NR=m2 -o src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c & +tools/xngen src/f32-ppmm/rvv.c.in -D MR=4 -D NR=m4 -o src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c & + wait diff --git a/src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c b/src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c new file mode 100644 index 00000000000..56febbd05e3 --- /dev/null +++ b/src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c @@ -0,0 +1,80 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-ppmm/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/ppmm.h" + + +void xnn_f32_ppmm_minmax_ukernel_1x1v__rvv( + size_t mr, + size_t nc, + size_t kc, + const float* restrict a, + const float* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const struct xnn_f32_minmax_params* restrict params) +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(float) == 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + + float* c0 = c; + + const size_t nr = __riscv_vsetvlmax_e32m1(); + size_t vl = nr; + do { + if XNN_UNLIKELY(nc < nr) { + vl = __riscv_vsetvl_e32m1(nc); + } + nc -= vl; + + // Load bias. + vfloat32m1_t vacc0 = __riscv_vle32_v_f32m1(w, vl); + w += nr; + + // Inner product loop. + size_t k = kc; + do { + const float va0 = a[0]; + a += 1; + + vfloat32m1_t vb = __riscv_vle32_v_f32m1(w, vl); + w += nr; + + vacc0 = __riscv_vfmacc_vf_f32m1(vacc0, va0, vb, vl); + + k -= sizeof(float); + } while (k != 0); + + // Clamp with min & max. + vacc0 = __riscv_vfmax_vf_f32m1(vacc0, vmin, vl); + + vacc0 = __riscv_vfmin_vf_f32m1(vacc0, vmax, vl); + + // Store results. + __riscv_vse32_v_f32m1(c0, vacc0, vl); + c0 = (float*) ((uintptr_t) c0 + cn_stride); + + a = (const float*) ((uintptr_t) a - kc * 1); + } while (nc != 0); +} diff --git a/src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c b/src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c new file mode 100644 index 00000000000..768951bd38b --- /dev/null +++ b/src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c @@ -0,0 +1,80 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-ppmm/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/ppmm.h" + + +void xnn_f32_ppmm_minmax_ukernel_1x2v__rvv( + size_t mr, + size_t nc, + size_t kc, + const float* restrict a, + const float* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const struct xnn_f32_minmax_params* restrict params) +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(float) == 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + + float* c0 = c; + + const size_t nr = __riscv_vsetvlmax_e32m2(); + size_t vl = nr; + do { + if XNN_UNLIKELY(nc < nr) { + vl = __riscv_vsetvl_e32m2(nc); + } + nc -= vl; + + // Load bias. + vfloat32m2_t vacc0 = __riscv_vle32_v_f32m2(w, vl); + w += nr; + + // Inner product loop. + size_t k = kc; + do { + const float va0 = a[0]; + a += 1; + + vfloat32m2_t vb = __riscv_vle32_v_f32m2(w, vl); + w += nr; + + vacc0 = __riscv_vfmacc_vf_f32m2(vacc0, va0, vb, vl); + + k -= sizeof(float); + } while (k != 0); + + // Clamp with min & max. + vacc0 = __riscv_vfmax_vf_f32m2(vacc0, vmin, vl); + + vacc0 = __riscv_vfmin_vf_f32m2(vacc0, vmax, vl); + + // Store results. + __riscv_vse32_v_f32m2(c0, vacc0, vl); + c0 = (float*) ((uintptr_t) c0 + cn_stride); + + a = (const float*) ((uintptr_t) a - kc * 1); + } while (nc != 0); +} diff --git a/src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c b/src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c new file mode 100644 index 00000000000..791b0679b35 --- /dev/null +++ b/src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c @@ -0,0 +1,80 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-ppmm/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/ppmm.h" + + +void xnn_f32_ppmm_minmax_ukernel_1x4v__rvv( + size_t mr, + size_t nc, + size_t kc, + const float* restrict a, + const float* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const struct xnn_f32_minmax_params* restrict params) +{ + assert(mr != 0); + assert(mr <= 1); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(float) == 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + + float* c0 = c; + + const size_t nr = __riscv_vsetvlmax_e32m4(); + size_t vl = nr; + do { + if XNN_UNLIKELY(nc < nr) { + vl = __riscv_vsetvl_e32m4(nc); + } + nc -= vl; + + // Load bias. + vfloat32m4_t vacc0 = __riscv_vle32_v_f32m4(w, vl); + w += nr; + + // Inner product loop. + size_t k = kc; + do { + const float va0 = a[0]; + a += 1; + + vfloat32m4_t vb = __riscv_vle32_v_f32m4(w, vl); + w += nr; + + vacc0 = __riscv_vfmacc_vf_f32m4(vacc0, va0, vb, vl); + + k -= sizeof(float); + } while (k != 0); + + // Clamp with min & max. + vacc0 = __riscv_vfmax_vf_f32m4(vacc0, vmin, vl); + + vacc0 = __riscv_vfmin_vf_f32m4(vacc0, vmax, vl); + + // Store results. + __riscv_vse32_v_f32m4(c0, vacc0, vl); + c0 = (float*) ((uintptr_t) c0 + cn_stride); + + a = (const float*) ((uintptr_t) a - kc * 1); + } while (nc != 0); +} diff --git a/src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c b/src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c new file mode 100644 index 00000000000..d518f894b44 --- /dev/null +++ b/src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c @@ -0,0 +1,91 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-ppmm/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/ppmm.h" + + +void xnn_f32_ppmm_minmax_ukernel_2x1v__rvv( + size_t mr, + size_t nc, + size_t kc, + const float* restrict a, + const float* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const struct xnn_f32_minmax_params* restrict params) +{ + assert(mr != 0); + assert(mr <= 2); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(float) == 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + + float* c0 = c; + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr != 2) { + c1 = c0; + } + + const size_t nr = __riscv_vsetvlmax_e32m1(); + size_t vl = nr; + do { + if XNN_UNLIKELY(nc < nr) { + vl = __riscv_vsetvl_e32m1(nc); + } + nc -= vl; + + // Load bias. + vfloat32m1_t vacc0 = __riscv_vle32_v_f32m1(w, vl); + w += nr; + vfloat32m1_t vacc1 = __riscv_vmv_v_v_f32m1(vacc0, vl); + + // Inner product loop. + size_t k = kc; + do { + const float va0 = a[0]; + const float va1 = a[1]; + a += 2; + + vfloat32m1_t vb = __riscv_vle32_v_f32m1(w, vl); + w += nr; + + vacc0 = __riscv_vfmacc_vf_f32m1(vacc0, va0, vb, vl); + vacc1 = __riscv_vfmacc_vf_f32m1(vacc1, va1, vb, vl); + + k -= sizeof(float); + } while (k != 0); + + // Clamp with min & max. + vacc0 = __riscv_vfmax_vf_f32m1(vacc0, vmin, vl); + vacc1 = __riscv_vfmax_vf_f32m1(vacc1, vmin, vl); + + vacc0 = __riscv_vfmin_vf_f32m1(vacc0, vmax, vl); + vacc1 = __riscv_vfmin_vf_f32m1(vacc1, vmax, vl); + + // Store results. + __riscv_vse32_v_f32m1(c0, vacc0, vl); + c0 = (float*) ((uintptr_t) c0 + cn_stride); + __riscv_vse32_v_f32m1(c1, vacc1, vl); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + + a = (const float*) ((uintptr_t) a - kc * 2); + } while (nc != 0); +} diff --git a/src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c b/src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c new file mode 100644 index 00000000000..07300e7a962 --- /dev/null +++ b/src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c @@ -0,0 +1,91 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-ppmm/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/ppmm.h" + + +void xnn_f32_ppmm_minmax_ukernel_2x2v__rvv( + size_t mr, + size_t nc, + size_t kc, + const float* restrict a, + const float* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const struct xnn_f32_minmax_params* restrict params) +{ + assert(mr != 0); + assert(mr <= 2); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(float) == 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + + float* c0 = c; + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr != 2) { + c1 = c0; + } + + const size_t nr = __riscv_vsetvlmax_e32m2(); + size_t vl = nr; + do { + if XNN_UNLIKELY(nc < nr) { + vl = __riscv_vsetvl_e32m2(nc); + } + nc -= vl; + + // Load bias. + vfloat32m2_t vacc0 = __riscv_vle32_v_f32m2(w, vl); + w += nr; + vfloat32m2_t vacc1 = __riscv_vmv_v_v_f32m2(vacc0, vl); + + // Inner product loop. + size_t k = kc; + do { + const float va0 = a[0]; + const float va1 = a[1]; + a += 2; + + vfloat32m2_t vb = __riscv_vle32_v_f32m2(w, vl); + w += nr; + + vacc0 = __riscv_vfmacc_vf_f32m2(vacc0, va0, vb, vl); + vacc1 = __riscv_vfmacc_vf_f32m2(vacc1, va1, vb, vl); + + k -= sizeof(float); + } while (k != 0); + + // Clamp with min & max. + vacc0 = __riscv_vfmax_vf_f32m2(vacc0, vmin, vl); + vacc1 = __riscv_vfmax_vf_f32m2(vacc1, vmin, vl); + + vacc0 = __riscv_vfmin_vf_f32m2(vacc0, vmax, vl); + vacc1 = __riscv_vfmin_vf_f32m2(vacc1, vmax, vl); + + // Store results. + __riscv_vse32_v_f32m2(c0, vacc0, vl); + c0 = (float*) ((uintptr_t) c0 + cn_stride); + __riscv_vse32_v_f32m2(c1, vacc1, vl); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + + a = (const float*) ((uintptr_t) a - kc * 2); + } while (nc != 0); +} diff --git a/src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c b/src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c new file mode 100644 index 00000000000..d64f82d4563 --- /dev/null +++ b/src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c @@ -0,0 +1,91 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-ppmm/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/ppmm.h" + + +void xnn_f32_ppmm_minmax_ukernel_2x4v__rvv( + size_t mr, + size_t nc, + size_t kc, + const float* restrict a, + const float* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const struct xnn_f32_minmax_params* restrict params) +{ + assert(mr != 0); + assert(mr <= 2); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(float) == 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + + float* c0 = c; + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr != 2) { + c1 = c0; + } + + const size_t nr = __riscv_vsetvlmax_e32m4(); + size_t vl = nr; + do { + if XNN_UNLIKELY(nc < nr) { + vl = __riscv_vsetvl_e32m4(nc); + } + nc -= vl; + + // Load bias. + vfloat32m4_t vacc0 = __riscv_vle32_v_f32m4(w, vl); + w += nr; + vfloat32m4_t vacc1 = __riscv_vmv_v_v_f32m4(vacc0, vl); + + // Inner product loop. + size_t k = kc; + do { + const float va0 = a[0]; + const float va1 = a[1]; + a += 2; + + vfloat32m4_t vb = __riscv_vle32_v_f32m4(w, vl); + w += nr; + + vacc0 = __riscv_vfmacc_vf_f32m4(vacc0, va0, vb, vl); + vacc1 = __riscv_vfmacc_vf_f32m4(vacc1, va1, vb, vl); + + k -= sizeof(float); + } while (k != 0); + + // Clamp with min & max. + vacc0 = __riscv_vfmax_vf_f32m4(vacc0, vmin, vl); + vacc1 = __riscv_vfmax_vf_f32m4(vacc1, vmin, vl); + + vacc0 = __riscv_vfmin_vf_f32m4(vacc0, vmax, vl); + vacc1 = __riscv_vfmin_vf_f32m4(vacc1, vmax, vl); + + // Store results. + __riscv_vse32_v_f32m4(c0, vacc0, vl); + c0 = (float*) ((uintptr_t) c0 + cn_stride); + __riscv_vse32_v_f32m4(c1, vacc1, vl); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + + a = (const float*) ((uintptr_t) a - kc * 2); + } while (nc != 0); +} diff --git a/src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c b/src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c new file mode 100644 index 00000000000..0510c840e01 --- /dev/null +++ b/src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c @@ -0,0 +1,113 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-ppmm/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/ppmm.h" + + +void xnn_f32_ppmm_minmax_ukernel_4x1v__rvv( + size_t mr, + size_t nc, + size_t kc, + const float* restrict a, + const float* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const struct xnn_f32_minmax_params* restrict params) +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(float) == 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + + float* c0 = c; + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + float* c3 = (float*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const size_t nr = __riscv_vsetvlmax_e32m1(); + size_t vl = nr; + do { + if XNN_UNLIKELY(nc < nr) { + vl = __riscv_vsetvl_e32m1(nc); + } + nc -= vl; + + // Load bias. + vfloat32m1_t vacc0 = __riscv_vle32_v_f32m1(w, vl); + w += nr; + vfloat32m1_t vacc1 = __riscv_vmv_v_v_f32m1(vacc0, vl); + vfloat32m1_t vacc2 = __riscv_vmv_v_v_f32m1(vacc0, vl); + vfloat32m1_t vacc3 = __riscv_vmv_v_v_f32m1(vacc0, vl); + + // Inner product loop. + size_t k = kc; + do { + const float va0 = a[0]; + const float va1 = a[1]; + const float va2 = a[2]; + const float va3 = a[3]; + a += 4; + + vfloat32m1_t vb = __riscv_vle32_v_f32m1(w, vl); + w += nr; + + vacc0 = __riscv_vfmacc_vf_f32m1(vacc0, va0, vb, vl); + vacc1 = __riscv_vfmacc_vf_f32m1(vacc1, va1, vb, vl); + vacc2 = __riscv_vfmacc_vf_f32m1(vacc2, va2, vb, vl); + vacc3 = __riscv_vfmacc_vf_f32m1(vacc3, va3, vb, vl); + + k -= sizeof(float); + } while (k != 0); + + // Clamp with min & max. + vacc0 = __riscv_vfmax_vf_f32m1(vacc0, vmin, vl); + vacc1 = __riscv_vfmax_vf_f32m1(vacc1, vmin, vl); + vacc2 = __riscv_vfmax_vf_f32m1(vacc2, vmin, vl); + vacc3 = __riscv_vfmax_vf_f32m1(vacc3, vmin, vl); + + vacc0 = __riscv_vfmin_vf_f32m1(vacc0, vmax, vl); + vacc1 = __riscv_vfmin_vf_f32m1(vacc1, vmax, vl); + vacc2 = __riscv_vfmin_vf_f32m1(vacc2, vmax, vl); + vacc3 = __riscv_vfmin_vf_f32m1(vacc3, vmax, vl); + + // Store results. + __riscv_vse32_v_f32m1(c0, vacc0, vl); + c0 = (float*) ((uintptr_t) c0 + cn_stride); + __riscv_vse32_v_f32m1(c1, vacc1, vl); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + __riscv_vse32_v_f32m1(c2, vacc2, vl); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + __riscv_vse32_v_f32m1(c3, vacc3, vl); + c3 = (float*) ((uintptr_t) c3 + cn_stride); + + a = (const float*) ((uintptr_t) a - kc * 4); + } while (nc != 0); +} diff --git a/src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c b/src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c new file mode 100644 index 00000000000..4d3e5d16015 --- /dev/null +++ b/src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c @@ -0,0 +1,113 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-ppmm/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/ppmm.h" + + +void xnn_f32_ppmm_minmax_ukernel_4x2v__rvv( + size_t mr, + size_t nc, + size_t kc, + const float* restrict a, + const float* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const struct xnn_f32_minmax_params* restrict params) +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(float) == 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + + float* c0 = c; + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + float* c3 = (float*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const size_t nr = __riscv_vsetvlmax_e32m2(); + size_t vl = nr; + do { + if XNN_UNLIKELY(nc < nr) { + vl = __riscv_vsetvl_e32m2(nc); + } + nc -= vl; + + // Load bias. + vfloat32m2_t vacc0 = __riscv_vle32_v_f32m2(w, vl); + w += nr; + vfloat32m2_t vacc1 = __riscv_vmv_v_v_f32m2(vacc0, vl); + vfloat32m2_t vacc2 = __riscv_vmv_v_v_f32m2(vacc0, vl); + vfloat32m2_t vacc3 = __riscv_vmv_v_v_f32m2(vacc0, vl); + + // Inner product loop. + size_t k = kc; + do { + const float va0 = a[0]; + const float va1 = a[1]; + const float va2 = a[2]; + const float va3 = a[3]; + a += 4; + + vfloat32m2_t vb = __riscv_vle32_v_f32m2(w, vl); + w += nr; + + vacc0 = __riscv_vfmacc_vf_f32m2(vacc0, va0, vb, vl); + vacc1 = __riscv_vfmacc_vf_f32m2(vacc1, va1, vb, vl); + vacc2 = __riscv_vfmacc_vf_f32m2(vacc2, va2, vb, vl); + vacc3 = __riscv_vfmacc_vf_f32m2(vacc3, va3, vb, vl); + + k -= sizeof(float); + } while (k != 0); + + // Clamp with min & max. + vacc0 = __riscv_vfmax_vf_f32m2(vacc0, vmin, vl); + vacc1 = __riscv_vfmax_vf_f32m2(vacc1, vmin, vl); + vacc2 = __riscv_vfmax_vf_f32m2(vacc2, vmin, vl); + vacc3 = __riscv_vfmax_vf_f32m2(vacc3, vmin, vl); + + vacc0 = __riscv_vfmin_vf_f32m2(vacc0, vmax, vl); + vacc1 = __riscv_vfmin_vf_f32m2(vacc1, vmax, vl); + vacc2 = __riscv_vfmin_vf_f32m2(vacc2, vmax, vl); + vacc3 = __riscv_vfmin_vf_f32m2(vacc3, vmax, vl); + + // Store results. + __riscv_vse32_v_f32m2(c0, vacc0, vl); + c0 = (float*) ((uintptr_t) c0 + cn_stride); + __riscv_vse32_v_f32m2(c1, vacc1, vl); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + __riscv_vse32_v_f32m2(c2, vacc2, vl); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + __riscv_vse32_v_f32m2(c3, vacc3, vl); + c3 = (float*) ((uintptr_t) c3 + cn_stride); + + a = (const float*) ((uintptr_t) a - kc * 4); + } while (nc != 0); +} diff --git a/src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c b/src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c new file mode 100644 index 00000000000..6e5db8bf8a1 --- /dev/null +++ b/src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c @@ -0,0 +1,113 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-ppmm/rvv.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/ppmm.h" + + +void xnn_f32_ppmm_minmax_ukernel_4x4v__rvv( + size_t mr, + size_t nc, + size_t kc, + const float* restrict a, + const float* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const struct xnn_f32_minmax_params* restrict params) +{ + assert(mr != 0); + assert(mr <= 4); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(float) == 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + + float* c0 = c; + float* c1 = (float*) ((uintptr_t) c0 + cm_stride); + if XNN_UNPREDICTABLE(mr < 2) { + c1 = c0; + } + float* c2 = (float*) ((uintptr_t) c1 + cm_stride); + if XNN_UNPREDICTABLE(mr <= 2) { + c2 = c1; + } + float* c3 = (float*) ((uintptr_t) c2 + cm_stride); + if XNN_UNPREDICTABLE(mr != 4) { + c3 = c2; + } + + const size_t nr = __riscv_vsetvlmax_e32m4(); + size_t vl = nr; + do { + if XNN_UNLIKELY(nc < nr) { + vl = __riscv_vsetvl_e32m4(nc); + } + nc -= vl; + + // Load bias. + vfloat32m4_t vacc0 = __riscv_vle32_v_f32m4(w, vl); + w += nr; + vfloat32m4_t vacc1 = __riscv_vmv_v_v_f32m4(vacc0, vl); + vfloat32m4_t vacc2 = __riscv_vmv_v_v_f32m4(vacc0, vl); + vfloat32m4_t vacc3 = __riscv_vmv_v_v_f32m4(vacc0, vl); + + // Inner product loop. + size_t k = kc; + do { + const float va0 = a[0]; + const float va1 = a[1]; + const float va2 = a[2]; + const float va3 = a[3]; + a += 4; + + vfloat32m4_t vb = __riscv_vle32_v_f32m4(w, vl); + w += nr; + + vacc0 = __riscv_vfmacc_vf_f32m4(vacc0, va0, vb, vl); + vacc1 = __riscv_vfmacc_vf_f32m4(vacc1, va1, vb, vl); + vacc2 = __riscv_vfmacc_vf_f32m4(vacc2, va2, vb, vl); + vacc3 = __riscv_vfmacc_vf_f32m4(vacc3, va3, vb, vl); + + k -= sizeof(float); + } while (k != 0); + + // Clamp with min & max. + vacc0 = __riscv_vfmax_vf_f32m4(vacc0, vmin, vl); + vacc1 = __riscv_vfmax_vf_f32m4(vacc1, vmin, vl); + vacc2 = __riscv_vfmax_vf_f32m4(vacc2, vmin, vl); + vacc3 = __riscv_vfmax_vf_f32m4(vacc3, vmin, vl); + + vacc0 = __riscv_vfmin_vf_f32m4(vacc0, vmax, vl); + vacc1 = __riscv_vfmin_vf_f32m4(vacc1, vmax, vl); + vacc2 = __riscv_vfmin_vf_f32m4(vacc2, vmax, vl); + vacc3 = __riscv_vfmin_vf_f32m4(vacc3, vmax, vl); + + // Store results. + __riscv_vse32_v_f32m4(c0, vacc0, vl); + c0 = (float*) ((uintptr_t) c0 + cn_stride); + __riscv_vse32_v_f32m4(c1, vacc1, vl); + c1 = (float*) ((uintptr_t) c1 + cn_stride); + __riscv_vse32_v_f32m4(c2, vacc2, vl); + c2 = (float*) ((uintptr_t) c2 + cn_stride); + __riscv_vse32_v_f32m4(c3, vacc3, vl); + c3 = (float*) ((uintptr_t) c3 + cn_stride); + + a = (const float*) ((uintptr_t) a - kc * 4); + } while (nc != 0); +} diff --git a/src/f32-ppmm/rvv.c.in b/src/f32-ppmm/rvv.c.in new file mode 100644 index 00000000000..b284f82a628 --- /dev/null +++ b/src/f32-ppmm/rvv.c.in @@ -0,0 +1,99 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert MR >= 1 +$assert NR in ["m1", "m2", "m4", "m8"] +$LMUL = NR[1] +#include +#include +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/ppmm.h" + + +void xnn_f32_ppmm_minmax_ukernel_${MR}x${LMUL}v__rvv( + size_t mr, + size_t nc, + size_t kc, + const float* restrict a, + const float* restrict w, + float* restrict c, + size_t cm_stride, + size_t cn_stride, + const struct xnn_f32_minmax_params* restrict params) +{ + assert(mr != 0); + assert(mr <= ${MR}); + assert(nc != 0); + assert(kc != 0); + assert(kc % sizeof(float) == 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + + float* c0 = c; + $for M in range(1, MR): + float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride); + $if M % 2 == 0: + if XNN_UNPREDICTABLE(mr <= ${M}) { + c${M} = c${M-1}; + } + $elif M + 1 == MR: + if XNN_UNPREDICTABLE(mr != ${M+1}) { + c${M} = c${M-1}; + } + $else: + if XNN_UNPREDICTABLE(mr < ${M+1}) { + c${M} = c${M-1}; + } + + const size_t nr = __riscv_vsetvlmax_e32m${LMUL}(); + size_t vl = nr; + do { + if XNN_UNLIKELY(nc < nr) { + vl = __riscv_vsetvl_e32m${LMUL}(nc); + } + nc -= vl; + + // Load bias. + vfloat32m${LMUL}_t vacc0 = __riscv_vle32_v_f32m${LMUL}(w, vl); + w += nr; + $for M in range(1, MR): + vfloat32m${LMUL}_t vacc${M} = __riscv_vmv_v_v_f32m${LMUL}(vacc0, vl); + + // Inner product loop. + size_t k = kc; + do { + $for M in range(MR): + const float va${M} = a[${M}]; + a += ${MR}; + + vfloat32m${LMUL}_t vb = __riscv_vle32_v_f32m${LMUL}(w, vl); + w += nr; + + $for M in range(MR): + vacc${M} = __riscv_vfmacc_vf_f32m${LMUL}(vacc${M}, va${M}, vb, vl); + + k -= sizeof(float); + } while (k != 0); + + // Clamp with min & max. + $for M in range(MR): + vacc${M} = __riscv_vfmax_vf_f32m${LMUL}(vacc${M}, vmin, vl); + + $for M in range(MR): + vacc${M} = __riscv_vfmin_vf_f32m${LMUL}(vacc${M}, vmax, vl); + + // Store results. + $for M in range(MR): + __riscv_vse32_v_f32m${LMUL}(c${M}, vacc${M}, vl); + c${M} = (float*) ((uintptr_t) c${M} + cn_stride); + + a = (const float*) ((uintptr_t) a - kc * ${MR}); + } while (nc != 0); +} diff --git a/src/xnnpack/ppmm.h b/src/xnnpack/ppmm.h index c858634699e..6ced7451199 100644 --- a/src/xnnpack/ppmm.h +++ b/src/xnnpack/ppmm.h @@ -77,6 +77,25 @@ DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( xnn_f32_ppmm_minmax_ukernel_4x4__scalar) +DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_ppmm_minmax_ukernel_1x1v__rvv) +DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_ppmm_minmax_ukernel_1x2v__rvv) +DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_ppmm_minmax_ukernel_1x4v__rvv) +DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_ppmm_minmax_ukernel_2x1v__rvv) +DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_ppmm_minmax_ukernel_2x2v__rvv) +DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_ppmm_minmax_ukernel_2x4v__rvv) +DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_ppmm_minmax_ukernel_4x1v__rvv) +DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_ppmm_minmax_ukernel_4x2v__rvv) +DECLARE_F32_PPMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_ppmm_minmax_ukernel_4x4v__rvv) + #ifdef __cplusplus } // extern "C" #endif From 025108a1de0006c627baf1f92070ce053697a841 Mon Sep 17 00:00:00 2001 From: velonica0 Date: Wed, 15 Apr 2026 09:31:50 +0800 Subject: [PATCH 2/4] Alphabetize f32-ppmm entries in cmake/bzl lists Move f32-ppmm entries from end of PROD/NON_PROD lists to alphabetical position (between f32-maxpool and f32-qs8-vcvt) to avoid merge conflicts with other RVV PRs. --- cmake/gen/rvv_microkernels.cmake | 12 +++++++++++- gen/rvv_microkernels.bzl | 10 +++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index 3a5f0c28bc9..560b6640e1c 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -28,6 +28,7 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-igemm/gen/f32-igemm-1x4v-minmax-rvv.c src/f32-igemm/gen/f32-igemm-7x4v-minmax-rvv.c src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c + src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-rvv-rr2-p6-u4v.c @@ -146,7 +147,6 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/x32-transposec/gen/x32-transposec-8x8-rvv.c src/x32-transposec/gen/x32-transposec-16x8-rvv.c src/x32-transposec/gen/x32-transposec-32x8-rvv.c - src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c) SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-1x1.c @@ -169,6 +169,16 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-gemm/gen/f32-gemm-7x4v-rvv.c src/f32-igemm/gen/f32-igemm-1x4v-rvv.c src/f32-igemm/gen/f32-igemm-7x4v-rvv.c + + src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c + src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c + src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index c18efa693b3..9980e53f4bd 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -24,6 +24,7 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-igemm/gen/f32-igemm-1x4v-minmax-rvv.c", "src/f32-igemm/gen/f32-igemm-7x4v-minmax-rvv.c", "src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c", + "src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c", "src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c", "src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-rvv-rr2-p6-u4v.c", @@ -142,7 +143,6 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/x32-transposec/gen/x32-transposec-8x8-rvv.c", "src/x32-transposec/gen/x32-transposec-16x8-rvv.c", "src/x32-transposec/gen/x32-transposec-32x8-rvv.c", - "src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c", ] NON_PROD_RVV_MICROKERNEL_SRCS = [ @@ -166,6 +166,14 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-gemm/gen/f32-gemm-7x4v-rvv.c", "src/f32-igemm/gen/f32-igemm-1x4v-rvv.c", "src/f32-igemm/gen/f32-igemm-7x4v-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c", + "src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c", "src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c", From 7dbe384826834eed0d2029f6d7e24c7433e388fd Mon Sep 17 00:00:00 2001 From: velonica0 Date: Mon, 20 Apr 2026 11:01:21 +0800 Subject: [PATCH 3/4] fix cmake bug --- .codex | 0 cmake/gen/rvv_microkernels.cmake | 4 +--- 2 files changed, 1 insertion(+), 3 deletions(-) create mode 100644 .codex diff --git a/.codex b/.codex new file mode 100644 index 00000000000..e69de29bb2d diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index 560b6640e1c..17f548dfb37 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -146,7 +146,7 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/x32-transposec/gen/x32-transposec-4x4-rvv.c src/x32-transposec/gen/x32-transposec-8x8-rvv.c src/x32-transposec/gen/x32-transposec-16x8-rvv.c - src/x32-transposec/gen/x32-transposec-32x8-rvv.c + src/x32-transposec/gen/x32-transposec-32x8-rvv.c) SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-1x1.c @@ -169,7 +169,6 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-gemm/gen/f32-gemm-7x4v-rvv.c src/f32-igemm/gen/f32-igemm-1x4v-rvv.c src/f32-igemm/gen/f32-igemm-7x4v-rvv.c - src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c @@ -178,7 +177,6 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c - src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c From 49eba529e283e5e57130ad03c2b1ac934802206f Mon Sep 17 00:00:00 2001 From: velonica0 Date: Mon, 18 May 2026 14:03:00 +0800 Subject: [PATCH 4/4] test --- test/f32-ppmm-minmax.cc | 423 ++++++++++++++++++++++++++++++++++++++ test/f32-ppmm-minmax.yaml | 38 ++++ 2 files changed, 461 insertions(+) diff --git a/test/f32-ppmm-minmax.cc b/test/f32-ppmm-minmax.cc index 7f7bc5fe430..010e5eeb54a 100644 --- a/test/f32-ppmm-minmax.cc +++ b/test/f32-ppmm-minmax.cc @@ -293,6 +293,255 @@ std::vector CreateTests1( return gemm_tests; } +#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + // NOLINTNEXTLINE(clang-diagnostic-unused-function) + std::vector CreateTests2( + size_t k_block, size_t adj_k_block, + ConstantOrFunction mr, ConstantOrFunction nr, size_t kr, size_t sr, + bool is_igemm, + bool unsigned_inputs, + uint8_t planes, + std::function test_func, + uint64_t arch_flags = 0) { + std::string kbs = std::to_string(k_block); + std::string kb2s = std::to_string(k_block * 2); + std::string akbs = std::to_string(adj_k_block); + nr = nr * xnn_init_hardware_config()->vlenb / sizeof(float); + std::string nrs = std::to_string(nr); + + const GemmMicrokernelTester tester = GemmMicrokernelTester() + .mr(mr).nr(nr).kr(kr).sr(sr).unsigned_inputs(unsigned_inputs).planes(planes); + + std::vector gemm_tests; + gemm_tests.reserve(42); + + gemm_tests.push_back(GemmTestParams( + "k_eq_" + kbs, + tester.clone() + .m(mr).n(nr).k(k_block) + , test_func, arch_flags)); + if (!is_igemm) { + gemm_tests.push_back(GemmTestParams( + "k_eq_" + kbs + "_strided_a", + tester.clone() + .m(mr).n(nr).k(k_block) + .a_stride(xnnpack::NextPrime(k_block + 1)) + , test_func, arch_flags)); + } + gemm_tests.push_back(GemmTestParams( + "k_eq_" + kbs + "_subtile", + tester.clone() + .k(k_block) + , test_func, arch_flags) + .loop_n(1, nr) + .loop_m(1, mr)); + gemm_tests.push_back(GemmTestParams( + "k_eq_" + kbs + "_subtile_m", + tester.clone() + .n(nr).k(k_block) + , test_func, arch_flags) + .loop_m(1, mr)); + gemm_tests.push_back(GemmTestParams( + "k_eq_" + kbs + "_subtile_n", + tester.clone() + .m(mr).k(k_block) + , test_func, arch_flags) + .loop_n(1, nr)); + if (k_block > 1) { + gemm_tests.push_back(GemmTestParams( + "k_lt_" + akbs, + tester.clone() + .m(mr).n(nr) + , test_func, arch_flags) + .loop_k(1, adj_k_block - 1)); + if (!is_igemm) { + gemm_tests.push_back(GemmTestParams( + "k_lt_" + akbs + "_strided_a", + tester.clone() + .m(mr).n(nr) + .a_stride(xnnpack::NextPrime(adj_k_block + 1)) + , test_func, arch_flags) + .loop_k(1, adj_k_block - 1)); + } + gemm_tests.push_back(GemmTestParams( + "k_lt_" + akbs + "_subtile", + tester.clone() + , test_func, arch_flags) + .loop_k(1, adj_k_block - 1) + .loop_n(1, nr) + .loop_m(1, mr)); + } + gemm_tests.push_back(GemmTestParams( + "k_gt_" + akbs, + tester.clone() + .m(mr).n(nr) + , test_func, arch_flags) + .loop_k(adj_k_block + 1, adj_k_block * 2 - 1, k_block)); + if (is_igemm) { + gemm_tests.push_back(GemmTestParams( + "k_gt_" + akbs + "_strided_a", + tester.clone() + .m(mr).n(nr) + .a_stride(xnnpack::NextPrime(adj_k_block * 2 + 1)) + , test_func, arch_flags) + .loop_k(adj_k_block + 1, adj_k_block * 2 - 1, k_block)); + } + gemm_tests.push_back(GemmTestParams( + "k_gt_" + akbs + "_subtile", + tester.clone() + , test_func, arch_flags) + .loop_k(adj_k_block + 1, adj_k_block * 2 - 1, k_block) + .loop_n(1, nr) + .loop_m(1, mr)); + if (k_block > 1) { + gemm_tests.push_back(GemmTestParams( + "k_div_" + kbs, + tester.clone() + .m(mr).n(nr) + , test_func, arch_flags) + .loop_k(adj_k_block + k_block, k_block * 5, k_block)); + if (is_igemm) { + gemm_tests.push_back(GemmTestParams( + "k_div_" + kbs + "_strided_a", + tester.clone() + .m(mr).n(nr) + .a_stride(xnnpack::NextPrime(k_block * 3 + 1)) + , test_func, arch_flags) + .loop_k(adj_k_block + k_block, k_block * 3, k_block)); + } + gemm_tests.push_back(GemmTestParams( + "k_div_" + kbs + "_subtile", + tester.clone() + , test_func, arch_flags) + .loop_k(adj_k_block + k_block, k_block * 5, k_block) + .loop_n(1, nr) + .loop_m(1, mr)); + } + gemm_tests.push_back(GemmTestParams( + "n_gt_" + nrs, + tester.clone() + .m(mr) + , test_func, arch_flags) + .loop_n(nr + 1, nr * 2 - 1, 4) + .loop_k(1, k_block * 3, k_block + 1)); + if (!is_igemm) { + gemm_tests.push_back(GemmTestParams( + "n_gt_" + nrs + "_strided_a", + tester.clone() + .m(mr) + .a_stride(xnnpack::NextPrime(k_block * 3 + 1)) + , test_func, arch_flags) + .loop_n(nr + 1, nr * 2 - 1, 4) + .loop_k(1, k_block * 3, k_block)); + } + gemm_tests.push_back(GemmTestParams( + "n_gt_" + nrs + "_subtile", + tester.clone() + , test_func, arch_flags) + .loop_n(nr + 1, nr * 2 - 1, 4) + .loop_k(1, k_block * 3, k_block + 1) + .loop_m(1, mr)); + gemm_tests.push_back(GemmTestParams( + "n_div_" + nrs, + tester.clone() + .m(mr) + , test_func, arch_flags) + .loop_n(nr * 2, nr * 3, nr) + .loop_k(1, k_block * 3, k_block + 1)); + if (!is_igemm) { + gemm_tests.push_back(GemmTestParams( + "n_div_" + nrs + "_strided_a", + tester.clone() + .m(mr) + .a_stride(xnnpack::NextPrime(k_block * 3 + 1)) + , test_func, arch_flags) + .loop_n(nr * 2, nr * 3, nr) + .loop_k(1, k_block * 3, k_block)); + } + gemm_tests.push_back(GemmTestParams( + "n_div_" + nrs + "_subtile", + tester.clone() + , test_func, arch_flags) + .loop_n(nr * 2, nr * 3, nr) + .loop_k(1, k_block * 3, k_block + 1) + .loop_m(1, mr)); + if (is_igemm) { + gemm_tests.push_back(GemmTestParams( + "small_kernel", + tester.clone() + .m(mr).n(nr).ks(3) + , test_func, arch_flags) + .loop_k(1, k_block * 3, k_block + 1)); + gemm_tests.push_back(GemmTestParams( + "small_kernel_subtile", + tester.clone() + .ks(3) + , test_func, arch_flags) + .loop_k(1, k_block * 3, k_block + 1) + .loop_n(1, nr) + .loop_m(1, mr)); + gemm_tests.push_back(GemmTestParams( + "n_gt_" + nrs + "_small_kernel", + tester.clone() + .m(mr).ks(3) + , test_func, arch_flags) + .loop_n(nr + 1, nr * 2 - 1, 4) + .loop_k(1, k_block * 3, k_block + 1)); + gemm_tests.push_back(GemmTestParams( + "n_div_" + nrs + "_small_kernel", + tester.clone() + .m(mr).ks(3) + , test_func, arch_flags) + .loop_n(nr * 2, nr * 3, nr) + .loop_k(1, k_block * 3, k_block + 1)); + } + gemm_tests.push_back(GemmTestParams( + "strided_cm_subtile", + tester.clone() + .mr(mr).nr(nr).kr(kr).sr(sr) + .cm_stride(xnnpack::NextPrime(nr + 1)) + , test_func, arch_flags) + .loop_k(1, k_block * 3, k_block + 1) + .loop_n(1, nr) + .loop_m(1, mr)); + if (is_igemm) { + gemm_tests.push_back(GemmTestParams( + "a_offset", + tester.clone() + .m(mr).n(nr).ks(3) + .a_offset(xnnpack::NextPrime(mr * k_block * 3 + 1)) + , test_func, arch_flags) + .loop_k(1, k_block * 3, k_block + 1)); + gemm_tests.push_back(GemmTestParams( + "zero", + tester.clone() + .m(mr).n(nr).ks(3) + .a_offset(xnnpack::NextPrime(mr * k_block * 3 + 1)) + , test_func, arch_flags) + .loop_k(1, k_block * 3, k_block + 1) + .loop_zi(0, mr - 1)); + } + gemm_tests.push_back(GemmTestParams( + "min", + tester.clone() + .m(mr).n(nr).k(k_block).min(0.0f) + , test_func, arch_flags)); + gemm_tests.push_back(GemmTestParams( + "max", + tester.clone() + .m(mr).n(nr).k(k_block).max(0.0f) + , test_func, arch_flags)); + gemm_tests.push_back(GemmTestParams( + "strided_cm", + tester.clone() + .m(mr).n(nr).k(k_block) + .cm_stride(xnnpack::NextPrime(nr + 1)) + , test_func, arch_flags)); + + return gemm_tests; + } +#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + } // namespace @@ -840,4 +1089,178 @@ INSTANTIATE_TEST_SUITE_P( }); +#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + INSTANTIATE_TEST_SUITE_P( + F32_PPMM_MINMAX_1X1V__RVV, GemmTest, + testing::ValuesIn(CreateTests2( + /*k_block=*/1, + /*adj_k_block=*/1, + /*mr=*/1, /*nr=*/1, /*kr=*/1, /*sr=*/1, + /*is_igemm=*/false, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_f32_ppmm_minmax_ukernel_1x1v__rvv, + xnn_init_f32_minmax_scalar_params, + xnn_pack_f32_gemm_goi_w); + }, + xnn_arch_riscv_vector)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + F32_PPMM_MINMAX_1X2V__RVV, GemmTest, + testing::ValuesIn(CreateTests2( + /*k_block=*/1, + /*adj_k_block=*/1, + /*mr=*/1, /*nr=*/2, /*kr=*/1, /*sr=*/1, + /*is_igemm=*/false, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_f32_ppmm_minmax_ukernel_1x2v__rvv, + xnn_init_f32_minmax_scalar_params, + xnn_pack_f32_gemm_goi_w); + }, + xnn_arch_riscv_vector)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + F32_PPMM_MINMAX_1X4V__RVV, GemmTest, + testing::ValuesIn(CreateTests2( + /*k_block=*/1, + /*adj_k_block=*/1, + /*mr=*/1, /*nr=*/4, /*kr=*/1, /*sr=*/1, + /*is_igemm=*/false, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_f32_ppmm_minmax_ukernel_1x4v__rvv, + xnn_init_f32_minmax_scalar_params, + xnn_pack_f32_gemm_goi_w); + }, + xnn_arch_riscv_vector)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + F32_PPMM_MINMAX_2X1V__RVV, GemmTest, + testing::ValuesIn(CreateTests2( + /*k_block=*/1, + /*adj_k_block=*/1, + /*mr=*/2, /*nr=*/1, /*kr=*/1, /*sr=*/1, + /*is_igemm=*/false, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_f32_ppmm_minmax_ukernel_2x1v__rvv, + xnn_init_f32_minmax_scalar_params, + xnn_pack_f32_gemm_goi_w); + }, + xnn_arch_riscv_vector)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + F32_PPMM_MINMAX_2X2V__RVV, GemmTest, + testing::ValuesIn(CreateTests2( + /*k_block=*/1, + /*adj_k_block=*/1, + /*mr=*/2, /*nr=*/2, /*kr=*/1, /*sr=*/1, + /*is_igemm=*/false, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_f32_ppmm_minmax_ukernel_2x2v__rvv, + xnn_init_f32_minmax_scalar_params, + xnn_pack_f32_gemm_goi_w); + }, + xnn_arch_riscv_vector)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + F32_PPMM_MINMAX_2X4V__RVV, GemmTest, + testing::ValuesIn(CreateTests2( + /*k_block=*/1, + /*adj_k_block=*/1, + /*mr=*/2, /*nr=*/4, /*kr=*/1, /*sr=*/1, + /*is_igemm=*/false, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_f32_ppmm_minmax_ukernel_2x4v__rvv, + xnn_init_f32_minmax_scalar_params, + xnn_pack_f32_gemm_goi_w); + }, + xnn_arch_riscv_vector)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + F32_PPMM_MINMAX_4X1V__RVV, GemmTest, + testing::ValuesIn(CreateTests2( + /*k_block=*/1, + /*adj_k_block=*/1, + /*mr=*/4, /*nr=*/1, /*kr=*/1, /*sr=*/1, + /*is_igemm=*/false, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_f32_ppmm_minmax_ukernel_4x1v__rvv, + xnn_init_f32_minmax_scalar_params, + xnn_pack_f32_gemm_goi_w); + }, + xnn_arch_riscv_vector)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + F32_PPMM_MINMAX_4X2V__RVV, GemmTest, + testing::ValuesIn(CreateTests2( + /*k_block=*/1, + /*adj_k_block=*/1, + /*mr=*/4, /*nr=*/2, /*kr=*/1, /*sr=*/1, + /*is_igemm=*/false, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_f32_ppmm_minmax_ukernel_4x2v__rvv, + xnn_init_f32_minmax_scalar_params, + xnn_pack_f32_gemm_goi_w); + }, + xnn_arch_riscv_vector)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + INSTANTIATE_TEST_SUITE_P( + F32_PPMM_MINMAX_4X4V__RVV, GemmTest, + testing::ValuesIn(CreateTests2( + /*k_block=*/1, + /*adj_k_block=*/1, + /*mr=*/4, /*nr=*/4, /*kr=*/1, /*sr=*/1, + /*is_igemm=*/false, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test(xnn_f32_ppmm_minmax_ukernel_4x4v__rvv, + xnn_init_f32_minmax_scalar_params, + xnn_pack_f32_gemm_goi_w); + }, + xnn_arch_riscv_vector)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); +#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + + } // namespace diff --git a/test/f32-ppmm-minmax.yaml b/test/f32-ppmm-minmax.yaml index 532f39d7f84..44f1424f793 100644 --- a/test/f32-ppmm-minmax.yaml +++ b/test/f32-ppmm-minmax.yaml @@ -120,3 +120,41 @@ init: xnn_init_f32_minmax_scalar_params pack: xnn_pack_f32_gemm_goi_w k-block: 1 + +# RISC-V +- name: xnn_f32_ppmm_minmax_ukernel_1x1v__rvv + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_f32_gemm_goi_w + k-block: 1 +- name: xnn_f32_ppmm_minmax_ukernel_1x2v__rvv + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_f32_gemm_goi_w + k-block: 1 +- name: xnn_f32_ppmm_minmax_ukernel_1x4v__rvv + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_f32_gemm_goi_w + k-block: 1 +- name: xnn_f32_ppmm_minmax_ukernel_2x1v__rvv + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_f32_gemm_goi_w + k-block: 1 +- name: xnn_f32_ppmm_minmax_ukernel_2x2v__rvv + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_f32_gemm_goi_w + k-block: 1 +- name: xnn_f32_ppmm_minmax_ukernel_2x4v__rvv + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_f32_gemm_goi_w + k-block: 1 +- name: xnn_f32_ppmm_minmax_ukernel_4x1v__rvv + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_f32_gemm_goi_w + k-block: 1 +- name: xnn_f32_ppmm_minmax_ukernel_4x2v__rvv + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_f32_gemm_goi_w + k-block: 1 +- name: xnn_f32_ppmm_minmax_ukernel_4x4v__rvv + init: xnn_init_f32_minmax_scalar_params + pack: xnn_pack_f32_gemm_goi_w + k-block: 1