From d3121ea9dde74fecd650a9eaa4e290fc411117b9 Mon Sep 17 00:00:00 2001
From: velonica0 <like@mail.nankai.edu.cn>
Date: Tue, 14 Apr 2026 09:51:44 +0800
Subject: [PATCH 1/3] [RVV] add rvv f32 kernels for velu, vgelu, vapproxgelu

Add RVV kernels for three elementwise activation ops:
- f32-velu (ELU)       - rr2_p6 polynomial approximation
- f32-vgelu (GELU)     - rational 12/10 approximation (div + nr variants)
- f32-vapproxgelu      - rational 12/10 approximation (div + nr variants)

Tested on SpacemiT K1 CPU and K3 CPU, both VLEN=256.
---
 cmake/gen/rvv_microkernels.cmake              |  24 +++-
 gen/rvv_microkernels.bzl                      |  20 ++++
 scripts/generate-f32-vapproxgelu.sh           |  10 ++
 scripts/generate-f32-velu.sh                  |   6 +
 scripts/generate-f32-vgelu.sh                 |  10 ++
 src/configs/unary-elementwise-config.c        |  43 ++++++-
 src/f32-vapproxgelu/f32-vapproxgelu.inc       |   7 ++
 ...2-vapproxgelu-rvv-rational-12-10-div-u1v.c |  96 ++++++++++++++++
 ...2-vapproxgelu-rvv-rational-12-10-div-u2v.c |  96 ++++++++++++++++
 ...2-vapproxgelu-rvv-rational-12-10-div-u4v.c |  96 ++++++++++++++++
 ...2-vapproxgelu-rvv-rational-12-10-div-u8v.c |  96 ++++++++++++++++
 ...32-vapproxgelu-rvv-rational-12-10-nr-u1v.c | 103 +++++++++++++++++
 ...32-vapproxgelu-rvv-rational-12-10-nr-u2v.c | 103 +++++++++++++++++
 ...32-vapproxgelu-rvv-rational-12-10-nr-u4v.c | 103 +++++++++++++++++
 ...32-vapproxgelu-rvv-rational-12-10-nr-u8v.c | 103 +++++++++++++++++
 src/f32-vapproxgelu/rvv-rational-12-10.c.in   | 107 ++++++++++++++++++
 src/f32-velu/f32-velu.inc                     |   7 ++
 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c    | 105 +++++++++++++++++
 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c    | 105 +++++++++++++++++
 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c    | 105 +++++++++++++++++
 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c    | 105 +++++++++++++++++
 src/f32-velu/rvv-rr2-p6.c.in                  | 101 +++++++++++++++++
 src/f32-vgelu/f32-vgelu.inc                   |   7 ++
 .../f32-vgelu-rvv-rational-12-10-div-u1v.c    |  95 ++++++++++++++++
 .../f32-vgelu-rvv-rational-12-10-div-u2v.c    |  95 ++++++++++++++++
 .../f32-vgelu-rvv-rational-12-10-div-u4v.c    |  95 ++++++++++++++++
 .../f32-vgelu-rvv-rational-12-10-div-u8v.c    |  95 ++++++++++++++++
 .../gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c | 102 +++++++++++++++++
 .../gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c | 102 +++++++++++++++++
 .../gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c | 102 +++++++++++++++++
 .../gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c | 102 +++++++++++++++++
 src/f32-vgelu/rvv-rational-12-10.c.in         | 105 +++++++++++++++++
 32 files changed, 2446 insertions(+), 5 deletions(-)
 create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c
 create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c
 create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c
 create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c
 create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c
 create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c
 create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c
 create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c
 create mode 100644 src/f32-vapproxgelu/rvv-rational-12-10.c.in
 create mode 100644 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c
 create mode 100644 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c
 create mode 100644 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c
 create mode 100644 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c
 create mode 100644 src/f32-velu/rvv-rr2-p6.c.in
 create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c
 create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c
 create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c
 create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c
 create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c
 create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c
 create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c
 create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c
 create mode 100644 src/f32-vgelu/rvv-rational-12-10.c.in

diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake
index e290b67f0da..f33a3a3a92b 100644
--- a/cmake/gen/rvv_microkernels.cmake
+++ b/cmake/gen/rvv_microkernels.cmake
@@ -142,7 +142,10 @@ SET(PROD_RVV_MICROKERNEL_SRCS
   src/x32-transposec/gen/x32-transposec-4x4-rvv.c
   src/x32-transposec/gen/x32-transposec-8x8-rvv.c
   src/x32-transposec/gen/x32-transposec-16x8-rvv.c
-  src/x32-transposec/gen/x32-transposec-32x8-rvv.c)
+  src/x32-transposec/gen/x32-transposec-32x8-rvv.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c
+  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c)
 
 SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-1x1.c
@@ -333,6 +336,23 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u4.c
   src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u2.c
   src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u4.c
-  src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c)
+  src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c
+  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c
+  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c
+  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c)
 
 SET(ALL_RVV_MICROKERNEL_SRCS ${PROD_RVV_MICROKERNEL_SRCS} + ${NON_PROD_RVV_MICROKERNEL_SRCS})
diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl
index cea694c83c7..e86c895b66b 100644
--- a/gen/rvv_microkernels.bzl
+++ b/gen/rvv_microkernels.bzl
@@ -139,6 +139,9 @@ PROD_RVV_MICROKERNEL_SRCS = [
     "src/x32-transposec/gen/x32-transposec-8x8-rvv.c",
     "src/x32-transposec/gen/x32-transposec-16x8-rvv.c",
     "src/x32-transposec/gen/x32-transposec-32x8-rvv.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c",
+    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c",
 ]
 
 NON_PROD_RVV_MICROKERNEL_SRCS = [
@@ -331,6 +334,23 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
     "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u2.c",
     "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u4.c",
     "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c",
+    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c",
+    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c",
+    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c",
 ]
 
 ALL_RVV_MICROKERNEL_SRCS = PROD_RVV_MICROKERNEL_SRCS + NON_PROD_RVV_MICROKERNEL_SRCS
diff --git a/scripts/generate-f32-vapproxgelu.sh b/scripts/generate-f32-vapproxgelu.sh
index 3c08f62388a..69ffec45d06 100755
--- a/scripts/generate-f32-vapproxgelu.sh
+++ b/scripts/generate-f32-vapproxgelu.sh
@@ -20,4 +20,14 @@ tools/xngen src/f32-vapproxgelu/rational-12-10.c.in -D ARCH=hvx -D BATCH_TILES=3
 
 tools/xngen src/f32-vapproxgelu/rational-12-10.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-avx512f-rational-12-10-nr.c &
 
+################################## RISC-V RVV #################################
+tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c &
+tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c &
+tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c &
+tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c &
+tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=NR  -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c &
+tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=NR  -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c &
+tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=NR  -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c &
+tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=NR  -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c &
+
 wait
diff --git a/scripts/generate-f32-velu.sh b/scripts/generate-f32-velu.sh
index 709add00d4e..4ac9ed34522 100755
--- a/scripts/generate-f32-velu.sh
+++ b/scripts/generate-f32-velu.sh
@@ -149,4 +149,10 @@ tools/xngen src/f32-velu/avx512f-rr1-p6.c.in -D BATCH_TILE=32  -o src/f32-velu/g
 tools/xngen src/f32-velu/avx512f-rr1-p6.c.in -D BATCH_TILE=48  -o src/f32-velu/gen/f32-velu-avx512f-rr1-p6-u48.c &
 tools/xngen src/f32-velu/avx512f-rr1-p6.c.in -D BATCH_TILE=64  -o src/f32-velu/gen/f32-velu-avx512f-rr1-p6-u64.c &
 
+################################## RISC-V RVV #################################
+tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=1 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c &
+tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=2 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c &
+tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=4 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c &
+tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=8 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c &
+
 wait
diff --git a/scripts/generate-f32-vgelu.sh b/scripts/generate-f32-vgelu.sh
index b4db27d1767..8929bba6b27 100755
--- a/scripts/generate-f32-vgelu.sh
+++ b/scripts/generate-f32-vgelu.sh
@@ -21,4 +21,14 @@ tools/xngen src/f32-vgelu/rational-12-10.c.in -D ARCH=hvx      -D BATCH_TILES=32
 tools/xngen src/f32-vgelu/rational-12-10.c.in -D ARCH=avx512f  -D BATCH_TILES=16,32,48,64 -D DIV=NR  -o src/f32-vgelu/gen/f32-vgelu-avx512f-rational-12-10-nr.c &
 tools/xngen src/f32-vgelu/rational-12-10.c.in -D ARCH=hvx      -D BATCH_TILES=32,64,128   -D DIV=NR  -o src/f32-vgelu/gen/f32-vgelu-hvx-rational-12-10-nr.c &
 
+################################## RISC-V RVV #################################
+tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c &
+tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c &
+tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c &
+tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c &
+tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=NR  -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c &
+tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=NR  -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c &
+tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=NR  -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c &
+tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=NR  -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c &
+
 wait
diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c
index 11062b7d6d7..2c84009cf34 100644
--- a/src/configs/unary-elementwise-config.c
+++ b/src/configs/unary-elementwise-config.c
@@ -1034,6 +1034,19 @@ static void init_f32_approxgelu_config_impl(struct xnn_unary_elementwise_config*
       config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__hvx_rational_12_10_div_u128);
       config->element_tile = 128;
     }
+  #elif XNN_ARCH_RISCV
+    #if XNN_ENABLE_RISCV_VECTOR
+      const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+      assert(hardware_config != NULL);
+      if (hardware_config->arch_flags & xnn_arch_riscv_vector) {
+        config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u4v);
+        config->element_tile = 4 * hardware_config->vlenb / sizeof(float);
+      } else
+    #endif
+    {
+      config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__scalar_rational_12_10_div_u1);
+      config->element_tile = 1;
+    }
   #else
     config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__scalar_rational_12_10_div_u1);
     config->element_tile = 1;
@@ -1292,9 +1305,20 @@ static void init_f32_elu_config(void) {
       }
     #endif
   #elif XNN_ARCH_RISCV
-    f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4);
-    f32_elu_config.element_tile = 4;
-    f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params;
+    #if XNN_ENABLE_RISCV_VECTOR
+      const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+      assert(hardware_config != NULL);
+      if (hardware_config->arch_flags & xnn_arch_riscv_vector) {
+        f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__rvv_rr2_p6_u4v);
+        f32_elu_config.element_tile = 4 * hardware_config->vlenb / sizeof(float);
+        f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params;
+      } else
+    #endif
+    {
+      f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4);
+      f32_elu_config.element_tile = 4;
+      f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params;
+    }
   #else
     f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4);
     f32_elu_config.element_tile = 4;
@@ -1367,6 +1391,19 @@ static void init_f32_gelu_config_impl(struct xnn_unary_elementwise_config* confi
       config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__hvx_rational_12_10_div_u128);
       config->element_tile = 128;
     }
+  #elif XNN_ARCH_RISCV
+    #if XNN_ENABLE_RISCV_VECTOR
+      const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+      assert(hardware_config != NULL);
+      if (hardware_config->arch_flags & xnn_arch_riscv_vector) {
+        config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u4v);
+        config->element_tile = 4 * hardware_config->vlenb / sizeof(float);
+      } else
+    #endif
+    {
+      config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u1);
+      config->element_tile = 1;
+    }
   #else
     config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u1);
     config->element_tile = 1;
diff --git a/src/f32-vapproxgelu/f32-vapproxgelu.inc b/src/f32-vapproxgelu/f32-vapproxgelu.inc
index 5bae52e23f8..68c2658a354 100644
--- a/src/f32-vapproxgelu/f32-vapproxgelu.inc
+++ b/src/f32-vapproxgelu/f32-vapproxgelu.inc
@@ -65,3 +65,10 @@ XNN_UKERNEL(xnn_arch_none, xnn_f32_vapproxgelu_ukernel__wasmsimd_rational_12_10_
 XNN_UKERNEL(xnn_arch_none, xnn_f32_vapproxgelu_ukernel__wasmsimd_rational_12_10_div_u12, 12, false, float, struct xnn_f32_default_params, NULL)
 XNN_UKERNEL(xnn_arch_none, xnn_f32_vapproxgelu_ukernel__wasmsimd_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, NULL)
 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u1v, 1, true, float, struct xnn_f32_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u2v, 2, true, float, struct xnn_f32_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u4v, 4, true, float, struct xnn_f32_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u8v, 8, true, float, struct xnn_f32_default_params, NULL)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c
new file mode 100644
index 00000000000..d78fb789043
--- /dev/null
+++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c
@@ -0,0 +1,96 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u1v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_x = 4.84974098e+00f;
+  const float vmin_x = -4.84974098e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788458347e-01f;
+  const float valpha_3 = 6.0803253204e-02f;
+  const float valpha_5 = 7.2898347862e-03f;
+  const float valpha_7 = 2.6887017884e-04f;
+  const float valpha_9 = 1.4302649106e-05f;
+  const float valpha_11 = 4.9544411240e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.4369759858e-01f;
+  const float vbeta_4 = 2.4381054565e-02f;
+  const float vbeta_6 = 1.3060354395e-03f;
+  const float vbeta_8 = 7.6477612311e-05f;
+  const float vbeta_10 = 1.3433452750e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m1(batch);
+
+    vfloat32m1_t vx_orig = __riscv_vle32_v_f32m1(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m1_t vx = __riscv_vfmin_vf_f32m1(vx_orig, vmax_x, n);
+    vx = __riscv_vfmax_vf_f32m1(vx, vmin_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m1_t vx2 = __riscv_vfmul_vv_f32m1(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m1(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m1(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m1_t vq = __riscv_vfmv_v_f_f32m1(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m1(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    vfloat32m1_t verf = __riscv_vfdiv_vv_f32m1(vp, vq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m1_t vy = __riscv_vfadd_vf_f32m1(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m1(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m1(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m1(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c
new file mode 100644
index 00000000000..f93fe59994c
--- /dev/null
+++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c
@@ -0,0 +1,96 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u2v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_x = 4.84974098e+00f;
+  const float vmin_x = -4.84974098e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788458347e-01f;
+  const float valpha_3 = 6.0803253204e-02f;
+  const float valpha_5 = 7.2898347862e-03f;
+  const float valpha_7 = 2.6887017884e-04f;
+  const float valpha_9 = 1.4302649106e-05f;
+  const float valpha_11 = 4.9544411240e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.4369759858e-01f;
+  const float vbeta_4 = 2.4381054565e-02f;
+  const float vbeta_6 = 1.3060354395e-03f;
+  const float vbeta_8 = 7.6477612311e-05f;
+  const float vbeta_10 = 1.3433452750e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m2(batch);
+
+    vfloat32m2_t vx_orig = __riscv_vle32_v_f32m2(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m2_t vx = __riscv_vfmin_vf_f32m2(vx_orig, vmax_x, n);
+    vx = __riscv_vfmax_vf_f32m2(vx, vmin_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m2_t vx2 = __riscv_vfmul_vv_f32m2(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m2_t vp = __riscv_vfmv_v_f_f32m2(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m2(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m2(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m2_t vq = __riscv_vfmv_v_f_f32m2(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m2(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    vfloat32m2_t verf = __riscv_vfdiv_vv_f32m2(vp, vq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m2_t vy = __riscv_vfadd_vf_f32m2(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m2(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m2(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m2(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c
new file mode 100644
index 00000000000..175aa184519
--- /dev/null
+++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c
@@ -0,0 +1,96 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u4v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_x = 4.84974098e+00f;
+  const float vmin_x = -4.84974098e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788458347e-01f;
+  const float valpha_3 = 6.0803253204e-02f;
+  const float valpha_5 = 7.2898347862e-03f;
+  const float valpha_7 = 2.6887017884e-04f;
+  const float valpha_9 = 1.4302649106e-05f;
+  const float valpha_11 = 4.9544411240e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.4369759858e-01f;
+  const float vbeta_4 = 2.4381054565e-02f;
+  const float vbeta_6 = 1.3060354395e-03f;
+  const float vbeta_8 = 7.6477612311e-05f;
+  const float vbeta_10 = 1.3433452750e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m4(batch);
+
+    vfloat32m4_t vx_orig = __riscv_vle32_v_f32m4(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m4_t vx = __riscv_vfmin_vf_f32m4(vx_orig, vmax_x, n);
+    vx = __riscv_vfmax_vf_f32m4(vx, vmin_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m4_t vx2 = __riscv_vfmul_vv_f32m4(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m4_t vp = __riscv_vfmv_v_f_f32m4(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m4(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m4(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m4_t vq = __riscv_vfmv_v_f_f32m4(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m4(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    vfloat32m4_t verf = __riscv_vfdiv_vv_f32m4(vp, vq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m4_t vy = __riscv_vfadd_vf_f32m4(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m4(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m4(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m4(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c
new file mode 100644
index 00000000000..bc3d665e050
--- /dev/null
+++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c
@@ -0,0 +1,96 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u8v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_x = 4.84974098e+00f;
+  const float vmin_x = -4.84974098e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788458347e-01f;
+  const float valpha_3 = 6.0803253204e-02f;
+  const float valpha_5 = 7.2898347862e-03f;
+  const float valpha_7 = 2.6887017884e-04f;
+  const float valpha_9 = 1.4302649106e-05f;
+  const float valpha_11 = 4.9544411240e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.4369759858e-01f;
+  const float vbeta_4 = 2.4381054565e-02f;
+  const float vbeta_6 = 1.3060354395e-03f;
+  const float vbeta_8 = 7.6477612311e-05f;
+  const float vbeta_10 = 1.3433452750e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m8(batch);
+
+    vfloat32m8_t vx_orig = __riscv_vle32_v_f32m8(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m8_t vx = __riscv_vfmin_vf_f32m8(vx_orig, vmax_x, n);
+    vx = __riscv_vfmax_vf_f32m8(vx, vmin_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m8_t vx2 = __riscv_vfmul_vv_f32m8(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m8_t vp = __riscv_vfmv_v_f_f32m8(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m8(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m8(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m8_t vq = __riscv_vfmv_v_f_f32m8(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m8(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    vfloat32m8_t verf = __riscv_vfdiv_vv_f32m8(vp, vq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m8_t vy = __riscv_vfadd_vf_f32m8(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m8(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m8(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m8(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c
new file mode 100644
index 00000000000..9fce14f9c13
--- /dev/null
+++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c
@@ -0,0 +1,103 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_nr_u1v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_x = 4.79519796e+00f;
+  const float vmin_x = -4.84563780e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788458347e-01f;
+  const float valpha_3 = 6.0803253204e-02f;
+  const float valpha_5 = 7.2898347862e-03f;
+  const float valpha_7 = 2.6887017884e-04f;
+  const float valpha_9 = 1.4302649106e-05f;
+  const float valpha_11 = 4.9544411240e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.4369759858e-01f;
+  const float vbeta_4 = 2.4381054565e-02f;
+  const float vbeta_6 = 1.3060354395e-03f;
+  const float vbeta_8 = 7.6477612311e-05f;
+  const float vbeta_10 = 1.3433452750e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m1(batch);
+
+    vfloat32m1_t vx_orig = __riscv_vle32_v_f32m1(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m1_t vx = __riscv_vfmin_vf_f32m1(vx_orig, vmax_x, n);
+    vx = __riscv_vfmax_vf_f32m1(vx, vmin_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m1_t vx2 = __riscv_vfmul_vv_f32m1(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m1(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m1(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m1_t vq = __riscv_vfmv_v_f_f32m1(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m1(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    // Newton-Raphson iteration for reciprocal.
+    vfloat32m1_t vrq = __riscv_vfrec7_v_f32m1(vq, n);
+    for (size_t iter = 0; iter < 2; iter++) {
+      vfloat32m1_t verr = __riscv_vfnmsac_vv_f32m1(
+          __riscv_vfmv_v_f_f32m1(2.0f, n), vrq, vq, n);
+      vrq = __riscv_vfmul_vv_f32m1(vrq, verr, n);
+    }
+    vfloat32m1_t verf = __riscv_vfmul_vv_f32m1(vp, vrq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m1_t vy = __riscv_vfadd_vf_f32m1(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m1(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m1(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m1(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c
new file mode 100644
index 00000000000..d281765f915
--- /dev/null
+++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c
@@ -0,0 +1,103 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_nr_u2v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_x = 4.79519796e+00f;
+  const float vmin_x = -4.84563780e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788458347e-01f;
+  const float valpha_3 = 6.0803253204e-02f;
+  const float valpha_5 = 7.2898347862e-03f;
+  const float valpha_7 = 2.6887017884e-04f;
+  const float valpha_9 = 1.4302649106e-05f;
+  const float valpha_11 = 4.9544411240e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.4369759858e-01f;
+  const float vbeta_4 = 2.4381054565e-02f;
+  const float vbeta_6 = 1.3060354395e-03f;
+  const float vbeta_8 = 7.6477612311e-05f;
+  const float vbeta_10 = 1.3433452750e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m2(batch);
+
+    vfloat32m2_t vx_orig = __riscv_vle32_v_f32m2(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m2_t vx = __riscv_vfmin_vf_f32m2(vx_orig, vmax_x, n);
+    vx = __riscv_vfmax_vf_f32m2(vx, vmin_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m2_t vx2 = __riscv_vfmul_vv_f32m2(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m2_t vp = __riscv_vfmv_v_f_f32m2(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m2(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m2(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m2_t vq = __riscv_vfmv_v_f_f32m2(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m2(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    // Newton-Raphson iteration for reciprocal.
+    vfloat32m2_t vrq = __riscv_vfrec7_v_f32m2(vq, n);
+    for (size_t iter = 0; iter < 2; iter++) {
+      vfloat32m2_t verr = __riscv_vfnmsac_vv_f32m2(
+          __riscv_vfmv_v_f_f32m2(2.0f, n), vrq, vq, n);
+      vrq = __riscv_vfmul_vv_f32m2(vrq, verr, n);
+    }
+    vfloat32m2_t verf = __riscv_vfmul_vv_f32m2(vp, vrq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m2_t vy = __riscv_vfadd_vf_f32m2(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m2(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m2(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m2(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c
new file mode 100644
index 00000000000..e66bb8c4b83
--- /dev/null
+++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c
@@ -0,0 +1,103 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_nr_u4v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_x = 4.79519796e+00f;
+  const float vmin_x = -4.84563780e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788458347e-01f;
+  const float valpha_3 = 6.0803253204e-02f;
+  const float valpha_5 = 7.2898347862e-03f;
+  const float valpha_7 = 2.6887017884e-04f;
+  const float valpha_9 = 1.4302649106e-05f;
+  const float valpha_11 = 4.9544411240e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.4369759858e-01f;
+  const float vbeta_4 = 2.4381054565e-02f;
+  const float vbeta_6 = 1.3060354395e-03f;
+  const float vbeta_8 = 7.6477612311e-05f;
+  const float vbeta_10 = 1.3433452750e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m4(batch);
+
+    vfloat32m4_t vx_orig = __riscv_vle32_v_f32m4(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m4_t vx = __riscv_vfmin_vf_f32m4(vx_orig, vmax_x, n);
+    vx = __riscv_vfmax_vf_f32m4(vx, vmin_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m4_t vx2 = __riscv_vfmul_vv_f32m4(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m4_t vp = __riscv_vfmv_v_f_f32m4(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m4(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m4(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m4_t vq = __riscv_vfmv_v_f_f32m4(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m4(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    // Newton-Raphson iteration for reciprocal.
+    vfloat32m4_t vrq = __riscv_vfrec7_v_f32m4(vq, n);
+    for (size_t iter = 0; iter < 2; iter++) {
+      vfloat32m4_t verr = __riscv_vfnmsac_vv_f32m4(
+          __riscv_vfmv_v_f_f32m4(2.0f, n), vrq, vq, n);
+      vrq = __riscv_vfmul_vv_f32m4(vrq, verr, n);
+    }
+    vfloat32m4_t verf = __riscv_vfmul_vv_f32m4(vp, vrq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m4_t vy = __riscv_vfadd_vf_f32m4(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m4(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m4(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m4(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c
new file mode 100644
index 00000000000..2b17e3a673d
--- /dev/null
+++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c
@@ -0,0 +1,103 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_nr_u8v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_x = 4.79519796e+00f;
+  const float vmin_x = -4.84563780e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788458347e-01f;
+  const float valpha_3 = 6.0803253204e-02f;
+  const float valpha_5 = 7.2898347862e-03f;
+  const float valpha_7 = 2.6887017884e-04f;
+  const float valpha_9 = 1.4302649106e-05f;
+  const float valpha_11 = 4.9544411240e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.4369759858e-01f;
+  const float vbeta_4 = 2.4381054565e-02f;
+  const float vbeta_6 = 1.3060354395e-03f;
+  const float vbeta_8 = 7.6477612311e-05f;
+  const float vbeta_10 = 1.3433452750e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m8(batch);
+
+    vfloat32m8_t vx_orig = __riscv_vle32_v_f32m8(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m8_t vx = __riscv_vfmin_vf_f32m8(vx_orig, vmax_x, n);
+    vx = __riscv_vfmax_vf_f32m8(vx, vmin_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m8_t vx2 = __riscv_vfmul_vv_f32m8(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m8_t vp = __riscv_vfmv_v_f_f32m8(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m8(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m8(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m8_t vq = __riscv_vfmv_v_f_f32m8(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m8(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    // Newton-Raphson iteration for reciprocal.
+    vfloat32m8_t vrq = __riscv_vfrec7_v_f32m8(vq, n);
+    for (size_t iter = 0; iter < 2; iter++) {
+      vfloat32m8_t verr = __riscv_vfnmsac_vv_f32m8(
+          __riscv_vfmv_v_f_f32m8(2.0f, n), vrq, vq, n);
+      vrq = __riscv_vfmul_vv_f32m8(vrq, verr, n);
+    }
+    vfloat32m8_t verf = __riscv_vfmul_vv_f32m8(vp, vrq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m8_t vy = __riscv_vfadd_vf_f32m8(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m8(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m8(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m8(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vapproxgelu/rvv-rational-12-10.c.in b/src/f32-vapproxgelu/rvv-rational-12-10.c.in
new file mode 100644
index 00000000000..e2294943328
--- /dev/null
+++ b/src/f32-vapproxgelu/rvv-rational-12-10.c.in
@@ -0,0 +1,107 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+$assert DIV in ("DIV", "NR")
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_${DIV.lower()}_u${LMUL}v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  $if DIV == "NR":
+    const float vmax_x = 4.79519796e+00f;
+    const float vmin_x = -4.84563780e+00f;
+  $else:
+    const float vmax_x = 4.84974098e+00f;
+    const float vmin_x = -4.84974098e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788458347e-01f;
+  const float valpha_3 = 6.0803253204e-02f;
+  const float valpha_5 = 7.2898347862e-03f;
+  const float valpha_7 = 2.6887017884e-04f;
+  const float valpha_9 = 1.4302649106e-05f;
+  const float valpha_11 = 4.9544411240e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.4369759858e-01f;
+  const float vbeta_4 = 2.4381054565e-02f;
+  const float vbeta_6 = 1.3060354395e-03f;
+  const float vbeta_8 = 7.6477612311e-05f;
+  const float vbeta_10 = 1.3433452750e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m${LMUL}(batch);
+
+    vfloat32m${LMUL}_t vx_orig = __riscv_vle32_v_f32m${LMUL}(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m${LMUL}_t vx = __riscv_vfmin_vf_f32m${LMUL}(vx_orig, vmax_x, n);
+    vx = __riscv_vfmax_vf_f32m${LMUL}(vx, vmin_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m${LMUL}_t vx2 = __riscv_vfmul_vv_f32m${LMUL}(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m${LMUL}_t vp = __riscv_vfmv_v_f_f32m${LMUL}(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m${LMUL}(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m${LMUL}(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m${LMUL}_t vq = __riscv_vfmv_v_f_f32m${LMUL}(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m${LMUL}(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    $if DIV == "DIV":
+      vfloat32m${LMUL}_t verf = __riscv_vfdiv_vv_f32m${LMUL}(vp, vq, n);
+    $else:
+      // Newton-Raphson iteration for reciprocal.
+      vfloat32m${LMUL}_t vrq = __riscv_vfrec7_v_f32m${LMUL}(vq, n);
+      for (size_t iter = 0; iter < 2; iter++) {
+        vfloat32m${LMUL}_t verr = __riscv_vfnmsac_vv_f32m${LMUL}(
+            __riscv_vfmv_v_f_f32m${LMUL}(2.0f, n), vrq, vq, n);
+        vrq = __riscv_vfmul_vv_f32m${LMUL}(vrq, verr, n);
+      }
+      vfloat32m${LMUL}_t verf = __riscv_vfmul_vv_f32m${LMUL}(vp, vrq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m${LMUL}_t vy = __riscv_vfadd_vf_f32m${LMUL}(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m${LMUL}(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m${LMUL}(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m${LMUL}(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-velu/f32-velu.inc b/src/f32-velu/f32-velu.inc
index 23024c853fa..7a4d3daad2d 100644
--- a/src/f32-velu/f32-velu.inc
+++ b/src/f32-velu/f32-velu.inc
@@ -128,6 +128,13 @@ XNN_UKERNEL(xnn_arch_none, xnn_f32_velu_ukernel__wasmrelaxedsimd_rr2_p6_u12, 12,
 XNN_UKERNEL(xnn_arch_none, xnn_f32_velu_ukernel__wasmrelaxedsimd_rr2_p6_u16, 16, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
 #endif  // XNN_ARCH_WASMRELAXEDSIMD
 
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_velu_ukernel__rvv_rr2_p6_u1v, 1, true, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_velu_ukernel__rvv_rr2_p6_u2v, 2, true, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_velu_ukernel__rvv_rr2_p6_u4v, 4, true, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_velu_ukernel__rvv_rr2_p6_u8v, 8, true, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+
 XNN_UKERNEL(xnn_arch_none, xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u1, 1, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
 XNN_UKERNEL(xnn_arch_none, xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u2, 2, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
 XNN_UKERNEL(xnn_arch_none, xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u3, 3, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params)
diff --git a/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c
new file mode 100644
index 00000000000..8648b6bd409
--- /dev/null
+++ b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c
@@ -0,0 +1,105 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-velu/rvv-rr2-p6.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_velu_ukernel__rvv_rr2_p6_u1v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_elu_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const float vprescale = params->scalar.prescale;
+  const float valpha = params->scalar.alpha;
+  const float vbeta = params->scalar.beta;
+
+  const float vsat_cutoff = -0x1.154246p+4f;
+  const float vmagic_bias = 0x1.8000FEp23f;
+  const float vlog2e = 0x1.715476p+0f;
+  const float vminus_ln2_hi = -0x1.62E440p-1f;
+  const float vminus_ln2_lo = 0x1.0105C6p-21f;
+  const float vc6 = 0x1.6b7338p-10f;
+  const float vc5 = 0x1.12278Ep-7f;
+  const float vc4 = 0x1.555716p-5f;
+  const float vc3 = 0x1.5554B0p-3f;
+  const float vc2 = 0x1.FFFFFEp-2f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m1(batch);
+
+    vfloat32m1_t vx = __riscv_vle32_v_f32m1(input, n);
+    input += n;
+
+    // Compute reduced argument z = max(prescale * x, sat_cutoff).
+    vfloat32m1_t vz = __riscv_vfmul_vf_f32m1(vx, vprescale, n);
+    vz = __riscv_vfmax_vf_f32m1(vz, vsat_cutoff, n);
+
+    // Compute reduced argument n = round(z / ln(2)).
+    // Use magic bias to get rounding for free.
+    vfloat32m1_t vn = __riscv_vfmacc_vf_f32m1(
+        __riscv_vfmv_v_f_f32m1(vmagic_bias, n), vlog2e, vz, n);
+
+    // Create 2^n by shifting n (as integer) into the exponent field.
+    vint32m1_t ven = __riscv_vsll_vx_i32m1(
+        __riscv_vreinterpret_v_f32m1_i32m1(vn), 23, n);
+    vfloat32m1_t vs = __riscv_vreinterpret_v_i32m1_f32m1(ven);
+
+    // Subtract magic bias to get the reduced argument.
+    vn = __riscv_vfsub_vf_f32m1(vn, vmagic_bias, n);
+
+    // Compute reduced argument t = z - n * ln(2).
+    // Use Cody-Waite range reduction (two constants to represent ln(2)).
+    vfloat32m1_t vt = __riscv_vfmacc_vf_f32m1(vz, vminus_ln2_hi, vn, n);
+    vt = __riscv_vfmacc_vf_f32m1(vt, vminus_ln2_lo, vn, n);
+
+    // Compute degree-6 polynomial approximation for exp(t) - 1 using Horner's method.
+    //   p = t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))
+    vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(vc5, n);
+    vp = __riscv_vfmacc_vf_f32m1(vp, vc6, vt, n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vt, __riscv_vfmv_v_f_f32m1(vc4, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vt, __riscv_vfmv_v_f_f32m1(vc3, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vt, __riscv_vfmv_v_f_f32m1(vc2, n), n);
+    vp = __riscv_vfmul_vv_f32m1(vp, vt, n);
+
+    // Reconstruct the exp(z) value:
+    //   t * s
+    //   s - 1
+    //   p = (p * t) + t
+    //   e = (p + (s - 1)) * alpha
+    vt = __riscv_vfmul_vv_f32m1(vt, vs, n);
+    vs = __riscv_vfsub_vf_f32m1(vs, 1.0f, n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vt, vt, n);
+    vfloat32m1_t ve = __riscv_vfmul_vf_f32m1(
+        __riscv_vfadd_vv_f32m1(vp, vs, n), valpha, n);
+
+    // Select between the ELU and linear parts:
+    //   y = x < 0 ? e : x * beta
+    vfloat32m1_t vy = __riscv_vfmul_vf_f32m1(vx, vbeta, n);
+    vbool32_t mask = __riscv_vmflt_vf_f32m1_b32(vx, 0.0f, n);
+    vy = __riscv_vmerge_vvm_f32m1(vy, ve, mask, n);
+
+    __riscv_vse32_v_f32m1(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c
new file mode 100644
index 00000000000..194d3bfcfc5
--- /dev/null
+++ b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c
@@ -0,0 +1,105 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-velu/rvv-rr2-p6.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_velu_ukernel__rvv_rr2_p6_u2v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_elu_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const float vprescale = params->scalar.prescale;
+  const float valpha = params->scalar.alpha;
+  const float vbeta = params->scalar.beta;
+
+  const float vsat_cutoff = -0x1.154246p+4f;
+  const float vmagic_bias = 0x1.8000FEp23f;
+  const float vlog2e = 0x1.715476p+0f;
+  const float vminus_ln2_hi = -0x1.62E440p-1f;
+  const float vminus_ln2_lo = 0x1.0105C6p-21f;
+  const float vc6 = 0x1.6b7338p-10f;
+  const float vc5 = 0x1.12278Ep-7f;
+  const float vc4 = 0x1.555716p-5f;
+  const float vc3 = 0x1.5554B0p-3f;
+  const float vc2 = 0x1.FFFFFEp-2f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m2(batch);
+
+    vfloat32m2_t vx = __riscv_vle32_v_f32m2(input, n);
+    input += n;
+
+    // Compute reduced argument z = max(prescale * x, sat_cutoff).
+    vfloat32m2_t vz = __riscv_vfmul_vf_f32m2(vx, vprescale, n);
+    vz = __riscv_vfmax_vf_f32m2(vz, vsat_cutoff, n);
+
+    // Compute reduced argument n = round(z / ln(2)).
+    // Use magic bias to get rounding for free.
+    vfloat32m2_t vn = __riscv_vfmacc_vf_f32m2(
+        __riscv_vfmv_v_f_f32m2(vmagic_bias, n), vlog2e, vz, n);
+
+    // Create 2^n by shifting n (as integer) into the exponent field.
+    vint32m2_t ven = __riscv_vsll_vx_i32m2(
+        __riscv_vreinterpret_v_f32m2_i32m2(vn), 23, n);
+    vfloat32m2_t vs = __riscv_vreinterpret_v_i32m2_f32m2(ven);
+
+    // Subtract magic bias to get the reduced argument.
+    vn = __riscv_vfsub_vf_f32m2(vn, vmagic_bias, n);
+
+    // Compute reduced argument t = z - n * ln(2).
+    // Use Cody-Waite range reduction (two constants to represent ln(2)).
+    vfloat32m2_t vt = __riscv_vfmacc_vf_f32m2(vz, vminus_ln2_hi, vn, n);
+    vt = __riscv_vfmacc_vf_f32m2(vt, vminus_ln2_lo, vn, n);
+
+    // Compute degree-6 polynomial approximation for exp(t) - 1 using Horner's method.
+    //   p = t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))
+    vfloat32m2_t vp = __riscv_vfmv_v_f_f32m2(vc5, n);
+    vp = __riscv_vfmacc_vf_f32m2(vp, vc6, vt, n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vt, __riscv_vfmv_v_f_f32m2(vc4, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vt, __riscv_vfmv_v_f_f32m2(vc3, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vt, __riscv_vfmv_v_f_f32m2(vc2, n), n);
+    vp = __riscv_vfmul_vv_f32m2(vp, vt, n);
+
+    // Reconstruct the exp(z) value:
+    //   t * s
+    //   s - 1
+    //   p = (p * t) + t
+    //   e = (p + (s - 1)) * alpha
+    vt = __riscv_vfmul_vv_f32m2(vt, vs, n);
+    vs = __riscv_vfsub_vf_f32m2(vs, 1.0f, n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vt, vt, n);
+    vfloat32m2_t ve = __riscv_vfmul_vf_f32m2(
+        __riscv_vfadd_vv_f32m2(vp, vs, n), valpha, n);
+
+    // Select between the ELU and linear parts:
+    //   y = x < 0 ? e : x * beta
+    vfloat32m2_t vy = __riscv_vfmul_vf_f32m2(vx, vbeta, n);
+    vbool16_t mask = __riscv_vmflt_vf_f32m2_b16(vx, 0.0f, n);
+    vy = __riscv_vmerge_vvm_f32m2(vy, ve, mask, n);
+
+    __riscv_vse32_v_f32m2(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c
new file mode 100644
index 00000000000..8ad83d6482d
--- /dev/null
+++ b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c
@@ -0,0 +1,105 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-velu/rvv-rr2-p6.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_velu_ukernel__rvv_rr2_p6_u4v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_elu_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const float vprescale = params->scalar.prescale;
+  const float valpha = params->scalar.alpha;
+  const float vbeta = params->scalar.beta;
+
+  const float vsat_cutoff = -0x1.154246p+4f;
+  const float vmagic_bias = 0x1.8000FEp23f;
+  const float vlog2e = 0x1.715476p+0f;
+  const float vminus_ln2_hi = -0x1.62E440p-1f;
+  const float vminus_ln2_lo = 0x1.0105C6p-21f;
+  const float vc6 = 0x1.6b7338p-10f;
+  const float vc5 = 0x1.12278Ep-7f;
+  const float vc4 = 0x1.555716p-5f;
+  const float vc3 = 0x1.5554B0p-3f;
+  const float vc2 = 0x1.FFFFFEp-2f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m4(batch);
+
+    vfloat32m4_t vx = __riscv_vle32_v_f32m4(input, n);
+    input += n;
+
+    // Compute reduced argument z = max(prescale * x, sat_cutoff).
+    vfloat32m4_t vz = __riscv_vfmul_vf_f32m4(vx, vprescale, n);
+    vz = __riscv_vfmax_vf_f32m4(vz, vsat_cutoff, n);
+
+    // Compute reduced argument n = round(z / ln(2)).
+    // Use magic bias to get rounding for free.
+    vfloat32m4_t vn = __riscv_vfmacc_vf_f32m4(
+        __riscv_vfmv_v_f_f32m4(vmagic_bias, n), vlog2e, vz, n);
+
+    // Create 2^n by shifting n (as integer) into the exponent field.
+    vint32m4_t ven = __riscv_vsll_vx_i32m4(
+        __riscv_vreinterpret_v_f32m4_i32m4(vn), 23, n);
+    vfloat32m4_t vs = __riscv_vreinterpret_v_i32m4_f32m4(ven);
+
+    // Subtract magic bias to get the reduced argument.
+    vn = __riscv_vfsub_vf_f32m4(vn, vmagic_bias, n);
+
+    // Compute reduced argument t = z - n * ln(2).
+    // Use Cody-Waite range reduction (two constants to represent ln(2)).
+    vfloat32m4_t vt = __riscv_vfmacc_vf_f32m4(vz, vminus_ln2_hi, vn, n);
+    vt = __riscv_vfmacc_vf_f32m4(vt, vminus_ln2_lo, vn, n);
+
+    // Compute degree-6 polynomial approximation for exp(t) - 1 using Horner's method.
+    //   p = t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))
+    vfloat32m4_t vp = __riscv_vfmv_v_f_f32m4(vc5, n);
+    vp = __riscv_vfmacc_vf_f32m4(vp, vc6, vt, n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vt, __riscv_vfmv_v_f_f32m4(vc4, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vt, __riscv_vfmv_v_f_f32m4(vc3, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vt, __riscv_vfmv_v_f_f32m4(vc2, n), n);
+    vp = __riscv_vfmul_vv_f32m4(vp, vt, n);
+
+    // Reconstruct the exp(z) value:
+    //   t * s
+    //   s - 1
+    //   p = (p * t) + t
+    //   e = (p + (s - 1)) * alpha
+    vt = __riscv_vfmul_vv_f32m4(vt, vs, n);
+    vs = __riscv_vfsub_vf_f32m4(vs, 1.0f, n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vt, vt, n);
+    vfloat32m4_t ve = __riscv_vfmul_vf_f32m4(
+        __riscv_vfadd_vv_f32m4(vp, vs, n), valpha, n);
+
+    // Select between the ELU and linear parts:
+    //   y = x < 0 ? e : x * beta
+    vfloat32m4_t vy = __riscv_vfmul_vf_f32m4(vx, vbeta, n);
+    vbool8_t mask = __riscv_vmflt_vf_f32m4_b8(vx, 0.0f, n);
+    vy = __riscv_vmerge_vvm_f32m4(vy, ve, mask, n);
+
+    __riscv_vse32_v_f32m4(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c
new file mode 100644
index 00000000000..2def057e537
--- /dev/null
+++ b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c
@@ -0,0 +1,105 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-velu/rvv-rr2-p6.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_velu_ukernel__rvv_rr2_p6_u8v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_elu_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const float vprescale = params->scalar.prescale;
+  const float valpha = params->scalar.alpha;
+  const float vbeta = params->scalar.beta;
+
+  const float vsat_cutoff = -0x1.154246p+4f;
+  const float vmagic_bias = 0x1.8000FEp23f;
+  const float vlog2e = 0x1.715476p+0f;
+  const float vminus_ln2_hi = -0x1.62E440p-1f;
+  const float vminus_ln2_lo = 0x1.0105C6p-21f;
+  const float vc6 = 0x1.6b7338p-10f;
+  const float vc5 = 0x1.12278Ep-7f;
+  const float vc4 = 0x1.555716p-5f;
+  const float vc3 = 0x1.5554B0p-3f;
+  const float vc2 = 0x1.FFFFFEp-2f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m8(batch);
+
+    vfloat32m8_t vx = __riscv_vle32_v_f32m8(input, n);
+    input += n;
+
+    // Compute reduced argument z = max(prescale * x, sat_cutoff).
+    vfloat32m8_t vz = __riscv_vfmul_vf_f32m8(vx, vprescale, n);
+    vz = __riscv_vfmax_vf_f32m8(vz, vsat_cutoff, n);
+
+    // Compute reduced argument n = round(z / ln(2)).
+    // Use magic bias to get rounding for free.
+    vfloat32m8_t vn = __riscv_vfmacc_vf_f32m8(
+        __riscv_vfmv_v_f_f32m8(vmagic_bias, n), vlog2e, vz, n);
+
+    // Create 2^n by shifting n (as integer) into the exponent field.
+    vint32m8_t ven = __riscv_vsll_vx_i32m8(
+        __riscv_vreinterpret_v_f32m8_i32m8(vn), 23, n);
+    vfloat32m8_t vs = __riscv_vreinterpret_v_i32m8_f32m8(ven);
+
+    // Subtract magic bias to get the reduced argument.
+    vn = __riscv_vfsub_vf_f32m8(vn, vmagic_bias, n);
+
+    // Compute reduced argument t = z - n * ln(2).
+    // Use Cody-Waite range reduction (two constants to represent ln(2)).
+    vfloat32m8_t vt = __riscv_vfmacc_vf_f32m8(vz, vminus_ln2_hi, vn, n);
+    vt = __riscv_vfmacc_vf_f32m8(vt, vminus_ln2_lo, vn, n);
+
+    // Compute degree-6 polynomial approximation for exp(t) - 1 using Horner's method.
+    //   p = t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))
+    vfloat32m8_t vp = __riscv_vfmv_v_f_f32m8(vc5, n);
+    vp = __riscv_vfmacc_vf_f32m8(vp, vc6, vt, n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vt, __riscv_vfmv_v_f_f32m8(vc4, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vt, __riscv_vfmv_v_f_f32m8(vc3, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vt, __riscv_vfmv_v_f_f32m8(vc2, n), n);
+    vp = __riscv_vfmul_vv_f32m8(vp, vt, n);
+
+    // Reconstruct the exp(z) value:
+    //   t * s
+    //   s - 1
+    //   p = (p * t) + t
+    //   e = (p + (s - 1)) * alpha
+    vt = __riscv_vfmul_vv_f32m8(vt, vs, n);
+    vs = __riscv_vfsub_vf_f32m8(vs, 1.0f, n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vt, vt, n);
+    vfloat32m8_t ve = __riscv_vfmul_vf_f32m8(
+        __riscv_vfadd_vv_f32m8(vp, vs, n), valpha, n);
+
+    // Select between the ELU and linear parts:
+    //   y = x < 0 ? e : x * beta
+    vfloat32m8_t vy = __riscv_vfmul_vf_f32m8(vx, vbeta, n);
+    vbool4_t mask = __riscv_vmflt_vf_f32m8_b4(vx, 0.0f, n);
+    vy = __riscv_vmerge_vvm_f32m8(vy, ve, mask, n);
+
+    __riscv_vse32_v_f32m8(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-velu/rvv-rr2-p6.c.in b/src/f32-velu/rvv-rr2-p6.c.in
new file mode 100644
index 00000000000..5cb16183169
--- /dev/null
+++ b/src/f32-velu/rvv-rr2-p6.c.in
@@ -0,0 +1,101 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_velu_ukernel__rvv_rr2_p6_u${LMUL}v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_elu_params* restrict params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const float vprescale = params->scalar.prescale;
+  const float valpha = params->scalar.alpha;
+  const float vbeta = params->scalar.beta;
+
+  const float vsat_cutoff = -0x1.154246p+4f;
+  const float vmagic_bias = 0x1.8000FEp23f;
+  const float vlog2e = 0x1.715476p+0f;
+  const float vminus_ln2_hi = -0x1.62E440p-1f;
+  const float vminus_ln2_lo = 0x1.0105C6p-21f;
+  const float vc6 = 0x1.6b7338p-10f;
+  const float vc5 = 0x1.12278Ep-7f;
+  const float vc4 = 0x1.555716p-5f;
+  const float vc3 = 0x1.5554B0p-3f;
+  const float vc2 = 0x1.FFFFFEp-2f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m${LMUL}(batch);
+
+    vfloat32m${LMUL}_t vx = __riscv_vle32_v_f32m${LMUL}(input, n);
+    input += n;
+
+    // Compute reduced argument z = max(prescale * x, sat_cutoff).
+    vfloat32m${LMUL}_t vz = __riscv_vfmul_vf_f32m${LMUL}(vx, vprescale, n);
+    vz = __riscv_vfmax_vf_f32m${LMUL}(vz, vsat_cutoff, n);
+
+    // Compute reduced argument n = round(z / ln(2)).
+    // Use magic bias to get rounding for free.
+    vfloat32m${LMUL}_t vn = __riscv_vfmacc_vf_f32m${LMUL}(
+        __riscv_vfmv_v_f_f32m${LMUL}(vmagic_bias, n), vlog2e, vz, n);
+
+    // Create 2^n by shifting n (as integer) into the exponent field.
+    vint32m${LMUL}_t ven = __riscv_vsll_vx_i32m${LMUL}(
+        __riscv_vreinterpret_v_f32m${LMUL}_i32m${LMUL}(vn), 23, n);
+    vfloat32m${LMUL}_t vs = __riscv_vreinterpret_v_i32m${LMUL}_f32m${LMUL}(ven);
+
+    // Subtract magic bias to get the reduced argument.
+    vn = __riscv_vfsub_vf_f32m${LMUL}(vn, vmagic_bias, n);
+
+    // Compute reduced argument t = z - n * ln(2).
+    // Use Cody-Waite range reduction (two constants to represent ln(2)).
+    vfloat32m${LMUL}_t vt = __riscv_vfmacc_vf_f32m${LMUL}(vz, vminus_ln2_hi, vn, n);
+    vt = __riscv_vfmacc_vf_f32m${LMUL}(vt, vminus_ln2_lo, vn, n);
+
+    // Compute degree-6 polynomial approximation for exp(t) - 1 using Horner's method.
+    //   p = t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6)))))
+    vfloat32m${LMUL}_t vp = __riscv_vfmv_v_f_f32m${LMUL}(vc5, n);
+    vp = __riscv_vfmacc_vf_f32m${LMUL}(vp, vc6, vt, n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vt, __riscv_vfmv_v_f_f32m${LMUL}(vc4, n), n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vt, __riscv_vfmv_v_f_f32m${LMUL}(vc3, n), n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vt, __riscv_vfmv_v_f_f32m${LMUL}(vc2, n), n);
+    vp = __riscv_vfmul_vv_f32m${LMUL}(vp, vt, n);
+
+    // Reconstruct the exp(z) value:
+    //   t * s
+    //   s - 1
+    //   p = (p * t) + t
+    //   e = (p + (s - 1)) * alpha
+    vt = __riscv_vfmul_vv_f32m${LMUL}(vt, vs, n);
+    vs = __riscv_vfsub_vf_f32m${LMUL}(vs, 1.0f, n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vt, vt, n);
+    vfloat32m${LMUL}_t ve = __riscv_vfmul_vf_f32m${LMUL}(
+        __riscv_vfadd_vv_f32m${LMUL}(vp, vs, n), valpha, n);
+
+    // Select between the ELU and linear parts:
+    //   y = x < 0 ? e : x * beta
+    vfloat32m${LMUL}_t vy = __riscv_vfmul_vf_f32m${LMUL}(vx, vbeta, n);
+    vbool${int(32/LMUL)}_t mask = __riscv_vmflt_vf_f32m${LMUL}_b${int(32/LMUL)}(vx, 0.0f, n);
+    vy = __riscv_vmerge_vvm_f32m${LMUL}(vy, ve, mask, n);
+
+    __riscv_vse32_v_f32m${LMUL}(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vgelu/f32-vgelu.inc b/src/f32-vgelu/f32-vgelu.inc
index 6542997c43b..bb76de70004 100644
--- a/src/f32-vgelu/f32-vgelu.inc
+++ b/src/f32-vgelu/f32-vgelu.inc
@@ -68,3 +68,10 @@ XNN_UKERNEL(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_nr_u32, 32,
 XNN_UKERNEL(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_nr_u64, 64, false, float, struct xnn_f32_default_params, NULL)
 XNN_UKERNEL(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_nr_u128, 128, false, float, struct xnn_f32_default_params, NULL)
 #endif  // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON
+
+#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u1v, 1, true, float, struct xnn_f32_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u2v, 2, true, float, struct xnn_f32_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u4v, 4, true, float, struct xnn_f32_default_params, NULL)
+XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u8v, 8, true, float, struct xnn_f32_default_params, NULL)
+#endif  // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR
diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c
new file mode 100644
index 00000000000..e5e29e1a535
--- /dev/null
+++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c
@@ -0,0 +1,95 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u1v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_abs_x = 5.1638283730e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788452387e-01f;
+  const float valpha_3 = 6.6972173750e-02f;
+  const float valpha_5 = 9.3065137044e-03f;
+  const float valpha_7 = 3.2973114867e-04f;
+  const float valpha_9 = 1.2609783880e-05f;
+  const float valpha_11 = 4.5835321316e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.5060352683e-01f;
+  const float vbeta_4 = 2.8431978077e-02f;
+  const float vbeta_6 = 1.8622842617e-03f;
+  const float vbeta_8 = 7.2267655923e-05f;
+  const float vbeta_10 = 1.1988805682e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m1(batch);
+
+    vfloat32m1_t vx_orig = __riscv_vle32_v_f32m1(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m1_t vx = __riscv_vfmin_vf_f32m1(vx_orig, vmax_abs_x, n);
+    vx = __riscv_vfmax_vf_f32m1(vx, -vmax_abs_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m1_t vx2 = __riscv_vfmul_vv_f32m1(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m1(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m1(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m1_t vq = __riscv_vfmv_v_f_f32m1(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m1(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    vfloat32m1_t verf = __riscv_vfdiv_vv_f32m1(vp, vq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m1_t vy = __riscv_vfadd_vf_f32m1(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m1(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m1(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m1(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c
new file mode 100644
index 00000000000..c7284af1a85
--- /dev/null
+++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c
@@ -0,0 +1,95 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u2v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_abs_x = 5.1638283730e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788452387e-01f;
+  const float valpha_3 = 6.6972173750e-02f;
+  const float valpha_5 = 9.3065137044e-03f;
+  const float valpha_7 = 3.2973114867e-04f;
+  const float valpha_9 = 1.2609783880e-05f;
+  const float valpha_11 = 4.5835321316e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.5060352683e-01f;
+  const float vbeta_4 = 2.8431978077e-02f;
+  const float vbeta_6 = 1.8622842617e-03f;
+  const float vbeta_8 = 7.2267655923e-05f;
+  const float vbeta_10 = 1.1988805682e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m2(batch);
+
+    vfloat32m2_t vx_orig = __riscv_vle32_v_f32m2(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m2_t vx = __riscv_vfmin_vf_f32m2(vx_orig, vmax_abs_x, n);
+    vx = __riscv_vfmax_vf_f32m2(vx, -vmax_abs_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m2_t vx2 = __riscv_vfmul_vv_f32m2(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m2_t vp = __riscv_vfmv_v_f_f32m2(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m2(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m2(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m2_t vq = __riscv_vfmv_v_f_f32m2(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m2(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    vfloat32m2_t verf = __riscv_vfdiv_vv_f32m2(vp, vq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m2_t vy = __riscv_vfadd_vf_f32m2(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m2(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m2(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m2(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c
new file mode 100644
index 00000000000..2961924ddea
--- /dev/null
+++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c
@@ -0,0 +1,95 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u4v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_abs_x = 5.1638283730e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788452387e-01f;
+  const float valpha_3 = 6.6972173750e-02f;
+  const float valpha_5 = 9.3065137044e-03f;
+  const float valpha_7 = 3.2973114867e-04f;
+  const float valpha_9 = 1.2609783880e-05f;
+  const float valpha_11 = 4.5835321316e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.5060352683e-01f;
+  const float vbeta_4 = 2.8431978077e-02f;
+  const float vbeta_6 = 1.8622842617e-03f;
+  const float vbeta_8 = 7.2267655923e-05f;
+  const float vbeta_10 = 1.1988805682e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m4(batch);
+
+    vfloat32m4_t vx_orig = __riscv_vle32_v_f32m4(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m4_t vx = __riscv_vfmin_vf_f32m4(vx_orig, vmax_abs_x, n);
+    vx = __riscv_vfmax_vf_f32m4(vx, -vmax_abs_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m4_t vx2 = __riscv_vfmul_vv_f32m4(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m4_t vp = __riscv_vfmv_v_f_f32m4(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m4(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m4(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m4_t vq = __riscv_vfmv_v_f_f32m4(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m4(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    vfloat32m4_t verf = __riscv_vfdiv_vv_f32m4(vp, vq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m4_t vy = __riscv_vfadd_vf_f32m4(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m4(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m4(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m4(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c
new file mode 100644
index 00000000000..417cc1f7501
--- /dev/null
+++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c
@@ -0,0 +1,95 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u8v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_abs_x = 5.1638283730e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788452387e-01f;
+  const float valpha_3 = 6.6972173750e-02f;
+  const float valpha_5 = 9.3065137044e-03f;
+  const float valpha_7 = 3.2973114867e-04f;
+  const float valpha_9 = 1.2609783880e-05f;
+  const float valpha_11 = 4.5835321316e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.5060352683e-01f;
+  const float vbeta_4 = 2.8431978077e-02f;
+  const float vbeta_6 = 1.8622842617e-03f;
+  const float vbeta_8 = 7.2267655923e-05f;
+  const float vbeta_10 = 1.1988805682e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m8(batch);
+
+    vfloat32m8_t vx_orig = __riscv_vle32_v_f32m8(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m8_t vx = __riscv_vfmin_vf_f32m8(vx_orig, vmax_abs_x, n);
+    vx = __riscv_vfmax_vf_f32m8(vx, -vmax_abs_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m8_t vx2 = __riscv_vfmul_vv_f32m8(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m8_t vp = __riscv_vfmv_v_f_f32m8(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m8(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m8(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m8_t vq = __riscv_vfmv_v_f_f32m8(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m8(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    vfloat32m8_t verf = __riscv_vfdiv_vv_f32m8(vp, vq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m8_t vy = __riscv_vfadd_vf_f32m8(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m8(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m8(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m8(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c
new file mode 100644
index 00000000000..1af619bf8a5
--- /dev/null
+++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c
@@ -0,0 +1,102 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vgelu_ukernel__rvv_rational_12_10_nr_u1v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_abs_x = 5.1164608002e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788452387e-01f;
+  const float valpha_3 = 6.6972173750e-02f;
+  const float valpha_5 = 9.3065137044e-03f;
+  const float valpha_7 = 3.2973114867e-04f;
+  const float valpha_9 = 1.2609783880e-05f;
+  const float valpha_11 = 4.5835321316e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.5060352683e-01f;
+  const float vbeta_4 = 2.8431978077e-02f;
+  const float vbeta_6 = 1.8622842617e-03f;
+  const float vbeta_8 = 7.2267655923e-05f;
+  const float vbeta_10 = 1.1988805682e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m1(batch);
+
+    vfloat32m1_t vx_orig = __riscv_vle32_v_f32m1(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m1_t vx = __riscv_vfmin_vf_f32m1(vx_orig, vmax_abs_x, n);
+    vx = __riscv_vfmax_vf_f32m1(vx, -vmax_abs_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m1_t vx2 = __riscv_vfmul_vv_f32m1(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m1(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m1(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m1_t vq = __riscv_vfmv_v_f_f32m1(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m1(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    // Newton-Raphson iteration for reciprocal.
+    vfloat32m1_t vrq = __riscv_vfrec7_v_f32m1(vq, n);
+    for (size_t iter = 0; iter < 2; iter++) {
+      vfloat32m1_t verr = __riscv_vfnmsac_vv_f32m1(
+          __riscv_vfmv_v_f_f32m1(2.0f, n), vrq, vq, n);
+      vrq = __riscv_vfmul_vv_f32m1(vrq, verr, n);
+    }
+    vfloat32m1_t verf = __riscv_vfmul_vv_f32m1(vp, vrq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m1_t vy = __riscv_vfadd_vf_f32m1(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m1(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m1(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m1(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c
new file mode 100644
index 00000000000..a29c5a312d3
--- /dev/null
+++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c
@@ -0,0 +1,102 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vgelu_ukernel__rvv_rational_12_10_nr_u2v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_abs_x = 5.1164608002e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788452387e-01f;
+  const float valpha_3 = 6.6972173750e-02f;
+  const float valpha_5 = 9.3065137044e-03f;
+  const float valpha_7 = 3.2973114867e-04f;
+  const float valpha_9 = 1.2609783880e-05f;
+  const float valpha_11 = 4.5835321316e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.5060352683e-01f;
+  const float vbeta_4 = 2.8431978077e-02f;
+  const float vbeta_6 = 1.8622842617e-03f;
+  const float vbeta_8 = 7.2267655923e-05f;
+  const float vbeta_10 = 1.1988805682e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m2(batch);
+
+    vfloat32m2_t vx_orig = __riscv_vle32_v_f32m2(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m2_t vx = __riscv_vfmin_vf_f32m2(vx_orig, vmax_abs_x, n);
+    vx = __riscv_vfmax_vf_f32m2(vx, -vmax_abs_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m2_t vx2 = __riscv_vfmul_vv_f32m2(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m2_t vp = __riscv_vfmv_v_f_f32m2(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m2(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m2(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m2_t vq = __riscv_vfmv_v_f_f32m2(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m2(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    // Newton-Raphson iteration for reciprocal.
+    vfloat32m2_t vrq = __riscv_vfrec7_v_f32m2(vq, n);
+    for (size_t iter = 0; iter < 2; iter++) {
+      vfloat32m2_t verr = __riscv_vfnmsac_vv_f32m2(
+          __riscv_vfmv_v_f_f32m2(2.0f, n), vrq, vq, n);
+      vrq = __riscv_vfmul_vv_f32m2(vrq, verr, n);
+    }
+    vfloat32m2_t verf = __riscv_vfmul_vv_f32m2(vp, vrq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m2_t vy = __riscv_vfadd_vf_f32m2(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m2(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m2(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m2(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c
new file mode 100644
index 00000000000..70c3f8d0e53
--- /dev/null
+++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c
@@ -0,0 +1,102 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vgelu_ukernel__rvv_rational_12_10_nr_u4v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_abs_x = 5.1164608002e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788452387e-01f;
+  const float valpha_3 = 6.6972173750e-02f;
+  const float valpha_5 = 9.3065137044e-03f;
+  const float valpha_7 = 3.2973114867e-04f;
+  const float valpha_9 = 1.2609783880e-05f;
+  const float valpha_11 = 4.5835321316e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.5060352683e-01f;
+  const float vbeta_4 = 2.8431978077e-02f;
+  const float vbeta_6 = 1.8622842617e-03f;
+  const float vbeta_8 = 7.2267655923e-05f;
+  const float vbeta_10 = 1.1988805682e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m4(batch);
+
+    vfloat32m4_t vx_orig = __riscv_vle32_v_f32m4(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m4_t vx = __riscv_vfmin_vf_f32m4(vx_orig, vmax_abs_x, n);
+    vx = __riscv_vfmax_vf_f32m4(vx, -vmax_abs_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m4_t vx2 = __riscv_vfmul_vv_f32m4(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m4_t vp = __riscv_vfmv_v_f_f32m4(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m4(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m4(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m4_t vq = __riscv_vfmv_v_f_f32m4(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m4(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    // Newton-Raphson iteration for reciprocal.
+    vfloat32m4_t vrq = __riscv_vfrec7_v_f32m4(vq, n);
+    for (size_t iter = 0; iter < 2; iter++) {
+      vfloat32m4_t verr = __riscv_vfnmsac_vv_f32m4(
+          __riscv_vfmv_v_f_f32m4(2.0f, n), vrq, vq, n);
+      vrq = __riscv_vfmul_vv_f32m4(vrq, verr, n);
+    }
+    vfloat32m4_t verf = __riscv_vfmul_vv_f32m4(vp, vrq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m4_t vy = __riscv_vfadd_vf_f32m4(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m4(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m4(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m4(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c
new file mode 100644
index 00000000000..b8d62ab0cb4
--- /dev/null
+++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c
@@ -0,0 +1,102 @@
+// clang-format off
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vgelu/rvv-rational-12-10.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vgelu_ukernel__rvv_rational_12_10_nr_u8v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  const float vmax_abs_x = 5.1164608002e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788452387e-01f;
+  const float valpha_3 = 6.6972173750e-02f;
+  const float valpha_5 = 9.3065137044e-03f;
+  const float valpha_7 = 3.2973114867e-04f;
+  const float valpha_9 = 1.2609783880e-05f;
+  const float valpha_11 = 4.5835321316e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.5060352683e-01f;
+  const float vbeta_4 = 2.8431978077e-02f;
+  const float vbeta_6 = 1.8622842617e-03f;
+  const float vbeta_8 = 7.2267655923e-05f;
+  const float vbeta_10 = 1.1988805682e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m8(batch);
+
+    vfloat32m8_t vx_orig = __riscv_vle32_v_f32m8(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m8_t vx = __riscv_vfmin_vf_f32m8(vx_orig, vmax_abs_x, n);
+    vx = __riscv_vfmax_vf_f32m8(vx, -vmax_abs_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m8_t vx2 = __riscv_vfmul_vv_f32m8(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m8_t vp = __riscv_vfmv_v_f_f32m8(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m8(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m8(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m8_t vq = __riscv_vfmv_v_f_f32m8(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m8(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    // Newton-Raphson iteration for reciprocal.
+    vfloat32m8_t vrq = __riscv_vfrec7_v_f32m8(vq, n);
+    for (size_t iter = 0; iter < 2; iter++) {
+      vfloat32m8_t verr = __riscv_vfnmsac_vv_f32m8(
+          __riscv_vfmv_v_f_f32m8(2.0f, n), vrq, vq, n);
+      vrq = __riscv_vfmul_vv_f32m8(vrq, verr, n);
+    }
+    vfloat32m8_t verf = __riscv_vfmul_vv_f32m8(vp, vrq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m8_t vy = __riscv_vfadd_vf_f32m8(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m8(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m8(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m8(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}
diff --git a/src/f32-vgelu/rvv-rational-12-10.c.in b/src/f32-vgelu/rvv-rational-12-10.c.in
new file mode 100644
index 00000000000..d38c4565697
--- /dev/null
+++ b/src/f32-vgelu/rvv-rational-12-10.c.in
@@ -0,0 +1,105 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert LMUL in [1, 2, 4, 8]
+$assert DIV in ("DIV", "NR")
+#include <assert.h>
+
+#include <riscv_vector.h>
+
+#include "src/xnnpack/common.h"
+#include "src/xnnpack/vunary.h"
+
+
+void xnn_f32_vgelu_ukernel__rvv_rational_12_10_${DIV.lower()}_u${LMUL}v(
+    size_t batch,
+    const float* input,
+    float* output,
+    const struct xnn_f32_default_params* unused_params)
+{
+  assert(batch != 0);
+  assert(batch % sizeof(float) == 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f`
+  // beyond this point. This value is chosen as the first floating point
+  // number as of which the interpolation returns +/-1.0f.
+  $if DIV == "NR":
+    const float vmax_abs_x = 5.1164608002e+00f;
+  $else:
+    const float vmax_abs_x = 5.1638283730e+00f;
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  const float valpha_1 = 7.9788452387e-01f;
+  const float valpha_3 = 6.6972173750e-02f;
+  const float valpha_5 = 9.3065137044e-03f;
+  const float valpha_7 = 3.2973114867e-04f;
+  const float valpha_9 = 1.2609783880e-05f;
+  const float valpha_11 = 4.5835321316e-08f;
+
+  // The monomial coefficients of the denominator polynomial (even).
+  const float vbeta_2 = 2.5060352683e-01f;
+  const float vbeta_4 = 2.8431978077e-02f;
+  const float vbeta_6 = 1.8622842617e-03f;
+  const float vbeta_8 = 7.2267655923e-05f;
+  const float vbeta_10 = 1.1988805682e-06f;
+
+  batch >>= XNN_LOG2_SIZEOF_FLOAT;
+  do {
+    const size_t n = __riscv_vsetvl_e32m${LMUL}(batch);
+
+    vfloat32m${LMUL}_t vx_orig = __riscv_vle32_v_f32m${LMUL}(input, n);
+    input += n;
+
+    // Clamp the inputs to the interpolation range.
+    vfloat32m${LMUL}_t vx = __riscv_vfmin_vf_f32m${LMUL}(vx_orig, vmax_abs_x, n);
+    vx = __riscv_vfmax_vf_f32m${LMUL}(vx, -vmax_abs_x, n);
+
+    // Since the polynomials are odd/even, we need x^2.
+    vfloat32m${LMUL}_t vx2 = __riscv_vfmul_vv_f32m${LMUL}(vx, vx, n);
+
+    // Evaluate the numerator polynomial p.
+    vfloat32m${LMUL}_t vp = __riscv_vfmv_v_f_f32m${LMUL}(valpha_9, n);
+    vp = __riscv_vfmacc_vf_f32m${LMUL}(vp, valpha_11, vx2, n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_7, n), n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_5, n), n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_3, n), n);
+    vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_1, n), n);
+    vp = __riscv_vfmul_vv_f32m${LMUL}(vp, vx, n);
+
+    // Evaluate the denominator polynomial q.
+    vfloat32m${LMUL}_t vq = __riscv_vfmv_v_f_f32m${LMUL}(vbeta_8, n);
+    vq = __riscv_vfmacc_vf_f32m${LMUL}(vq, vbeta_10, vx2, n);
+    vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_6, n), n);
+    vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_4, n), n);
+    vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_2, n), n);
+    vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(1.0f, n), n);
+
+    // Divide the numerator by the denominator.
+    $if DIV == "DIV":
+      vfloat32m${LMUL}_t verf = __riscv_vfdiv_vv_f32m${LMUL}(vp, vq, n);
+    $else:
+      // Newton-Raphson iteration for reciprocal.
+      vfloat32m${LMUL}_t vrq = __riscv_vfrec7_v_f32m${LMUL}(vq, n);
+      for (size_t iter = 0; iter < 2; iter++) {
+        vfloat32m${LMUL}_t verr = __riscv_vfnmsac_vv_f32m${LMUL}(
+            __riscv_vfmv_v_f_f32m${LMUL}(2.0f, n), vrq, vq, n);
+        vrq = __riscv_vfmul_vv_f32m${LMUL}(vrq, verr, n);
+      }
+      vfloat32m${LMUL}_t verf = __riscv_vfmul_vv_f32m${LMUL}(vp, vrq, n);
+
+    // Add one to the rational interpolant, and multiply by 0.5 times the
+    // original input.
+    vfloat32m${LMUL}_t vy = __riscv_vfadd_vf_f32m${LMUL}(verf, 1.0f, n);
+    vy = __riscv_vfmul_vf_f32m${LMUL}(vy, 0.5f, n);
+    vy = __riscv_vfmul_vv_f32m${LMUL}(vy, vx_orig, n);
+
+    __riscv_vse32_v_f32m${LMUL}(output, vy, n);
+    output += n;
+
+    batch -= n;
+  } while (batch != 0);
+}

From ae231328e5a16ff989ae8dbde7d1f2e48a372dd5 Mon Sep 17 00:00:00 2001
From: velonica0 <like@mail.nankai.edu.cn>
Date: Wed, 15 Apr 2026 09:32:22 +0800
Subject: [PATCH 2/3] Alphabetize RVV elementwise entries in cmake/bzl lists

Move f32-velu, f32-vgelu, f32-vapproxgelu entries from end of
PROD/NON_PROD lists to alphabetical positions to avoid merge conflicts
with other RVV PRs.
---
 cmake/gen/rvv_microkernels.cmake | 40 ++++++++++++++++----------------
 gen/rvv_microkernels.bzl         | 40 ++++++++++++++++----------------
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake
index f33a3a3a92b..55dda525b9d 100644
--- a/cmake/gen/rvv_microkernels.cmake
+++ b/cmake/gen/rvv_microkernels.cmake
@@ -43,6 +43,7 @@ SET(PROD_RVV_MICROKERNEL_SRCS
   src/f32-spmm/gen/f32-spmm-4vx4-minmax-rvv.c
   src/f32-spmm/gen/f32-spmm-8vx1-minmax-rvv.c
   src/f32-spmm/gen/f32-spmm-8vx2-minmax-rvv.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c
   src/f32-vbinary/gen/f32-vadd-rvv-u8v.c
   src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c
   src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c
@@ -69,6 +70,8 @@ SET(PROD_RVV_MICROKERNEL_SRCS
   src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c
   src/f32-vcos/gen/f32-vcos-rvv-rational-5-4-div-u8v.c
   src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u8v.c
+  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c
   src/f32-vhswish/gen/f32-vhswish-rvv-u4v.c
   src/f32-vlog/gen/f32-vlog-rvv-rational-3-3-div-u8v.c
   src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c
@@ -143,9 +146,6 @@ SET(PROD_RVV_MICROKERNEL_SRCS
   src/x32-transposec/gen/x32-transposec-8x8-rvv.c
   src/x32-transposec/gen/x32-transposec-16x8-rvv.c
   src/x32-transposec/gen/x32-transposec-32x8-rvv.c
-  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c
-  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c
-  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c)
 
 SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-1x1.c
@@ -201,6 +201,13 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/f32-spmm/gen/f32-spmm-4vx1-minmax-rvv.c
   src/f32-spmm/gen/f32-spmm-4vx2-minmax-rvv.c
   src/f32-spmm/gen/f32-spmm-8vx4-minmax-rvv.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c
+  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c
   src/f32-vbinary/gen/f32-vadd-rvv-u4v.c
   src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c
   src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c
@@ -234,6 +241,16 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u1v.c
   src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u2v.c
   src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u4v.c
+  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c
+  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c
+  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c
+  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c
   src/f32-vhswish/gen/f32-vhswish-rvv-u1v.c
   src/f32-vhswish/gen/f32-vhswish-rvv-u2v.c
   src/f32-vhswish/gen/f32-vhswish-rvv-u8v.c
@@ -337,22 +354,5 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u2.c
   src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u4.c
   src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c
-  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c
-  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c
-  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c
-  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c
-  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c
-  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c
-  src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c
-  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c
-  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c
-  src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c
-  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c
-  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c
-  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c
-  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c
-  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c
-  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c
-  src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c)
 
 SET(ALL_RVV_MICROKERNEL_SRCS ${PROD_RVV_MICROKERNEL_SRCS} + ${NON_PROD_RVV_MICROKERNEL_SRCS})
diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl
index e86c895b66b..cfe6c5daa14 100644
--- a/gen/rvv_microkernels.bzl
+++ b/gen/rvv_microkernels.bzl
@@ -39,6 +39,7 @@ PROD_RVV_MICROKERNEL_SRCS = [
     "src/f32-spmm/gen/f32-spmm-4vx4-minmax-rvv.c",
     "src/f32-spmm/gen/f32-spmm-8vx1-minmax-rvv.c",
     "src/f32-spmm/gen/f32-spmm-8vx2-minmax-rvv.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c",
     "src/f32-vbinary/gen/f32-vadd-rvv-u8v.c",
     "src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c",
     "src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c",
@@ -65,6 +66,8 @@ PROD_RVV_MICROKERNEL_SRCS = [
     "src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c",
     "src/f32-vcos/gen/f32-vcos-rvv-rational-5-4-div-u8v.c",
     "src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u8v.c",
+    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c",
     "src/f32-vhswish/gen/f32-vhswish-rvv-u4v.c",
     "src/f32-vlog/gen/f32-vlog-rvv-rational-3-3-div-u8v.c",
     "src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c",
@@ -139,9 +142,6 @@ PROD_RVV_MICROKERNEL_SRCS = [
     "src/x32-transposec/gen/x32-transposec-8x8-rvv.c",
     "src/x32-transposec/gen/x32-transposec-16x8-rvv.c",
     "src/x32-transposec/gen/x32-transposec-32x8-rvv.c",
-    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c",
-    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c",
-    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c",
 ]
 
 NON_PROD_RVV_MICROKERNEL_SRCS = [
@@ -198,6 +198,13 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
     "src/f32-spmm/gen/f32-spmm-4vx1-minmax-rvv.c",
     "src/f32-spmm/gen/f32-spmm-4vx2-minmax-rvv.c",
     "src/f32-spmm/gen/f32-spmm-8vx4-minmax-rvv.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c",
+    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c",
     "src/f32-vbinary/gen/f32-vadd-rvv-u4v.c",
     "src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c",
     "src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c",
@@ -231,6 +238,16 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
     "src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u1v.c",
     "src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u2v.c",
     "src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u4v.c",
+    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c",
+    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c",
+    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c",
+    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c",
     "src/f32-vhswish/gen/f32-vhswish-rvv-u1v.c",
     "src/f32-vhswish/gen/f32-vhswish-rvv-u2v.c",
     "src/f32-vhswish/gen/f32-vhswish-rvv-u8v.c",
@@ -334,23 +351,6 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
     "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u2.c",
     "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u4.c",
     "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c",
-    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c",
-    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c",
-    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c",
-    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c",
-    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c",
-    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c",
-    "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c",
-    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c",
-    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c",
-    "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c",
-    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c",
-    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c",
-    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c",
-    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c",
-    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c",
-    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c",
-    "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c",
 ]
 
 ALL_RVV_MICROKERNEL_SRCS = PROD_RVV_MICROKERNEL_SRCS + NON_PROD_RVV_MICROKERNEL_SRCS

From 0b6f61afd92adefbee1d0f8dffa36cb67558be13 Mon Sep 17 00:00:00 2001
From: velonica0 <like@mail.nankai.edu.cn>
Date: Mon, 20 Apr 2026 10:34:19 +0800
Subject: [PATCH 3/3] fix cmake bug

---
 cmake/gen/rvv_microkernels.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake
index 55dda525b9d..2491a629887 100644
--- a/cmake/gen/rvv_microkernels.cmake
+++ b/cmake/gen/rvv_microkernels.cmake
@@ -145,7 +145,7 @@ SET(PROD_RVV_MICROKERNEL_SRCS
   src/x32-transposec/gen/x32-transposec-4x4-rvv.c
   src/x32-transposec/gen/x32-transposec-8x8-rvv.c
   src/x32-transposec/gen/x32-transposec-16x8-rvv.c
-  src/x32-transposec/gen/x32-transposec-32x8-rvv.c
+  src/x32-transposec/gen/x32-transposec-32x8-rvv.c)
 
 SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-1x1.c
@@ -353,6 +353,6 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
   src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u4.c
   src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u2.c
   src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u4.c
-  src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c
+  src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c)
 
 SET(ALL_RVV_MICROKERNEL_SRCS ${PROD_RVV_MICROKERNEL_SRCS} + ${NON_PROD_RVV_MICROKERNEL_SRCS})