From d3121ea9dde74fecd650a9eaa4e290fc411117b9 Mon Sep 17 00:00:00 2001 From: velonica0 Date: Tue, 14 Apr 2026 09:51:44 +0800 Subject: [PATCH 1/3] [RVV] add rvv f32 kernels for velu, vgelu, vapproxgelu Add RVV kernels for three elementwise activation ops: - f32-velu (ELU) - rr2_p6 polynomial approximation - f32-vgelu (GELU) - rational 12/10 approximation (div + nr variants) - f32-vapproxgelu - rational 12/10 approximation (div + nr variants) Tested on SpacemiT K1 CPU and K3 CPU, both VLEN=256. --- cmake/gen/rvv_microkernels.cmake | 24 +++- gen/rvv_microkernels.bzl | 20 ++++ scripts/generate-f32-vapproxgelu.sh | 10 ++ scripts/generate-f32-velu.sh | 6 + scripts/generate-f32-vgelu.sh | 10 ++ src/configs/unary-elementwise-config.c | 43 ++++++- src/f32-vapproxgelu/f32-vapproxgelu.inc | 7 ++ ...2-vapproxgelu-rvv-rational-12-10-div-u1v.c | 96 ++++++++++++++++ ...2-vapproxgelu-rvv-rational-12-10-div-u2v.c | 96 ++++++++++++++++ ...2-vapproxgelu-rvv-rational-12-10-div-u4v.c | 96 ++++++++++++++++ ...2-vapproxgelu-rvv-rational-12-10-div-u8v.c | 96 ++++++++++++++++ ...32-vapproxgelu-rvv-rational-12-10-nr-u1v.c | 103 +++++++++++++++++ ...32-vapproxgelu-rvv-rational-12-10-nr-u2v.c | 103 +++++++++++++++++ ...32-vapproxgelu-rvv-rational-12-10-nr-u4v.c | 103 +++++++++++++++++ ...32-vapproxgelu-rvv-rational-12-10-nr-u8v.c | 103 +++++++++++++++++ src/f32-vapproxgelu/rvv-rational-12-10.c.in | 107 ++++++++++++++++++ src/f32-velu/f32-velu.inc | 7 ++ src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c | 105 +++++++++++++++++ src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c | 105 +++++++++++++++++ src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c | 105 +++++++++++++++++ src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c | 105 +++++++++++++++++ src/f32-velu/rvv-rr2-p6.c.in | 101 +++++++++++++++++ src/f32-vgelu/f32-vgelu.inc | 7 ++ .../f32-vgelu-rvv-rational-12-10-div-u1v.c | 95 ++++++++++++++++ .../f32-vgelu-rvv-rational-12-10-div-u2v.c | 95 ++++++++++++++++ .../f32-vgelu-rvv-rational-12-10-div-u4v.c | 95 ++++++++++++++++ .../f32-vgelu-rvv-rational-12-10-div-u8v.c | 95 ++++++++++++++++ .../gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c | 102 +++++++++++++++++ .../gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c | 102 +++++++++++++++++ .../gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c | 102 +++++++++++++++++ .../gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c | 102 +++++++++++++++++ src/f32-vgelu/rvv-rational-12-10.c.in | 105 +++++++++++++++++ 32 files changed, 2446 insertions(+), 5 deletions(-) create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c create mode 100644 src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c create mode 100644 src/f32-vapproxgelu/rvv-rational-12-10.c.in create mode 100644 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c create mode 100644 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c create mode 100644 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c create mode 100644 src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c create mode 100644 src/f32-velu/rvv-rr2-p6.c.in create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c create mode 100644 src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c create mode 100644 src/f32-vgelu/rvv-rational-12-10.c.in diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index e290b67f0da..f33a3a3a92b 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -142,7 +142,10 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/x32-transposec/gen/x32-transposec-4x4-rvv.c src/x32-transposec/gen/x32-transposec-8x8-rvv.c src/x32-transposec/gen/x32-transposec-16x8-rvv.c - src/x32-transposec/gen/x32-transposec-32x8-rvv.c) + src/x32-transposec/gen/x32-transposec-32x8-rvv.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c + src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c) SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-1x1.c @@ -333,6 +336,23 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u4.c src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u2.c src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u4.c - src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c) + src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c + src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c + src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c + src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c) SET(ALL_RVV_MICROKERNEL_SRCS ${PROD_RVV_MICROKERNEL_SRCS} + ${NON_PROD_RVV_MICROKERNEL_SRCS}) diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index cea694c83c7..e86c895b66b 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -139,6 +139,9 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/x32-transposec/gen/x32-transposec-8x8-rvv.c", "src/x32-transposec/gen/x32-transposec-16x8-rvv.c", "src/x32-transposec/gen/x32-transposec-32x8-rvv.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c", + "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c", ] NON_PROD_RVV_MICROKERNEL_SRCS = [ @@ -331,6 +334,23 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u2.c", "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u4.c", "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c", + "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c", + "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c", + "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c", ] ALL_RVV_MICROKERNEL_SRCS = PROD_RVV_MICROKERNEL_SRCS + NON_PROD_RVV_MICROKERNEL_SRCS diff --git a/scripts/generate-f32-vapproxgelu.sh b/scripts/generate-f32-vapproxgelu.sh index 3c08f62388a..69ffec45d06 100755 --- a/scripts/generate-f32-vapproxgelu.sh +++ b/scripts/generate-f32-vapproxgelu.sh @@ -20,4 +20,14 @@ tools/xngen src/f32-vapproxgelu/rational-12-10.c.in -D ARCH=hvx -D BATCH_TILES=3 tools/xngen src/f32-vapproxgelu/rational-12-10.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-avx512f-rational-12-10-nr.c & +################################## RISC-V RVV ################################# +tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c & +tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c & +tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c & +tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=DIV -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c & +tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c & +tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c & +tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c & +tools/xngen src/f32-vapproxgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=NR -o src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c & + wait diff --git a/scripts/generate-f32-velu.sh b/scripts/generate-f32-velu.sh index 709add00d4e..4ac9ed34522 100755 --- a/scripts/generate-f32-velu.sh +++ b/scripts/generate-f32-velu.sh @@ -149,4 +149,10 @@ tools/xngen src/f32-velu/avx512f-rr1-p6.c.in -D BATCH_TILE=32 -o src/f32-velu/g tools/xngen src/f32-velu/avx512f-rr1-p6.c.in -D BATCH_TILE=48 -o src/f32-velu/gen/f32-velu-avx512f-rr1-p6-u48.c & tools/xngen src/f32-velu/avx512f-rr1-p6.c.in -D BATCH_TILE=64 -o src/f32-velu/gen/f32-velu-avx512f-rr1-p6-u64.c & +################################## RISC-V RVV ################################# +tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=1 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c & +tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=2 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c & +tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=4 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c & +tools/xngen src/f32-velu/rvv-rr2-p6.c.in -D LMUL=8 -o src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c & + wait diff --git a/scripts/generate-f32-vgelu.sh b/scripts/generate-f32-vgelu.sh index b4db27d1767..8929bba6b27 100755 --- a/scripts/generate-f32-vgelu.sh +++ b/scripts/generate-f32-vgelu.sh @@ -21,4 +21,14 @@ tools/xngen src/f32-vgelu/rational-12-10.c.in -D ARCH=hvx -D BATCH_TILES=32 tools/xngen src/f32-vgelu/rational-12-10.c.in -D ARCH=avx512f -D BATCH_TILES=16,32,48,64 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-avx512f-rational-12-10-nr.c & tools/xngen src/f32-vgelu/rational-12-10.c.in -D ARCH=hvx -D BATCH_TILES=32,64,128 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-hvx-rational-12-10-nr.c & +################################## RISC-V RVV ################################# +tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c & +tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c & +tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c & +tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=DIV -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c & +tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=1 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c & +tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=2 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c & +tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=4 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c & +tools/xngen src/f32-vgelu/rvv-rational-12-10.c.in -D LMUL=8 -D DIV=NR -o src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c & + wait diff --git a/src/configs/unary-elementwise-config.c b/src/configs/unary-elementwise-config.c index 11062b7d6d7..2c84009cf34 100644 --- a/src/configs/unary-elementwise-config.c +++ b/src/configs/unary-elementwise-config.c @@ -1034,6 +1034,19 @@ static void init_f32_approxgelu_config_impl(struct xnn_unary_elementwise_config* config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__hvx_rational_12_10_div_u128); config->element_tile = 128; } + #elif XNN_ARCH_RISCV + #if XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector) { + config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u4v); + config->element_tile = 4 * hardware_config->vlenb / sizeof(float); + } else + #endif + { + config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__scalar_rational_12_10_div_u1); + config->element_tile = 1; + } #else config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vapproxgelu_ukernel__scalar_rational_12_10_div_u1); config->element_tile = 1; @@ -1292,9 +1305,20 @@ static void init_f32_elu_config(void) { } #endif #elif XNN_ARCH_RISCV - f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4); - f32_elu_config.element_tile = 4; - f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; + #if XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector) { + f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__rvv_rr2_p6_u4v); + f32_elu_config.element_tile = 4 * hardware_config->vlenb / sizeof(float); + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; + } else + #endif + { + f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4); + f32_elu_config.element_tile = 4; + f32_elu_config.init = (xnn_init_unary_uparams_fn) xnn_init_f32_elu_scalar_params; + } #else f32_elu_config.ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u4); f32_elu_config.element_tile = 4; @@ -1367,6 +1391,19 @@ static void init_f32_gelu_config_impl(struct xnn_unary_elementwise_config* confi config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__hvx_rational_12_10_div_u128); config->element_tile = 128; } + #elif XNN_ARCH_RISCV + #if XNN_ENABLE_RISCV_VECTOR + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + assert(hardware_config != NULL); + if (hardware_config->arch_flags & xnn_arch_riscv_vector) { + config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u4v); + config->element_tile = 4 * hardware_config->vlenb / sizeof(float); + } else + #endif + { + config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u1); + config->element_tile = 1; + } #else config->ukernel = XNN_INIT_UNARY_UKERNEL(xnn_f32_vgelu_ukernel__scalar_rational_12_10_div_u1); config->element_tile = 1; diff --git a/src/f32-vapproxgelu/f32-vapproxgelu.inc b/src/f32-vapproxgelu/f32-vapproxgelu.inc index 5bae52e23f8..68c2658a354 100644 --- a/src/f32-vapproxgelu/f32-vapproxgelu.inc +++ b/src/f32-vapproxgelu/f32-vapproxgelu.inc @@ -65,3 +65,10 @@ XNN_UKERNEL(xnn_arch_none, xnn_f32_vapproxgelu_ukernel__wasmsimd_rational_12_10_ XNN_UKERNEL(xnn_arch_none, xnn_f32_vapproxgelu_ukernel__wasmsimd_rational_12_10_div_u12, 12, false, float, struct xnn_f32_default_params, NULL) XNN_UKERNEL(xnn_arch_none, xnn_f32_vapproxgelu_ukernel__wasmsimd_rational_12_10_div_u16, 16, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD + +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u1v, 1, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u2v, 2, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u4v, 4, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u8v, 8, true, float, struct xnn_f32_default_params, NULL) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c new file mode 100644 index 00000000000..d78fb789043 --- /dev/null +++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c @@ -0,0 +1,96 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u1v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_x = 4.84974098e+00f; + const float vmin_x = -4.84974098e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788458347e-01f; + const float valpha_3 = 6.0803253204e-02f; + const float valpha_5 = 7.2898347862e-03f; + const float valpha_7 = 2.6887017884e-04f; + const float valpha_9 = 1.4302649106e-05f; + const float valpha_11 = 4.9544411240e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.4369759858e-01f; + const float vbeta_4 = 2.4381054565e-02f; + const float vbeta_6 = 1.3060354395e-03f; + const float vbeta_8 = 7.6477612311e-05f; + const float vbeta_10 = 1.3433452750e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m1(batch); + + vfloat32m1_t vx_orig = __riscv_vle32_v_f32m1(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m1_t vx = __riscv_vfmin_vf_f32m1(vx_orig, vmax_x, n); + vx = __riscv_vfmax_vf_f32m1(vx, vmin_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m1_t vx2 = __riscv_vfmul_vv_f32m1(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m1(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m1(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m1_t vq = __riscv_vfmv_v_f_f32m1(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m1(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(1.0f, n), n); + + // Divide the numerator by the denominator. + vfloat32m1_t verf = __riscv_vfdiv_vv_f32m1(vp, vq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m1_t vy = __riscv_vfadd_vf_f32m1(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m1(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m1(vy, vx_orig, n); + + __riscv_vse32_v_f32m1(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c new file mode 100644 index 00000000000..f93fe59994c --- /dev/null +++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c @@ -0,0 +1,96 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u2v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_x = 4.84974098e+00f; + const float vmin_x = -4.84974098e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788458347e-01f; + const float valpha_3 = 6.0803253204e-02f; + const float valpha_5 = 7.2898347862e-03f; + const float valpha_7 = 2.6887017884e-04f; + const float valpha_9 = 1.4302649106e-05f; + const float valpha_11 = 4.9544411240e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.4369759858e-01f; + const float vbeta_4 = 2.4381054565e-02f; + const float vbeta_6 = 1.3060354395e-03f; + const float vbeta_8 = 7.6477612311e-05f; + const float vbeta_10 = 1.3433452750e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m2(batch); + + vfloat32m2_t vx_orig = __riscv_vle32_v_f32m2(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m2_t vx = __riscv_vfmin_vf_f32m2(vx_orig, vmax_x, n); + vx = __riscv_vfmax_vf_f32m2(vx, vmin_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m2_t vx2 = __riscv_vfmul_vv_f32m2(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m2_t vp = __riscv_vfmv_v_f_f32m2(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m2(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m2(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m2_t vq = __riscv_vfmv_v_f_f32m2(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m2(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(1.0f, n), n); + + // Divide the numerator by the denominator. + vfloat32m2_t verf = __riscv_vfdiv_vv_f32m2(vp, vq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m2_t vy = __riscv_vfadd_vf_f32m2(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m2(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m2(vy, vx_orig, n); + + __riscv_vse32_v_f32m2(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c new file mode 100644 index 00000000000..175aa184519 --- /dev/null +++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c @@ -0,0 +1,96 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u4v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_x = 4.84974098e+00f; + const float vmin_x = -4.84974098e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788458347e-01f; + const float valpha_3 = 6.0803253204e-02f; + const float valpha_5 = 7.2898347862e-03f; + const float valpha_7 = 2.6887017884e-04f; + const float valpha_9 = 1.4302649106e-05f; + const float valpha_11 = 4.9544411240e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.4369759858e-01f; + const float vbeta_4 = 2.4381054565e-02f; + const float vbeta_6 = 1.3060354395e-03f; + const float vbeta_8 = 7.6477612311e-05f; + const float vbeta_10 = 1.3433452750e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m4(batch); + + vfloat32m4_t vx_orig = __riscv_vle32_v_f32m4(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m4_t vx = __riscv_vfmin_vf_f32m4(vx_orig, vmax_x, n); + vx = __riscv_vfmax_vf_f32m4(vx, vmin_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m4_t vx2 = __riscv_vfmul_vv_f32m4(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m4_t vp = __riscv_vfmv_v_f_f32m4(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m4(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m4(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m4_t vq = __riscv_vfmv_v_f_f32m4(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m4(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(1.0f, n), n); + + // Divide the numerator by the denominator. + vfloat32m4_t verf = __riscv_vfdiv_vv_f32m4(vp, vq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m4_t vy = __riscv_vfadd_vf_f32m4(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m4(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m4(vy, vx_orig, n); + + __riscv_vse32_v_f32m4(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c new file mode 100644 index 00000000000..bc3d665e050 --- /dev/null +++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c @@ -0,0 +1,96 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_div_u8v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_x = 4.84974098e+00f; + const float vmin_x = -4.84974098e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788458347e-01f; + const float valpha_3 = 6.0803253204e-02f; + const float valpha_5 = 7.2898347862e-03f; + const float valpha_7 = 2.6887017884e-04f; + const float valpha_9 = 1.4302649106e-05f; + const float valpha_11 = 4.9544411240e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.4369759858e-01f; + const float vbeta_4 = 2.4381054565e-02f; + const float vbeta_6 = 1.3060354395e-03f; + const float vbeta_8 = 7.6477612311e-05f; + const float vbeta_10 = 1.3433452750e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m8(batch); + + vfloat32m8_t vx_orig = __riscv_vle32_v_f32m8(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m8_t vx = __riscv_vfmin_vf_f32m8(vx_orig, vmax_x, n); + vx = __riscv_vfmax_vf_f32m8(vx, vmin_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m8_t vx2 = __riscv_vfmul_vv_f32m8(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m8_t vp = __riscv_vfmv_v_f_f32m8(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m8(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m8(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m8_t vq = __riscv_vfmv_v_f_f32m8(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m8(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(1.0f, n), n); + + // Divide the numerator by the denominator. + vfloat32m8_t verf = __riscv_vfdiv_vv_f32m8(vp, vq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m8_t vy = __riscv_vfadd_vf_f32m8(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m8(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m8(vy, vx_orig, n); + + __riscv_vse32_v_f32m8(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c new file mode 100644 index 00000000000..9fce14f9c13 --- /dev/null +++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c @@ -0,0 +1,103 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_nr_u1v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_x = 4.79519796e+00f; + const float vmin_x = -4.84563780e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788458347e-01f; + const float valpha_3 = 6.0803253204e-02f; + const float valpha_5 = 7.2898347862e-03f; + const float valpha_7 = 2.6887017884e-04f; + const float valpha_9 = 1.4302649106e-05f; + const float valpha_11 = 4.9544411240e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.4369759858e-01f; + const float vbeta_4 = 2.4381054565e-02f; + const float vbeta_6 = 1.3060354395e-03f; + const float vbeta_8 = 7.6477612311e-05f; + const float vbeta_10 = 1.3433452750e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m1(batch); + + vfloat32m1_t vx_orig = __riscv_vle32_v_f32m1(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m1_t vx = __riscv_vfmin_vf_f32m1(vx_orig, vmax_x, n); + vx = __riscv_vfmax_vf_f32m1(vx, vmin_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m1_t vx2 = __riscv_vfmul_vv_f32m1(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m1(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m1(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m1_t vq = __riscv_vfmv_v_f_f32m1(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m1(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(1.0f, n), n); + + // Divide the numerator by the denominator. + // Newton-Raphson iteration for reciprocal. + vfloat32m1_t vrq = __riscv_vfrec7_v_f32m1(vq, n); + for (size_t iter = 0; iter < 2; iter++) { + vfloat32m1_t verr = __riscv_vfnmsac_vv_f32m1( + __riscv_vfmv_v_f_f32m1(2.0f, n), vrq, vq, n); + vrq = __riscv_vfmul_vv_f32m1(vrq, verr, n); + } + vfloat32m1_t verf = __riscv_vfmul_vv_f32m1(vp, vrq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m1_t vy = __riscv_vfadd_vf_f32m1(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m1(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m1(vy, vx_orig, n); + + __riscv_vse32_v_f32m1(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c new file mode 100644 index 00000000000..d281765f915 --- /dev/null +++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c @@ -0,0 +1,103 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_nr_u2v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_x = 4.79519796e+00f; + const float vmin_x = -4.84563780e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788458347e-01f; + const float valpha_3 = 6.0803253204e-02f; + const float valpha_5 = 7.2898347862e-03f; + const float valpha_7 = 2.6887017884e-04f; + const float valpha_9 = 1.4302649106e-05f; + const float valpha_11 = 4.9544411240e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.4369759858e-01f; + const float vbeta_4 = 2.4381054565e-02f; + const float vbeta_6 = 1.3060354395e-03f; + const float vbeta_8 = 7.6477612311e-05f; + const float vbeta_10 = 1.3433452750e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m2(batch); + + vfloat32m2_t vx_orig = __riscv_vle32_v_f32m2(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m2_t vx = __riscv_vfmin_vf_f32m2(vx_orig, vmax_x, n); + vx = __riscv_vfmax_vf_f32m2(vx, vmin_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m2_t vx2 = __riscv_vfmul_vv_f32m2(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m2_t vp = __riscv_vfmv_v_f_f32m2(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m2(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m2(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m2_t vq = __riscv_vfmv_v_f_f32m2(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m2(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(1.0f, n), n); + + // Divide the numerator by the denominator. + // Newton-Raphson iteration for reciprocal. + vfloat32m2_t vrq = __riscv_vfrec7_v_f32m2(vq, n); + for (size_t iter = 0; iter < 2; iter++) { + vfloat32m2_t verr = __riscv_vfnmsac_vv_f32m2( + __riscv_vfmv_v_f_f32m2(2.0f, n), vrq, vq, n); + vrq = __riscv_vfmul_vv_f32m2(vrq, verr, n); + } + vfloat32m2_t verf = __riscv_vfmul_vv_f32m2(vp, vrq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m2_t vy = __riscv_vfadd_vf_f32m2(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m2(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m2(vy, vx_orig, n); + + __riscv_vse32_v_f32m2(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c new file mode 100644 index 00000000000..e66bb8c4b83 --- /dev/null +++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c @@ -0,0 +1,103 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_nr_u4v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_x = 4.79519796e+00f; + const float vmin_x = -4.84563780e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788458347e-01f; + const float valpha_3 = 6.0803253204e-02f; + const float valpha_5 = 7.2898347862e-03f; + const float valpha_7 = 2.6887017884e-04f; + const float valpha_9 = 1.4302649106e-05f; + const float valpha_11 = 4.9544411240e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.4369759858e-01f; + const float vbeta_4 = 2.4381054565e-02f; + const float vbeta_6 = 1.3060354395e-03f; + const float vbeta_8 = 7.6477612311e-05f; + const float vbeta_10 = 1.3433452750e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m4(batch); + + vfloat32m4_t vx_orig = __riscv_vle32_v_f32m4(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m4_t vx = __riscv_vfmin_vf_f32m4(vx_orig, vmax_x, n); + vx = __riscv_vfmax_vf_f32m4(vx, vmin_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m4_t vx2 = __riscv_vfmul_vv_f32m4(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m4_t vp = __riscv_vfmv_v_f_f32m4(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m4(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m4(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m4_t vq = __riscv_vfmv_v_f_f32m4(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m4(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(1.0f, n), n); + + // Divide the numerator by the denominator. + // Newton-Raphson iteration for reciprocal. + vfloat32m4_t vrq = __riscv_vfrec7_v_f32m4(vq, n); + for (size_t iter = 0; iter < 2; iter++) { + vfloat32m4_t verr = __riscv_vfnmsac_vv_f32m4( + __riscv_vfmv_v_f_f32m4(2.0f, n), vrq, vq, n); + vrq = __riscv_vfmul_vv_f32m4(vrq, verr, n); + } + vfloat32m4_t verf = __riscv_vfmul_vv_f32m4(vp, vrq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m4_t vy = __riscv_vfadd_vf_f32m4(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m4(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m4(vy, vx_orig, n); + + __riscv_vse32_v_f32m4(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c new file mode 100644 index 00000000000..2b17e3a673d --- /dev/null +++ b/src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c @@ -0,0 +1,103 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vapproxgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_nr_u8v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_x = 4.79519796e+00f; + const float vmin_x = -4.84563780e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788458347e-01f; + const float valpha_3 = 6.0803253204e-02f; + const float valpha_5 = 7.2898347862e-03f; + const float valpha_7 = 2.6887017884e-04f; + const float valpha_9 = 1.4302649106e-05f; + const float valpha_11 = 4.9544411240e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.4369759858e-01f; + const float vbeta_4 = 2.4381054565e-02f; + const float vbeta_6 = 1.3060354395e-03f; + const float vbeta_8 = 7.6477612311e-05f; + const float vbeta_10 = 1.3433452750e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m8(batch); + + vfloat32m8_t vx_orig = __riscv_vle32_v_f32m8(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m8_t vx = __riscv_vfmin_vf_f32m8(vx_orig, vmax_x, n); + vx = __riscv_vfmax_vf_f32m8(vx, vmin_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m8_t vx2 = __riscv_vfmul_vv_f32m8(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m8_t vp = __riscv_vfmv_v_f_f32m8(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m8(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m8(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m8_t vq = __riscv_vfmv_v_f_f32m8(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m8(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(1.0f, n), n); + + // Divide the numerator by the denominator. + // Newton-Raphson iteration for reciprocal. + vfloat32m8_t vrq = __riscv_vfrec7_v_f32m8(vq, n); + for (size_t iter = 0; iter < 2; iter++) { + vfloat32m8_t verr = __riscv_vfnmsac_vv_f32m8( + __riscv_vfmv_v_f_f32m8(2.0f, n), vrq, vq, n); + vrq = __riscv_vfmul_vv_f32m8(vrq, verr, n); + } + vfloat32m8_t verf = __riscv_vfmul_vv_f32m8(vp, vrq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m8_t vy = __riscv_vfadd_vf_f32m8(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m8(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m8(vy, vx_orig, n); + + __riscv_vse32_v_f32m8(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vapproxgelu/rvv-rational-12-10.c.in b/src/f32-vapproxgelu/rvv-rational-12-10.c.in new file mode 100644 index 00000000000..e2294943328 --- /dev/null +++ b/src/f32-vapproxgelu/rvv-rational-12-10.c.in @@ -0,0 +1,107 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +$assert DIV in ("DIV", "NR") +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vapproxgelu_ukernel__rvv_rational_12_10_${DIV.lower()}_u${LMUL}v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + $if DIV == "NR": + const float vmax_x = 4.79519796e+00f; + const float vmin_x = -4.84563780e+00f; + $else: + const float vmax_x = 4.84974098e+00f; + const float vmin_x = -4.84974098e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788458347e-01f; + const float valpha_3 = 6.0803253204e-02f; + const float valpha_5 = 7.2898347862e-03f; + const float valpha_7 = 2.6887017884e-04f; + const float valpha_9 = 1.4302649106e-05f; + const float valpha_11 = 4.9544411240e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.4369759858e-01f; + const float vbeta_4 = 2.4381054565e-02f; + const float vbeta_6 = 1.3060354395e-03f; + const float vbeta_8 = 7.6477612311e-05f; + const float vbeta_10 = 1.3433452750e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m${LMUL}(batch); + + vfloat32m${LMUL}_t vx_orig = __riscv_vle32_v_f32m${LMUL}(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m${LMUL}_t vx = __riscv_vfmin_vf_f32m${LMUL}(vx_orig, vmax_x, n); + vx = __riscv_vfmax_vf_f32m${LMUL}(vx, vmin_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m${LMUL}_t vx2 = __riscv_vfmul_vv_f32m${LMUL}(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m${LMUL}_t vp = __riscv_vfmv_v_f_f32m${LMUL}(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m${LMUL}(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m${LMUL}(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m${LMUL}_t vq = __riscv_vfmv_v_f_f32m${LMUL}(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m${LMUL}(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(1.0f, n), n); + + // Divide the numerator by the denominator. + $if DIV == "DIV": + vfloat32m${LMUL}_t verf = __riscv_vfdiv_vv_f32m${LMUL}(vp, vq, n); + $else: + // Newton-Raphson iteration for reciprocal. + vfloat32m${LMUL}_t vrq = __riscv_vfrec7_v_f32m${LMUL}(vq, n); + for (size_t iter = 0; iter < 2; iter++) { + vfloat32m${LMUL}_t verr = __riscv_vfnmsac_vv_f32m${LMUL}( + __riscv_vfmv_v_f_f32m${LMUL}(2.0f, n), vrq, vq, n); + vrq = __riscv_vfmul_vv_f32m${LMUL}(vrq, verr, n); + } + vfloat32m${LMUL}_t verf = __riscv_vfmul_vv_f32m${LMUL}(vp, vrq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m${LMUL}_t vy = __riscv_vfadd_vf_f32m${LMUL}(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m${LMUL}(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m${LMUL}(vy, vx_orig, n); + + __riscv_vse32_v_f32m${LMUL}(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-velu/f32-velu.inc b/src/f32-velu/f32-velu.inc index 23024c853fa..7a4d3daad2d 100644 --- a/src/f32-velu/f32-velu.inc +++ b/src/f32-velu/f32-velu.inc @@ -128,6 +128,13 @@ XNN_UKERNEL(xnn_arch_none, xnn_f32_velu_ukernel__wasmrelaxedsimd_rr2_p6_u12, 12, XNN_UKERNEL(xnn_arch_none, xnn_f32_velu_ukernel__wasmrelaxedsimd_rr2_p6_u16, 16, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) #endif // XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_velu_ukernel__rvv_rr2_p6_u1v, 1, true, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_velu_ukernel__rvv_rr2_p6_u2v, 2, true, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_velu_ukernel__rvv_rr2_p6_u4v, 4, true, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_velu_ukernel__rvv_rr2_p6_u8v, 8, true, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + XNN_UKERNEL(xnn_arch_none, xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u1, 1, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) XNN_UKERNEL(xnn_arch_none, xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u2, 2, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) XNN_UKERNEL(xnn_arch_none, xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_u3, 3, false, float, struct xnn_f32_elu_params, xnn_init_f32_elu_scalar_params) diff --git a/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c new file mode 100644 index 00000000000..8648b6bd409 --- /dev/null +++ b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c @@ -0,0 +1,105 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-velu/rvv-rr2-p6.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_velu_ukernel__rvv_rr2_p6_u1v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_elu_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + const float vprescale = params->scalar.prescale; + const float valpha = params->scalar.alpha; + const float vbeta = params->scalar.beta; + + const float vsat_cutoff = -0x1.154246p+4f; + const float vmagic_bias = 0x1.8000FEp23f; + const float vlog2e = 0x1.715476p+0f; + const float vminus_ln2_hi = -0x1.62E440p-1f; + const float vminus_ln2_lo = 0x1.0105C6p-21f; + const float vc6 = 0x1.6b7338p-10f; + const float vc5 = 0x1.12278Ep-7f; + const float vc4 = 0x1.555716p-5f; + const float vc3 = 0x1.5554B0p-3f; + const float vc2 = 0x1.FFFFFEp-2f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m1(batch); + + vfloat32m1_t vx = __riscv_vle32_v_f32m1(input, n); + input += n; + + // Compute reduced argument z = max(prescale * x, sat_cutoff). + vfloat32m1_t vz = __riscv_vfmul_vf_f32m1(vx, vprescale, n); + vz = __riscv_vfmax_vf_f32m1(vz, vsat_cutoff, n); + + // Compute reduced argument n = round(z / ln(2)). + // Use magic bias to get rounding for free. + vfloat32m1_t vn = __riscv_vfmacc_vf_f32m1( + __riscv_vfmv_v_f_f32m1(vmagic_bias, n), vlog2e, vz, n); + + // Create 2^n by shifting n (as integer) into the exponent field. + vint32m1_t ven = __riscv_vsll_vx_i32m1( + __riscv_vreinterpret_v_f32m1_i32m1(vn), 23, n); + vfloat32m1_t vs = __riscv_vreinterpret_v_i32m1_f32m1(ven); + + // Subtract magic bias to get the reduced argument. + vn = __riscv_vfsub_vf_f32m1(vn, vmagic_bias, n); + + // Compute reduced argument t = z - n * ln(2). + // Use Cody-Waite range reduction (two constants to represent ln(2)). + vfloat32m1_t vt = __riscv_vfmacc_vf_f32m1(vz, vminus_ln2_hi, vn, n); + vt = __riscv_vfmacc_vf_f32m1(vt, vminus_ln2_lo, vn, n); + + // Compute degree-6 polynomial approximation for exp(t) - 1 using Horner's method. + // p = t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6))))) + vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(vc5, n); + vp = __riscv_vfmacc_vf_f32m1(vp, vc6, vt, n); + vp = __riscv_vfmadd_vv_f32m1(vp, vt, __riscv_vfmv_v_f_f32m1(vc4, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vt, __riscv_vfmv_v_f_f32m1(vc3, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vt, __riscv_vfmv_v_f_f32m1(vc2, n), n); + vp = __riscv_vfmul_vv_f32m1(vp, vt, n); + + // Reconstruct the exp(z) value: + // t * s + // s - 1 + // p = (p * t) + t + // e = (p + (s - 1)) * alpha + vt = __riscv_vfmul_vv_f32m1(vt, vs, n); + vs = __riscv_vfsub_vf_f32m1(vs, 1.0f, n); + vp = __riscv_vfmadd_vv_f32m1(vp, vt, vt, n); + vfloat32m1_t ve = __riscv_vfmul_vf_f32m1( + __riscv_vfadd_vv_f32m1(vp, vs, n), valpha, n); + + // Select between the ELU and linear parts: + // y = x < 0 ? e : x * beta + vfloat32m1_t vy = __riscv_vfmul_vf_f32m1(vx, vbeta, n); + vbool32_t mask = __riscv_vmflt_vf_f32m1_b32(vx, 0.0f, n); + vy = __riscv_vmerge_vvm_f32m1(vy, ve, mask, n); + + __riscv_vse32_v_f32m1(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c new file mode 100644 index 00000000000..194d3bfcfc5 --- /dev/null +++ b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c @@ -0,0 +1,105 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-velu/rvv-rr2-p6.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_velu_ukernel__rvv_rr2_p6_u2v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_elu_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + const float vprescale = params->scalar.prescale; + const float valpha = params->scalar.alpha; + const float vbeta = params->scalar.beta; + + const float vsat_cutoff = -0x1.154246p+4f; + const float vmagic_bias = 0x1.8000FEp23f; + const float vlog2e = 0x1.715476p+0f; + const float vminus_ln2_hi = -0x1.62E440p-1f; + const float vminus_ln2_lo = 0x1.0105C6p-21f; + const float vc6 = 0x1.6b7338p-10f; + const float vc5 = 0x1.12278Ep-7f; + const float vc4 = 0x1.555716p-5f; + const float vc3 = 0x1.5554B0p-3f; + const float vc2 = 0x1.FFFFFEp-2f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m2(batch); + + vfloat32m2_t vx = __riscv_vle32_v_f32m2(input, n); + input += n; + + // Compute reduced argument z = max(prescale * x, sat_cutoff). + vfloat32m2_t vz = __riscv_vfmul_vf_f32m2(vx, vprescale, n); + vz = __riscv_vfmax_vf_f32m2(vz, vsat_cutoff, n); + + // Compute reduced argument n = round(z / ln(2)). + // Use magic bias to get rounding for free. + vfloat32m2_t vn = __riscv_vfmacc_vf_f32m2( + __riscv_vfmv_v_f_f32m2(vmagic_bias, n), vlog2e, vz, n); + + // Create 2^n by shifting n (as integer) into the exponent field. + vint32m2_t ven = __riscv_vsll_vx_i32m2( + __riscv_vreinterpret_v_f32m2_i32m2(vn), 23, n); + vfloat32m2_t vs = __riscv_vreinterpret_v_i32m2_f32m2(ven); + + // Subtract magic bias to get the reduced argument. + vn = __riscv_vfsub_vf_f32m2(vn, vmagic_bias, n); + + // Compute reduced argument t = z - n * ln(2). + // Use Cody-Waite range reduction (two constants to represent ln(2)). + vfloat32m2_t vt = __riscv_vfmacc_vf_f32m2(vz, vminus_ln2_hi, vn, n); + vt = __riscv_vfmacc_vf_f32m2(vt, vminus_ln2_lo, vn, n); + + // Compute degree-6 polynomial approximation for exp(t) - 1 using Horner's method. + // p = t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6))))) + vfloat32m2_t vp = __riscv_vfmv_v_f_f32m2(vc5, n); + vp = __riscv_vfmacc_vf_f32m2(vp, vc6, vt, n); + vp = __riscv_vfmadd_vv_f32m2(vp, vt, __riscv_vfmv_v_f_f32m2(vc4, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vt, __riscv_vfmv_v_f_f32m2(vc3, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vt, __riscv_vfmv_v_f_f32m2(vc2, n), n); + vp = __riscv_vfmul_vv_f32m2(vp, vt, n); + + // Reconstruct the exp(z) value: + // t * s + // s - 1 + // p = (p * t) + t + // e = (p + (s - 1)) * alpha + vt = __riscv_vfmul_vv_f32m2(vt, vs, n); + vs = __riscv_vfsub_vf_f32m2(vs, 1.0f, n); + vp = __riscv_vfmadd_vv_f32m2(vp, vt, vt, n); + vfloat32m2_t ve = __riscv_vfmul_vf_f32m2( + __riscv_vfadd_vv_f32m2(vp, vs, n), valpha, n); + + // Select between the ELU and linear parts: + // y = x < 0 ? e : x * beta + vfloat32m2_t vy = __riscv_vfmul_vf_f32m2(vx, vbeta, n); + vbool16_t mask = __riscv_vmflt_vf_f32m2_b16(vx, 0.0f, n); + vy = __riscv_vmerge_vvm_f32m2(vy, ve, mask, n); + + __riscv_vse32_v_f32m2(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c new file mode 100644 index 00000000000..8ad83d6482d --- /dev/null +++ b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c @@ -0,0 +1,105 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-velu/rvv-rr2-p6.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_velu_ukernel__rvv_rr2_p6_u4v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_elu_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + const float vprescale = params->scalar.prescale; + const float valpha = params->scalar.alpha; + const float vbeta = params->scalar.beta; + + const float vsat_cutoff = -0x1.154246p+4f; + const float vmagic_bias = 0x1.8000FEp23f; + const float vlog2e = 0x1.715476p+0f; + const float vminus_ln2_hi = -0x1.62E440p-1f; + const float vminus_ln2_lo = 0x1.0105C6p-21f; + const float vc6 = 0x1.6b7338p-10f; + const float vc5 = 0x1.12278Ep-7f; + const float vc4 = 0x1.555716p-5f; + const float vc3 = 0x1.5554B0p-3f; + const float vc2 = 0x1.FFFFFEp-2f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m4(batch); + + vfloat32m4_t vx = __riscv_vle32_v_f32m4(input, n); + input += n; + + // Compute reduced argument z = max(prescale * x, sat_cutoff). + vfloat32m4_t vz = __riscv_vfmul_vf_f32m4(vx, vprescale, n); + vz = __riscv_vfmax_vf_f32m4(vz, vsat_cutoff, n); + + // Compute reduced argument n = round(z / ln(2)). + // Use magic bias to get rounding for free. + vfloat32m4_t vn = __riscv_vfmacc_vf_f32m4( + __riscv_vfmv_v_f_f32m4(vmagic_bias, n), vlog2e, vz, n); + + // Create 2^n by shifting n (as integer) into the exponent field. + vint32m4_t ven = __riscv_vsll_vx_i32m4( + __riscv_vreinterpret_v_f32m4_i32m4(vn), 23, n); + vfloat32m4_t vs = __riscv_vreinterpret_v_i32m4_f32m4(ven); + + // Subtract magic bias to get the reduced argument. + vn = __riscv_vfsub_vf_f32m4(vn, vmagic_bias, n); + + // Compute reduced argument t = z - n * ln(2). + // Use Cody-Waite range reduction (two constants to represent ln(2)). + vfloat32m4_t vt = __riscv_vfmacc_vf_f32m4(vz, vminus_ln2_hi, vn, n); + vt = __riscv_vfmacc_vf_f32m4(vt, vminus_ln2_lo, vn, n); + + // Compute degree-6 polynomial approximation for exp(t) - 1 using Horner's method. + // p = t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6))))) + vfloat32m4_t vp = __riscv_vfmv_v_f_f32m4(vc5, n); + vp = __riscv_vfmacc_vf_f32m4(vp, vc6, vt, n); + vp = __riscv_vfmadd_vv_f32m4(vp, vt, __riscv_vfmv_v_f_f32m4(vc4, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vt, __riscv_vfmv_v_f_f32m4(vc3, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vt, __riscv_vfmv_v_f_f32m4(vc2, n), n); + vp = __riscv_vfmul_vv_f32m4(vp, vt, n); + + // Reconstruct the exp(z) value: + // t * s + // s - 1 + // p = (p * t) + t + // e = (p + (s - 1)) * alpha + vt = __riscv_vfmul_vv_f32m4(vt, vs, n); + vs = __riscv_vfsub_vf_f32m4(vs, 1.0f, n); + vp = __riscv_vfmadd_vv_f32m4(vp, vt, vt, n); + vfloat32m4_t ve = __riscv_vfmul_vf_f32m4( + __riscv_vfadd_vv_f32m4(vp, vs, n), valpha, n); + + // Select between the ELU and linear parts: + // y = x < 0 ? e : x * beta + vfloat32m4_t vy = __riscv_vfmul_vf_f32m4(vx, vbeta, n); + vbool8_t mask = __riscv_vmflt_vf_f32m4_b8(vx, 0.0f, n); + vy = __riscv_vmerge_vvm_f32m4(vy, ve, mask, n); + + __riscv_vse32_v_f32m4(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c new file mode 100644 index 00000000000..2def057e537 --- /dev/null +++ b/src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c @@ -0,0 +1,105 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-velu/rvv-rr2-p6.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_velu_ukernel__rvv_rr2_p6_u8v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_elu_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + const float vprescale = params->scalar.prescale; + const float valpha = params->scalar.alpha; + const float vbeta = params->scalar.beta; + + const float vsat_cutoff = -0x1.154246p+4f; + const float vmagic_bias = 0x1.8000FEp23f; + const float vlog2e = 0x1.715476p+0f; + const float vminus_ln2_hi = -0x1.62E440p-1f; + const float vminus_ln2_lo = 0x1.0105C6p-21f; + const float vc6 = 0x1.6b7338p-10f; + const float vc5 = 0x1.12278Ep-7f; + const float vc4 = 0x1.555716p-5f; + const float vc3 = 0x1.5554B0p-3f; + const float vc2 = 0x1.FFFFFEp-2f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m8(batch); + + vfloat32m8_t vx = __riscv_vle32_v_f32m8(input, n); + input += n; + + // Compute reduced argument z = max(prescale * x, sat_cutoff). + vfloat32m8_t vz = __riscv_vfmul_vf_f32m8(vx, vprescale, n); + vz = __riscv_vfmax_vf_f32m8(vz, vsat_cutoff, n); + + // Compute reduced argument n = round(z / ln(2)). + // Use magic bias to get rounding for free. + vfloat32m8_t vn = __riscv_vfmacc_vf_f32m8( + __riscv_vfmv_v_f_f32m8(vmagic_bias, n), vlog2e, vz, n); + + // Create 2^n by shifting n (as integer) into the exponent field. + vint32m8_t ven = __riscv_vsll_vx_i32m8( + __riscv_vreinterpret_v_f32m8_i32m8(vn), 23, n); + vfloat32m8_t vs = __riscv_vreinterpret_v_i32m8_f32m8(ven); + + // Subtract magic bias to get the reduced argument. + vn = __riscv_vfsub_vf_f32m8(vn, vmagic_bias, n); + + // Compute reduced argument t = z - n * ln(2). + // Use Cody-Waite range reduction (two constants to represent ln(2)). + vfloat32m8_t vt = __riscv_vfmacc_vf_f32m8(vz, vminus_ln2_hi, vn, n); + vt = __riscv_vfmacc_vf_f32m8(vt, vminus_ln2_lo, vn, n); + + // Compute degree-6 polynomial approximation for exp(t) - 1 using Horner's method. + // p = t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6))))) + vfloat32m8_t vp = __riscv_vfmv_v_f_f32m8(vc5, n); + vp = __riscv_vfmacc_vf_f32m8(vp, vc6, vt, n); + vp = __riscv_vfmadd_vv_f32m8(vp, vt, __riscv_vfmv_v_f_f32m8(vc4, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vt, __riscv_vfmv_v_f_f32m8(vc3, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vt, __riscv_vfmv_v_f_f32m8(vc2, n), n); + vp = __riscv_vfmul_vv_f32m8(vp, vt, n); + + // Reconstruct the exp(z) value: + // t * s + // s - 1 + // p = (p * t) + t + // e = (p + (s - 1)) * alpha + vt = __riscv_vfmul_vv_f32m8(vt, vs, n); + vs = __riscv_vfsub_vf_f32m8(vs, 1.0f, n); + vp = __riscv_vfmadd_vv_f32m8(vp, vt, vt, n); + vfloat32m8_t ve = __riscv_vfmul_vf_f32m8( + __riscv_vfadd_vv_f32m8(vp, vs, n), valpha, n); + + // Select between the ELU and linear parts: + // y = x < 0 ? e : x * beta + vfloat32m8_t vy = __riscv_vfmul_vf_f32m8(vx, vbeta, n); + vbool4_t mask = __riscv_vmflt_vf_f32m8_b4(vx, 0.0f, n); + vy = __riscv_vmerge_vvm_f32m8(vy, ve, mask, n); + + __riscv_vse32_v_f32m8(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-velu/rvv-rr2-p6.c.in b/src/f32-velu/rvv-rr2-p6.c.in new file mode 100644 index 00000000000..5cb16183169 --- /dev/null +++ b/src/f32-velu/rvv-rr2-p6.c.in @@ -0,0 +1,101 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_velu_ukernel__rvv_rr2_p6_u${LMUL}v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_elu_params* restrict params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + const float vprescale = params->scalar.prescale; + const float valpha = params->scalar.alpha; + const float vbeta = params->scalar.beta; + + const float vsat_cutoff = -0x1.154246p+4f; + const float vmagic_bias = 0x1.8000FEp23f; + const float vlog2e = 0x1.715476p+0f; + const float vminus_ln2_hi = -0x1.62E440p-1f; + const float vminus_ln2_lo = 0x1.0105C6p-21f; + const float vc6 = 0x1.6b7338p-10f; + const float vc5 = 0x1.12278Ep-7f; + const float vc4 = 0x1.555716p-5f; + const float vc3 = 0x1.5554B0p-3f; + const float vc2 = 0x1.FFFFFEp-2f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m${LMUL}(batch); + + vfloat32m${LMUL}_t vx = __riscv_vle32_v_f32m${LMUL}(input, n); + input += n; + + // Compute reduced argument z = max(prescale * x, sat_cutoff). + vfloat32m${LMUL}_t vz = __riscv_vfmul_vf_f32m${LMUL}(vx, vprescale, n); + vz = __riscv_vfmax_vf_f32m${LMUL}(vz, vsat_cutoff, n); + + // Compute reduced argument n = round(z / ln(2)). + // Use magic bias to get rounding for free. + vfloat32m${LMUL}_t vn = __riscv_vfmacc_vf_f32m${LMUL}( + __riscv_vfmv_v_f_f32m${LMUL}(vmagic_bias, n), vlog2e, vz, n); + + // Create 2^n by shifting n (as integer) into the exponent field. + vint32m${LMUL}_t ven = __riscv_vsll_vx_i32m${LMUL}( + __riscv_vreinterpret_v_f32m${LMUL}_i32m${LMUL}(vn), 23, n); + vfloat32m${LMUL}_t vs = __riscv_vreinterpret_v_i32m${LMUL}_f32m${LMUL}(ven); + + // Subtract magic bias to get the reduced argument. + vn = __riscv_vfsub_vf_f32m${LMUL}(vn, vmagic_bias, n); + + // Compute reduced argument t = z - n * ln(2). + // Use Cody-Waite range reduction (two constants to represent ln(2)). + vfloat32m${LMUL}_t vt = __riscv_vfmacc_vf_f32m${LMUL}(vz, vminus_ln2_hi, vn, n); + vt = __riscv_vfmacc_vf_f32m${LMUL}(vt, vminus_ln2_lo, vn, n); + + // Compute degree-6 polynomial approximation for exp(t) - 1 using Horner's method. + // p = t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * c6))))) + vfloat32m${LMUL}_t vp = __riscv_vfmv_v_f_f32m${LMUL}(vc5, n); + vp = __riscv_vfmacc_vf_f32m${LMUL}(vp, vc6, vt, n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vt, __riscv_vfmv_v_f_f32m${LMUL}(vc4, n), n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vt, __riscv_vfmv_v_f_f32m${LMUL}(vc3, n), n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vt, __riscv_vfmv_v_f_f32m${LMUL}(vc2, n), n); + vp = __riscv_vfmul_vv_f32m${LMUL}(vp, vt, n); + + // Reconstruct the exp(z) value: + // t * s + // s - 1 + // p = (p * t) + t + // e = (p + (s - 1)) * alpha + vt = __riscv_vfmul_vv_f32m${LMUL}(vt, vs, n); + vs = __riscv_vfsub_vf_f32m${LMUL}(vs, 1.0f, n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vt, vt, n); + vfloat32m${LMUL}_t ve = __riscv_vfmul_vf_f32m${LMUL}( + __riscv_vfadd_vv_f32m${LMUL}(vp, vs, n), valpha, n); + + // Select between the ELU and linear parts: + // y = x < 0 ? e : x * beta + vfloat32m${LMUL}_t vy = __riscv_vfmul_vf_f32m${LMUL}(vx, vbeta, n); + vbool${int(32/LMUL)}_t mask = __riscv_vmflt_vf_f32m${LMUL}_b${int(32/LMUL)}(vx, 0.0f, n); + vy = __riscv_vmerge_vvm_f32m${LMUL}(vy, ve, mask, n); + + __riscv_vse32_v_f32m${LMUL}(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vgelu/f32-vgelu.inc b/src/f32-vgelu/f32-vgelu.inc index 6542997c43b..bb76de70004 100644 --- a/src/f32-vgelu/f32-vgelu.inc +++ b/src/f32-vgelu/f32-vgelu.inc @@ -68,3 +68,10 @@ XNN_UKERNEL(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_nr_u32, 32, XNN_UKERNEL(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_nr_u64, 64, false, float, struct xnn_f32_default_params, NULL) XNN_UKERNEL(xnn_arch_hvx, xnn_f32_vgelu_ukernel__hvx_rational_12_10_nr_u128, 128, false, float, struct xnn_f32_default_params, NULL) #endif // XNN_ENABLE_HVX && XNN_ARCH_HEXAGON + +#if XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u1v, 1, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u2v, 2, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u4v, 4, true, float, struct xnn_f32_default_params, NULL) +XNN_UKERNEL(xnn_arch_riscv_vector, xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u8v, 8, true, float, struct xnn_f32_default_params, NULL) +#endif // XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c new file mode 100644 index 00000000000..e5e29e1a535 --- /dev/null +++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c @@ -0,0 +1,95 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u1v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_abs_x = 5.1638283730e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788452387e-01f; + const float valpha_3 = 6.6972173750e-02f; + const float valpha_5 = 9.3065137044e-03f; + const float valpha_7 = 3.2973114867e-04f; + const float valpha_9 = 1.2609783880e-05f; + const float valpha_11 = 4.5835321316e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.5060352683e-01f; + const float vbeta_4 = 2.8431978077e-02f; + const float vbeta_6 = 1.8622842617e-03f; + const float vbeta_8 = 7.2267655923e-05f; + const float vbeta_10 = 1.1988805682e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m1(batch); + + vfloat32m1_t vx_orig = __riscv_vle32_v_f32m1(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m1_t vx = __riscv_vfmin_vf_f32m1(vx_orig, vmax_abs_x, n); + vx = __riscv_vfmax_vf_f32m1(vx, -vmax_abs_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m1_t vx2 = __riscv_vfmul_vv_f32m1(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m1(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m1(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m1_t vq = __riscv_vfmv_v_f_f32m1(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m1(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(1.0f, n), n); + + // Divide the numerator by the denominator. + vfloat32m1_t verf = __riscv_vfdiv_vv_f32m1(vp, vq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m1_t vy = __riscv_vfadd_vf_f32m1(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m1(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m1(vy, vx_orig, n); + + __riscv_vse32_v_f32m1(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c new file mode 100644 index 00000000000..c7284af1a85 --- /dev/null +++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c @@ -0,0 +1,95 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u2v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_abs_x = 5.1638283730e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788452387e-01f; + const float valpha_3 = 6.6972173750e-02f; + const float valpha_5 = 9.3065137044e-03f; + const float valpha_7 = 3.2973114867e-04f; + const float valpha_9 = 1.2609783880e-05f; + const float valpha_11 = 4.5835321316e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.5060352683e-01f; + const float vbeta_4 = 2.8431978077e-02f; + const float vbeta_6 = 1.8622842617e-03f; + const float vbeta_8 = 7.2267655923e-05f; + const float vbeta_10 = 1.1988805682e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m2(batch); + + vfloat32m2_t vx_orig = __riscv_vle32_v_f32m2(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m2_t vx = __riscv_vfmin_vf_f32m2(vx_orig, vmax_abs_x, n); + vx = __riscv_vfmax_vf_f32m2(vx, -vmax_abs_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m2_t vx2 = __riscv_vfmul_vv_f32m2(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m2_t vp = __riscv_vfmv_v_f_f32m2(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m2(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m2(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m2_t vq = __riscv_vfmv_v_f_f32m2(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m2(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(1.0f, n), n); + + // Divide the numerator by the denominator. + vfloat32m2_t verf = __riscv_vfdiv_vv_f32m2(vp, vq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m2_t vy = __riscv_vfadd_vf_f32m2(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m2(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m2(vy, vx_orig, n); + + __riscv_vse32_v_f32m2(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c new file mode 100644 index 00000000000..2961924ddea --- /dev/null +++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c @@ -0,0 +1,95 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u4v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_abs_x = 5.1638283730e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788452387e-01f; + const float valpha_3 = 6.6972173750e-02f; + const float valpha_5 = 9.3065137044e-03f; + const float valpha_7 = 3.2973114867e-04f; + const float valpha_9 = 1.2609783880e-05f; + const float valpha_11 = 4.5835321316e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.5060352683e-01f; + const float vbeta_4 = 2.8431978077e-02f; + const float vbeta_6 = 1.8622842617e-03f; + const float vbeta_8 = 7.2267655923e-05f; + const float vbeta_10 = 1.1988805682e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m4(batch); + + vfloat32m4_t vx_orig = __riscv_vle32_v_f32m4(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m4_t vx = __riscv_vfmin_vf_f32m4(vx_orig, vmax_abs_x, n); + vx = __riscv_vfmax_vf_f32m4(vx, -vmax_abs_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m4_t vx2 = __riscv_vfmul_vv_f32m4(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m4_t vp = __riscv_vfmv_v_f_f32m4(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m4(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m4(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m4_t vq = __riscv_vfmv_v_f_f32m4(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m4(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(1.0f, n), n); + + // Divide the numerator by the denominator. + vfloat32m4_t verf = __riscv_vfdiv_vv_f32m4(vp, vq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m4_t vy = __riscv_vfadd_vf_f32m4(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m4(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m4(vy, vx_orig, n); + + __riscv_vse32_v_f32m4(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c new file mode 100644 index 00000000000..417cc1f7501 --- /dev/null +++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c @@ -0,0 +1,95 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vgelu_ukernel__rvv_rational_12_10_div_u8v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_abs_x = 5.1638283730e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788452387e-01f; + const float valpha_3 = 6.6972173750e-02f; + const float valpha_5 = 9.3065137044e-03f; + const float valpha_7 = 3.2973114867e-04f; + const float valpha_9 = 1.2609783880e-05f; + const float valpha_11 = 4.5835321316e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.5060352683e-01f; + const float vbeta_4 = 2.8431978077e-02f; + const float vbeta_6 = 1.8622842617e-03f; + const float vbeta_8 = 7.2267655923e-05f; + const float vbeta_10 = 1.1988805682e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m8(batch); + + vfloat32m8_t vx_orig = __riscv_vle32_v_f32m8(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m8_t vx = __riscv_vfmin_vf_f32m8(vx_orig, vmax_abs_x, n); + vx = __riscv_vfmax_vf_f32m8(vx, -vmax_abs_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m8_t vx2 = __riscv_vfmul_vv_f32m8(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m8_t vp = __riscv_vfmv_v_f_f32m8(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m8(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m8(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m8_t vq = __riscv_vfmv_v_f_f32m8(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m8(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(1.0f, n), n); + + // Divide the numerator by the denominator. + vfloat32m8_t verf = __riscv_vfdiv_vv_f32m8(vp, vq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m8_t vy = __riscv_vfadd_vf_f32m8(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m8(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m8(vy, vx_orig, n); + + __riscv_vse32_v_f32m8(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c new file mode 100644 index 00000000000..1af619bf8a5 --- /dev/null +++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c @@ -0,0 +1,102 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vgelu_ukernel__rvv_rational_12_10_nr_u1v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_abs_x = 5.1164608002e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788452387e-01f; + const float valpha_3 = 6.6972173750e-02f; + const float valpha_5 = 9.3065137044e-03f; + const float valpha_7 = 3.2973114867e-04f; + const float valpha_9 = 1.2609783880e-05f; + const float valpha_11 = 4.5835321316e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.5060352683e-01f; + const float vbeta_4 = 2.8431978077e-02f; + const float vbeta_6 = 1.8622842617e-03f; + const float vbeta_8 = 7.2267655923e-05f; + const float vbeta_10 = 1.1988805682e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m1(batch); + + vfloat32m1_t vx_orig = __riscv_vle32_v_f32m1(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m1_t vx = __riscv_vfmin_vf_f32m1(vx_orig, vmax_abs_x, n); + vx = __riscv_vfmax_vf_f32m1(vx, -vmax_abs_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m1_t vx2 = __riscv_vfmul_vv_f32m1(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m1_t vp = __riscv_vfmv_v_f_f32m1(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m1(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m1(vp, vx2, __riscv_vfmv_v_f_f32m1(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m1(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m1_t vq = __riscv_vfmv_v_f_f32m1(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m1(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m1(vq, vx2, __riscv_vfmv_v_f_f32m1(1.0f, n), n); + + // Divide the numerator by the denominator. + // Newton-Raphson iteration for reciprocal. + vfloat32m1_t vrq = __riscv_vfrec7_v_f32m1(vq, n); + for (size_t iter = 0; iter < 2; iter++) { + vfloat32m1_t verr = __riscv_vfnmsac_vv_f32m1( + __riscv_vfmv_v_f_f32m1(2.0f, n), vrq, vq, n); + vrq = __riscv_vfmul_vv_f32m1(vrq, verr, n); + } + vfloat32m1_t verf = __riscv_vfmul_vv_f32m1(vp, vrq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m1_t vy = __riscv_vfadd_vf_f32m1(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m1(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m1(vy, vx_orig, n); + + __riscv_vse32_v_f32m1(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c new file mode 100644 index 00000000000..a29c5a312d3 --- /dev/null +++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c @@ -0,0 +1,102 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vgelu_ukernel__rvv_rational_12_10_nr_u2v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_abs_x = 5.1164608002e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788452387e-01f; + const float valpha_3 = 6.6972173750e-02f; + const float valpha_5 = 9.3065137044e-03f; + const float valpha_7 = 3.2973114867e-04f; + const float valpha_9 = 1.2609783880e-05f; + const float valpha_11 = 4.5835321316e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.5060352683e-01f; + const float vbeta_4 = 2.8431978077e-02f; + const float vbeta_6 = 1.8622842617e-03f; + const float vbeta_8 = 7.2267655923e-05f; + const float vbeta_10 = 1.1988805682e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m2(batch); + + vfloat32m2_t vx_orig = __riscv_vle32_v_f32m2(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m2_t vx = __riscv_vfmin_vf_f32m2(vx_orig, vmax_abs_x, n); + vx = __riscv_vfmax_vf_f32m2(vx, -vmax_abs_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m2_t vx2 = __riscv_vfmul_vv_f32m2(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m2_t vp = __riscv_vfmv_v_f_f32m2(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m2(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m2(vp, vx2, __riscv_vfmv_v_f_f32m2(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m2(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m2_t vq = __riscv_vfmv_v_f_f32m2(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m2(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m2(vq, vx2, __riscv_vfmv_v_f_f32m2(1.0f, n), n); + + // Divide the numerator by the denominator. + // Newton-Raphson iteration for reciprocal. + vfloat32m2_t vrq = __riscv_vfrec7_v_f32m2(vq, n); + for (size_t iter = 0; iter < 2; iter++) { + vfloat32m2_t verr = __riscv_vfnmsac_vv_f32m2( + __riscv_vfmv_v_f_f32m2(2.0f, n), vrq, vq, n); + vrq = __riscv_vfmul_vv_f32m2(vrq, verr, n); + } + vfloat32m2_t verf = __riscv_vfmul_vv_f32m2(vp, vrq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m2_t vy = __riscv_vfadd_vf_f32m2(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m2(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m2(vy, vx_orig, n); + + __riscv_vse32_v_f32m2(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c new file mode 100644 index 00000000000..70c3f8d0e53 --- /dev/null +++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c @@ -0,0 +1,102 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vgelu_ukernel__rvv_rational_12_10_nr_u4v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_abs_x = 5.1164608002e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788452387e-01f; + const float valpha_3 = 6.6972173750e-02f; + const float valpha_5 = 9.3065137044e-03f; + const float valpha_7 = 3.2973114867e-04f; + const float valpha_9 = 1.2609783880e-05f; + const float valpha_11 = 4.5835321316e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.5060352683e-01f; + const float vbeta_4 = 2.8431978077e-02f; + const float vbeta_6 = 1.8622842617e-03f; + const float vbeta_8 = 7.2267655923e-05f; + const float vbeta_10 = 1.1988805682e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m4(batch); + + vfloat32m4_t vx_orig = __riscv_vle32_v_f32m4(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m4_t vx = __riscv_vfmin_vf_f32m4(vx_orig, vmax_abs_x, n); + vx = __riscv_vfmax_vf_f32m4(vx, -vmax_abs_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m4_t vx2 = __riscv_vfmul_vv_f32m4(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m4_t vp = __riscv_vfmv_v_f_f32m4(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m4(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m4(vp, vx2, __riscv_vfmv_v_f_f32m4(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m4(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m4_t vq = __riscv_vfmv_v_f_f32m4(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m4(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m4(vq, vx2, __riscv_vfmv_v_f_f32m4(1.0f, n), n); + + // Divide the numerator by the denominator. + // Newton-Raphson iteration for reciprocal. + vfloat32m4_t vrq = __riscv_vfrec7_v_f32m4(vq, n); + for (size_t iter = 0; iter < 2; iter++) { + vfloat32m4_t verr = __riscv_vfnmsac_vv_f32m4( + __riscv_vfmv_v_f_f32m4(2.0f, n), vrq, vq, n); + vrq = __riscv_vfmul_vv_f32m4(vrq, verr, n); + } + vfloat32m4_t verf = __riscv_vfmul_vv_f32m4(vp, vrq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m4_t vy = __riscv_vfadd_vf_f32m4(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m4(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m4(vy, vx_orig, n); + + __riscv_vse32_v_f32m4(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c new file mode 100644 index 00000000000..b8d62ab0cb4 --- /dev/null +++ b/src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c @@ -0,0 +1,102 @@ +// clang-format off +// Auto-generated file. Do not edit! +// Template: src/f32-vgelu/rvv-rational-12-10.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vgelu_ukernel__rvv_rational_12_10_nr_u8v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + const float vmax_abs_x = 5.1164608002e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788452387e-01f; + const float valpha_3 = 6.6972173750e-02f; + const float valpha_5 = 9.3065137044e-03f; + const float valpha_7 = 3.2973114867e-04f; + const float valpha_9 = 1.2609783880e-05f; + const float valpha_11 = 4.5835321316e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.5060352683e-01f; + const float vbeta_4 = 2.8431978077e-02f; + const float vbeta_6 = 1.8622842617e-03f; + const float vbeta_8 = 7.2267655923e-05f; + const float vbeta_10 = 1.1988805682e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m8(batch); + + vfloat32m8_t vx_orig = __riscv_vle32_v_f32m8(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m8_t vx = __riscv_vfmin_vf_f32m8(vx_orig, vmax_abs_x, n); + vx = __riscv_vfmax_vf_f32m8(vx, -vmax_abs_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m8_t vx2 = __riscv_vfmul_vv_f32m8(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m8_t vp = __riscv_vfmv_v_f_f32m8(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m8(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m8(vp, vx2, __riscv_vfmv_v_f_f32m8(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m8(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m8_t vq = __riscv_vfmv_v_f_f32m8(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m8(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m8(vq, vx2, __riscv_vfmv_v_f_f32m8(1.0f, n), n); + + // Divide the numerator by the denominator. + // Newton-Raphson iteration for reciprocal. + vfloat32m8_t vrq = __riscv_vfrec7_v_f32m8(vq, n); + for (size_t iter = 0; iter < 2; iter++) { + vfloat32m8_t verr = __riscv_vfnmsac_vv_f32m8( + __riscv_vfmv_v_f_f32m8(2.0f, n), vrq, vq, n); + vrq = __riscv_vfmul_vv_f32m8(vrq, verr, n); + } + vfloat32m8_t verf = __riscv_vfmul_vv_f32m8(vp, vrq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m8_t vy = __riscv_vfadd_vf_f32m8(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m8(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m8(vy, vx_orig, n); + + __riscv_vse32_v_f32m8(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} diff --git a/src/f32-vgelu/rvv-rational-12-10.c.in b/src/f32-vgelu/rvv-rational-12-10.c.in new file mode 100644 index 00000000000..d38c4565697 --- /dev/null +++ b/src/f32-vgelu/rvv-rational-12-10.c.in @@ -0,0 +1,105 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert LMUL in [1, 2, 4, 8] +$assert DIV in ("DIV", "NR") +#include + +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/vunary.h" + + +void xnn_f32_vgelu_ukernel__rvv_rational_12_10_${DIV.lower()}_u${LMUL}v( + size_t batch, + const float* input, + float* output, + const struct xnn_f32_default_params* unused_params) +{ + assert(batch != 0); + assert(batch % sizeof(float) == 0); + assert(input != NULL); + assert(output != NULL); + + // Cap the inputs to this value as `erf(x/sqrt(2))` will always be `+/-1.0f` + // beyond this point. This value is chosen as the first floating point + // number as of which the interpolation returns +/-1.0f. + $if DIV == "NR": + const float vmax_abs_x = 5.1164608002e+00f; + $else: + const float vmax_abs_x = 5.1638283730e+00f; + + // The monomial coefficients of the numerator polynomial (odd). + const float valpha_1 = 7.9788452387e-01f; + const float valpha_3 = 6.6972173750e-02f; + const float valpha_5 = 9.3065137044e-03f; + const float valpha_7 = 3.2973114867e-04f; + const float valpha_9 = 1.2609783880e-05f; + const float valpha_11 = 4.5835321316e-08f; + + // The monomial coefficients of the denominator polynomial (even). + const float vbeta_2 = 2.5060352683e-01f; + const float vbeta_4 = 2.8431978077e-02f; + const float vbeta_6 = 1.8622842617e-03f; + const float vbeta_8 = 7.2267655923e-05f; + const float vbeta_10 = 1.1988805682e-06f; + + batch >>= XNN_LOG2_SIZEOF_FLOAT; + do { + const size_t n = __riscv_vsetvl_e32m${LMUL}(batch); + + vfloat32m${LMUL}_t vx_orig = __riscv_vle32_v_f32m${LMUL}(input, n); + input += n; + + // Clamp the inputs to the interpolation range. + vfloat32m${LMUL}_t vx = __riscv_vfmin_vf_f32m${LMUL}(vx_orig, vmax_abs_x, n); + vx = __riscv_vfmax_vf_f32m${LMUL}(vx, -vmax_abs_x, n); + + // Since the polynomials are odd/even, we need x^2. + vfloat32m${LMUL}_t vx2 = __riscv_vfmul_vv_f32m${LMUL}(vx, vx, n); + + // Evaluate the numerator polynomial p. + vfloat32m${LMUL}_t vp = __riscv_vfmv_v_f_f32m${LMUL}(valpha_9, n); + vp = __riscv_vfmacc_vf_f32m${LMUL}(vp, valpha_11, vx2, n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_7, n), n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_5, n), n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_3, n), n); + vp = __riscv_vfmadd_vv_f32m${LMUL}(vp, vx2, __riscv_vfmv_v_f_f32m${LMUL}(valpha_1, n), n); + vp = __riscv_vfmul_vv_f32m${LMUL}(vp, vx, n); + + // Evaluate the denominator polynomial q. + vfloat32m${LMUL}_t vq = __riscv_vfmv_v_f_f32m${LMUL}(vbeta_8, n); + vq = __riscv_vfmacc_vf_f32m${LMUL}(vq, vbeta_10, vx2, n); + vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_6, n), n); + vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_4, n), n); + vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(vbeta_2, n), n); + vq = __riscv_vfmadd_vv_f32m${LMUL}(vq, vx2, __riscv_vfmv_v_f_f32m${LMUL}(1.0f, n), n); + + // Divide the numerator by the denominator. + $if DIV == "DIV": + vfloat32m${LMUL}_t verf = __riscv_vfdiv_vv_f32m${LMUL}(vp, vq, n); + $else: + // Newton-Raphson iteration for reciprocal. + vfloat32m${LMUL}_t vrq = __riscv_vfrec7_v_f32m${LMUL}(vq, n); + for (size_t iter = 0; iter < 2; iter++) { + vfloat32m${LMUL}_t verr = __riscv_vfnmsac_vv_f32m${LMUL}( + __riscv_vfmv_v_f_f32m${LMUL}(2.0f, n), vrq, vq, n); + vrq = __riscv_vfmul_vv_f32m${LMUL}(vrq, verr, n); + } + vfloat32m${LMUL}_t verf = __riscv_vfmul_vv_f32m${LMUL}(vp, vrq, n); + + // Add one to the rational interpolant, and multiply by 0.5 times the + // original input. + vfloat32m${LMUL}_t vy = __riscv_vfadd_vf_f32m${LMUL}(verf, 1.0f, n); + vy = __riscv_vfmul_vf_f32m${LMUL}(vy, 0.5f, n); + vy = __riscv_vfmul_vv_f32m${LMUL}(vy, vx_orig, n); + + __riscv_vse32_v_f32m${LMUL}(output, vy, n); + output += n; + + batch -= n; + } while (batch != 0); +} From ae231328e5a16ff989ae8dbde7d1f2e48a372dd5 Mon Sep 17 00:00:00 2001 From: velonica0 Date: Wed, 15 Apr 2026 09:32:22 +0800 Subject: [PATCH 2/3] Alphabetize RVV elementwise entries in cmake/bzl lists Move f32-velu, f32-vgelu, f32-vapproxgelu entries from end of PROD/NON_PROD lists to alphabetical positions to avoid merge conflicts with other RVV PRs. --- cmake/gen/rvv_microkernels.cmake | 40 ++++++++++++++++---------------- gen/rvv_microkernels.bzl | 40 ++++++++++++++++---------------- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index f33a3a3a92b..55dda525b9d 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -43,6 +43,7 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-spmm/gen/f32-spmm-4vx4-minmax-rvv.c src/f32-spmm/gen/f32-spmm-8vx1-minmax-rvv.c src/f32-spmm/gen/f32-spmm-8vx2-minmax-rvv.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c src/f32-vbinary/gen/f32-vadd-rvv-u8v.c src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c @@ -69,6 +70,8 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c src/f32-vcos/gen/f32-vcos-rvv-rational-5-4-div-u8v.c src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u8v.c + src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c src/f32-vhswish/gen/f32-vhswish-rvv-u4v.c src/f32-vlog/gen/f32-vlog-rvv-rational-3-3-div-u8v.c src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c @@ -143,9 +146,6 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/x32-transposec/gen/x32-transposec-8x8-rvv.c src/x32-transposec/gen/x32-transposec-16x8-rvv.c src/x32-transposec/gen/x32-transposec-32x8-rvv.c - src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c - src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c - src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c) SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-1x1.c @@ -201,6 +201,13 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-spmm/gen/f32-spmm-4vx1-minmax-rvv.c src/f32-spmm/gen/f32-spmm-4vx2-minmax-rvv.c src/f32-spmm/gen/f32-spmm-8vx4-minmax-rvv.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c + src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c src/f32-vbinary/gen/f32-vadd-rvv-u4v.c src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c @@ -234,6 +241,16 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u1v.c src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u2v.c src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u4v.c + src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c + src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c + src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c + src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c src/f32-vhswish/gen/f32-vhswish-rvv-u1v.c src/f32-vhswish/gen/f32-vhswish-rvv-u2v.c src/f32-vhswish/gen/f32-vhswish-rvv-u8v.c @@ -337,22 +354,5 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u2.c src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u4.c src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c - src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c - src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c - src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c - src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c - src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c - src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c - src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c - src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c - src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c - src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c - src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c - src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c - src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c - src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c - src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c - src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c - src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c) SET(ALL_RVV_MICROKERNEL_SRCS ${PROD_RVV_MICROKERNEL_SRCS} + ${NON_PROD_RVV_MICROKERNEL_SRCS}) diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index e86c895b66b..cfe6c5daa14 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -39,6 +39,7 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-spmm/gen/f32-spmm-4vx4-minmax-rvv.c", "src/f32-spmm/gen/f32-spmm-8vx1-minmax-rvv.c", "src/f32-spmm/gen/f32-spmm-8vx2-minmax-rvv.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c", "src/f32-vbinary/gen/f32-vadd-rvv-u8v.c", "src/f32-vbinary/gen/f32-vaddc-rvv-u8v.c", "src/f32-vbinary/gen/f32-vdiv-rvv-u8v.c", @@ -65,6 +66,8 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-vcopysign/gen/f32-vrcopysignc-rvv-u8v.c", "src/f32-vcos/gen/f32-vcos-rvv-rational-5-4-div-u8v.c", "src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u8v.c", + "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c", "src/f32-vhswish/gen/f32-vhswish-rvv-u4v.c", "src/f32-vlog/gen/f32-vlog-rvv-rational-3-3-div-u8v.c", "src/f32-vlrelu/gen/f32-vlrelu-rvv-u4v.c", @@ -139,9 +142,6 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/x32-transposec/gen/x32-transposec-8x8-rvv.c", "src/x32-transposec/gen/x32-transposec-16x8-rvv.c", "src/x32-transposec/gen/x32-transposec-32x8-rvv.c", - "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u4v.c", - "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u4v.c", - "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u4v.c", ] NON_PROD_RVV_MICROKERNEL_SRCS = [ @@ -198,6 +198,13 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-spmm/gen/f32-spmm-4vx1-minmax-rvv.c", "src/f32-spmm/gen/f32-spmm-4vx2-minmax-rvv.c", "src/f32-spmm/gen/f32-spmm-8vx4-minmax-rvv.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c", + "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c", "src/f32-vbinary/gen/f32-vadd-rvv-u4v.c", "src/f32-vbinary/gen/f32-vaddc-rvv-u4v.c", "src/f32-vbinary/gen/f32-vdiv-rvv-u4v.c", @@ -231,6 +238,16 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u1v.c", "src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u2v.c", "src/f32-vexp/gen/f32-vexp-rvv-rational-3-2-div-u4v.c", + "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c", + "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c", + "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c", + "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c", "src/f32-vhswish/gen/f32-vhswish-rvv-u1v.c", "src/f32-vhswish/gen/f32-vhswish-rvv-u2v.c", "src/f32-vhswish/gen/f32-vhswish-rvv-u8v.c", @@ -334,23 +351,6 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u2.c", "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u4.c", "src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c", - "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u1v.c", - "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u2v.c", - "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-div-u8v.c", - "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u1v.c", - "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u2v.c", - "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u4v.c", - "src/f32-vapproxgelu/gen/f32-vapproxgelu-rvv-rational-12-10-nr-u8v.c", - "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u1v.c", - "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u2v.c", - "src/f32-velu/gen/f32-velu-rvv-rr2-p6-u8v.c", - "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u1v.c", - "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u2v.c", - "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-div-u8v.c", - "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u1v.c", - "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u2v.c", - "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u4v.c", - "src/f32-vgelu/gen/f32-vgelu-rvv-rational-12-10-nr-u8v.c", ] ALL_RVV_MICROKERNEL_SRCS = PROD_RVV_MICROKERNEL_SRCS + NON_PROD_RVV_MICROKERNEL_SRCS From 0b6f61afd92adefbee1d0f8dffa36cb67558be13 Mon Sep 17 00:00:00 2001 From: velonica0 Date: Mon, 20 Apr 2026 10:34:19 +0800 Subject: [PATCH 3/3] fix cmake bug --- cmake/gen/rvv_microkernels.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index 55dda525b9d..2491a629887 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -145,7 +145,7 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/x32-transposec/gen/x32-transposec-4x4-rvv.c src/x32-transposec/gen/x32-transposec-8x8-rvv.c src/x32-transposec/gen/x32-transposec-16x8-rvv.c - src/x32-transposec/gen/x32-transposec-32x8-rvv.c + src/x32-transposec/gen/x32-transposec-32x8-rvv.c) SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-conv-hwc2chw/f32-conv-hwc2chw-3x3s2p1c3x2v-rvv-1x1.c @@ -353,6 +353,6 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS src/x32-packw/gen/x32-packw-x4v-gemm-goi-rvv-u4.c src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u2.c src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u4.c - src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c + src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c) SET(ALL_RVV_MICROKERNEL_SRCS ${PROD_RVV_MICROKERNEL_SRCS} + ${NON_PROD_RVV_MICROKERNEL_SRCS})