Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added .codex
Empty file.
22 changes: 22 additions & 0 deletions bench/f32-bgemm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1535,6 +1535,28 @@ BENCHMARK_BGEMM(f32_ppmm_4x2_twopass__scalar)
BENCHMARK_BGEMM(f32_ppmm_4x4_twopass__scalar)
BENCHMARK_BGEMM(f32_ppmm_3x3_twopass__scalar)

#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
static void f32_ppmm_4x4v_unipass__rvv(benchmark::State& state) {
f32_ppmm1p(state, xnn_x32_packw_gemm_goi_ukernel_x4__scalar_float_u4,
xnn_x32_packx_ukernel_4x__scalar,
xnn_f32_ppmm_minmax_ukernel_4x4v__rvv,
xnn_init_f32_minmax_scalar_params,
/*mr=*/4, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float),
/*kr=*/1, /*sr=*/1);
}
static void f32_ppmm_4x4v_twopass__rvv(benchmark::State& state) {
f32_ppmm2p(state, xnn_x32_packw_gemm_goi_ukernel_x4__scalar_float_u4,
xnn_x32_packx_ukernel_4x__scalar,
xnn_f32_ppmm_minmax_ukernel_4x4v__rvv,
xnn_init_f32_minmax_scalar_params,
/*mr=*/4, /*nr=*/4 * xnn_init_hardware_config()->vlenb / sizeof(float),
/*kr=*/1, /*sr=*/1);
}

BENCHMARK_BGEMM(f32_ppmm_4x4v_unipass__rvv)
BENCHMARK_BGEMM(f32_ppmm_4x4v_twopass__rvv)
#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV

#ifdef BENCHMARK_RUY
BENCHMARK_BGEMM(ruy_st)
#endif // BENCHMARK_RUY
Expand Down
19 changes: 18 additions & 1 deletion cmake/gen/rvv_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ SET(PROD_RVV_MICROKERNEL_SRCS
src/f32-igemm/gen/f32-igemm-1x4v-minmax-rvv.c
src/f32-igemm/gen/f32-igemm-7x4v-minmax-rvv.c
src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c
src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c
src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c
src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c
src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-rvv-rr2-p6-u4v.c
Expand Down Expand Up @@ -168,6 +169,14 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
src/f32-gemm/gen/f32-gemm-7x4v-rvv.c
src/f32-igemm/gen/f32-igemm-1x4v-rvv.c
src/f32-igemm/gen/f32-igemm-7x4v-rvv.c
src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c
src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c
src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c
src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c
Expand Down Expand Up @@ -356,6 +365,14 @@ SET(NON_PROD_RVV_MICROKERNEL_SRCS
src/x32-packw/gen/x32-packw-x8v-gemm-goi-rvv-u8.c
src/x32-transposec/gen/x32-transposec-8xv1-rvv.c
src/x32-transposec/gen/x32-transposec-8xv2-rvv.c
src/x32-transposec/gen/x32-transposec-8xv4-rvv.c)
src/x32-transposec/gen/x32-transposec-8xv4-rvv.c
src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c
src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c)

SET(ALL_RVV_MICROKERNEL_SRCS ${PROD_RVV_MICROKERNEL_SRCS} ${NON_PROD_RVV_MICROKERNEL_SRCS})
17 changes: 17 additions & 0 deletions gen/rvv_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ PROD_RVV_MICROKERNEL_SRCS = [
"src/f32-igemm/gen/f32-igemm-1x4v-minmax-rvv.c",
"src/f32-igemm/gen/f32-igemm-7x4v-minmax-rvv.c",
"src/f32-maxpool/gen/f32-maxpool-9p-minmax-rvv-u2v.c",
"src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c",
"src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u2v.c",
"src/f32-qu8-vcvt/gen/f32-qu8-vcvt-rvv-u2v.c",
"src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-rvv-rr2-p6-u4v.c",
Expand Down Expand Up @@ -165,6 +166,14 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
"src/f32-gemm/gen/f32-gemm-7x4v-rvv.c",
"src/f32-igemm/gen/f32-igemm-1x4v-rvv.c",
"src/f32-igemm/gen/f32-igemm-7x4v-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c",
"src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u1v.c",
"src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u4v.c",
"src/f32-qs8-vcvt/gen/f32-qs8-vcvt-rvv-u8v.c",
Expand Down Expand Up @@ -354,6 +363,14 @@ NON_PROD_RVV_MICROKERNEL_SRCS = [
"src/x32-transposec/gen/x32-transposec-8xv1-rvv.c",
"src/x32-transposec/gen/x32-transposec-8xv2-rvv.c",
"src/x32-transposec/gen/x32-transposec-8xv4-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c",
"src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c",
]

ALL_RVV_MICROKERNEL_SRCS = PROD_RVV_MICROKERNEL_SRCS + NON_PROD_RVV_MICROKERNEL_SRCS
11 changes: 11 additions & 0 deletions scripts/generate-f32-ppmm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,15 @@ tools/xngen src/f32-ppmm/8x8-aarch64-neonfma-cortex-a75.S.in -D PREFETCH=1 -o sr
################################### x86 SSE ###################################
tools/xngen src/f32-ppmm/sse.c.in -D MR=4 -D NR=8 -o src/f32-ppmm/gen/f32-ppmm-4x8-minmax-sse.c &

################################## RISC-V RVV #################################
tools/xngen src/f32-ppmm/rvv.c.in -D MR=1 -D NR=m1 -o src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c &
tools/xngen src/f32-ppmm/rvv.c.in -D MR=1 -D NR=m2 -o src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c &
tools/xngen src/f32-ppmm/rvv.c.in -D MR=1 -D NR=m4 -o src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c &
tools/xngen src/f32-ppmm/rvv.c.in -D MR=2 -D NR=m1 -o src/f32-ppmm/gen/f32-ppmm-2x1v-minmax-rvv.c &
tools/xngen src/f32-ppmm/rvv.c.in -D MR=2 -D NR=m2 -o src/f32-ppmm/gen/f32-ppmm-2x2v-minmax-rvv.c &
tools/xngen src/f32-ppmm/rvv.c.in -D MR=2 -D NR=m4 -o src/f32-ppmm/gen/f32-ppmm-2x4v-minmax-rvv.c &
tools/xngen src/f32-ppmm/rvv.c.in -D MR=4 -D NR=m1 -o src/f32-ppmm/gen/f32-ppmm-4x1v-minmax-rvv.c &
tools/xngen src/f32-ppmm/rvv.c.in -D MR=4 -D NR=m2 -o src/f32-ppmm/gen/f32-ppmm-4x2v-minmax-rvv.c &
tools/xngen src/f32-ppmm/rvv.c.in -D MR=4 -D NR=m4 -o src/f32-ppmm/gen/f32-ppmm-4x4v-minmax-rvv.c &

wait
80 changes: 80 additions & 0 deletions src/f32-ppmm/gen/f32-ppmm-1x1v-minmax-rvv.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f32-ppmm/rvv.c.in
// Generator: tools/xngen
//
// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#include <riscv_vector.h>

#include "src/xnnpack/common.h"
#include "src/xnnpack/ppmm.h"


void xnn_f32_ppmm_minmax_ukernel_1x1v__rvv(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const struct xnn_f32_minmax_params* restrict params)
{
assert(mr != 0);
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);

const float vmin = params->scalar.min;
const float vmax = params->scalar.max;

float* c0 = c;

const size_t nr = __riscv_vsetvlmax_e32m1();
size_t vl = nr;
do {
if XNN_UNLIKELY(nc < nr) {
vl = __riscv_vsetvl_e32m1(nc);
}
nc -= vl;

// Load bias.
vfloat32m1_t vacc0 = __riscv_vle32_v_f32m1(w, vl);
w += nr;

// Inner product loop.
size_t k = kc;
do {
const float va0 = a[0];
a += 1;

vfloat32m1_t vb = __riscv_vle32_v_f32m1(w, vl);
w += nr;

vacc0 = __riscv_vfmacc_vf_f32m1(vacc0, va0, vb, vl);

k -= sizeof(float);
} while (k != 0);

// Clamp with min & max.
vacc0 = __riscv_vfmax_vf_f32m1(vacc0, vmin, vl);

vacc0 = __riscv_vfmin_vf_f32m1(vacc0, vmax, vl);

// Store results.
__riscv_vse32_v_f32m1(c0, vacc0, vl);
c0 = (float*) ((uintptr_t) c0 + cn_stride);

a = (const float*) ((uintptr_t) a - kc * 1);
} while (nc != 0);
}
80 changes: 80 additions & 0 deletions src/f32-ppmm/gen/f32-ppmm-1x2v-minmax-rvv.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f32-ppmm/rvv.c.in
// Generator: tools/xngen
//
// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#include <riscv_vector.h>

#include "src/xnnpack/common.h"
#include "src/xnnpack/ppmm.h"


void xnn_f32_ppmm_minmax_ukernel_1x2v__rvv(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const struct xnn_f32_minmax_params* restrict params)
{
assert(mr != 0);
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);

const float vmin = params->scalar.min;
const float vmax = params->scalar.max;

float* c0 = c;

const size_t nr = __riscv_vsetvlmax_e32m2();
size_t vl = nr;
do {
if XNN_UNLIKELY(nc < nr) {
vl = __riscv_vsetvl_e32m2(nc);
}
nc -= vl;

// Load bias.
vfloat32m2_t vacc0 = __riscv_vle32_v_f32m2(w, vl);
w += nr;

// Inner product loop.
size_t k = kc;
do {
const float va0 = a[0];
a += 1;

vfloat32m2_t vb = __riscv_vle32_v_f32m2(w, vl);
w += nr;

vacc0 = __riscv_vfmacc_vf_f32m2(vacc0, va0, vb, vl);

k -= sizeof(float);
} while (k != 0);

// Clamp with min & max.
vacc0 = __riscv_vfmax_vf_f32m2(vacc0, vmin, vl);

vacc0 = __riscv_vfmin_vf_f32m2(vacc0, vmax, vl);

// Store results.
__riscv_vse32_v_f32m2(c0, vacc0, vl);
c0 = (float*) ((uintptr_t) c0 + cn_stride);

a = (const float*) ((uintptr_t) a - kc * 1);
} while (nc != 0);
}
80 changes: 80 additions & 0 deletions src/f32-ppmm/gen/f32-ppmm-1x4v-minmax-rvv.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f32-ppmm/rvv.c.in
// Generator: tools/xngen
//
// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#include <riscv_vector.h>

#include "src/xnnpack/common.h"
#include "src/xnnpack/ppmm.h"


void xnn_f32_ppmm_minmax_ukernel_1x4v__rvv(
size_t mr,
size_t nc,
size_t kc,
const float* restrict a,
const float* restrict w,
float* restrict c,
size_t cm_stride,
size_t cn_stride,
const struct xnn_f32_minmax_params* restrict params)
{
assert(mr != 0);
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(float) == 0);

const float vmin = params->scalar.min;
const float vmax = params->scalar.max;

float* c0 = c;

const size_t nr = __riscv_vsetvlmax_e32m4();
size_t vl = nr;
do {
if XNN_UNLIKELY(nc < nr) {
vl = __riscv_vsetvl_e32m4(nc);
}
nc -= vl;

// Load bias.
vfloat32m4_t vacc0 = __riscv_vle32_v_f32m4(w, vl);
w += nr;

// Inner product loop.
size_t k = kc;
do {
const float va0 = a[0];
a += 1;

vfloat32m4_t vb = __riscv_vle32_v_f32m4(w, vl);
w += nr;

vacc0 = __riscv_vfmacc_vf_f32m4(vacc0, va0, vb, vl);

k -= sizeof(float);
} while (k != 0);

// Clamp with min & max.
vacc0 = __riscv_vfmax_vf_f32m4(vacc0, vmin, vl);

vacc0 = __riscv_vfmin_vf_f32m4(vacc0, vmax, vl);

// Store results.
__riscv_vse32_v_f32m4(c0, vacc0, vl);
c0 = (float*) ((uintptr_t) c0 + cn_stride);

a = (const float*) ((uintptr_t) a - kc * 1);
} while (nc != 0);
}
Loading