Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bench/vunary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ void vlrelu(benchmark::State& state, uint64_t arch_flags,
->UseRealTime();
#include "src/bf16-f32-vcvt/bf16-f32-vcvt.inc"
#include "src/bf16-qs8-vcvt/bf16-qs8-vcvt.inc"
#include "src/bf16-qu8-vcvt/bf16-qu8-vcvt.inc"
#include "src/f32-bf16-vcvt/f32-bf16-vcvt.inc"
#include "src/f16-f32-vcvt/f16-f32-vcvt.inc"
#include "src/f16-qs8-vcvt/f16-qs8-vcvt.inc"
Expand Down
1 change: 1 addition & 0 deletions build_srcs.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ LOGGING_SRCS = [
MICROKERNEL_DEFS = [
"src/bf16-f32-vcvt/bf16-f32-vcvt.inc",
"src/bf16-qs8-vcvt/bf16-qs8-vcvt.inc",
"src/bf16-qu8-vcvt/bf16-qu8-vcvt.inc",
"src/bf16-rminmax/bf16-rmax.inc",
"src/bf16-rminmax/bf16-rmin.inc",
"src/bf16-rminmax/bf16-rminmax.inc",
Expand Down
4 changes: 4 additions & 0 deletions cmake/gen/scalar_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
SET(PROD_SCALAR_MICROKERNEL_SRCS
src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u2.c
src/bf16-qs8-vcvt/gen/bf16-qs8-vcvt-scalar-imagic-u4.c
src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u4.c
src/bf16-rminmax/gen/bf16-rmax-scalar-u2-acc2.c
src/bf16-rminmax/gen/bf16-rmin-scalar-u2-acc2.c
src/bf16-rminmax/gen/bf16-rminmax-scalar-u2-acc2.c
Expand Down Expand Up @@ -259,6 +260,9 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS
src/bf16-qs8-vcvt/gen/bf16-qs8-vcvt-scalar-imagic-u1.c
src/bf16-qs8-vcvt/gen/bf16-qs8-vcvt-scalar-imagic-u2.c
src/bf16-qs8-vcvt/gen/bf16-qs8-vcvt-scalar-imagic-u3.c
src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u1.c
src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u2.c
src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u3.c
src/bf16-rminmax/gen/bf16-rmax-scalar-u1.c
src/bf16-rminmax/gen/bf16-rmax-scalar-u3-acc3.c
src/bf16-rminmax/gen/bf16-rmax-scalar-u4-acc2.c
Expand Down
4 changes: 4 additions & 0 deletions gen/scalar_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
PROD_SCALAR_MICROKERNEL_SRCS = [
"src/bf16-f32-vcvt/gen/bf16-f32-vcvt-scalar-u2.c",
"src/bf16-qs8-vcvt/gen/bf16-qs8-vcvt-scalar-imagic-u4.c",
"src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u4.c",
"src/bf16-rminmax/gen/bf16-rmax-scalar-u2-acc2.c",
"src/bf16-rminmax/gen/bf16-rmin-scalar-u2-acc2.c",
"src/bf16-rminmax/gen/bf16-rminmax-scalar-u2-acc2.c",
Expand Down Expand Up @@ -256,6 +257,9 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [
"src/bf16-qs8-vcvt/gen/bf16-qs8-vcvt-scalar-imagic-u1.c",
"src/bf16-qs8-vcvt/gen/bf16-qs8-vcvt-scalar-imagic-u2.c",
"src/bf16-qs8-vcvt/gen/bf16-qs8-vcvt-scalar-imagic-u3.c",
"src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u1.c",
"src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u2.c",
"src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u3.c",
"src/bf16-rminmax/gen/bf16-rmax-scalar-u1.c",
"src/bf16-rminmax/gen/bf16-rmax-scalar-u3-acc3.c",
"src/bf16-rminmax/gen/bf16-rmax-scalar-u4-acc2.c",
Expand Down
5 changes: 5 additions & 0 deletions scripts/generate-f32-qs8-vcvt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@ tools/xngen src/f32-qs8-vcvt/scalar-imagic.c.in -D BATCH_TILE=2 -D IDATATYPE=BF1
tools/xngen src/f32-qs8-vcvt/scalar-imagic.c.in -D BATCH_TILE=3 -D IDATATYPE=BF16 -D ODATATYPE=QS8 -D WASM=0 -o src/bf16-qs8-vcvt/gen/bf16-qs8-vcvt-scalar-imagic-u3.c &
tools/xngen src/f32-qs8-vcvt/scalar-imagic.c.in -D BATCH_TILE=4 -D IDATATYPE=BF16 -D ODATATYPE=QS8 -D WASM=0 -o src/bf16-qs8-vcvt/gen/bf16-qs8-vcvt-scalar-imagic-u4.c &

tools/xngen src/f32-qs8-vcvt/scalar-imagic.c.in -D BATCH_TILE=1 -D IDATATYPE=BF16 -D ODATATYPE=QU8 -D WASM=0 -o src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u1.c &
tools/xngen src/f32-qs8-vcvt/scalar-imagic.c.in -D BATCH_TILE=2 -D IDATATYPE=BF16 -D ODATATYPE=QU8 -D WASM=0 -o src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u2.c &
tools/xngen src/f32-qs8-vcvt/scalar-imagic.c.in -D BATCH_TILE=3 -D IDATATYPE=BF16 -D ODATATYPE=QU8 -D WASM=0 -o src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u3.c &
tools/xngen src/f32-qs8-vcvt/scalar-imagic.c.in -D BATCH_TILE=4 -D IDATATYPE=BF16 -D ODATATYPE=QU8 -D WASM=0 -o src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u4.c &

tools/xngen src/f32-qs8-vcvt/scalar-imagic.c.in -D BATCH_TILE=1 -D IDATATYPE=F16 -D ODATATYPE=QU8 -D WASM=0 -o src/f16-qu8-vcvt/gen/f16-qu8-vcvt-scalar-imagic-u1.c &
tools/xngen src/f32-qs8-vcvt/scalar-imagic.c.in -D BATCH_TILE=2 -D IDATATYPE=F16 -D ODATATYPE=QU8 -D WASM=0 -o src/f16-qu8-vcvt/gen/f16-qu8-vcvt-scalar-imagic-u2.c &
tools/xngen src/f32-qs8-vcvt/scalar-imagic.c.in -D BATCH_TILE=3 -D IDATATYPE=F16 -D ODATATYPE=QU8 -D WASM=0 -o src/f16-qu8-vcvt/gen/f16-qu8-vcvt-scalar-imagic-u3.c &
Expand Down
21 changes: 21 additions & 0 deletions src/bf16-qu8-vcvt/bf16-qu8-vcvt.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// clang-format off
// Copyright 2025 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#ifndef XNN_QUANTIZED
#define XNN_QUANTIZED(T) T
#define XNN_DEFINED_QUANTIZED
#endif

XNN_UKERNEL(xnn_arch_none, xnn_bf16_qu8_vcvt_ukernel__scalar_imagic_u1, 1, false, xnn_bfloat16, XNN_QUANTIZED(uint8_t), struct xnn_bf16_qu8_cvt_params, xnn_init_bf16_qu8_cvt_scalar_params)
XNN_UKERNEL(xnn_arch_none, xnn_bf16_qu8_vcvt_ukernel__scalar_imagic_u2, 2, false, xnn_bfloat16, XNN_QUANTIZED(uint8_t), struct xnn_bf16_qu8_cvt_params, xnn_init_bf16_qu8_cvt_scalar_params)
XNN_UKERNEL(xnn_arch_none, xnn_bf16_qu8_vcvt_ukernel__scalar_imagic_u3, 3, false, xnn_bfloat16, XNN_QUANTIZED(uint8_t), struct xnn_bf16_qu8_cvt_params, xnn_init_bf16_qu8_cvt_scalar_params)
XNN_UKERNEL(xnn_arch_none, xnn_bf16_qu8_vcvt_ukernel__scalar_imagic_u4, 4, false, xnn_bfloat16, XNN_QUANTIZED(uint8_t), struct xnn_bf16_qu8_cvt_params, xnn_init_bf16_qu8_cvt_scalar_params)


#ifdef XNN_DEFINED_QUANTIZED
#undef XNN_DEFINED_QUANTIZED
#undef XNN_QUANTIZED
#endif
57 changes: 57 additions & 0 deletions src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f32-qs8-vcvt/scalar-imagic.c.in
// Generator: tools/xngen
//
// Copyright 2021 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <float.h>
#include <stddef.h>
#include <stdint.h>

#include "src/xnnpack/math.h"
#include "src/xnnpack/microparams.h"
#include "src/xnnpack/vcvt.h"

void xnn_bf16_qu8_vcvt_ukernel__scalar_imagic_u1(
size_t batch,
const xnn_bfloat16* input,
uint8_t* output,
const struct xnn_bf16_qu8_cvt_params* restrict params)
{
assert(batch != 0);
assert(batch % sizeof(xnn_bfloat16) == 0);
assert(input != NULL);
assert(output != NULL);

const xnn_bfloat16* i = input;
// Don't let the scale be 0, which can happen for large scales, and should
// not happen because this value is a reciprocal.
const float vscale = math_max_f32(FLT_MIN, xnn_bfloat16_to_float(params->scalar.scale));
const float vmagic_bias = 12582912.0f;
const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;

do {
float vx = xnn_bfloat16_to_float(*i++);
vx *= vscale;
vx += vmagic_bias;

int32_t vy = (int32_t) float_as_uint32(vx);
vy = math_max_s32(vy, vmagic_min);
vy = math_min_s32(vy, vmagic_max);
vy -= vmagic_bias_less_zero_point;

*output++ = (uint8_t) vy;

batch -= sizeof(xnn_bfloat16);
} while (batch != 0);
}
83 changes: 83 additions & 0 deletions src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f32-qs8-vcvt/scalar-imagic.c.in
// Generator: tools/xngen
//
// Copyright 2021 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <float.h>
#include <stddef.h>
#include <stdint.h>

#include "src/xnnpack/common.h"
#include "src/xnnpack/math.h"
#include "src/xnnpack/microparams.h"
#include "src/xnnpack/vcvt.h"

void xnn_bf16_qu8_vcvt_ukernel__scalar_imagic_u2(
size_t batch,
const xnn_bfloat16* input,
uint8_t* output,
const struct xnn_bf16_qu8_cvt_params* restrict params)
{
assert(batch != 0);
assert(batch % sizeof(xnn_bfloat16) == 0);
assert(input != NULL);
assert(output != NULL);

const xnn_bfloat16* i = input;
// Don't let the scale be 0, which can happen for large scales, and should
// not happen because this value is a reciprocal.
const float vscale = math_max_f32(FLT_MIN, xnn_bfloat16_to_float(params->scalar.scale));
const float vmagic_bias = 12582912.0f;
const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;

for (; batch >= 2 * sizeof(xnn_bfloat16); batch -= 2 * sizeof(xnn_bfloat16)) {
float vx0 = xnn_bfloat16_to_float(i[0]);
float vx1 = xnn_bfloat16_to_float(i[1]);
i += 2;

vx0 *= vscale;
vx1 *= vscale;

vx0 += vmagic_bias;
vx1 += vmagic_bias;

int32_t vy0 = (int32_t) float_as_uint32(vx0);
int32_t vy1 = (int32_t) float_as_uint32(vx1);

vy0 = math_max_s32(vy0, vmagic_min);
vy1 = math_max_s32(vy1, vmagic_min);

vy0 = math_min_s32(vy0, vmagic_max);
vy1 = math_min_s32(vy1, vmagic_max);

vy0 -= vmagic_bias_less_zero_point;
vy1 -= vmagic_bias_less_zero_point;

output[0] = (uint8_t) vy0;
output[1] = (uint8_t) vy1;
output += 2;
}
if XNN_UNLIKELY(batch != 0) {
float vx = xnn_bfloat16_to_float(*i);
vx *= vscale;
vx += vmagic_bias;

int32_t vy = (int32_t) float_as_uint32(vx);
vy = math_max_s32(vy, vmagic_min);
vy = math_min_s32(vy, vmagic_max);
vy -= vmagic_bias_less_zero_point;

*output = (uint8_t) vy;
}
}
95 changes: 95 additions & 0 deletions src/bf16-qu8-vcvt/gen/bf16-qu8-vcvt-scalar-imagic-u3.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// clang-format off
// Auto-generated file. Do not edit!
// Template: src/f32-qs8-vcvt/scalar-imagic.c.in
// Generator: tools/xngen
//
// Copyright 2021 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <float.h>
#include <stddef.h>
#include <stdint.h>

#include "src/xnnpack/common.h"
#include "src/xnnpack/math.h"
#include "src/xnnpack/microparams.h"
#include "src/xnnpack/vcvt.h"

void xnn_bf16_qu8_vcvt_ukernel__scalar_imagic_u3(
size_t batch,
const xnn_bfloat16* input,
uint8_t* output,
const struct xnn_bf16_qu8_cvt_params* restrict params)
{
assert(batch != 0);
assert(batch % sizeof(xnn_bfloat16) == 0);
assert(input != NULL);
assert(output != NULL);

const xnn_bfloat16* i = input;
// Don't let the scale be 0, which can happen for large scales, and should
// not happen because this value is a reciprocal.
const float vscale = math_max_f32(FLT_MIN, xnn_bfloat16_to_float(params->scalar.scale));
const float vmagic_bias = 12582912.0f;
const float output_min_less_zero_point = (float) ((int32_t) 0 - (int32_t) params->scalar.output_zero_point);
const float output_max_less_zero_point = (float) ((int32_t) 255 - (int32_t) params->scalar.output_zero_point);
const int32_t vmagic_min = (int32_t) float_as_uint32(vmagic_bias + output_min_less_zero_point);
const int32_t vmagic_max = (int32_t) float_as_uint32(vmagic_bias + output_max_less_zero_point);
const int32_t vmagic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) params->scalar.output_zero_point;

for (; batch >= 3 * sizeof(xnn_bfloat16); batch -= 3 * sizeof(xnn_bfloat16)) {
float vx0 = xnn_bfloat16_to_float(i[0]);
float vx1 = xnn_bfloat16_to_float(i[1]);
float vx2 = xnn_bfloat16_to_float(i[2]);
i += 3;

vx0 *= vscale;
vx1 *= vscale;
vx2 *= vscale;

vx0 += vmagic_bias;
vx1 += vmagic_bias;
vx2 += vmagic_bias;

int32_t vy0 = (int32_t) float_as_uint32(vx0);
int32_t vy1 = (int32_t) float_as_uint32(vx1);
int32_t vy2 = (int32_t) float_as_uint32(vx2);

vy0 = math_max_s32(vy0, vmagic_min);
vy1 = math_max_s32(vy1, vmagic_min);
vy2 = math_max_s32(vy2, vmagic_min);

vy0 = math_min_s32(vy0, vmagic_max);
vy1 = math_min_s32(vy1, vmagic_max);
vy2 = math_min_s32(vy2, vmagic_max);

vy0 -= vmagic_bias_less_zero_point;
vy1 -= vmagic_bias_less_zero_point;
vy2 -= vmagic_bias_less_zero_point;

output[0] = (uint8_t) vy0;
output[1] = (uint8_t) vy1;
output[2] = (uint8_t) vy2;
output += 3;
}
if XNN_UNLIKELY(batch != 0) {
do {
float vx = xnn_bfloat16_to_float(*i++);
vx *= vscale;
vx += vmagic_bias;

int32_t vy = (int32_t) float_as_uint32(vx);
vy = math_max_s32(vy, vmagic_min);
vy = math_min_s32(vy, vmagic_max);
vy -= vmagic_bias_less_zero_point;

*output++ = (uint8_t) vy;

batch -= sizeof(xnn_bfloat16);
} while (batch != 0);
}
}
Loading
Loading