diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..c62273f21 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,78 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Wuffs (Wrangling Untrusted File Formats Safely) is a memory-safe programming language and standard library for decoding/encoding untrusted file formats. Wuffs source (`.wuffs` files) transpiles to C99 code. The generated C is distributed as a single-file library (`release/c/wuffs-*.c`). Safety guarantees (buffer overflows, integer overflow, null dereferences) are enforced at compile time with zero runtime overhead. + +## Build Commands + +```bash +# Install Go-based toolchain (wuffs, wuffsfmt) +go install ./cmd/wuffs* + +# Regenerate C code after editing .wuffs files +wuffs gen std/... # all modules +wuffs gen std/gif # single module + +# Run tests +wuffs test # all tests +wuffs test std/gif # single module +wuffs test -mimic # compare against reference C libraries (giflib, libpng, etc.) + +# Run benchmarks +wuffs bench std/gif # single module +wuffs bench -mimic # compare performance vs reference libs + +# Run Go unit tests (for toolchain code in lang/) +go test ./... + +# Build example programs +./build-example.sh example/zcat # single example +./build-example.sh # all examples + +# Build fuzz harnesses +./build-fuzz.sh + +# Full CI check (run before submitting a PR) +./build-all.sh +``` + +## Architecture + +**Toolchain (`lang/`)** — Go code implementing the Wuffs-to-C compiler: +- `lang/parse` — parser producing AST +- `lang/check` — type checker, bounds checker, proof/assertion verifier +- `lang/generate` — C code generation orchestration +- `lang/ast` — AST node definitions +- `lang/builtin` — built-in type and function signatures +- `lang/token` — tokenizer +- `lang/wuffsroot` — repository root discovery + +**Standard Library (`std/`)** — Wuffs source for codecs: image formats (gif, png, jpeg, bmp, webp, qoi, targa, wbmp, vp8, etc2, thumbhash), compression (deflate, gzip, zlib, bzip2, lzma, lzip, lzw, xz), checksums/hashes (crc32, crc64, adler32, sha256, xxhash32/64), data formats (json, cbor, netpbm, nie). + +**Generated Output (`release/c/`)** — Pre-generated single-file C libraries checked into the repo. Users `#include` these directly; define `WUFFS_IMPLEMENTATION` to compile the implementation, not just headers. + +**Internal C Templates (`internal/cgen/`)** — Base C code and auxiliary C++ helpers that get incorporated into generated output. + +**Tests (`test/c/`)** — C test files per codec in `test/c/std/`. Mimic tests in `test/c/mimiclib/` compare against third-party C libraries. Test data in `test/data/`. + +**CLI Tools (`cmd/`)** — `wuffs` (gen/test/bench/genlib), `wuffs-c`, `wuffsfmt` (auto-formatter), `ractool`, `dumbindent`. + +**Supporting Go Libraries (`lib/`)** — Go wrappers and utilities used by tools and examples. + +## Key Language Concepts + +- **Hermetic**: No I/O, no memory allocation, no syscalls. Callers provide all buffers. +- **Coroutines**: Methods marked `?` can suspend on `$short read`/`$short write`; callers refill buffers and resume. +- **Refinement types**: e.g. `base.u32[..= 255]` constrains value ranges, verified at compile time via interval arithmetic. +- **Facts and assertions**: Compile-time proof system; `assert` statements with named axioms for bounds safety. +- **Effects**: `!` marks impure methods, `?` marks coroutines. +- **Syntax differences from C**: `and`/`or`/`not` for logical ops, `<>` for not-equals, `~mod+`/`~sat+` for modular/saturating arithmetic, no operator precedence (explicit parens required). + +## Code Style + +- Wuffs source: auto-formatted with `wuffsfmt` +- C/C++ code: Chromium style (`.clang-format` in repo root) +- License: Apache-2.0 OR MIT (dual-licensed) diff --git a/fuzz/c/std/webp_fuzzer.c b/fuzz/c/std/webp_fuzzer.c new file mode 100644 index 000000000..4c4419038 --- /dev/null +++ b/fuzz/c/std/webp_fuzzer.c @@ -0,0 +1,93 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// ---------------- + +// Silence the nested slash-star warning for the next comment's command line. +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wcomment" + +/* +This fuzzer (the fuzz function) is typically run indirectly, by a framework +such as https://github.com/google/oss-fuzz calling LLVMFuzzerTestOneInput. + +When working on the fuzz implementation, or as a coherence check, defining +WUFFS_CONFIG__FUZZLIB_MAIN will let you manually run fuzz over a set of files: + +gcc -DWUFFS_CONFIG__FUZZLIB_MAIN webp_fuzzer.c +./a.out ../../../test/data/*.webp +rm -f ./a.out + +It should print "PASS", amongst other information, and exit(0). +*/ + +#pragma clang diagnostic pop + +// Wuffs ships as a "single file C library" or "header file library" as per +// https://github.com/nothings/stb/blob/master/docs/stb_howto.txt +// +// To use that single file as a "foo.c"-like implementation, instead of a +// "foo.h"-like header, #define WUFFS_IMPLEMENTATION before #include'ing or +// compiling it. +#define WUFFS_IMPLEMENTATION + +#if defined(WUFFS_CONFIG__FUZZLIB_MAIN) +// Defining the WUFFS_CONFIG__STATIC_FUNCTIONS macro is optional, but when +// combined with WUFFS_IMPLEMENTATION, it demonstrates making all of Wuffs' +// functions have static storage. +// +// This can help the compiler ignore or discard unused code, which can produce +// faster compiles and smaller binaries. Other motivations are discussed in the +// "ALLOW STATIC IMPLEMENTATION" section of +// https://raw.githubusercontent.com/nothings/stb/master/docs/stb_howto.txt +#define WUFFS_CONFIG__STATIC_FUNCTIONS +#endif // defined(WUFFS_CONFIG__FUZZLIB_MAIN) + +// Defining the WUFFS_CONFIG__MODULE* macros are optional, but it lets users of +// release/c/etc.c choose which parts of Wuffs to build. That file contains the +// entire Wuffs standard library, implementing a variety of codecs and file +// formats. Without this macro definition, an optimizing compiler or linker may +// very well discard Wuffs code for unused codecs, but listing the Wuffs +// modules we use makes that process explicit. Preprocessing means that such +// code simply isn't compiled. +#define WUFFS_CONFIG__MODULES +#define WUFFS_CONFIG__MODULE__BASE +#define WUFFS_CONFIG__MODULE__VP8 +#define WUFFS_CONFIG__MODULE__WEBP + +// If building this program in an environment that doesn't easily accommodate +// relative includes, you can use the script/inline-c-relative-includes.go +// program to generate a stand-alone C file. +#include "../../../release/c/wuffs-unsupported-snapshot.c" +#include "../fuzzlib/fuzzlib.c" +#include "../fuzzlib/fuzzlib_image_decoder.c" + +const char* // +fuzz(wuffs_base__io_buffer* src, uint64_t hash) { + // Heap-allocate: the WebP decoder struct is too large for the stack. + wuffs_webp__decoder* dec = + (wuffs_webp__decoder*)calloc(1, sizeof(wuffs_webp__decoder)); + if (!dec) { + return "out of memory"; + } + wuffs_base__status status = wuffs_webp__decoder__initialize( + dec, sizeof *dec, WUFFS_VERSION, + (hash & 1) ? WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED : 0); + hash = wuffs_base__u64__rotate_right(hash, 1); + if (!wuffs_base__status__is_ok(&status)) { + free(dec); + return wuffs_base__status__message(&status); + } + const char* ret = fuzz_image_decoder( + src, hash, + wuffs_webp__decoder__upcast_as__wuffs_base__image_decoder(dec)); + free(dec); + return ret; +} diff --git a/internal/cgen/base/all-impl.c b/internal/cgen/base/all-impl.c index 01cdc56e3..bdf96ecb4 100644 --- a/internal/cgen/base/all-impl.c +++ b/internal/cgen/base/all-impl.c @@ -164,6 +164,8 @@ const uint32_t wuffs_private_impl__pixel_format__bits_per_channel[16] = { // ¡ INSERT base/pixconv-submodule-x86-avx2.c. +// ¡ INSERT base/pixconv-submodule-arm-neon.c. + #endif // !defined(WUFFS_CONFIG__MODULES) || // defined(WUFFS_CONFIG__MODULE__BASE) || // defined(WUFFS_CONFIG__MODULE__BASE__PIXCONV) diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h index 0ad1ebcdd..eb0ea4d6f 100644 --- a/internal/cgen/base/fundamental-public.h +++ b/internal/cgen/base/fundamental-public.h @@ -408,6 +408,31 @@ wuffs_base__cpu_arch__have_x86_sse42(void) { #define WUFFS_BASE__GENERATED_C_CODE #endif +// WUFFS_BASE__GENERATED_C_CODE_NOINLINE is WUFFS_BASE__GENERATED_C_CODE with +// an additional noinline hint. It is used for cold helper functions (e.g. byte +// loading) that should not be inlined into their callers, so that the callers +// remain small enough for the compiler to inline them at their call sites. +#if defined(__GNUC__) || defined(__clang__) +#define WUFFS_BASE__GENERATED_C_CODE_NOINLINE \ + WUFFS_BASE__GENERATED_C_CODE __attribute__((noinline)) +#elif defined(_MSC_VER) +#define WUFFS_BASE__GENERATED_C_CODE_NOINLINE \ + WUFFS_BASE__GENERATED_C_CODE __declspec(noinline) +#else +#define WUFFS_BASE__GENERATED_C_CODE_NOINLINE WUFFS_BASE__GENERATED_C_CODE +#endif + +// WUFFS_BASE__GENERATED_C_CODE_ALWAYS_INLINE is +// WUFFS_BASE__GENERATED_C_CODE with an additional always_inline hint. It is +// used for hot helper functions that should always be inlined into their +// callers (e.g. coefficient decoding in boolean decoders). +#if defined(__GNUC__) || defined(__clang__) +#define WUFFS_BASE__GENERATED_C_CODE_ALWAYS_INLINE \ + WUFFS_BASE__GENERATED_C_CODE inline __attribute__((always_inline)) +#else +#define WUFFS_BASE__GENERATED_C_CODE_ALWAYS_INLINE WUFFS_BASE__GENERATED_C_CODE +#endif + // -------- // Options (bitwise or'ed together) for wuffs_foo__bar__initialize functions. @@ -1077,6 +1102,38 @@ wuffs_base__count_leading_zeroes_u64(uint64_t u) { #endif // (defined(__GNUC__) || defined(__clang__)) && (__SIZEOF_LONG__ == 8) +static inline uint32_t // +wuffs_base__count_leading_zeroes_u32(uint32_t u) { +#if defined(__GNUC__) || defined(__clang__) + return u ? ((uint32_t)(__builtin_clz(u))) : 32u; +#else + if (u == 0) { + return 32; + } + uint32_t n = 0; + if ((u >> 16) == 0) { + n |= 16; + u <<= 16; + } + if ((u >> 24) == 0) { + n |= 8; + u <<= 8; + } + if ((u >> 28) == 0) { + n |= 4; + u <<= 4; + } + if ((u >> 30) == 0) { + n |= 2; + u <<= 2; + } + if ((u >> 31) == 0) { + n |= 1; + } + return n; +#endif +} + // -------- // Normally, the wuffs_base__peek_etc and wuffs_base__poke_etc implementations @@ -1111,6 +1168,10 @@ wuffs_base__peek_u16be__no_bounds_check(const uint8_t* p) { uint16_t x; memcpy(&x, p, 2); return _byteswap_ushort(x); +#elif defined(__GNUC__) || defined(__clang__) + uint16_t x; + memcpy(&x, p, 2); + return __builtin_bswap16(x); #else return (uint16_t)(((uint16_t)(p[0]) << 8) | ((uint16_t)(p[1]) << 0)); #endif @@ -1145,6 +1206,13 @@ wuffs_base__peek_u32be__no_bounds_check(const uint8_t* p) { uint32_t x; memcpy(&x, p, 4); return _byteswap_ulong(x); +#elif defined(__GNUC__) || defined(__clang__) + // Use memcpy + bswap to guarantee a single 32-bit load. The byte-shift + // pattern below is semantically equivalent, but compilers may fail to merge + // the four byte loads in large functions. + uint32_t x; + memcpy(&x, p, 4); + return __builtin_bswap32(x); #else return ((uint32_t)(p[0]) << 24) | ((uint32_t)(p[1]) << 16) | ((uint32_t)(p[2]) << 8) | ((uint32_t)(p[3]) << 0); @@ -1213,6 +1281,10 @@ wuffs_base__peek_u64be__no_bounds_check(const uint8_t* p) { uint64_t x; memcpy(&x, p, 8); return _byteswap_uint64(x); +#elif defined(__GNUC__) || defined(__clang__) + uint64_t x; + memcpy(&x, p, 8); + return __builtin_bswap64(x); #else return ((uint64_t)(p[0]) << 56) | ((uint64_t)(p[1]) << 48) | ((uint64_t)(p[2]) << 40) | ((uint64_t)(p[3]) << 32) | diff --git a/internal/cgen/base/image-private.h b/internal/cgen/base/image-private.h index 4547b0f28..deaa83a45 100644 --- a/internal/cgen/base/image-private.h +++ b/internal/cgen/base/image-private.h @@ -69,6 +69,7 @@ wuffs_base__pixel_swizzler__swizzle_ycck( uint8_t v3, bool is_rgb_or_cmyk, bool triangle_filter_for_2to1, + bool src_is_bt601, wuffs_base__slice_u8 scratch_buffer_2k); // ---------------- Images (Utility) diff --git a/internal/cgen/base/image-public.h b/internal/cgen/base/image-public.h index 8a9e424a0..8df387515 100644 --- a/internal/cgen/base/image-public.h +++ b/internal/cgen/base/image-public.h @@ -370,6 +370,63 @@ wuffs_base__color_ycc__as__color_u32_abgr(uint8_t yy, uint8_t cb, uint8_t cr) { ((0x00FF0000 & rr32) >> 16); } +// wuffs_base__color_ycc_bt601__as__color_u32 converts from BT.601 studio-range +// YCbCr (as used by VP8, H.264, etc.) to 0xAARRGGBB. The alpha bits are +// always 0xFF. +// +// This uses the studio-range formula from ITU-R BT.601 / RFC 6386 section 13: +// R = 1.164*(Y-16) + 1.596*(Cr-128) +// G = 1.164*(Y-16) - 0.391*(Cb-128) - 0.813*(Cr-128) +// B = 1.164*(Y-16) + 2.018*(Cb-128) +// +// The fixed-point arithmetic matches libwebp's VP8YUVToR/G/B for bit-exact +// results. +static inline wuffs_base__color_u32_argb_premul // +wuffs_base__color_ycc_bt601__as__color_u32(uint8_t yy, + uint8_t cb, + uint8_t cr) { + int32_t yc = ((int32_t)yy * 19077) >> 8; + int32_t rc = ((int32_t)cr * 26149) >> 8; + int32_t gc_u = ((int32_t)cb * 6419) >> 8; + int32_t gc_v = ((int32_t)cr * 13320) >> 8; + int32_t bc = ((int32_t)cb * 33050) >> 8; + + int32_t rr = yc + rc - 14234; + int32_t gg = yc - gc_u - gc_v + 8708; + int32_t bb = yc + bc - 17685; + + // Clip to [0, 255]: if in range [0, 16320], shift right by 6. + uint32_t r = (rr < 0) ? 0u : (rr > 16320) ? 255u : ((uint32_t)rr >> 6); + uint32_t g = (gg < 0) ? 0u : (gg > 16320) ? 255u : ((uint32_t)gg >> 6); + uint32_t b = (bb < 0) ? 0u : (bb > 16320) ? 255u : ((uint32_t)bb >> 6); + + return 0xFF000000u | (r << 16) | (g << 8) | b; +} + +// wuffs_base__color_ycc_bt601__as__color_u32_abgr is like +// wuffs_base__color_ycc_bt601__as__color_u32 but the uint32_t returned is in +// 0xAABBGGRR order, not 0xAARRGGBB. +static inline uint32_t // +wuffs_base__color_ycc_bt601__as__color_u32_abgr(uint8_t yy, + uint8_t cb, + uint8_t cr) { + int32_t yc = ((int32_t)yy * 19077) >> 8; + int32_t rc = ((int32_t)cr * 26149) >> 8; + int32_t gc_u = ((int32_t)cb * 6419) >> 8; + int32_t gc_v = ((int32_t)cr * 13320) >> 8; + int32_t bc = ((int32_t)cb * 33050) >> 8; + + int32_t rr = yc + rc - 14234; + int32_t gg = yc - gc_u - gc_v + 8708; + int32_t bb = yc + bc - 17685; + + uint32_t r = (rr < 0) ? 0u : (rr > 16320) ? 255u : ((uint32_t)rr >> 6); + uint32_t g = (gg < 0) ? 0u : (gg > 16320) ? 255u : ((uint32_t)gg >> 6); + uint32_t b = (bb < 0) ? 0u : (bb > 16320) ? 255u : ((uint32_t)bb >> 6); + + return 0xFF000000u | (b << 16) | (g << 8) | r; +} + // -------- typedef uint8_t wuffs_base__pixel_blend; diff --git a/internal/cgen/base/pixconv-submodule-arm-neon.c b/internal/cgen/base/pixconv-submodule-arm-neon.c new file mode 100644 index 000000000..9b32c3b41 --- /dev/null +++ b/internal/cgen/base/pixconv-submodule-arm-neon.c @@ -0,0 +1,216 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// -------- + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +static void // +wuffs_private_impl__swizzle_ycc__convert_3_bgrx_arm_neon( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + // Per wuffs_base__color_ycc__as__color_u32, the formulae: + // + // R = Y + 1.40200 * Cr + // G = Y - 0.34414 * Cb - 0.71414 * Cr + // B = Y + 1.77200 * Cb + // + // When scaled by 1<<16: + // + // 0.34414 becomes 0x0581A = 22554. + // 0.71414 becomes 0x0B6D2 = 46802. + // 1.40200 becomes 0x166E9 = 91881. + // 1.77200 becomes 0x1C5A2 = 116130. + // + // Separate the integer and fractional parts, since we work with signed + // 16-bit SIMD lanes (int16x4_t for vmull_n_s16). + // + // -0x3A5E = -0x20000 + 0x1C5A2 The B:Cb factor. + // +0x66E9 = -0x10000 + 0x166E9 The R:Cr factor. + // -0x581A = +0x00000 - 0x0581A The G:Cb factor. + // +0x492E = +0x10000 - 0x0B6D2 The G:Cr factor. + // + // B-Y = frac_B * Cb / 65536 + 2 * Cb + // R-Y = frac_R * Cr / 65536 + 1 * Cr + // G-Y = (frac_Gcb * Cb + frac_Gcr * Cr) / 65536 - 1 * Cr + + const int16_t k_frac_b_cb = -0x3A5E; // -14942 + const int16_t k_frac_r_cr = +0x66E9; // +26345 + const int16_t k_frac_g_cb = -0x581A; // -22554 + const int16_t k_frac_g_cr = +0x492E; // +18734 + + const int16x8_t bias = vdupq_n_s16(128); + const uint8x8_t alpha = vdup_n_u8(0xFF); + + while ((x + 8u) <= x_end) { + // Load 8 pixels of Y, Cb, Cr. + uint8x8_t y_u8 = vld1_u8(up0); + uint8x8_t cb_u8 = vld1_u8(up1); + uint8x8_t cr_u8 = vld1_u8(up2); + + // Widen to int16 and center chroma around zero. + int16x8_t yy = vreinterpretq_s16_u16(vmovl_u8(y_u8)); + int16x8_t cb = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_u8)), bias); + int16x8_t cr = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_u8)), bias); + + // Split into lo/hi halves for 32-bit precision multiplies. + int16x4_t cb_lo = vget_low_s16(cb); + int16x4_t cb_hi = vget_high_s16(cb); + int16x4_t cr_lo = vget_low_s16(cr); + int16x4_t cr_hi = vget_high_s16(cr); + + // R-Y = round(frac_R * Cr / 65536) + Cr + int16x8_t ry = vcombine_s16( + vrshrn_n_s32(vmull_n_s16(cr_lo, k_frac_r_cr), 16), + vrshrn_n_s32(vmull_n_s16(cr_hi, k_frac_r_cr), 16)); + ry = vaddq_s16(ry, cr); + + // B-Y = round(frac_B * Cb / 65536) + 2 * Cb + int16x8_t by = vcombine_s16( + vrshrn_n_s32(vmull_n_s16(cb_lo, k_frac_b_cb), 16), + vrshrn_n_s32(vmull_n_s16(cb_hi, k_frac_b_cb), 16)); + by = vaddq_s16(by, vaddq_s16(cb, cb)); + + // G-Y = round((frac_Gcb * Cb + frac_Gcr * Cr) / 65536) - Cr + int32x4_t gy32_lo = vmull_n_s16(cb_lo, k_frac_g_cb); + gy32_lo = vmlal_n_s16(gy32_lo, cr_lo, k_frac_g_cr); + int32x4_t gy32_hi = vmull_n_s16(cb_hi, k_frac_g_cb); + gy32_hi = vmlal_n_s16(gy32_hi, cr_hi, k_frac_g_cr); + int16x8_t gy = vcombine_s16( + vrshrn_n_s32(gy32_lo, 16), + vrshrn_n_s32(gy32_hi, 16)); + gy = vsubq_s16(gy, cr); + + // Add Y and clamp to [0, 255] via saturating unsigned narrow. + uint8x8_t r = vqmovun_s16(vaddq_s16(yy, ry)); + uint8x8_t g = vqmovun_s16(vaddq_s16(yy, gy)); + uint8x8_t b = vqmovun_s16(vaddq_s16(yy, by)); + + // Interleave to BGRX and store 8 pixels (32 bytes). + uint8x8x4_t bgrx; + bgrx.val[0] = b; + bgrx.val[1] = g; + bgrx.val[2] = r; + bgrx.val[3] = alpha; + vst4_u8(dst_iter, bgrx); + + dst_iter += 32u; + up0 += 8u; + up1 += 8u; + up2 += 8u; + x += 8u; + } + + // Scalar tail. + for (; x < x_end; x++) { + uint32_t color = // + wuffs_base__color_ycc__as__color_u32( // + *up0++, *up1++, *up2++); + wuffs_base__poke_u32le__no_bounds_check(dst_iter, color); + dst_iter += 4u; + } +} + +// The rgbx flavor is exactly the same as the bgrx flavor except that the +// interleave order is {r, g, b, alpha} instead of {b, g, r, alpha}. +static void // +wuffs_private_impl__swizzle_ycc__convert_3_rgbx_arm_neon( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + const int16_t k_frac_b_cb = -0x3A5E; + const int16_t k_frac_r_cr = +0x66E9; + const int16_t k_frac_g_cb = -0x581A; + const int16_t k_frac_g_cr = +0x492E; + + const int16x8_t bias = vdupq_n_s16(128); + const uint8x8_t alpha = vdup_n_u8(0xFF); + + while ((x + 8u) <= x_end) { + uint8x8_t y_u8 = vld1_u8(up0); + uint8x8_t cb_u8 = vld1_u8(up1); + uint8x8_t cr_u8 = vld1_u8(up2); + + int16x8_t yy = vreinterpretq_s16_u16(vmovl_u8(y_u8)); + int16x8_t cb = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_u8)), bias); + int16x8_t cr = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_u8)), bias); + + int16x4_t cb_lo = vget_low_s16(cb); + int16x4_t cb_hi = vget_high_s16(cb); + int16x4_t cr_lo = vget_low_s16(cr); + int16x4_t cr_hi = vget_high_s16(cr); + + int16x8_t ry = vcombine_s16( + vrshrn_n_s32(vmull_n_s16(cr_lo, k_frac_r_cr), 16), + vrshrn_n_s32(vmull_n_s16(cr_hi, k_frac_r_cr), 16)); + ry = vaddq_s16(ry, cr); + + int16x8_t by = vcombine_s16( + vrshrn_n_s32(vmull_n_s16(cb_lo, k_frac_b_cb), 16), + vrshrn_n_s32(vmull_n_s16(cb_hi, k_frac_b_cb), 16)); + by = vaddq_s16(by, vaddq_s16(cb, cb)); + + int32x4_t gy32_lo = vmull_n_s16(cb_lo, k_frac_g_cb); + gy32_lo = vmlal_n_s16(gy32_lo, cr_lo, k_frac_g_cr); + int32x4_t gy32_hi = vmull_n_s16(cb_hi, k_frac_g_cb); + gy32_hi = vmlal_n_s16(gy32_hi, cr_hi, k_frac_g_cr); + int16x8_t gy = vcombine_s16( + vrshrn_n_s32(gy32_lo, 16), + vrshrn_n_s32(gy32_hi, 16)); + gy = vsubq_s16(gy, cr); + + uint8x8_t r = vqmovun_s16(vaddq_s16(yy, ry)); + uint8x8_t g = vqmovun_s16(vaddq_s16(yy, gy)); + uint8x8_t b = vqmovun_s16(vaddq_s16(yy, by)); + + // Interleave to RGBX and store 8 pixels (32 bytes). + uint8x8x4_t rgbx; + rgbx.val[0] = r; + rgbx.val[1] = g; + rgbx.val[2] = b; + rgbx.val[3] = alpha; + vst4_u8(dst_iter, rgbx); + + dst_iter += 32u; + up0 += 8u; + up1 += 8u; + up2 += 8u; + x += 8u; + } + + for (; x < x_end; x++) { + uint32_t color = // + wuffs_base__color_ycc__as__color_u32_abgr( // + *up0++, *up1++, *up2++); + wuffs_base__poke_u32le__no_bounds_check(dst_iter, color); + dst_iter += 4u; + } +} + +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon diff --git a/internal/cgen/base/pixconv-submodule-x86-avx2.c b/internal/cgen/base/pixconv-submodule-x86-avx2.c index bba8dadc7..cf1c272da 100644 --- a/internal/cgen/base/pixconv-submodule-x86-avx2.c +++ b/internal/cgen/base/pixconv-submodule-x86-avx2.c @@ -433,6 +433,311 @@ wuffs_private_impl__swizzle_ycc__convert_3_rgbx_x86_avx2( } } +// -------- + +// BT.601 studio-range YCbCr to BGRX/RGBX, AVX2. +// +// This matches the scalar wuffs_base__color_ycc_bt601__as__color_u32 formula: +// yc = (Y * 19077) >> 8 +// rc = (Cr * 26149) >> 8 +// gc_u = (Cb * 6419) >> 8 +// gc_v = (Cr * 13320) >> 8 +// bc = (Cb * 33050) >> 8 +// R = clip((yc + rc - 14234) >> 6, 0, 255) +// G = clip((yc - gc_u - gc_v + 8708) >> 6, 0, 255) +// B = clip((yc + bc - 17685) >> 6, 0, 255) +// +// SIMD approach: compute (X * K) >> 8 via mullo+mulhi_epu16, combine as i16, +// shift right by 6, and use packus_epi16 for [0,255] clamping. The B channel +// uses adds_epi16 to avoid overflow (yc+bc can exceed i16 max). + +// Helper: compute (X * K) >> 8 in i16 lanes, where X is u8-in-i16 [0..255] +// and K is a u16 constant. Uses mullo_epi16 (low 16 bits) and mulhi_epu16 +// (unsigned high 16 bits) to form the result. +#define WUFFS_PRIVATE_IMPL__MULDIV256(x, k) \ + _mm256_or_si256(_mm256_srli_epi16(_mm256_mullo_epi16((x), (k)), 8), \ + _mm256_slli_epi16(_mm256_mulhi_epu16((x), (k)), 8)) + +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx_x86_avx2( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + if ((x + 32u) > x_end) { + wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx( // + dst, x, x_end, y, up0, up1, up2); + return; + } + + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + const __m256i u00FF = _mm256_set1_epi16(+0x00FF); + const __m256i uFFFF = _mm256_set1_epi16(-0x0001); + + // BT.601 fixed-point constants for (X * K) >> 8 via MULDIV256. + const __m256i k_19077 = _mm256_set1_epi16(19077); + const __m256i k_26149 = _mm256_set1_epi16(26149); + const __m256i k_6419 = _mm256_set1_epi16(6419); + const __m256i k_13320 = _mm256_set1_epi16(13320); + + // For the B channel, (Cb * 33050) >> 8 can exceed i16 max (32920 for Cb=255). + // We center: bc_c = ((Cb-128) * 33050) >> 8, range [-16525, 16395]. + // Decompose 33050 = 129*256 + 26, so (x*33050)>>8 = x*129 + (x*26)>>8. + const __m256i k_128 = _mm256_set1_epi16(128); + const __m256i k_26 = _mm256_set1_epi16(26); + + const __m256i k_r_off = _mm256_set1_epi16(-14234); + const __m256i k_g_off = _mm256_set1_epi16(+8708); + // B offset: (128*33050)>>8 = 16525, so 16525 - 17685 = -1160. + const __m256i k_b_off = _mm256_set1_epi16(-1160); + + while (x < x_end) { + __m256i yy_all = _mm256_lddqu_si256((const __m256i*)(const void*)up0); + __m256i cb_all = _mm256_lddqu_si256((const __m256i*)(const void*)up1); + __m256i cr_all = _mm256_lddqu_si256((const __m256i*)(const void*)up2); + + // Split into even and odd i16 lanes. + __m256i yy_eve = _mm256_and_si256(yy_all, u00FF); + __m256i yy_odd = _mm256_srli_epi16(yy_all, 8); + __m256i cb_eve = _mm256_and_si256(cb_all, u00FF); + __m256i cb_odd = _mm256_srli_epi16(cb_all, 8); + __m256i cr_eve = _mm256_and_si256(cr_all, u00FF); + __m256i cr_odd = _mm256_srli_epi16(cr_all, 8); + + // yc = (Y * 19077) >> 8. Range [0, 18977]. + __m256i yc_eve = WUFFS_PRIVATE_IMPL__MULDIV256(yy_eve, k_19077); + __m256i yc_odd = WUFFS_PRIVATE_IMPL__MULDIV256(yy_odd, k_19077); + + // rc = (Cr * 26149) >> 8. Range [0, 26046]. + __m256i rc_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cr_eve, k_26149); + __m256i rc_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cr_odd, k_26149); + + // gc_u = (Cb * 6419) >> 8. Range [0, 6393]. + __m256i gc_u_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cb_eve, k_6419); + __m256i gc_u_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cb_odd, k_6419); + + // gc_v = (Cr * 13320) >> 8. Range [0, 13270]. + __m256i gc_v_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cr_eve, k_13320); + __m256i gc_v_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cr_odd, k_13320); + + // bc_c = ((Cb-128) * 33050) >> 8 = (Cb-128)*129 + ((Cb-128)*26)>>8. + // Range [-16525, 16395], fits i16. + __m256i cb_c_eve = _mm256_sub_epi16(cb_eve, k_128); + __m256i cb_c_odd = _mm256_sub_epi16(cb_odd, k_128); + __m256i bc_c_eve = _mm256_add_epi16( + _mm256_add_epi16(_mm256_slli_epi16(cb_c_eve, 7), cb_c_eve), + _mm256_srai_epi16(_mm256_mullo_epi16(cb_c_eve, k_26), 8)); + __m256i bc_c_odd = _mm256_add_epi16( + _mm256_add_epi16(_mm256_slli_epi16(cb_c_odd, 7), cb_c_odd), + _mm256_srai_epi16(_mm256_mullo_epi16(cb_c_odd, k_26), 8)); + + // R = (yc + rc - 14234) >> 6. Max = 18977+26046-14234 = 30789 < 32767. + __m256i r_eve = _mm256_srai_epi16( + _mm256_add_epi16(_mm256_add_epi16(yc_eve, rc_eve), k_r_off), 6); + __m256i r_odd = _mm256_srai_epi16( + _mm256_add_epi16(_mm256_add_epi16(yc_odd, rc_odd), k_r_off), 6); + + // G = (yc - gc_u - gc_v + 8708) >> 6. Range [-10955, 27685], fits i16. + __m256i g_eve = _mm256_srai_epi16( + _mm256_add_epi16( + _mm256_sub_epi16(_mm256_sub_epi16(yc_eve, gc_u_eve), gc_v_eve), + k_g_off), + 6); + __m256i g_odd = _mm256_srai_epi16( + _mm256_add_epi16( + _mm256_sub_epi16(_mm256_sub_epi16(yc_odd, gc_u_odd), gc_v_odd), + k_g_off), + 6); + + // B = (yc + bc_c - 1160) >> 6. Range [-17685, 34212]. Use adds_epi16 + // for the final sum: saturates to 32767 for sums > 32767, which after + // >>6 = 511 gets clamped to 255 by packus. Correct. + __m256i b_eve = _mm256_srai_epi16( + _mm256_adds_epi16(_mm256_add_epi16(yc_eve, k_b_off), bc_c_eve), 6); + __m256i b_odd = _mm256_srai_epi16( + _mm256_adds_epi16(_mm256_add_epi16(yc_odd, k_b_off), bc_c_odd), 6); + + // Pack i16 to u8 with saturation. + __m256i packed_b_eve = _mm256_packus_epi16(b_eve, b_eve); + __m256i packed_b_odd = _mm256_packus_epi16(b_odd, b_odd); + __m256i packed_g_eve = _mm256_packus_epi16(g_eve, g_eve); + __m256i packed_g_odd = _mm256_packus_epi16(g_odd, g_odd); + __m256i packed_r_eve = _mm256_packus_epi16(r_eve, r_eve); + __m256i packed_r_odd = _mm256_packus_epi16(r_odd, r_odd); + + // Interleave to BGRX, same as the JFIF converter. + __m256i mix00 = _mm256_unpacklo_epi8(packed_b_eve, packed_g_eve); + __m256i mix01 = _mm256_unpacklo_epi8(packed_b_odd, packed_g_odd); + __m256i mix02 = _mm256_unpacklo_epi8(packed_r_eve, uFFFF); + __m256i mix03 = _mm256_unpacklo_epi8(packed_r_odd, uFFFF); + + __m256i mix10 = _mm256_unpacklo_epi16(mix00, mix02); + __m256i mix11 = _mm256_unpacklo_epi16(mix01, mix03); + __m256i mix12 = _mm256_unpackhi_epi16(mix00, mix02); + __m256i mix13 = _mm256_unpackhi_epi16(mix01, mix03); + + __m256i mix20 = _mm256_unpacklo_epi32(mix10, mix11); + __m256i mix21 = _mm256_unpackhi_epi32(mix10, mix11); + __m256i mix22 = _mm256_unpacklo_epi32(mix12, mix13); + __m256i mix23 = _mm256_unpackhi_epi32(mix12, mix13); + + __m256i mix30 = _mm256_permute2x128_si256(mix20, mix21, 0x20); + __m256i mix31 = _mm256_permute2x128_si256(mix22, mix23, 0x20); + __m256i mix32 = _mm256_permute2x128_si256(mix20, mix21, 0x31); + __m256i mix33 = _mm256_permute2x128_si256(mix22, mix23, 0x31); + + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x00), mix30); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x20), mix31); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x40), mix32); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x60), mix33); + + uint32_t n = 32u - (31u & (x - x_end)); + dst_iter += 4u * n; + up0 += n; + up1 += n; + up2 += n; + x += n; + } +} + +// The rgbx flavor is the same as the bgrx flavor above but swaps B and R in +// the interleave stage. +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx_x86_avx2( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + if ((x + 32u) > x_end) { + wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx( // + dst, x, x_end, y, up0, up1, up2); + return; + } + + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + const __m256i u00FF = _mm256_set1_epi16(+0x00FF); + const __m256i uFFFF = _mm256_set1_epi16(-0x0001); + + const __m256i k_19077 = _mm256_set1_epi16(19077); + const __m256i k_26149 = _mm256_set1_epi16(26149); + const __m256i k_6419 = _mm256_set1_epi16(6419); + const __m256i k_13320 = _mm256_set1_epi16(13320); + const __m256i k_128 = _mm256_set1_epi16(128); + const __m256i k_26 = _mm256_set1_epi16(26); + + const __m256i k_r_off = _mm256_set1_epi16(-14234); + const __m256i k_g_off = _mm256_set1_epi16(+8708); + const __m256i k_b_off = _mm256_set1_epi16(-1160); + + while (x < x_end) { + __m256i yy_all = _mm256_lddqu_si256((const __m256i*)(const void*)up0); + __m256i cb_all = _mm256_lddqu_si256((const __m256i*)(const void*)up1); + __m256i cr_all = _mm256_lddqu_si256((const __m256i*)(const void*)up2); + + __m256i yy_eve = _mm256_and_si256(yy_all, u00FF); + __m256i yy_odd = _mm256_srli_epi16(yy_all, 8); + __m256i cb_eve = _mm256_and_si256(cb_all, u00FF); + __m256i cb_odd = _mm256_srli_epi16(cb_all, 8); + __m256i cr_eve = _mm256_and_si256(cr_all, u00FF); + __m256i cr_odd = _mm256_srli_epi16(cr_all, 8); + + __m256i yc_eve = WUFFS_PRIVATE_IMPL__MULDIV256(yy_eve, k_19077); + __m256i yc_odd = WUFFS_PRIVATE_IMPL__MULDIV256(yy_odd, k_19077); + __m256i rc_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cr_eve, k_26149); + __m256i rc_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cr_odd, k_26149); + __m256i gc_u_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cb_eve, k_6419); + __m256i gc_u_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cb_odd, k_6419); + __m256i gc_v_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cr_eve, k_13320); + __m256i gc_v_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cr_odd, k_13320); + + __m256i cb_c_eve = _mm256_sub_epi16(cb_eve, k_128); + __m256i cb_c_odd = _mm256_sub_epi16(cb_odd, k_128); + __m256i bc_c_eve = _mm256_add_epi16( + _mm256_add_epi16(_mm256_slli_epi16(cb_c_eve, 7), cb_c_eve), + _mm256_srai_epi16(_mm256_mullo_epi16(cb_c_eve, k_26), 8)); + __m256i bc_c_odd = _mm256_add_epi16( + _mm256_add_epi16(_mm256_slli_epi16(cb_c_odd, 7), cb_c_odd), + _mm256_srai_epi16(_mm256_mullo_epi16(cb_c_odd, k_26), 8)); + + __m256i r_eve = _mm256_srai_epi16( + _mm256_add_epi16(_mm256_add_epi16(yc_eve, rc_eve), k_r_off), 6); + __m256i r_odd = _mm256_srai_epi16( + _mm256_add_epi16(_mm256_add_epi16(yc_odd, rc_odd), k_r_off), 6); + + __m256i g_eve = _mm256_srai_epi16( + _mm256_add_epi16( + _mm256_sub_epi16(_mm256_sub_epi16(yc_eve, gc_u_eve), gc_v_eve), + k_g_off), + 6); + __m256i g_odd = _mm256_srai_epi16( + _mm256_add_epi16( + _mm256_sub_epi16(_mm256_sub_epi16(yc_odd, gc_u_odd), gc_v_odd), + k_g_off), + 6); + + __m256i b_eve = _mm256_srai_epi16( + _mm256_adds_epi16(_mm256_add_epi16(yc_eve, k_b_off), bc_c_eve), 6); + __m256i b_odd = _mm256_srai_epi16( + _mm256_adds_epi16(_mm256_add_epi16(yc_odd, k_b_off), bc_c_odd), 6); + + __m256i packed_b_eve = _mm256_packus_epi16(b_eve, b_eve); + __m256i packed_b_odd = _mm256_packus_epi16(b_odd, b_odd); + __m256i packed_g_eve = _mm256_packus_epi16(g_eve, g_eve); + __m256i packed_g_odd = _mm256_packus_epi16(g_odd, g_odd); + __m256i packed_r_eve = _mm256_packus_epi16(r_eve, r_eve); + __m256i packed_r_odd = _mm256_packus_epi16(r_odd, r_odd); + + // § Note the swapped B and R channels compared to bgrx. + __m256i mix00 = _mm256_unpacklo_epi8(packed_r_eve, packed_g_eve); + __m256i mix01 = _mm256_unpacklo_epi8(packed_r_odd, packed_g_odd); + __m256i mix02 = _mm256_unpacklo_epi8(packed_b_eve, uFFFF); + __m256i mix03 = _mm256_unpacklo_epi8(packed_b_odd, uFFFF); + + __m256i mix10 = _mm256_unpacklo_epi16(mix00, mix02); + __m256i mix11 = _mm256_unpacklo_epi16(mix01, mix03); + __m256i mix12 = _mm256_unpackhi_epi16(mix00, mix02); + __m256i mix13 = _mm256_unpackhi_epi16(mix01, mix03); + + __m256i mix20 = _mm256_unpacklo_epi32(mix10, mix11); + __m256i mix21 = _mm256_unpackhi_epi32(mix10, mix11); + __m256i mix22 = _mm256_unpacklo_epi32(mix12, mix13); + __m256i mix23 = _mm256_unpackhi_epi32(mix12, mix13); + + __m256i mix30 = _mm256_permute2x128_si256(mix20, mix21, 0x20); + __m256i mix31 = _mm256_permute2x128_si256(mix22, mix23, 0x20); + __m256i mix32 = _mm256_permute2x128_si256(mix20, mix21, 0x31); + __m256i mix33 = _mm256_permute2x128_si256(mix22, mix23, 0x31); + + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x00), mix30); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x20), mix31); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x40), mix32); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x60), mix33); + + uint32_t n = 32u - (31u & (x - x_end)); + dst_iter += 4u * n; + up0 += n; + up1 += n; + up2 += n; + x += n; + } +} + +#undef WUFFS_PRIVATE_IMPL__MULDIV256 + #if defined(__GNUC__) && !defined(__clang__) // No-op. #else diff --git a/internal/cgen/base/pixconv-submodule-ycck.c b/internal/cgen/base/pixconv-submodule-ycck.c index f757cc191..94370a268 100644 --- a/internal/cgen/base/pixconv-submodule-ycck.c +++ b/internal/cgen/base/pixconv-submodule-ycck.c @@ -33,6 +33,28 @@ wuffs_private_impl__swizzle_ycc__convert_3_rgbx_x86_avx2( const uint8_t* up1, const uint8_t* up2); +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx_x86_avx2( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2); + +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx_x86_avx2( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2); + #if defined(__GNUC__) && !defined(__clang__) // No-op. #else @@ -49,6 +71,28 @@ wuffs_private_impl__swizzle_ycc__upsample_inv_h2v2_triangle_x86_avx2( #endif #endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +static void // +wuffs_private_impl__swizzle_ycc__convert_3_bgrx_arm_neon( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2); + +static void // +wuffs_private_impl__swizzle_ycc__convert_3_rgbx_arm_neon( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + // -------- static inline uint32_t // @@ -227,6 +271,69 @@ wuffs_private_impl__swizzle_ycc__convert_3_rgbx(wuffs_base__pixel_buffer* dst, } } +// BT.601 studio-range variants for VP8/H.264. + +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_general( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + for (; x < x_end; x++) { + uint32_t color = // + wuffs_base__color_ycc_bt601__as__color_u32( // + *up0++, *up1++, *up2++); + wuffs_base__pixel_buffer__set_color_u32_at(dst, x, y, color); + } +} + +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + for (; x < x_end; x++) { + uint32_t color = // + wuffs_base__color_ycc_bt601__as__color_u32( // + *up0++, *up1++, *up2++); + wuffs_base__poke_u32le__no_bounds_check(dst_iter, color); + dst_iter += 4u; + } +} + +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + for (; x < x_end; x++) { + uint32_t color = // + wuffs_base__color_ycc_bt601__as__color_u32_abgr( // + *up0++, *up1++, *up2++); + wuffs_base__poke_u32le__no_bounds_check(dst_iter, color); + dst_iter += 4u; + } +} + // -------- // wuffs_private_impl__swizzle_ycc__upsample_func upsamples to a destination @@ -1226,6 +1333,7 @@ wuffs_base__pixel_swizzler__swizzle_ycck( uint8_t v3, bool is_rgb_or_cmyk, bool triangle_filter_for_2to1, + bool src_is_bt601, wuffs_base__slice_u8 scratch_buffer_2k) { if (!p) { return wuffs_base__make_status(wuffs_base__error__bad_receiver); @@ -1438,6 +1546,37 @@ wuffs_base__pixel_swizzler__swizzle_ycck( if (is_rgb_or_cmyk) { conv3func = &wuffs_private_impl__swizzle_rgb__convert_3_general; + } else if (src_is_bt601) { + // BT.601 studio-range YCbCr (VP8, H.264). + switch (dst->pixcfg.private_impl.pixfmt.repr) { + case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL: + case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL: + case WUFFS_BASE__PIXEL_FORMAT__BGRX: +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + if (wuffs_base__cpu_arch__have_x86_avx2()) { + conv3func = + &wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx_x86_avx2; + break; + } +#endif + conv3func = &wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx; + break; + case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL: + case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL: + case WUFFS_BASE__PIXEL_FORMAT__RGBX: +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + if (wuffs_base__cpu_arch__have_x86_avx2()) { + conv3func = + &wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx_x86_avx2; + break; + } +#endif + conv3func = &wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx; + break; + default: + conv3func = &wuffs_private_impl__swizzle_ycc_bt601__convert_3_general; + break; + } } else { switch (dst->pixcfg.private_impl.pixfmt.repr) { case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL: @@ -1448,6 +1587,10 @@ wuffs_base__pixel_swizzler__swizzle_ycck( conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_bgrx_x86_avx2; break; } +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_bgrx_arm_neon; + break; #endif conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_bgrx; break; @@ -1459,6 +1602,10 @@ wuffs_base__pixel_swizzler__swizzle_ycck( conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_rgbx_x86_avx2; break; } +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_rgbx_arm_neon; + break; #endif conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_rgbx; break; diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go index 18e2e99d6..d8548294b 100644 --- a/internal/cgen/builtin.go +++ b/internal/cgen/builtin.go @@ -478,7 +478,7 @@ func (g *gen) writeBuiltinCPUArch(b *buffer, recv *a.Expr, method t.ID, returnTy case id == t.IDARMCRC32Utility, id == t.IDARMCRC32U32: return g.writeBuiltinCPUArchARMCRC32(b, recv, method, args, sideEffectsOnly, depth) case id.IsBuiltInCPUArchARMNeon(): - return g.writeBuiltinCPUArchARMNeon(b, recv, method, args, sideEffectsOnly, depth) + return g.writeBuiltinCPUArchARMNeon(b, recv, method, returnType, args, sideEffectsOnly, depth) case id == t.IDX86SSE42Utility, id == t.IDX86M128I, id == t.IDX86AVX2Utility, id == t.IDX86M256I: return g.writeBuiltinCPUArchX86(b, recv, method, returnType, args, sideEffectsOnly, depth) @@ -511,7 +511,7 @@ func (g *gen) writeBuiltinCPUArchARMCRC32(b *buffer, recv *a.Expr, method t.ID, return nil } -func (g *gen) writeBuiltinCPUArchARMNeon(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, sideEffectsOnly bool, depth uint32) error { +func (g *gen) writeBuiltinCPUArchARMNeon(b *buffer, recv *a.Expr, method t.ID, returnType *a.TypeExpr, args []*a.Node, sideEffectsOnly bool, depth uint32) error { methodStr := method.Str(g.tm) if strings.HasPrefix(methodStr, "make_") { before, after, ptr := "", ")", false @@ -557,6 +557,8 @@ func (g *gen) writeBuiltinCPUArchARMNeon(b *buffer, recv *a.Expr, method t.ID, a before, ptr = "vld1_u8(", true case "make_u8x16_slice128": before, ptr = "vld1q_u8(", true + case "make_u32x4_slice_u32lex4": + before, ptr = "vld1q_u32(", true default: return fmt.Errorf("internal error: unsupported cpu_arch method %q", methodStr) } @@ -579,6 +581,34 @@ func (g *gen) writeBuiltinCPUArchARMNeon(b *buffer, recv *a.Expr, method t.ID, a b.writes(after) return nil + } else if strings.HasPrefix(methodStr, "store_") { + if !sideEffectsOnly { + b.writes("(") + } + prefix := "" + switch methodStr { + case "store_slice64": + prefix = "vst1_u8(" + case "store_slice128": + prefix = "vst1q_u8(" + } + if prefix == "" { + return fmt.Errorf("internal error: unsupported cpu_arch method %q", methodStr) + } + b.writes(prefix) + if err := g.writeExprDotPtr(b, args[0].AsArg().Value(), false, depth); err != nil { + return err + } + b.writes(", ") + if err := g.writeExpr(b, recv, false, depth); err != nil { + return err + } + b.writes(")") + if !sideEffectsOnly { + b.writes(", wuffs_base__make_empty_struct())") + } + return nil + } else if strings.HasPrefix(methodStr, "as_") { switch recv.MType().QID()[1] { case t.IDARMNeonU8x8: @@ -614,6 +644,102 @@ func (g *gen) writeBuiltinCPUArchARMNeon(b *buffer, recv *a.Expr, method t.ID, a } } + // Signed reinterpret operations: call signed NEON intrinsics on unsigned + // types by wrapping with vreinterpret casts. Handles _s8 and _s16 suffixes. + if strings.HasSuffix(methodStr, "_s8") || strings.HasSuffix(methodStr, "_s16") { + intrinsic := methodStr + + // Determine the signed cast for the receiver based on its type. + recvToSigned := "" + switch recv.MType().QID()[1] { + case t.IDARMNeonU8x8: + recvToSigned = "vreinterpret_s8_u8" + case t.IDARMNeonU8x16: + recvToSigned = "vreinterpretq_s8_u8" + case t.IDARMNeonU16x4: + recvToSigned = "vreinterpret_s16_u16" + case t.IDARMNeonU16x8: + recvToSigned = "vreinterpretq_s16_u16" + } + + // Determine the unsigned cast for the result based on return type. + // Most _s8/_s16 intrinsics return a signed type that needs casting back + // to unsigned. Exception: "unsigned narrowing" intrinsics like vqmovun_s16 + // already return an unsigned type (uint8x8_t), so no result cast is needed. + resultToUnsigned := "" + isUnsignedNarrow := strings.Contains(intrinsic, "un_s") + if returnType != nil { + switch returnType.QID()[1] { + case t.IDARMNeonU8x8: + if !isUnsignedNarrow { + resultToUnsigned = "vreinterpret_u8_s8" + } + case t.IDARMNeonU8x16: + if !isUnsignedNarrow { + resultToUnsigned = "vreinterpretq_u8_s8" + } + case t.IDARMNeonU16x4: + resultToUnsigned = "vreinterpret_u16_s16" + case t.IDARMNeonU16x8: + resultToUnsigned = "vreinterpretq_u16_s16" + } + } + + if recvToSigned != "" { + if resultToUnsigned != "" { + b.printf("%s(", resultToUnsigned) + } + b.printf("%s(%s(", intrinsic, recvToSigned) + if err := g.writeExpr(b, recv, false, depth); err != nil { + return err + } + b.writes(")") + for _, o := range args { + b.writes(", ") + oVal := o.AsArg().Value() + // Only reinterpret NEON vector args, not scalar shift amounts. + qid1 := oVal.MType().QID()[1] + isNeonType := (qid1 >= t.IDARMNeonU8x8 && qid1 <= t.IDARMNeonU64x1) || + (qid1 >= t.IDARMNeonU8x16 && qid1 <= t.IDARMNeonU64x2) + if isNeonType { + // Determine the signed cast for this arg's type. + argToSigned := "" + switch qid1 { + case t.IDARMNeonU8x8: + argToSigned = "vreinterpret_s8_u8" + case t.IDARMNeonU8x16: + argToSigned = "vreinterpretq_s8_u8" + case t.IDARMNeonU16x4: + argToSigned = "vreinterpret_s16_u16" + case t.IDARMNeonU16x8: + argToSigned = "vreinterpretq_s16_u16" + } + if argToSigned != "" { + b.printf("%s(", argToSigned) + if err := g.writeExpr(b, oVal, false, depth); err != nil { + return err + } + b.writes(")") + } else { + if err := g.writeExpr(b, oVal, false, depth); err != nil { + return err + } + } + } else { + // Scalar argument (shift amount, etc.). + if err := g.writeExpr(b, oVal, false, depth); err != nil { + return err + } + } + } + b.writes(")") + if resultToUnsigned != "" { + b.writes(")") + } + return nil + } + } + b.writes(methodStr) b.writes("(") if err := g.writeExpr(b, recv, false, depth); err != nil { @@ -657,7 +783,9 @@ func (g *gen) writeBuiltinCPUArchX86(b *buffer, recv *a.Expr, method t.ID, retur fName, tName = "_mm_cvtsi32_si128", "int32_t" case "make_m128i_single_u64": fName, tName = "_mm_cvtsi64_si128", "int64_t" - case "make_m128i_slice128", "make_m128i_slice_u16lex8": + case "make_m128i_slice64": + fName, tName, ptr = "_mm_loadl_epi64", "const __m128i*)(const void*", true + case "make_m128i_slice128", "make_m128i_slice_u16lex8", "make_m128i_slice_u32lex4": fName, tName, ptr = "_mm_lddqu_si128", "const __m128i*)(const void*", true case "make_m128i_zeroes": fName, tName = "_mm_setzero_si128", "" @@ -828,6 +956,37 @@ func (g *gen) writeExprDotPtr(b *buffer, n *a.Expr, sideEffectsOnly bool, depth func (g *gen) writeBuiltinNumType(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, depth uint32) error { switch method { + case t.IDCountLeadingZeroes: + // "recv.count_leading_zeroes()" in C is: + // - For u8: "wuffs_base__count_leading_zeroes_u32(((uint32_t)(recv))) - 24u" + // - For u16: "wuffs_base__count_leading_zeroes_u32(((uint32_t)(recv))) - 16u" + // - For u32: "wuffs_base__count_leading_zeroes_u32(((uint32_t)(recv)))" + // - For u64: "((uint32_t)(wuffs_base__count_leading_zeroes_u64(((uint64_t)(recv)))))" + sz, err := g.sizeof(recv.MType()) + if err != nil { + return err + } + if sz == 8 { + b.writes("((uint32_t)(wuffs_base__count_leading_zeroes_u64(((uint64_t)(") + if err := g.writeExpr(b, recv, false, depth); err != nil { + return err + } + b.writes(")))))") + } else { + if sz < 4 { + b.writes("(") + } + b.writes("wuffs_base__count_leading_zeroes_u32(((uint32_t)(") + if err := g.writeExpr(b, recv, false, depth); err != nil { + return err + } + b.writes(")))") + if sz < 4 { + b.printf(" - %du)", (4-sz)*8) + } + } + return nil + case t.IDLowBits: // "recv.low_bits(n:etc)" in C is one of: // - "((recv) & constant)" diff --git a/internal/cgen/cgen.go b/internal/cgen/cgen.go index 8c9134c16..8e7732e15 100644 --- a/internal/cgen/cgen.go +++ b/internal/cgen/cgen.go @@ -132,6 +132,7 @@ func Do(args []string) error { "// ¡ INSERT base/intconv-submodule.c.\n": insertBaseIntConvSubmoduleC, "// ¡ INSERT base/magic-submodule.c.\n": insertBaseMagicSubmoduleC, "// ¡ INSERT base/pixconv-submodule-regular.c.\n": insertBasePixConvSubmoduleRegularC, + "// ¡ INSERT base/pixconv-submodule-arm-neon.c.\n": insertBasePixConvSubmoduleArmNeonC, "// ¡ INSERT base/pixconv-submodule-x86-avx2.c.\n": insertBasePixConvSubmoduleX86Avx2C, "// ¡ INSERT base/pixconv-submodule-ycck.c.\n": insertBasePixConvSubmoduleYcckC, "// ¡ INSERT base/utf8-submodule.c.\n": insertBaseUTF8SubmoduleC, @@ -396,6 +397,11 @@ func insertBasePixConvSubmoduleRegularC(buf *buffer) error { return nil } +func insertBasePixConvSubmoduleArmNeonC(buf *buffer) error { + buf.writes(embedBasePixConvSubmoduleArmNeonC.Trim()) + return nil +} + func insertBasePixConvSubmoduleX86Avx2C(buf *buffer) error { buf.writes(embedBasePixConvSubmoduleX86Avx2C.Trim()) return nil diff --git a/internal/cgen/embed.go b/internal/cgen/embed.go index 8f2056f89..f7823beb5 100644 --- a/internal/cgen/embed.go +++ b/internal/cgen/embed.go @@ -96,6 +96,9 @@ var embedBaseMagicSubmoduleC EmbeddedString //go:embed base/pixconv-submodule-regular.c var embedBasePixConvSubmoduleRegularC EmbeddedString +//go:embed base/pixconv-submodule-arm-neon.c +var embedBasePixConvSubmoduleArmNeonC EmbeddedString + //go:embed base/pixconv-submodule-x86-avx2.c var embedBasePixConvSubmoduleX86Avx2C EmbeddedString diff --git a/internal/cgen/func.go b/internal/cgen/func.go index 598d0d065..d4de38b82 100644 --- a/internal/cgen/func.go +++ b/internal/cgen/func.go @@ -14,6 +14,7 @@ import ( "fmt" "math/big" "strconv" + "strings" a "github.com/google/wuffs/lang/ast" t "github.com/google/wuffs/lang/token" @@ -87,7 +88,31 @@ const ( func (g *gen) writeFuncSignature(b *buffer, n *a.Func, wfs uint32) error { switch wfs { case wfsCDecl: - b.writes("WUFFS_BASE__GENERATED_C_CODE\n") + // Use NOINLINE for cold helper functions (e.g. byte-loading helpers + // for boolean decoders) so the compiler keeps their callers small + // enough to inline at call sites. + // Also use NOINLINE for large per-macroblock functions (e.g. + // decode_one_mb) and coefficient dispatch functions (e.g. + // decode_mb_coefficients) so their callers stay compact. + // Use NOINLINE for cold helper functions (e.g. byte-loading helpers + // for boolean decoders) so the compiler keeps their callers small + // enough to inline at call sites. + // Also use NOINLINE for large per-macroblock functions (e.g. + // decode_one_mb) and coefficient dispatch functions (e.g. + // decode_mb_coefficients) so their callers stay compact. + // Use ALWAYS_INLINE for hot inner functions (e.g. + // decode_block_coeffs) that should be inlined into their callers + // to avoid per-call struct sync overhead. + funcName := n.FuncName().Str(g.tm) + if strings.HasSuffix(funcName, "_load_bytes") || + funcName == "decode_one_mb" || + funcName == "decode_mb_coefficients" { + b.writes("WUFFS_BASE__GENERATED_C_CODE_NOINLINE\n") + } else if funcName == "decode_block_coeffs" { + b.writes("WUFFS_BASE__GENERATED_C_CODE_ALWAYS_INLINE\n") + } else { + b.writes("WUFFS_BASE__GENERATED_C_CODE\n") + } if n.Public() { b.writes("WUFFS_BASE__MAYBE_STATIC ") } else { diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go index 9b2f81883..17fd2389c 100644 --- a/lang/builtin/builtin.go +++ b/lang/builtin/builtin.go @@ -357,21 +357,25 @@ var Funcs = [][]string{ } var funcsOther = [...]string{ + "u8.count_leading_zeroes() u32[..= 8]", "u8.high_bits(n: u32[..= 7]) u8", "u8.low_bits(n: u32[..= 7]) u8", "u8.max(no_less_than: u8) u8", "u8.min(no_more_than: u8) u8", + "u16.count_leading_zeroes() u32[..= 16]", "u16.high_bits(n: u32[..= 15]) u16", "u16.low_bits(n: u32[..= 15]) u16", "u16.max(no_less_than: u16) u16", "u16.min(no_more_than: u16) u16", + "u32.count_leading_zeroes() u32[..= 32]", "u32.high_bits(n: u32[..= 31]) u32", "u32.low_bits(n: u32[..= 31]) u32", "u32.max(no_less_than: u32) u32", "u32.min(no_more_than: u32) u32", + "u64.count_leading_zeroes() u32[..= 64]", "u64.high_bits(n: u32[..= 63]) u64", "u64.low_bits(n: u32[..= 63]) u64", "u64.max(no_less_than: u64) u64", @@ -734,6 +738,7 @@ var funcsOther = [...]string{ "v3: u8[..= 4]," + "is_rgb_or_cmyk: bool," + "triangle_filter_for_2to1: bool," + + "src_is_bt601: bool," + "scratch_buffer_2k: slice u8) status", // ---- arm_crc32_utility @@ -785,6 +790,12 @@ var funcsOther = [...]string{ "arm_neon_utility.make_u8x8_slice64(a: roslice base.u8) arm_neon_u8x8", "arm_neon_utility.make_u8x16_slice128(a: roslice base.u8) arm_neon_u8x16", + "arm_neon_utility.make_u32x4_slice_u32lex4(a: roslice base.u32) arm_neon_u32x4", + + // ---- arm_neon store operations + + "arm_neon_u8x8.store_slice64!(a: slice base.u8)", + "arm_neon_u8x16.store_slice128!(a: slice base.u8)", // ---- arm_neon_uAxB.as_uCxD @@ -827,8 +838,10 @@ var funcsOther = [...]string{ "x86_sse42_utility.make_m128i_single_u32(a: u32) x86_m128i", "x86_sse42_utility.make_m128i_single_u64(a: u64) x86_m128i", + "x86_sse42_utility.make_m128i_slice64(a: roslice base.u8) x86_m128i", "x86_sse42_utility.make_m128i_slice128(a: roslice base.u8) x86_m128i", "x86_sse42_utility.make_m128i_slice_u16lex8(a: roslice base.u16) x86_m128i", + "x86_sse42_utility.make_m128i_slice_u32lex4(a: roslice base.u32) x86_m128i", "x86_sse42_utility.make_m128i_zeroes() x86_m128i", @@ -849,7 +862,11 @@ var funcsOther = [...]string{ "x86_m128i._mm_add_epi32(b: x86_m128i) x86_m128i", "x86_m128i._mm_add_epi64(b: x86_m128i) x86_m128i", "x86_m128i._mm_add_epi8(b: x86_m128i) x86_m128i", + "x86_m128i._mm_adds_epi8(b: x86_m128i) x86_m128i", + "x86_m128i._mm_adds_epi16(b: x86_m128i) x86_m128i", + "x86_m128i._mm_adds_epu8(b: x86_m128i) x86_m128i", "x86_m128i._mm_and_si128(b: x86_m128i) x86_m128i", + "x86_m128i._mm_andnot_si128(b: x86_m128i) x86_m128i", "x86_m128i._mm_avg_epu16(b: x86_m128i) x86_m128i", "x86_m128i._mm_avg_epu8(b: x86_m128i) x86_m128i", "x86_m128i._mm_blend_epi16(b: x86_m128i, imm8: u32) x86_m128i", @@ -859,6 +876,9 @@ var funcsOther = [...]string{ "x86_m128i._mm_cmpeq_epi32(b: x86_m128i) x86_m128i", "x86_m128i._mm_cmpeq_epi64(b: x86_m128i) x86_m128i", "x86_m128i._mm_cmpeq_epi8(b: x86_m128i) x86_m128i", + "x86_m128i._mm_cmpgt_epi8(b: x86_m128i) x86_m128i", + "x86_m128i._mm_cmpgt_epi16(b: x86_m128i) x86_m128i", + "x86_m128i._mm_cmpgt_epi32(b: x86_m128i) x86_m128i", "x86_m128i._mm_extract_epi16(imm8: u32) u16", "x86_m128i._mm_extract_epi32(imm8: u32) u32", "x86_m128i._mm_extract_epi64(imm8: u32) u64", @@ -877,9 +897,12 @@ var funcsOther = [...]string{ "x86_m128i._mm_min_epu16(b: x86_m128i) x86_m128i", "x86_m128i._mm_min_epu32(b: x86_m128i) x86_m128i", "x86_m128i._mm_min_epu8(b: x86_m128i) x86_m128i", + "x86_m128i._mm_mulhi_epi16(b: x86_m128i) x86_m128i", + "x86_m128i._mm_mullo_epi16(b: x86_m128i) x86_m128i", "x86_m128i._mm_mullo_epi32(b: x86_m128i) x86_m128i", "x86_m128i._mm_or_si128(b: x86_m128i) x86_m128i", "x86_m128i._mm_packs_epi16(b: x86_m128i) x86_m128i", + "x86_m128i._mm_packs_epi32(b: x86_m128i) x86_m128i", "x86_m128i._mm_packus_epi16(b: x86_m128i) x86_m128i", "x86_m128i._mm_sad_epu8(b: x86_m128i) x86_m128i", "x86_m128i._mm_shuffle_epi32(imm8: u32) x86_m128i", @@ -888,6 +911,8 @@ var funcsOther = [...]string{ "x86_m128i._mm_slli_epi32(imm8: u32) x86_m128i", "x86_m128i._mm_slli_epi64(imm8: u32) x86_m128i", "x86_m128i._mm_slli_si128(imm8: u32) x86_m128i", + "x86_m128i._mm_srai_epi16(imm8: u32) x86_m128i", + "x86_m128i._mm_srai_epi32(imm8: u32) x86_m128i", "x86_m128i._mm_srli_epi16(imm8: u32) x86_m128i", "x86_m128i._mm_srli_epi32(imm8: u32) x86_m128i", "x86_m128i._mm_srli_epi64(imm8: u32) x86_m128i", @@ -896,6 +921,10 @@ var funcsOther = [...]string{ "x86_m128i._mm_sub_epi32(b: x86_m128i) x86_m128i", "x86_m128i._mm_sub_epi64(b: x86_m128i) x86_m128i", "x86_m128i._mm_sub_epi8(b: x86_m128i) x86_m128i", + "x86_m128i._mm_subs_epi8(b: x86_m128i) x86_m128i", + "x86_m128i._mm_subs_epi16(b: x86_m128i) x86_m128i", + "x86_m128i._mm_subs_epu8(b: x86_m128i) x86_m128i", + "x86_m128i._mm_subs_epu16(b: x86_m128i) x86_m128i", "x86_m128i._mm_unpackhi_epi16(b: x86_m128i) x86_m128i", "x86_m128i._mm_unpackhi_epi32(b: x86_m128i) x86_m128i", "x86_m128i._mm_unpackhi_epi64(b: x86_m128i) x86_m128i", @@ -905,6 +934,7 @@ var funcsOther = [...]string{ "x86_m128i._mm_unpacklo_epi64(b: x86_m128i) x86_m128i", "x86_m128i._mm_unpacklo_epi8(b: x86_m128i) x86_m128i", "x86_m128i._mm_xor_si128(b: x86_m128i) x86_m128i", + "x86_m128i._mm256_castsi128_si256() x86_m256i", // ---- x86_avx2_utility @@ -952,21 +982,28 @@ var funcsOther = [...]string{ "x86_m256i._mm256_add_epi32(b: x86_m256i) x86_m256i", "x86_m256i._mm256_add_epi64(b: x86_m256i) x86_m256i", "x86_m256i._mm256_add_epi8(b: x86_m256i) x86_m256i", + "x86_m256i._mm256_adds_epi8(b: x86_m256i) x86_m256i", + "x86_m256i._mm256_adds_epu8(b: x86_m256i) x86_m256i", "x86_m256i._mm256_and_si256(b: x86_m256i) x86_m256i", + "x86_m256i._mm256_andnot_si256(b: x86_m256i) x86_m256i", "x86_m256i._mm256_castsi256_si128() x86_m128i", + "x86_m256i._mm256_cmpeq_epi8(b: x86_m256i) x86_m256i", "x86_m256i._mm256_extract_epi64(index: u32) u64", "x86_m256i._mm256_extracti128_si256(imm8: u32) x86_m128i", "x86_m256i._mm256_inserti128_si256(b: x86_m128i, imm8: u32) x86_m256i", "x86_m256i._mm256_madd_epi16(b: x86_m256i) x86_m256i", "x86_m256i._mm256_maddubs_epi16(b: x86_m256i) x86_m256i", + "x86_m256i._mm256_mulhi_epi16(b: x86_m256i) x86_m256i", "x86_m256i._mm256_mullo_epi16(b: x86_m256i) x86_m256i", "x86_m256i._mm256_or_si256(b: x86_m256i) x86_m256i", "x86_m256i._mm256_packs_epi16(b: x86_m256i) x86_m256i", "x86_m256i._mm256_packs_epi32(b: x86_m256i) x86_m256i", + "x86_m256i._mm256_packus_epi16(b: x86_m256i) x86_m256i", "x86_m256i._mm256_permute2x128_si256(b: x86_m256i, imm8: u32) x86_m256i", "x86_m256i._mm256_permute4x64_epi64(imm8: u32) x86_m256i", "x86_m256i._mm256_sad_epu8(b: x86_m256i) x86_m256i", "x86_m256i._mm256_shuffle_epi32(imm8: u32) x86_m256i", + "x86_m256i._mm256_shuffle_epi8(b: x86_m256i) x86_m256i", "x86_m256i._mm256_sign_epi16(b: x86_m256i) x86_m256i", "x86_m256i._mm256_slli_epi16(imm8: u32) x86_m256i", "x86_m256i._mm256_slli_epi32(imm8: u32) x86_m256i", @@ -984,6 +1021,8 @@ var funcsOther = [...]string{ "x86_m256i._mm256_sub_epi32(b: x86_m256i) x86_m256i", "x86_m256i._mm256_sub_epi64(b: x86_m256i) x86_m256i", "x86_m256i._mm256_sub_epi8(b: x86_m256i) x86_m256i", + "x86_m256i._mm256_subs_epi8(b: x86_m256i) x86_m256i", + "x86_m256i._mm256_subs_epu8(b: x86_m256i) x86_m256i", "x86_m256i._mm256_testz_si256(b: x86_m256i) u32", "x86_m256i._mm256_unpackhi_epi16(b: x86_m256i) x86_m256i", "x86_m256i._mm256_unpackhi_epi32(b: x86_m256i) x86_m256i", diff --git a/lang/builtin/data.go b/lang/builtin/data.go index 79a5c66e2..98a6f327c 100644 --- a/lang/builtin/data.go +++ b/lang/builtin/data.go @@ -52,6 +52,7 @@ var funcsARMNeon = [...]string{ "arm_neon_u8x8.vminv_u8() u8", "arm_neon_u8x8.vmla_u8(b: arm_neon_u8x8, c: arm_neon_u8x8) arm_neon_u8x8", "arm_neon_u8x8.vmls_u8(b: arm_neon_u8x8, c: arm_neon_u8x8) arm_neon_u8x8", + "arm_neon_u8x8.vmovl_s8() arm_neon_u16x8", "arm_neon_u8x8.vmovl_u8() arm_neon_u16x8", "arm_neon_u8x8.vmovn_high_u16(b: arm_neon_u16x8) arm_neon_u8x16", "arm_neon_u8x8.vmul_u8(b: arm_neon_u8x8) arm_neon_u8x8", @@ -64,6 +65,7 @@ var funcsARMNeon = [...]string{ "arm_neon_u8x8.vpmax_u8(b: arm_neon_u8x8) arm_neon_u8x8", "arm_neon_u8x8.vpmin_u8(b: arm_neon_u8x8) arm_neon_u8x8", "arm_neon_u8x8.vqadd_u8(b: arm_neon_u8x8) arm_neon_u8x8", + "arm_neon_u8x8.vqadd_s8(b: arm_neon_u8x8) arm_neon_u8x8", "arm_neon_u8x8.vqmovn_high_u16(b: arm_neon_u16x8) arm_neon_u8x16", // arm_neon_u8x8.vqmovun_high_s16(etc) // arm_neon_u8x8.vqrshl_u8(etc) @@ -74,6 +76,7 @@ var funcsARMNeon = [...]string{ "arm_neon_u8x8.vqshrn_high_n_u16(b: arm_neon_u16x8, c: u32) arm_neon_u8x16", // arm_neon_u8x8.vqshrun_high_n_s16(etc) "arm_neon_u8x8.vqsub_u8(b: arm_neon_u8x8) arm_neon_u8x8", + "arm_neon_u8x8.vqsub_s8(b: arm_neon_u8x8) arm_neon_u8x8", "arm_neon_u8x8.vqtbx1_u8(tab: arm_neon_u8x16, idx: arm_neon_u8x8) arm_neon_u8x8", // arm_neon_u8x8.vqtbx2_u8(etc) // arm_neon_u8x8.vqtbx3_u8(etc) @@ -93,6 +96,7 @@ var funcsARMNeon = [...]string{ // arm_neon_u8x8.vshl_u8(etc) "arm_neon_u8x8.vshll_n_u8(b: u32) arm_neon_u16x8", "arm_neon_u8x8.vshr_n_u8(b: u32) arm_neon_u8x8", + "arm_neon_u8x8.vshr_n_s8(b: u32) arm_neon_u8x8", "arm_neon_u8x8.vshrn_high_n_u16(b: arm_neon_u16x8, c: u32) arm_neon_u8x16", "arm_neon_u8x8.vsli_n_u8(b: arm_neon_u8x8, c: u32) arm_neon_u8x8", // arm_neon_u8x8.vsqadd_u8(etc) @@ -431,10 +435,12 @@ var funcsARMNeon = [...]string{ "arm_neon_u8x16.vpmaxq_u8(b: arm_neon_u8x16) arm_neon_u8x16", "arm_neon_u8x16.vpminq_u8(b: arm_neon_u8x16) arm_neon_u8x16", "arm_neon_u8x16.vqaddq_u8(b: arm_neon_u8x16) arm_neon_u8x16", + "arm_neon_u8x16.vqaddq_s8(b: arm_neon_u8x16) arm_neon_u8x16", // arm_neon_u8x16.vqrshlq_u8(etc) "arm_neon_u8x16.vqshlq_n_u8(b: u32) arm_neon_u8x16", // arm_neon_u8x16.vqshlq_u8(etc) "arm_neon_u8x16.vqsubq_u8(b: arm_neon_u8x16) arm_neon_u8x16", + "arm_neon_u8x16.vqsubq_s8(b: arm_neon_u8x16) arm_neon_u8x16", "arm_neon_u8x16.vqtbl1_u8(b: arm_neon_u8x8) arm_neon_u8x8", "arm_neon_u8x16.vqtbl1q_u8(b: arm_neon_u8x16) arm_neon_u8x16", "arm_neon_u8x16.vqtbx1q_u8(tab: arm_neon_u8x16, idx: arm_neon_u8x16) arm_neon_u8x16", @@ -453,6 +459,7 @@ var funcsARMNeon = [...]string{ "arm_neon_u8x16.vshlq_n_u8(b: u32) arm_neon_u8x16", // arm_neon_u8x16.vshlq_u8(etc) "arm_neon_u8x16.vshrq_n_u8(b: u32) arm_neon_u8x16", + "arm_neon_u8x16.vshrq_n_s8(b: u32) arm_neon_u8x16", "arm_neon_u8x16.vsliq_n_u8(b: arm_neon_u8x16, c: u32) arm_neon_u8x16", // arm_neon_u8x16.vsqaddq_u8(etc) "arm_neon_u8x16.vsraq_n_u8(b: arm_neon_u8x16, c: u32) arm_neon_u8x16", @@ -544,13 +551,18 @@ var funcsARMNeon = [...]string{ "arm_neon_u16x8.vpaddq_u16(b: arm_neon_u16x8) arm_neon_u16x8", "arm_neon_u16x8.vpmaxq_u16(b: arm_neon_u16x8) arm_neon_u16x8", "arm_neon_u16x8.vpminq_u16(b: arm_neon_u16x8) arm_neon_u16x8", + "arm_neon_u16x8.vqaddq_s16(b: arm_neon_u16x8) arm_neon_u16x8", "arm_neon_u16x8.vqaddq_u16(b: arm_neon_u16x8) arm_neon_u16x8", + "arm_neon_u16x8.vqdmulhq_n_s16(b: u16) arm_neon_u16x8", + "arm_neon_u16x8.vqmovn_s16() arm_neon_u8x8", "arm_neon_u16x8.vqmovn_u16() arm_neon_u8x8", + "arm_neon_u16x8.vqmovun_s16() arm_neon_u8x8", // arm_neon_u16x8.vqrshlq_u16(etc) "arm_neon_u16x8.vqrshrn_n_u16(b: u32) arm_neon_u8x8", "arm_neon_u16x8.vqshlq_n_u16(b: u32) arm_neon_u16x8", // arm_neon_u16x8.vqshlq_u16(etc) "arm_neon_u16x8.vqshrn_n_u16(b: u32) arm_neon_u8x8", + "arm_neon_u16x8.vqsubq_s16(b: arm_neon_u16x8) arm_neon_u16x8", "arm_neon_u16x8.vqsubq_u16(b: arm_neon_u16x8) arm_neon_u16x8", "arm_neon_u16x8.vraddhn_u16(b: arm_neon_u16x8) arm_neon_u8x8", "arm_neon_u16x8.vrev32q_u16() arm_neon_u16x8", @@ -565,6 +577,7 @@ var funcsARMNeon = [...]string{ "arm_neon_u16x8.vshlq_n_u16(b: u32) arm_neon_u16x8", // arm_neon_u16x8.vshlq_u16(etc) "arm_neon_u16x8.vshrn_n_u16(b: u32) arm_neon_u8x8", + "arm_neon_u16x8.vshrq_n_s16(b: u32) arm_neon_u16x8", "arm_neon_u16x8.vshrq_n_u16(b: u32) arm_neon_u16x8", "arm_neon_u16x8.vsliq_n_u16(b: arm_neon_u16x8, c: u32) arm_neon_u16x8", // arm_neon_u16x8.vsqaddq_u16(etc) diff --git a/lang/check/axioms.md b/lang/check/axioms.md index d75705d5c..f343c5f2c 100644 --- a/lang/check/axioms.md +++ b/lang/check/axioms.md @@ -44,6 +44,7 @@ editing this list. --- +- `"a < (a + b): 0 < b"` - `"a <= (a + b): 0 <= b"` --- diff --git a/lang/check/bounds.go b/lang/check/bounds.go index 4811cde55..1b10013f7 100644 --- a/lang/check/bounds.go +++ b/lang/check/bounds.go @@ -1309,6 +1309,8 @@ func (q *checker) bcheckExprCallSpecialCases(n *a.Expr, depth uint32) (bounds, e advance = sixteen case strings.HasSuffix(s, "_slice_u16lex32"): // 32 u16 values is 64 bytes. advance = thirtyTwo + case strings.HasSuffix(s, "_slice_u32lex4"): // 4 u32 values is 16 bytes. + advance = four case strings.Contains(s, "_slice"): return bounds{}, fmt.Errorf("check: internal error: unrecognized %s method", s) } diff --git a/lang/check/data.go b/lang/check/data.go index e381f8dea..ef2e74e75 100644 --- a/lang/check/data.go +++ b/lang/check/data.go @@ -375,6 +375,24 @@ var reasons = [...]struct { return nil }}, + {`"a < (a + b): 0 < b"`, func(q *checker, n *a.Assert) error { + op, xa, t0 := parseBinaryOp(n.Condition()) + if op != t.IDXBinaryLessThan { + return errFailed + } + op, xa, xb := parseBinaryOp(t0) + if op != t.IDXBinaryPlus { + return errFailed + } + // 0 < b + if err := proveReasonRequirement(q, t.IDXBinaryLessThan, zeroExpr, xb); err != nil { + return err + } + _ = xa + _ = xb + return nil + }}, + {`"a <= (a + b): 0 <= b"`, func(q *checker, n *a.Assert) error { op, xa, t0 := parseBinaryOp(n.Condition()) if op != t.IDXBinaryLessEq { diff --git a/lang/check/optimize.go b/lang/check/optimize.go index 86974d511..e814ca858 100644 --- a/lang/check/optimize.go +++ b/lang/check/optimize.go @@ -82,6 +82,25 @@ func (q *checker) optimizeIOMethodAdvance(receiver *a.Expr, advance *big.Int, ad if n != nil && (n.Cmp(advance) >= 0) { retOK = true } + + // OK if i is (base + c1) and j is (base + c2) with + // the same base expression, and ((c2 - c1) >= advance). + if !retOK && (i.Operator() == t.IDXBinaryPlus) { + iBase, iConst := i.LHS().AsExpr(), i.RHS().AsExpr().ConstValue() + if iConst == nil { + iBase, iConst = i.RHS().AsExpr(), i.LHS().AsExpr().ConstValue() + } + jBase, jConst := j.LHS().AsExpr(), j.RHS().AsExpr().ConstValue() + if jConst == nil { + jBase, jConst = j.RHS().AsExpr(), j.LHS().AsExpr().ConstValue() + } + if (iConst != nil) && (jConst != nil) && iBase.Eq(jBase) { + n := big.NewInt(0).Sub(jConst, iConst) + if n.Cmp(advance) >= 0 { + retOK = true + } + } + } } } diff --git a/lang/token/list.go b/lang/token/list.go index 8daf2e606..a4d49e8e4 100644 --- a/lang/token/list.go +++ b/lang/token/list.go @@ -677,10 +677,11 @@ const ( // TODO: range/rect methods like intersection and contains? - IDHighBits = ID(0x220) - IDLowBits = ID(0x221) - IDMax = ID(0x222) - IDMin = ID(0x223) + IDCountLeadingZeroes = ID(0x224) + IDHighBits = ID(0x220) + IDLowBits = ID(0x221) + IDMax = ID(0x222) + IDMin = ID(0x223) IDIsError = ID(0x230) IDIsOK = ID(0x231) @@ -1126,10 +1127,11 @@ var builtInsByID = [nBuiltInIDs]string{ IDUnroll: "unroll", IDUpdate: "update", - IDHighBits: "high_bits", - IDLowBits: "low_bits", - IDMax: "max", - IDMin: "min", + IDCountLeadingZeroes: "count_leading_zeroes", + IDHighBits: "high_bits", + IDLowBits: "low_bits", + IDMax: "max", + IDMin: "min", IDIsError: "is_error", IDIsOK: "is_ok", diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c index 32565449d..3f9d2143d 100644 --- a/release/c/wuffs-unsupported-snapshot.c +++ b/release/c/wuffs-unsupported-snapshot.c @@ -464,6 +464,31 @@ wuffs_base__cpu_arch__have_x86_sse42(void) { #define WUFFS_BASE__GENERATED_C_CODE #endif +// WUFFS_BASE__GENERATED_C_CODE_NOINLINE is WUFFS_BASE__GENERATED_C_CODE with +// an additional noinline hint. It is used for cold helper functions (e.g. byte +// loading) that should not be inlined into their callers, so that the callers +// remain small enough for the compiler to inline them at their call sites. +#if defined(__GNUC__) || defined(__clang__) +#define WUFFS_BASE__GENERATED_C_CODE_NOINLINE \ + WUFFS_BASE__GENERATED_C_CODE __attribute__((noinline)) +#elif defined(_MSC_VER) +#define WUFFS_BASE__GENERATED_C_CODE_NOINLINE \ + WUFFS_BASE__GENERATED_C_CODE __declspec(noinline) +#else +#define WUFFS_BASE__GENERATED_C_CODE_NOINLINE WUFFS_BASE__GENERATED_C_CODE +#endif + +// WUFFS_BASE__GENERATED_C_CODE_ALWAYS_INLINE is +// WUFFS_BASE__GENERATED_C_CODE with an additional always_inline hint. It is +// used for hot helper functions that should always be inlined into their +// callers (e.g. coefficient decoding in boolean decoders). +#if defined(__GNUC__) || defined(__clang__) +#define WUFFS_BASE__GENERATED_C_CODE_ALWAYS_INLINE \ + WUFFS_BASE__GENERATED_C_CODE inline __attribute__((always_inline)) +#else +#define WUFFS_BASE__GENERATED_C_CODE_ALWAYS_INLINE WUFFS_BASE__GENERATED_C_CODE +#endif + // -------- // Options (bitwise or'ed together) for wuffs_foo__bar__initialize functions. @@ -1371,6 +1396,38 @@ wuffs_base__count_leading_zeroes_u64(uint64_t u) { #endif // (defined(__GNUC__) || defined(__clang__)) && (__SIZEOF_LONG__ == 8) +static inline uint32_t // +wuffs_base__count_leading_zeroes_u32(uint32_t u) { +#if defined(__GNUC__) || defined(__clang__) + return u ? ((uint32_t)(__builtin_clz(u))) : 32u; +#else + if (u == 0) { + return 32; + } + uint32_t n = 0; + if ((u >> 16) == 0) { + n |= 16; + u <<= 16; + } + if ((u >> 24) == 0) { + n |= 8; + u <<= 8; + } + if ((u >> 28) == 0) { + n |= 4; + u <<= 4; + } + if ((u >> 30) == 0) { + n |= 2; + u <<= 2; + } + if ((u >> 31) == 0) { + n |= 1; + } + return n; +#endif +} + // -------- // Normally, the wuffs_base__peek_etc and wuffs_base__poke_etc implementations @@ -1405,6 +1462,10 @@ wuffs_base__peek_u16be__no_bounds_check(const uint8_t* p) { uint16_t x; memcpy(&x, p, 2); return _byteswap_ushort(x); +#elif defined(__GNUC__) || defined(__clang__) + uint16_t x; + memcpy(&x, p, 2); + return __builtin_bswap16(x); #else return (uint16_t)(((uint16_t)(p[0]) << 8) | ((uint16_t)(p[1]) << 0)); #endif @@ -1439,6 +1500,13 @@ wuffs_base__peek_u32be__no_bounds_check(const uint8_t* p) { uint32_t x; memcpy(&x, p, 4); return _byteswap_ulong(x); +#elif defined(__GNUC__) || defined(__clang__) + // Use memcpy + bswap to guarantee a single 32-bit load. The byte-shift + // pattern below is semantically equivalent, but compilers may fail to merge + // the four byte loads in large functions. + uint32_t x; + memcpy(&x, p, 4); + return __builtin_bswap32(x); #else return ((uint32_t)(p[0]) << 24) | ((uint32_t)(p[1]) << 16) | ((uint32_t)(p[2]) << 8) | ((uint32_t)(p[3]) << 0); @@ -1507,6 +1575,10 @@ wuffs_base__peek_u64be__no_bounds_check(const uint8_t* p) { uint64_t x; memcpy(&x, p, 8); return _byteswap_uint64(x); +#elif defined(__GNUC__) || defined(__clang__) + uint64_t x; + memcpy(&x, p, 8); + return __builtin_bswap64(x); #else return ((uint64_t)(p[0]) << 56) | ((uint64_t)(p[1]) << 48) | ((uint64_t)(p[2]) << 40) | ((uint64_t)(p[3]) << 32) | @@ -4722,6 +4794,63 @@ wuffs_base__color_ycc__as__color_u32_abgr(uint8_t yy, uint8_t cb, uint8_t cr) { ((0x00FF0000 & rr32) >> 16); } +// wuffs_base__color_ycc_bt601__as__color_u32 converts from BT.601 studio-range +// YCbCr (as used by VP8, H.264, etc.) to 0xAARRGGBB. The alpha bits are +// always 0xFF. +// +// This uses the studio-range formula from ITU-R BT.601 / RFC 6386 section 13: +// R = 1.164*(Y-16) + 1.596*(Cr-128) +// G = 1.164*(Y-16) - 0.391*(Cb-128) - 0.813*(Cr-128) +// B = 1.164*(Y-16) + 2.018*(Cb-128) +// +// The fixed-point arithmetic matches libwebp's VP8YUVToR/G/B for bit-exact +// results. +static inline wuffs_base__color_u32_argb_premul // +wuffs_base__color_ycc_bt601__as__color_u32(uint8_t yy, + uint8_t cb, + uint8_t cr) { + int32_t yc = ((int32_t)yy * 19077) >> 8; + int32_t rc = ((int32_t)cr * 26149) >> 8; + int32_t gc_u = ((int32_t)cb * 6419) >> 8; + int32_t gc_v = ((int32_t)cr * 13320) >> 8; + int32_t bc = ((int32_t)cb * 33050) >> 8; + + int32_t rr = yc + rc - 14234; + int32_t gg = yc - gc_u - gc_v + 8708; + int32_t bb = yc + bc - 17685; + + // Clip to [0, 255]: if in range [0, 16320], shift right by 6. + uint32_t r = (rr < 0) ? 0u : (rr > 16320) ? 255u : ((uint32_t)rr >> 6); + uint32_t g = (gg < 0) ? 0u : (gg > 16320) ? 255u : ((uint32_t)gg >> 6); + uint32_t b = (bb < 0) ? 0u : (bb > 16320) ? 255u : ((uint32_t)bb >> 6); + + return 0xFF000000u | (r << 16) | (g << 8) | b; +} + +// wuffs_base__color_ycc_bt601__as__color_u32_abgr is like +// wuffs_base__color_ycc_bt601__as__color_u32 but the uint32_t returned is in +// 0xAABBGGRR order, not 0xAARRGGBB. +static inline uint32_t // +wuffs_base__color_ycc_bt601__as__color_u32_abgr(uint8_t yy, + uint8_t cb, + uint8_t cr) { + int32_t yc = ((int32_t)yy * 19077) >> 8; + int32_t rc = ((int32_t)cr * 26149) >> 8; + int32_t gc_u = ((int32_t)cb * 6419) >> 8; + int32_t gc_v = ((int32_t)cr * 13320) >> 8; + int32_t bc = ((int32_t)cb * 33050) >> 8; + + int32_t rr = yc + rc - 14234; + int32_t gg = yc - gc_u - gc_v + 8708; + int32_t bb = yc + bc - 17685; + + uint32_t r = (rr < 0) ? 0u : (rr > 16320) ? 255u : ((uint32_t)rr >> 6); + uint32_t g = (gg < 0) ? 0u : (gg > 16320) ? 255u : ((uint32_t)gg >> 6); + uint32_t b = (bb < 0) ? 0u : (bb > 16320) ? 255u : ((uint32_t)bb >> 6); + + return 0xFF000000u | (b << 16) | (g << 8) | r; +} + // -------- typedef uint8_t wuffs_base__pixel_blend; @@ -15306,12 +15435,13 @@ struct wuffs_thumbhash__decoder__struct { // ---------------- Status Codes extern const char wuffs_vp8__error__bad_header[]; +extern const char wuffs_vp8__error__bad_coefficient[]; extern const char wuffs_vp8__error__truncated_input[]; extern const char wuffs_vp8__error__unsupported_vp8_file[]; // ---------------- Public Consts -#define WUFFS_VP8__DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE 0u +#define WUFFS_VP8__DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE 403177472u // ---------------- Struct Declarations @@ -15449,6 +15579,11 @@ WUFFS_BASE__MAYBE_STATIC wuffs_base__range_ii_u64 wuffs_vp8__decoder__workbuf_len( const wuffs_vp8__decoder* self); +WUFFS_BASE__GENERATED_C_CODE +WUFFS_BASE__MAYBE_STATIC uint64_t +wuffs_vp8__decoder__workbuf_len_total( + const wuffs_vp8__decoder* self); + #ifdef __cplusplus } // extern "C" #endif @@ -15478,12 +15613,205 @@ struct wuffs_vp8__decoder__struct { uint32_t f_width; uint32_t f_height; + uint32_t f_mb_width; + uint32_t f_mb_height; uint8_t f_call_sequence; uint64_t f_frame_config_io_position; + bool f_key_frame; + uint32_t f_partition0_size; + uint32_t f_bool_range; + uint64_t f_bool_value; + uint32_t f_bool_bits; + uint32_t f_bool_ri; + uint32_t f_bool_wi; + uint32_t f_p1_range; + uint64_t f_p1_value; + uint32_t f_p1_bits; + uint32_t f_p1_ri; + uint32_t f_p1_wi; + bool f_use_segment; + bool f_update_segment_map; + bool f_segment_is_abs; + int32_t f_segment_quant[4]; + int32_t f_segment_lf[4]; + uint8_t f_segment_prob[3]; + uint8_t f_filter_type; + uint8_t f_filter_level; + uint8_t f_sharpness_level; + bool f_lf_delta_enabled; + int32_t f_lf_ref_delta[4]; + int32_t f_lf_mode_delta[4]; + uint32_t f_filter_extra_rows; + uint8_t f_quant_y_ac_qi; + int32_t f_quant_y_dc_delta; + int32_t f_quant_y2_dc_delta; + int32_t f_quant_y2_ac_delta; + int32_t f_quant_uv_dc_delta; + int32_t f_quant_uv_ac_delta; + uint32_t f_dequant_y_dc[4]; + uint32_t f_dequant_y_ac[4]; + uint32_t f_dequant_y2_dc[4]; + uint32_t f_dequant_y2_ac[4]; + uint32_t f_dequant_uv_dc[4]; + uint32_t f_dequant_uv_ac[4]; + uint32_t f_seg_filter_level[4]; + uint8_t f_fstrength_level[8]; + uint8_t f_fstrength_ilevel[8]; + uint8_t f_fstrength_hlevel[8]; + uint32_t f_num_partitions; + bool f_multi_partition; + uint32_t f_current_partition; + uint32_t f_part_range[8]; + uint64_t f_part_value[8]; + uint32_t f_part_bits[8]; + uint32_t f_part_wbuf_ri[8]; + uint32_t f_part_wbuf_size[8]; + uint64_t f_part_wbuf_offset[8]; + uint32_t f_current_part_wbuf_ri; + uint32_t f_mb_x; + uint32_t f_mb_y; + uint8_t f_segment_id; + bool f_is_skip_coeff; + bool f_mb_no_skip_coeff; + uint8_t f_prob_skip_false; + uint8_t f_mb_luma_mode; + uint8_t f_mb_chroma_mode; + uint8_t f_left_nz_y2; + uint32_t f_y_stride; + uint32_t f_uv_stride; + uint64_t f_workbuf_offset_y_end; + uint64_t f_workbuf_offset_u_end; + uint64_t f_workbuf_offset_v_end; + uint32_t f_p0_wbuf_ri; + uint32_t f_p0_wbuf_count; uint32_t f_dst_x; uint32_t f_dst_y; wuffs_base__pixel_swizzler f_swizzler; + wuffs_base__empty_struct (*choosy_simple_vfilter_16)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_limit); + wuffs_base__empty_struct (*choosy_normal_vfilter_inner_16)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_vfilter_mb_16)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_vfilter_mb_8)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_hfilter_mb_16)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_hfilter_mb_8)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_hfilter_inner_16)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_hfilter_inner_8)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_vfilter_inner_8)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_vfilter_mb_uv)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_hfilter_mb_uv)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_vfilter_inner_uv)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_normal_hfilter_inner_uv)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + wuffs_base__empty_struct (*choosy_idct_add)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset); + wuffs_base__empty_struct (*choosy_idct_dc_add)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset); + wuffs_base__empty_struct (*choosy_idct_add_pair)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b); + wuffs_base__empty_struct (*choosy_idct_dc_add_pair)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b); + wuffs_base__empty_struct (*choosy_predict_16x16)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode); + wuffs_base__empty_struct (*choosy_predict_8x8)( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode, + uint64_t a_plane_offset); uint32_t p_decode_image_config; uint32_t p_do_decode_image_config; uint32_t p_decode_frame_config; @@ -15493,6 +15821,26 @@ struct wuffs_vp8__decoder__struct { } private_impl; struct { + uint8_t f_bool_buffer[4096]; + uint8_t f_p1_buffer[4096]; + uint32_t f_mb_coeffs[400]; + uint8_t f_mb_y_ac_nz[16]; + uint8_t f_mb_uv_nz[8]; + uint32_t f_block_ac_nz; + uint8_t f_coeff_probs[1056]; + uint8_t f_scratch_buffer_2k[2048]; + uint8_t f_above_nz[8200]; + uint8_t f_left_nz[8]; + uint8_t f_above_modes[4096]; + uint8_t f_left_modes[4]; + uint8_t f_sub_modes[16]; + uint8_t f_mb_upper_right[4]; + uint8_t f_above_nz_y2[1025]; + uint8_t f_mb_filter_level[2048]; + uint8_t f_mb_filter_ilevel[2048]; + uint8_t f_mb_filter_hlevel[2048]; + uint8_t f_mb_filter_inner[2048]; + struct { uint64_t scratch; } s_do_decode_image_config; @@ -15644,6 +15992,11 @@ struct wuffs_vp8__decoder__struct { return wuffs_vp8__decoder__workbuf_len(this); } + inline uint64_t + workbuf_len_total() const { + return wuffs_vp8__decoder__workbuf_len_total(this); + } + #endif // __cplusplus }; // struct wuffs_vp8__decoder__struct @@ -16020,7 +16373,6 @@ extern const char wuffs_webp__error__bad_transform[]; extern const char wuffs_webp__error__short_chunk[]; extern const char wuffs_webp__error__truncated_input[]; extern const char wuffs_webp__error__unsupported_number_of_huffman_groups[]; -extern const char wuffs_webp__error__unsupported_transform_after_color_indexing_transform[]; extern const char wuffs_webp__error__unsupported_webp_file[]; // ---------------- Public Consts @@ -16197,21 +16549,34 @@ struct wuffs_webp__decoder__struct { uint8_t f_code_length_code_lengths[19]; bool f_sub_chunk_has_padding; bool f_is_vp8_lossy; + bool f_is_vp8x; + bool f_has_alpha; + uint64_t f_vp8x_workbuf_len; + uint64_t f_vp8l_alpha_workbuf_len; uint64_t f_frame_config_io_position; uint32_t f_riff_chunk_length; uint32_t f_sub_chunk_length; uint32_t f_bits; uint32_t f_n_bits; + uint64_t f_pix_p; + uint32_t f_pix_x; + uint32_t f_pix_y; + uint64_t f_pix_cc_p; bool f_seen_transform[4]; uint8_t f_transform_type[4]; uint8_t f_transform_tile_size_log2[4]; uint32_t f_n_transforms; + bool f_fuse_subtract_green; uint32_t f_color_cache_bits; uint32_t f_overall_color_cache_bits; uint32_t f_overall_tile_size_log2; uint32_t f_overall_n_huffman_groups; + bool f_hg_compacted; + uint32_t f_hg_bitstream_groups; + uint32_t f_hg_n_sorted; uint32_t f_ht_n_symbols; uint32_t f_ht_code_lengths_remaining; + uint32_t f_ht_next_top; uint32_t f_color_indexing_palette_size; uint32_t f_color_indexing_width; uint32_t f_workbuf_offset_for_transform[4]; @@ -16224,6 +16589,17 @@ struct wuffs_webp__decoder__struct { uint32_t p_decode_code_length_code_lengths; uint32_t p_build_code_lengths; uint32_t p_decode_pixels_slow; + wuffs_base__empty_struct (*choosy_apply_transform_predictor)( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix, + wuffs_base__slice_u8 a_tile_data); + wuffs_base__empty_struct (*choosy_apply_transform_cross_color)( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix, + wuffs_base__slice_u8 a_tile_data); + wuffs_base__empty_struct (*choosy_apply_transform_subtract_green)( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix); uint32_t p_decode_image_config; uint32_t p_do_decode_image_config; uint32_t p_do_decode_image_config_limited; @@ -16231,6 +16607,7 @@ struct wuffs_webp__decoder__struct { uint32_t p_decode_frame_config; uint32_t p_do_decode_frame_config; uint32_t p_decode_frame; + uint32_t p_do_decode_frame_vp8x; uint32_t p_do_decode_frame; uint32_t p_decode_transform; uint32_t p_decode_color_cache_parameters; @@ -16242,14 +16619,21 @@ struct wuffs_webp__decoder__struct { wuffs_vp8__decoder f_vp8; uint8_t f_palette[1024]; uint32_t f_color_cache[2048]; + uint16_t f_hg_sorted[1024]; uint16_t f_codes[2328]; uint16_t f_code_lengths[2328]; uint16_t f_code_lengths_huffman_nodes[37]; - uint16_t f_huffman_nodes[256][6267]; + uint32_t f_huffman_tables[1025][4096]; + uint16_t f_huffman_table_base_offsets[1025][5]; + uint8_t f_hg_trivial[1025]; + uint32_t f_hg_literal_arb[1025]; struct { uint32_t v_hg; uint32_t v_ht; + uint32_t v_target; + uint32_t v_sorted_idx; + uint32_t v_raw_hg; } s_decode_huffman_groups; struct { uint32_t v_use_second_symbol; @@ -16278,7 +16662,7 @@ struct wuffs_webp__decoder__struct { uint32_t v_x; uint32_t v_y; uint32_t v_hg; - uint16_t v_node; + uint32_t v_table_entry; uint32_t v_color; uint32_t v_back_ref_len_n_bits; uint32_t v_back_ref_len_minus_1; @@ -16295,16 +16679,31 @@ struct wuffs_webp__decoder__struct { struct { uint64_t scratch; } s_do_decode_image_config_limited_vp8l; + struct { + uint32_t v_c32; + uint32_t v_chunk_length; + bool v_chunk_padding; + uint64_t v_alpha_offset; + uint32_t v_alph_length; + uint8_t v_alph_filter; + uint64_t v_alpha_i; + uint64_t v_alpha_n; + uint64_t scratch; + } s_do_decode_frame_vp8x; struct { uint32_t v_width; } s_do_decode_frame; struct { uint32_t v_transform_type; uint32_t v_tile_size_log2; + uint32_t v_effective_width; } s_decode_transform; struct { uint32_t v_tile_size_log2; } s_decode_hg_table; + struct { + uint64_t v_p_max; + } s_decode_pixels; } private_data; #ifdef __cplusplus @@ -18967,6 +19366,7 @@ wuffs_base__pixel_swizzler__swizzle_ycck( uint8_t v3, bool is_rgb_or_cmyk, bool triangle_filter_for_2to1, + bool src_is_bt601, wuffs_base__slice_u8 scratch_buffer_2k); // ---------------- Images (Utility) @@ -32164,6 +32564,28 @@ wuffs_private_impl__swizzle_ycc__convert_3_rgbx_x86_avx2( const uint8_t* up1, const uint8_t* up2); +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx_x86_avx2( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2); + +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx_x86_avx2( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2); + #if defined(__GNUC__) && !defined(__clang__) // No-op. #else @@ -32180,6 +32602,28 @@ wuffs_private_impl__swizzle_ycc__upsample_inv_h2v2_triangle_x86_avx2( #endif #endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +static void // +wuffs_private_impl__swizzle_ycc__convert_3_bgrx_arm_neon( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2); + +static void // +wuffs_private_impl__swizzle_ycc__convert_3_rgbx_arm_neon( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + // -------- static inline uint32_t // @@ -32358,6 +32802,69 @@ wuffs_private_impl__swizzle_ycc__convert_3_rgbx(wuffs_base__pixel_buffer* dst, } } +// BT.601 studio-range variants for VP8/H.264. + +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_general( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + for (; x < x_end; x++) { + uint32_t color = // + wuffs_base__color_ycc_bt601__as__color_u32( // + *up0++, *up1++, *up2++); + wuffs_base__pixel_buffer__set_color_u32_at(dst, x, y, color); + } +} + +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + for (; x < x_end; x++) { + uint32_t color = // + wuffs_base__color_ycc_bt601__as__color_u32( // + *up0++, *up1++, *up2++); + wuffs_base__poke_u32le__no_bounds_check(dst_iter, color); + dst_iter += 4u; + } +} + +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + for (; x < x_end; x++) { + uint32_t color = // + wuffs_base__color_ycc_bt601__as__color_u32_abgr( // + *up0++, *up1++, *up2++); + wuffs_base__poke_u32le__no_bounds_check(dst_iter, color); + dst_iter += 4u; + } +} + // -------- // wuffs_private_impl__swizzle_ycc__upsample_func upsamples to a destination @@ -33357,6 +33864,7 @@ wuffs_base__pixel_swizzler__swizzle_ycck( uint8_t v3, bool is_rgb_or_cmyk, bool triangle_filter_for_2to1, + bool src_is_bt601, wuffs_base__slice_u8 scratch_buffer_2k) { if (!p) { return wuffs_base__make_status(wuffs_base__error__bad_receiver); @@ -33569,6 +34077,37 @@ wuffs_base__pixel_swizzler__swizzle_ycck( if (is_rgb_or_cmyk) { conv3func = &wuffs_private_impl__swizzle_rgb__convert_3_general; + } else if (src_is_bt601) { + // BT.601 studio-range YCbCr (VP8, H.264). + switch (dst->pixcfg.private_impl.pixfmt.repr) { + case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL: + case WUFFS_BASE__PIXEL_FORMAT__BGRA_PREMUL: + case WUFFS_BASE__PIXEL_FORMAT__BGRX: +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + if (wuffs_base__cpu_arch__have_x86_avx2()) { + conv3func = + &wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx_x86_avx2; + break; + } +#endif + conv3func = &wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx; + break; + case WUFFS_BASE__PIXEL_FORMAT__RGBA_NONPREMUL: + case WUFFS_BASE__PIXEL_FORMAT__RGBA_PREMUL: + case WUFFS_BASE__PIXEL_FORMAT__RGBX: +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + if (wuffs_base__cpu_arch__have_x86_avx2()) { + conv3func = + &wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx_x86_avx2; + break; + } +#endif + conv3func = &wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx; + break; + default: + conv3func = &wuffs_private_impl__swizzle_ycc_bt601__convert_3_general; + break; + } } else { switch (dst->pixcfg.private_impl.pixfmt.repr) { case WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL: @@ -33579,6 +34118,10 @@ wuffs_base__pixel_swizzler__swizzle_ycck( conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_bgrx_x86_avx2; break; } +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_bgrx_arm_neon; + break; #endif conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_bgrx; break; @@ -33590,6 +34133,10 @@ wuffs_base__pixel_swizzler__swizzle_ycck( conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_rgbx_x86_avx2; break; } +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_rgbx_arm_neon; + break; #endif conv3func = &wuffs_private_impl__swizzle_ycc__convert_3_rgbx; break; @@ -34141,6 +34688,311 @@ wuffs_private_impl__swizzle_ycc__convert_3_rgbx_x86_avx2( } } +// -------- + +// BT.601 studio-range YCbCr to BGRX/RGBX, AVX2. +// +// This matches the scalar wuffs_base__color_ycc_bt601__as__color_u32 formula: +// yc = (Y * 19077) >> 8 +// rc = (Cr * 26149) >> 8 +// gc_u = (Cb * 6419) >> 8 +// gc_v = (Cr * 13320) >> 8 +// bc = (Cb * 33050) >> 8 +// R = clip((yc + rc - 14234) >> 6, 0, 255) +// G = clip((yc - gc_u - gc_v + 8708) >> 6, 0, 255) +// B = clip((yc + bc - 17685) >> 6, 0, 255) +// +// SIMD approach: compute (X * K) >> 8 via mullo+mulhi_epu16, combine as i16, +// shift right by 6, and use packus_epi16 for [0,255] clamping. The B channel +// uses adds_epi16 to avoid overflow (yc+bc can exceed i16 max). + +// Helper: compute (X * K) >> 8 in i16 lanes, where X is u8-in-i16 [0..255] +// and K is a u16 constant. Uses mullo_epi16 (low 16 bits) and mulhi_epu16 +// (unsigned high 16 bits) to form the result. +#define WUFFS_PRIVATE_IMPL__MULDIV256(x, k) \ + _mm256_or_si256(_mm256_srli_epi16(_mm256_mullo_epi16((x), (k)), 8), \ + _mm256_slli_epi16(_mm256_mulhi_epu16((x), (k)), 8)) + +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx_x86_avx2( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + if ((x + 32u) > x_end) { + wuffs_private_impl__swizzle_ycc_bt601__convert_3_bgrx( // + dst, x, x_end, y, up0, up1, up2); + return; + } + + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + const __m256i u00FF = _mm256_set1_epi16(+0x00FF); + const __m256i uFFFF = _mm256_set1_epi16(-0x0001); + + // BT.601 fixed-point constants for (X * K) >> 8 via MULDIV256. + const __m256i k_19077 = _mm256_set1_epi16(19077); + const __m256i k_26149 = _mm256_set1_epi16(26149); + const __m256i k_6419 = _mm256_set1_epi16(6419); + const __m256i k_13320 = _mm256_set1_epi16(13320); + + // For the B channel, (Cb * 33050) >> 8 can exceed i16 max (32920 for Cb=255). + // We center: bc_c = ((Cb-128) * 33050) >> 8, range [-16525, 16395]. + // Decompose 33050 = 129*256 + 26, so (x*33050)>>8 = x*129 + (x*26)>>8. + const __m256i k_128 = _mm256_set1_epi16(128); + const __m256i k_26 = _mm256_set1_epi16(26); + + const __m256i k_r_off = _mm256_set1_epi16(-14234); + const __m256i k_g_off = _mm256_set1_epi16(+8708); + // B offset: (128*33050)>>8 = 16525, so 16525 - 17685 = -1160. + const __m256i k_b_off = _mm256_set1_epi16(-1160); + + while (x < x_end) { + __m256i yy_all = _mm256_lddqu_si256((const __m256i*)(const void*)up0); + __m256i cb_all = _mm256_lddqu_si256((const __m256i*)(const void*)up1); + __m256i cr_all = _mm256_lddqu_si256((const __m256i*)(const void*)up2); + + // Split into even and odd i16 lanes. + __m256i yy_eve = _mm256_and_si256(yy_all, u00FF); + __m256i yy_odd = _mm256_srli_epi16(yy_all, 8); + __m256i cb_eve = _mm256_and_si256(cb_all, u00FF); + __m256i cb_odd = _mm256_srli_epi16(cb_all, 8); + __m256i cr_eve = _mm256_and_si256(cr_all, u00FF); + __m256i cr_odd = _mm256_srli_epi16(cr_all, 8); + + // yc = (Y * 19077) >> 8. Range [0, 18977]. + __m256i yc_eve = WUFFS_PRIVATE_IMPL__MULDIV256(yy_eve, k_19077); + __m256i yc_odd = WUFFS_PRIVATE_IMPL__MULDIV256(yy_odd, k_19077); + + // rc = (Cr * 26149) >> 8. Range [0, 26046]. + __m256i rc_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cr_eve, k_26149); + __m256i rc_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cr_odd, k_26149); + + // gc_u = (Cb * 6419) >> 8. Range [0, 6393]. + __m256i gc_u_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cb_eve, k_6419); + __m256i gc_u_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cb_odd, k_6419); + + // gc_v = (Cr * 13320) >> 8. Range [0, 13270]. + __m256i gc_v_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cr_eve, k_13320); + __m256i gc_v_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cr_odd, k_13320); + + // bc_c = ((Cb-128) * 33050) >> 8 = (Cb-128)*129 + ((Cb-128)*26)>>8. + // Range [-16525, 16395], fits i16. + __m256i cb_c_eve = _mm256_sub_epi16(cb_eve, k_128); + __m256i cb_c_odd = _mm256_sub_epi16(cb_odd, k_128); + __m256i bc_c_eve = _mm256_add_epi16( + _mm256_add_epi16(_mm256_slli_epi16(cb_c_eve, 7), cb_c_eve), + _mm256_srai_epi16(_mm256_mullo_epi16(cb_c_eve, k_26), 8)); + __m256i bc_c_odd = _mm256_add_epi16( + _mm256_add_epi16(_mm256_slli_epi16(cb_c_odd, 7), cb_c_odd), + _mm256_srai_epi16(_mm256_mullo_epi16(cb_c_odd, k_26), 8)); + + // R = (yc + rc - 14234) >> 6. Max = 18977+26046-14234 = 30789 < 32767. + __m256i r_eve = _mm256_srai_epi16( + _mm256_add_epi16(_mm256_add_epi16(yc_eve, rc_eve), k_r_off), 6); + __m256i r_odd = _mm256_srai_epi16( + _mm256_add_epi16(_mm256_add_epi16(yc_odd, rc_odd), k_r_off), 6); + + // G = (yc - gc_u - gc_v + 8708) >> 6. Range [-10955, 27685], fits i16. + __m256i g_eve = _mm256_srai_epi16( + _mm256_add_epi16( + _mm256_sub_epi16(_mm256_sub_epi16(yc_eve, gc_u_eve), gc_v_eve), + k_g_off), + 6); + __m256i g_odd = _mm256_srai_epi16( + _mm256_add_epi16( + _mm256_sub_epi16(_mm256_sub_epi16(yc_odd, gc_u_odd), gc_v_odd), + k_g_off), + 6); + + // B = (yc + bc_c - 1160) >> 6. Range [-17685, 34212]. Use adds_epi16 + // for the final sum: saturates to 32767 for sums > 32767, which after + // >>6 = 511 gets clamped to 255 by packus. Correct. + __m256i b_eve = _mm256_srai_epi16( + _mm256_adds_epi16(_mm256_add_epi16(yc_eve, k_b_off), bc_c_eve), 6); + __m256i b_odd = _mm256_srai_epi16( + _mm256_adds_epi16(_mm256_add_epi16(yc_odd, k_b_off), bc_c_odd), 6); + + // Pack i16 to u8 with saturation. + __m256i packed_b_eve = _mm256_packus_epi16(b_eve, b_eve); + __m256i packed_b_odd = _mm256_packus_epi16(b_odd, b_odd); + __m256i packed_g_eve = _mm256_packus_epi16(g_eve, g_eve); + __m256i packed_g_odd = _mm256_packus_epi16(g_odd, g_odd); + __m256i packed_r_eve = _mm256_packus_epi16(r_eve, r_eve); + __m256i packed_r_odd = _mm256_packus_epi16(r_odd, r_odd); + + // Interleave to BGRX, same as the JFIF converter. + __m256i mix00 = _mm256_unpacklo_epi8(packed_b_eve, packed_g_eve); + __m256i mix01 = _mm256_unpacklo_epi8(packed_b_odd, packed_g_odd); + __m256i mix02 = _mm256_unpacklo_epi8(packed_r_eve, uFFFF); + __m256i mix03 = _mm256_unpacklo_epi8(packed_r_odd, uFFFF); + + __m256i mix10 = _mm256_unpacklo_epi16(mix00, mix02); + __m256i mix11 = _mm256_unpacklo_epi16(mix01, mix03); + __m256i mix12 = _mm256_unpackhi_epi16(mix00, mix02); + __m256i mix13 = _mm256_unpackhi_epi16(mix01, mix03); + + __m256i mix20 = _mm256_unpacklo_epi32(mix10, mix11); + __m256i mix21 = _mm256_unpackhi_epi32(mix10, mix11); + __m256i mix22 = _mm256_unpacklo_epi32(mix12, mix13); + __m256i mix23 = _mm256_unpackhi_epi32(mix12, mix13); + + __m256i mix30 = _mm256_permute2x128_si256(mix20, mix21, 0x20); + __m256i mix31 = _mm256_permute2x128_si256(mix22, mix23, 0x20); + __m256i mix32 = _mm256_permute2x128_si256(mix20, mix21, 0x31); + __m256i mix33 = _mm256_permute2x128_si256(mix22, mix23, 0x31); + + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x00), mix30); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x20), mix31); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x40), mix32); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x60), mix33); + + uint32_t n = 32u - (31u & (x - x_end)); + dst_iter += 4u * n; + up0 += n; + up1 += n; + up2 += n; + x += n; + } +} + +// The rgbx flavor is the same as the bgrx flavor above but swaps B and R in +// the interleave stage. +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +static void // +wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx_x86_avx2( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + if ((x + 32u) > x_end) { + wuffs_private_impl__swizzle_ycc_bt601__convert_3_rgbx( // + dst, x, x_end, y, up0, up1, up2); + return; + } + + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + const __m256i u00FF = _mm256_set1_epi16(+0x00FF); + const __m256i uFFFF = _mm256_set1_epi16(-0x0001); + + const __m256i k_19077 = _mm256_set1_epi16(19077); + const __m256i k_26149 = _mm256_set1_epi16(26149); + const __m256i k_6419 = _mm256_set1_epi16(6419); + const __m256i k_13320 = _mm256_set1_epi16(13320); + const __m256i k_128 = _mm256_set1_epi16(128); + const __m256i k_26 = _mm256_set1_epi16(26); + + const __m256i k_r_off = _mm256_set1_epi16(-14234); + const __m256i k_g_off = _mm256_set1_epi16(+8708); + const __m256i k_b_off = _mm256_set1_epi16(-1160); + + while (x < x_end) { + __m256i yy_all = _mm256_lddqu_si256((const __m256i*)(const void*)up0); + __m256i cb_all = _mm256_lddqu_si256((const __m256i*)(const void*)up1); + __m256i cr_all = _mm256_lddqu_si256((const __m256i*)(const void*)up2); + + __m256i yy_eve = _mm256_and_si256(yy_all, u00FF); + __m256i yy_odd = _mm256_srli_epi16(yy_all, 8); + __m256i cb_eve = _mm256_and_si256(cb_all, u00FF); + __m256i cb_odd = _mm256_srli_epi16(cb_all, 8); + __m256i cr_eve = _mm256_and_si256(cr_all, u00FF); + __m256i cr_odd = _mm256_srli_epi16(cr_all, 8); + + __m256i yc_eve = WUFFS_PRIVATE_IMPL__MULDIV256(yy_eve, k_19077); + __m256i yc_odd = WUFFS_PRIVATE_IMPL__MULDIV256(yy_odd, k_19077); + __m256i rc_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cr_eve, k_26149); + __m256i rc_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cr_odd, k_26149); + __m256i gc_u_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cb_eve, k_6419); + __m256i gc_u_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cb_odd, k_6419); + __m256i gc_v_eve = WUFFS_PRIVATE_IMPL__MULDIV256(cr_eve, k_13320); + __m256i gc_v_odd = WUFFS_PRIVATE_IMPL__MULDIV256(cr_odd, k_13320); + + __m256i cb_c_eve = _mm256_sub_epi16(cb_eve, k_128); + __m256i cb_c_odd = _mm256_sub_epi16(cb_odd, k_128); + __m256i bc_c_eve = _mm256_add_epi16( + _mm256_add_epi16(_mm256_slli_epi16(cb_c_eve, 7), cb_c_eve), + _mm256_srai_epi16(_mm256_mullo_epi16(cb_c_eve, k_26), 8)); + __m256i bc_c_odd = _mm256_add_epi16( + _mm256_add_epi16(_mm256_slli_epi16(cb_c_odd, 7), cb_c_odd), + _mm256_srai_epi16(_mm256_mullo_epi16(cb_c_odd, k_26), 8)); + + __m256i r_eve = _mm256_srai_epi16( + _mm256_add_epi16(_mm256_add_epi16(yc_eve, rc_eve), k_r_off), 6); + __m256i r_odd = _mm256_srai_epi16( + _mm256_add_epi16(_mm256_add_epi16(yc_odd, rc_odd), k_r_off), 6); + + __m256i g_eve = _mm256_srai_epi16( + _mm256_add_epi16( + _mm256_sub_epi16(_mm256_sub_epi16(yc_eve, gc_u_eve), gc_v_eve), + k_g_off), + 6); + __m256i g_odd = _mm256_srai_epi16( + _mm256_add_epi16( + _mm256_sub_epi16(_mm256_sub_epi16(yc_odd, gc_u_odd), gc_v_odd), + k_g_off), + 6); + + __m256i b_eve = _mm256_srai_epi16( + _mm256_adds_epi16(_mm256_add_epi16(yc_eve, k_b_off), bc_c_eve), 6); + __m256i b_odd = _mm256_srai_epi16( + _mm256_adds_epi16(_mm256_add_epi16(yc_odd, k_b_off), bc_c_odd), 6); + + __m256i packed_b_eve = _mm256_packus_epi16(b_eve, b_eve); + __m256i packed_b_odd = _mm256_packus_epi16(b_odd, b_odd); + __m256i packed_g_eve = _mm256_packus_epi16(g_eve, g_eve); + __m256i packed_g_odd = _mm256_packus_epi16(g_odd, g_odd); + __m256i packed_r_eve = _mm256_packus_epi16(r_eve, r_eve); + __m256i packed_r_odd = _mm256_packus_epi16(r_odd, r_odd); + + // § Note the swapped B and R channels compared to bgrx. + __m256i mix00 = _mm256_unpacklo_epi8(packed_r_eve, packed_g_eve); + __m256i mix01 = _mm256_unpacklo_epi8(packed_r_odd, packed_g_odd); + __m256i mix02 = _mm256_unpacklo_epi8(packed_b_eve, uFFFF); + __m256i mix03 = _mm256_unpacklo_epi8(packed_b_odd, uFFFF); + + __m256i mix10 = _mm256_unpacklo_epi16(mix00, mix02); + __m256i mix11 = _mm256_unpacklo_epi16(mix01, mix03); + __m256i mix12 = _mm256_unpackhi_epi16(mix00, mix02); + __m256i mix13 = _mm256_unpackhi_epi16(mix01, mix03); + + __m256i mix20 = _mm256_unpacklo_epi32(mix10, mix11); + __m256i mix21 = _mm256_unpackhi_epi32(mix10, mix11); + __m256i mix22 = _mm256_unpacklo_epi32(mix12, mix13); + __m256i mix23 = _mm256_unpackhi_epi32(mix12, mix13); + + __m256i mix30 = _mm256_permute2x128_si256(mix20, mix21, 0x20); + __m256i mix31 = _mm256_permute2x128_si256(mix22, mix23, 0x20); + __m256i mix32 = _mm256_permute2x128_si256(mix20, mix21, 0x31); + __m256i mix33 = _mm256_permute2x128_si256(mix22, mix23, 0x31); + + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x00), mix30); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x20), mix31); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x40), mix32); + _mm256_storeu_si256((__m256i*)(void*)(dst_iter + 0x60), mix33); + + uint32_t n = 32u - (31u & (x - x_end)); + dst_iter += 4u * n; + up0 += n; + up1 += n; + up2 += n; + x += n; + } +} + +#undef WUFFS_PRIVATE_IMPL__MULDIV256 + #if defined(__GNUC__) && !defined(__clang__) // No-op. #else @@ -34347,6 +35199,213 @@ wuffs_private_impl__swizzle_ycc__upsample_inv_h2v2_triangle_x86_avx2( #endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) // ‼ WUFFS MULTI-FILE SECTION -x86_avx2 +// -------- + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +static void // +wuffs_private_impl__swizzle_ycc__convert_3_bgrx_arm_neon( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + // Per wuffs_base__color_ycc__as__color_u32, the formulae: + // + // R = Y + 1.40200 * Cr + // G = Y - 0.34414 * Cb - 0.71414 * Cr + // B = Y + 1.77200 * Cb + // + // When scaled by 1<<16: + // + // 0.34414 becomes 0x0581A = 22554. + // 0.71414 becomes 0x0B6D2 = 46802. + // 1.40200 becomes 0x166E9 = 91881. + // 1.77200 becomes 0x1C5A2 = 116130. + // + // Separate the integer and fractional parts, since we work with signed + // 16-bit SIMD lanes (int16x4_t for vmull_n_s16). + // + // -0x3A5E = -0x20000 + 0x1C5A2 The B:Cb factor. + // +0x66E9 = -0x10000 + 0x166E9 The R:Cr factor. + // -0x581A = +0x00000 - 0x0581A The G:Cb factor. + // +0x492E = +0x10000 - 0x0B6D2 The G:Cr factor. + // + // B-Y = frac_B * Cb / 65536 + 2 * Cb + // R-Y = frac_R * Cr / 65536 + 1 * Cr + // G-Y = (frac_Gcb * Cb + frac_Gcr * Cr) / 65536 - 1 * Cr + + const int16_t k_frac_b_cb = -0x3A5E; // -14942 + const int16_t k_frac_r_cr = +0x66E9; // +26345 + const int16_t k_frac_g_cb = -0x581A; // -22554 + const int16_t k_frac_g_cr = +0x492E; // +18734 + + const int16x8_t bias = vdupq_n_s16(128); + const uint8x8_t alpha = vdup_n_u8(0xFF); + + while ((x + 8u) <= x_end) { + // Load 8 pixels of Y, Cb, Cr. + uint8x8_t y_u8 = vld1_u8(up0); + uint8x8_t cb_u8 = vld1_u8(up1); + uint8x8_t cr_u8 = vld1_u8(up2); + + // Widen to int16 and center chroma around zero. + int16x8_t yy = vreinterpretq_s16_u16(vmovl_u8(y_u8)); + int16x8_t cb = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_u8)), bias); + int16x8_t cr = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_u8)), bias); + + // Split into lo/hi halves for 32-bit precision multiplies. + int16x4_t cb_lo = vget_low_s16(cb); + int16x4_t cb_hi = vget_high_s16(cb); + int16x4_t cr_lo = vget_low_s16(cr); + int16x4_t cr_hi = vget_high_s16(cr); + + // R-Y = round(frac_R * Cr / 65536) + Cr + int16x8_t ry = vcombine_s16( + vrshrn_n_s32(vmull_n_s16(cr_lo, k_frac_r_cr), 16), + vrshrn_n_s32(vmull_n_s16(cr_hi, k_frac_r_cr), 16)); + ry = vaddq_s16(ry, cr); + + // B-Y = round(frac_B * Cb / 65536) + 2 * Cb + int16x8_t by = vcombine_s16( + vrshrn_n_s32(vmull_n_s16(cb_lo, k_frac_b_cb), 16), + vrshrn_n_s32(vmull_n_s16(cb_hi, k_frac_b_cb), 16)); + by = vaddq_s16(by, vaddq_s16(cb, cb)); + + // G-Y = round((frac_Gcb * Cb + frac_Gcr * Cr) / 65536) - Cr + int32x4_t gy32_lo = vmull_n_s16(cb_lo, k_frac_g_cb); + gy32_lo = vmlal_n_s16(gy32_lo, cr_lo, k_frac_g_cr); + int32x4_t gy32_hi = vmull_n_s16(cb_hi, k_frac_g_cb); + gy32_hi = vmlal_n_s16(gy32_hi, cr_hi, k_frac_g_cr); + int16x8_t gy = vcombine_s16( + vrshrn_n_s32(gy32_lo, 16), + vrshrn_n_s32(gy32_hi, 16)); + gy = vsubq_s16(gy, cr); + + // Add Y and clamp to [0, 255] via saturating unsigned narrow. + uint8x8_t r = vqmovun_s16(vaddq_s16(yy, ry)); + uint8x8_t g = vqmovun_s16(vaddq_s16(yy, gy)); + uint8x8_t b = vqmovun_s16(vaddq_s16(yy, by)); + + // Interleave to BGRX and store 8 pixels (32 bytes). + uint8x8x4_t bgrx; + bgrx.val[0] = b; + bgrx.val[1] = g; + bgrx.val[2] = r; + bgrx.val[3] = alpha; + vst4_u8(dst_iter, bgrx); + + dst_iter += 32u; + up0 += 8u; + up1 += 8u; + up2 += 8u; + x += 8u; + } + + // Scalar tail. + for (; x < x_end; x++) { + uint32_t color = // + wuffs_base__color_ycc__as__color_u32( // + *up0++, *up1++, *up2++); + wuffs_base__poke_u32le__no_bounds_check(dst_iter, color); + dst_iter += 4u; + } +} + +// The rgbx flavor is exactly the same as the bgrx flavor except that the +// interleave order is {r, g, b, alpha} instead of {b, g, r, alpha}. +static void // +wuffs_private_impl__swizzle_ycc__convert_3_rgbx_arm_neon( + wuffs_base__pixel_buffer* dst, + uint32_t x, + uint32_t x_end, + uint32_t y, + const uint8_t* up0, + const uint8_t* up1, + const uint8_t* up2) { + size_t dst_stride = dst->private_impl.planes[0].stride; + uint8_t* dst_iter = dst->private_impl.planes[0].ptr + + (dst_stride * ((size_t)y)) + (4u * ((size_t)x)); + + const int16_t k_frac_b_cb = -0x3A5E; + const int16_t k_frac_r_cr = +0x66E9; + const int16_t k_frac_g_cb = -0x581A; + const int16_t k_frac_g_cr = +0x492E; + + const int16x8_t bias = vdupq_n_s16(128); + const uint8x8_t alpha = vdup_n_u8(0xFF); + + while ((x + 8u) <= x_end) { + uint8x8_t y_u8 = vld1_u8(up0); + uint8x8_t cb_u8 = vld1_u8(up1); + uint8x8_t cr_u8 = vld1_u8(up2); + + int16x8_t yy = vreinterpretq_s16_u16(vmovl_u8(y_u8)); + int16x8_t cb = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cb_u8)), bias); + int16x8_t cr = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(cr_u8)), bias); + + int16x4_t cb_lo = vget_low_s16(cb); + int16x4_t cb_hi = vget_high_s16(cb); + int16x4_t cr_lo = vget_low_s16(cr); + int16x4_t cr_hi = vget_high_s16(cr); + + int16x8_t ry = vcombine_s16( + vrshrn_n_s32(vmull_n_s16(cr_lo, k_frac_r_cr), 16), + vrshrn_n_s32(vmull_n_s16(cr_hi, k_frac_r_cr), 16)); + ry = vaddq_s16(ry, cr); + + int16x8_t by = vcombine_s16( + vrshrn_n_s32(vmull_n_s16(cb_lo, k_frac_b_cb), 16), + vrshrn_n_s32(vmull_n_s16(cb_hi, k_frac_b_cb), 16)); + by = vaddq_s16(by, vaddq_s16(cb, cb)); + + int32x4_t gy32_lo = vmull_n_s16(cb_lo, k_frac_g_cb); + gy32_lo = vmlal_n_s16(gy32_lo, cr_lo, k_frac_g_cr); + int32x4_t gy32_hi = vmull_n_s16(cb_hi, k_frac_g_cb); + gy32_hi = vmlal_n_s16(gy32_hi, cr_hi, k_frac_g_cr); + int16x8_t gy = vcombine_s16( + vrshrn_n_s32(gy32_lo, 16), + vrshrn_n_s32(gy32_hi, 16)); + gy = vsubq_s16(gy, cr); + + uint8x8_t r = vqmovun_s16(vaddq_s16(yy, ry)); + uint8x8_t g = vqmovun_s16(vaddq_s16(yy, gy)); + uint8x8_t b = vqmovun_s16(vaddq_s16(yy, by)); + + // Interleave to RGBX and store 8 pixels (32 bytes). + uint8x8x4_t rgbx; + rgbx.val[0] = r; + rgbx.val[1] = g; + rgbx.val[2] = b; + rgbx.val[3] = alpha; + vst4_u8(dst_iter, rgbx); + + dst_iter += 32u; + up0 += 8u; + up1 += 8u; + up2 += 8u; + x += 8u; + } + + for (; x < x_end; x++) { + uint32_t color = // + wuffs_base__color_ycc__as__color_u32_abgr( // + *up0++, *up1++, *up2++); + wuffs_base__poke_u32le__no_bounds_check(dst_iter, color); + dst_iter += 4u; + } +} + +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + #endif // !defined(WUFFS_CONFIG__MODULES) || // defined(WUFFS_CONFIG__MODULE__BASE) || // defined(WUFFS_CONFIG__MODULE__BASE__PIXCONV) @@ -57063,6 +58122,7 @@ wuffs_jpeg__decoder__swizzle_colorful( self->private_impl.f_components_v[3u], self->private_impl.f_is_rgb_or_cmyk, ! self->private_impl.f_use_lower_quality, + false, wuffs_base__make_slice_u8(self->private_data.f_swizzle_ycck_scratch_buffer_2k, 2048)); return wuffs_private_impl__status__ensure_not_a_suspension(v_status); } @@ -80333,234 +81393,13596 @@ wuffs_thumbhash__decoder__workbuf_len( // ---------------- Status Codes Implementations const char wuffs_vp8__error__bad_header[] = "#vp8: bad header"; +const char wuffs_vp8__error__bad_coefficient[] = "#vp8: bad coefficient"; const char wuffs_vp8__error__truncated_input[] = "#vp8: truncated input"; const char wuffs_vp8__error__unsupported_vp8_file[] = "#vp8: unsupported VP8 file"; +const char wuffs_vp8__error__internal_error_inconsistent_decoder_state[] = "#vp8: internal error: inconsistent decoder state"; // ---------------- Private Consts +static const uint16_t +WUFFS_VP8__DC_QUANT[128] WUFFS_BASE__POTENTIALLY_UNUSED = { + 4u, 5u, 6u, 7u, 8u, 9u, 10u, 10u, + 11u, 12u, 13u, 14u, 15u, 16u, 17u, 17u, + 18u, 19u, 20u, 20u, 21u, 21u, 22u, 22u, + 23u, 23u, 24u, 25u, 25u, 26u, 27u, 28u, + 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, + 37u, 37u, 38u, 39u, 40u, 41u, 42u, 43u, + 44u, 45u, 46u, 46u, 47u, 48u, 49u, 50u, + 51u, 52u, 53u, 54u, 55u, 56u, 57u, 58u, + 59u, 60u, 61u, 62u, 63u, 64u, 65u, 66u, + 67u, 68u, 69u, 70u, 71u, 72u, 73u, 74u, + 75u, 76u, 76u, 77u, 78u, 79u, 80u, 81u, + 82u, 83u, 84u, 85u, 86u, 87u, 88u, 89u, + 91u, 93u, 95u, 96u, 98u, 100u, 101u, 102u, + 104u, 106u, 108u, 110u, 112u, 114u, 116u, 118u, + 122u, 124u, 126u, 128u, 130u, 132u, 134u, 136u, + 138u, 140u, 143u, 145u, 148u, 151u, 154u, 157u, +}; + +static const uint16_t +WUFFS_VP8__AC_QUANT[128] WUFFS_BASE__POTENTIALLY_UNUSED = { + 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, + 12u, 13u, 14u, 15u, 16u, 17u, 18u, 19u, + 20u, 21u, 22u, 23u, 24u, 25u, 26u, 27u, + 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, + 36u, 37u, 38u, 39u, 40u, 41u, 42u, 43u, + 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, + 52u, 53u, 54u, 55u, 56u, 57u, 58u, 60u, + 62u, 64u, 66u, 68u, 70u, 72u, 74u, 76u, + 78u, 80u, 82u, 84u, 86u, 88u, 90u, 92u, + 94u, 96u, 98u, 100u, 102u, 104u, 106u, 108u, + 110u, 112u, 114u, 116u, 119u, 122u, 125u, 128u, + 131u, 134u, 137u, 140u, 143u, 146u, 149u, 152u, + 155u, 158u, 161u, 164u, 167u, 170u, 173u, 177u, + 181u, 185u, 189u, 193u, 197u, 201u, 205u, 209u, + 213u, 217u, 221u, 225u, 229u, 234u, 239u, 245u, + 249u, 254u, 259u, 264u, 269u, 274u, 279u, 284u, +}; + +static const uint8_t +WUFFS_VP8__COEFF_BANDS[16] WUFFS_BASE__POTENTIALLY_UNUSED = { + 0u, 1u, 2u, 3u, 6u, 4u, 5u, 6u, + 6u, 6u, 6u, 6u, 6u, 6u, 6u, 7u, +}; + +static const uint8_t +WUFFS_VP8__COEFF_BAND_OFFSET[16] WUFFS_BASE__POTENTIALLY_UNUSED = { + 0u, 33u, 66u, 99u, 198u, 132u, 165u, 198u, + 198u, 198u, 198u, 198u, 198u, 198u, 198u, 231u, +}; + +static const uint8_t +WUFFS_VP8__ZIGZAG[16] WUFFS_BASE__POTENTIALLY_UNUSED = { + 0u, 1u, 4u, 8u, 5u, 2u, 3u, 6u, + 9u, 12u, 13u, 10u, 7u, 11u, 14u, 15u, +}; + +static const uint8_t +WUFFS_VP8__DEFAULT_COEFF_PROBS[1056] WUFFS_BASE__POTENTIALLY_UNUSED = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 253u, 136u, 254u, 255u, 228u, 219u, 128u, + 128u, 128u, 128u, 128u, 189u, 129u, 242u, 255u, + 227u, 213u, 255u, 219u, 128u, 128u, 128u, 106u, + 126u, 227u, 252u, 214u, 209u, 255u, 255u, 128u, + 128u, 128u, 1u, 98u, 248u, 255u, 236u, 226u, + 255u, 255u, 128u, 128u, 128u, 181u, 133u, 238u, + 254u, 221u, 234u, 255u, 154u, 128u, 128u, 128u, + 78u, 134u, 202u, 247u, 198u, 180u, 255u, 219u, + 128u, 128u, 128u, 1u, 185u, 249u, 255u, 243u, + 255u, 128u, 128u, 128u, 128u, 128u, 184u, 150u, + 247u, 255u, 236u, 224u, 128u, 128u, 128u, 128u, + 128u, 77u, 110u, 216u, 255u, 236u, 230u, 128u, + 128u, 128u, 128u, 128u, 1u, 101u, 251u, 255u, + 241u, 255u, 128u, 128u, 128u, 128u, 128u, 170u, + 139u, 241u, 252u, 236u, 209u, 255u, 255u, 128u, + 128u, 128u, 37u, 116u, 196u, 243u, 228u, 255u, + 255u, 255u, 128u, 128u, 128u, 1u, 204u, 254u, + 255u, 245u, 255u, 128u, 128u, 128u, 128u, 128u, + 207u, 160u, 250u, 255u, 238u, 128u, 128u, 128u, + 128u, 128u, 128u, 102u, 103u, 231u, 255u, 211u, + 171u, 128u, 128u, 128u, 128u, 128u, 1u, 152u, + 252u, 255u, 240u, 255u, 128u, 128u, 128u, 128u, + 128u, 177u, 135u, 243u, 255u, 234u, 225u, 128u, + 128u, 128u, 128u, 128u, 80u, 129u, 211u, 255u, + 194u, 224u, 128u, 128u, 128u, 128u, 128u, 1u, + 1u, 255u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 246u, 1u, 255u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 255u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 198u, 35u, 237u, 223u, 193u, 187u, 162u, 160u, + 145u, 155u, 62u, 131u, 45u, 198u, 221u, 172u, + 176u, 220u, 157u, 252u, 221u, 1u, 68u, 47u, + 146u, 208u, 149u, 167u, 221u, 162u, 255u, 223u, + 128u, 1u, 149u, 241u, 255u, 221u, 224u, 255u, + 255u, 128u, 128u, 128u, 184u, 141u, 234u, 253u, + 222u, 220u, 255u, 199u, 128u, 128u, 128u, 81u, + 99u, 181u, 242u, 176u, 190u, 249u, 202u, 255u, + 255u, 128u, 1u, 129u, 232u, 253u, 214u, 197u, + 242u, 196u, 255u, 255u, 128u, 99u, 121u, 210u, + 250u, 201u, 198u, 255u, 202u, 128u, 128u, 128u, + 23u, 91u, 163u, 242u, 170u, 187u, 247u, 210u, + 255u, 255u, 128u, 1u, 200u, 246u, 255u, 234u, + 255u, 128u, 128u, 128u, 128u, 128u, 109u, 178u, + 241u, 255u, 231u, 245u, 255u, 255u, 128u, 128u, + 128u, 44u, 130u, 201u, 253u, 205u, 192u, 255u, + 255u, 128u, 128u, 128u, 1u, 132u, 239u, 251u, + 219u, 209u, 255u, 165u, 128u, 128u, 128u, 94u, + 136u, 225u, 251u, 218u, 190u, 255u, 255u, 128u, + 128u, 128u, 22u, 100u, 174u, 245u, 186u, 161u, + 255u, 199u, 128u, 128u, 128u, 1u, 182u, 249u, + 255u, 232u, 235u, 128u, 128u, 128u, 128u, 128u, + 124u, 143u, 241u, 255u, 227u, 234u, 128u, 128u, + 128u, 128u, 128u, 35u, 77u, 181u, 251u, 193u, + 211u, 255u, 205u, 128u, 128u, 128u, 1u, 157u, + 247u, 255u, 236u, 231u, 255u, 255u, 128u, 128u, + 128u, 121u, 141u, 235u, 255u, 225u, 227u, 255u, + 255u, 128u, 128u, 128u, 45u, 99u, 188u, 251u, + 195u, 217u, 255u, 224u, 128u, 128u, 128u, 1u, + 1u, 251u, 255u, 213u, 255u, 128u, 128u, 128u, + 128u, 128u, 203u, 1u, 248u, 255u, 255u, 128u, + 128u, 128u, 128u, 128u, 128u, 137u, 1u, 177u, + 255u, 224u, 255u, 128u, 128u, 128u, 128u, 128u, + 253u, 9u, 248u, 251u, 207u, 208u, 255u, 192u, + 128u, 128u, 128u, 175u, 13u, 224u, 243u, 193u, + 185u, 249u, 198u, 255u, 255u, 128u, 73u, 17u, + 171u, 221u, 161u, 179u, 236u, 167u, 255u, 234u, + 128u, 1u, 95u, 247u, 253u, 212u, 183u, 255u, + 255u, 128u, 128u, 128u, 239u, 90u, 244u, 250u, + 211u, 209u, 255u, 255u, 128u, 128u, 128u, 155u, + 77u, 195u, 248u, 188u, 195u, 255u, 255u, 128u, + 128u, 128u, 1u, 24u, 239u, 251u, 218u, 219u, + 255u, 205u, 128u, 128u, 128u, 201u, 51u, 219u, + 255u, 196u, 186u, 128u, 128u, 128u, 128u, 128u, + 69u, 46u, 190u, 239u, 201u, 218u, 255u, 228u, + 128u, 128u, 128u, 1u, 191u, 251u, 255u, 255u, + 128u, 128u, 128u, 128u, 128u, 128u, 223u, 165u, + 249u, 255u, 213u, 255u, 128u, 128u, 128u, 128u, + 128u, 141u, 124u, 248u, 255u, 255u, 128u, 128u, + 128u, 128u, 128u, 128u, 1u, 16u, 248u, 255u, + 255u, 128u, 128u, 128u, 128u, 128u, 128u, 190u, + 36u, 230u, 255u, 236u, 255u, 128u, 128u, 128u, + 128u, 128u, 149u, 1u, 255u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 1u, 226u, 255u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 247u, 192u, 255u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 240u, 128u, 255u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 1u, 134u, + 252u, 255u, 255u, 128u, 128u, 128u, 128u, 128u, + 128u, 213u, 62u, 250u, 255u, 255u, 128u, 128u, + 128u, 128u, 128u, 128u, 55u, 93u, 255u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 202u, 24u, 213u, 235u, 186u, 191u, 220u, 160u, + 240u, 175u, 255u, 126u, 38u, 182u, 232u, 169u, + 184u, 228u, 174u, 255u, 187u, 128u, 61u, 46u, + 138u, 219u, 151u, 178u, 240u, 170u, 255u, 216u, + 128u, 1u, 112u, 230u, 250u, 199u, 191u, 247u, + 159u, 255u, 255u, 128u, 166u, 109u, 228u, 252u, + 211u, 215u, 255u, 174u, 128u, 128u, 128u, 39u, + 77u, 162u, 232u, 172u, 180u, 245u, 178u, 255u, + 255u, 128u, 1u, 52u, 220u, 246u, 198u, 199u, + 249u, 220u, 255u, 255u, 128u, 124u, 74u, 191u, + 243u, 183u, 193u, 250u, 221u, 255u, 255u, 128u, + 24u, 71u, 130u, 219u, 154u, 170u, 243u, 182u, + 255u, 255u, 128u, 1u, 182u, 225u, 249u, 219u, + 240u, 255u, 224u, 128u, 128u, 128u, 149u, 150u, + 226u, 252u, 216u, 205u, 255u, 171u, 128u, 128u, + 128u, 28u, 108u, 170u, 242u, 183u, 194u, 254u, + 223u, 255u, 255u, 128u, 1u, 81u, 230u, 252u, + 204u, 203u, 255u, 192u, 128u, 128u, 128u, 123u, + 102u, 209u, 247u, 188u, 196u, 255u, 233u, 128u, + 128u, 128u, 20u, 95u, 153u, 243u, 164u, 173u, + 255u, 203u, 128u, 128u, 128u, 1u, 222u, 248u, + 255u, 216u, 213u, 128u, 128u, 128u, 128u, 128u, + 168u, 175u, 246u, 252u, 235u, 205u, 255u, 255u, + 128u, 128u, 128u, 47u, 116u, 215u, 255u, 211u, + 212u, 255u, 255u, 128u, 128u, 128u, 1u, 121u, + 236u, 253u, 212u, 214u, 255u, 255u, 128u, 128u, + 128u, 141u, 84u, 213u, 252u, 201u, 202u, 255u, + 219u, 128u, 128u, 128u, 42u, 80u, 160u, 240u, + 162u, 185u, 255u, 205u, 128u, 128u, 128u, 1u, + 1u, 255u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 244u, 1u, 255u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 238u, 1u, 255u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, +}; + +static const uint8_t +WUFFS_VP8__COEFF_UPDATE_PROBS[1056] WUFFS_BASE__POTENTIALLY_UNUSED = { + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 176u, 246u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 223u, 241u, 252u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 249u, + 253u, 253u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 244u, 252u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 234u, 254u, 254u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 253u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 246u, 254u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 239u, 253u, + 254u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 254u, 255u, 254u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 248u, 254u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 251u, + 255u, 254u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 253u, 254u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 251u, 254u, 254u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 254u, 255u, 254u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 254u, + 253u, 255u, 254u, 255u, 255u, 255u, 255u, 255u, + 255u, 250u, 255u, 254u, 255u, 254u, 255u, 255u, + 255u, 255u, 255u, 255u, 254u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 217u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 225u, 252u, 241u, 253u, 255u, + 255u, 254u, 255u, 255u, 255u, 255u, 234u, 250u, + 241u, 250u, 253u, 255u, 253u, 254u, 255u, 255u, + 255u, 255u, 254u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 223u, 254u, 254u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 238u, + 253u, 254u, 254u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 248u, 254u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 249u, 254u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 253u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 247u, 254u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 253u, 254u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 252u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 254u, 254u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 253u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 254u, + 253u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 250u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 254u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 186u, 251u, 250u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 234u, 251u, 244u, 254u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 251u, 251u, + 243u, 253u, 254u, 255u, 254u, 255u, 255u, 255u, + 255u, 255u, 253u, 254u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 236u, 253u, 254u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 251u, + 253u, 253u, 254u, 254u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 254u, 254u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 254u, 254u, 254u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 254u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 254u, 254u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 254u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 254u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 248u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 250u, 254u, 252u, 254u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 248u, 254u, + 249u, 253u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 253u, 253u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 246u, 253u, 253u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 252u, + 254u, 251u, 254u, 254u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 254u, 252u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 248u, 254u, 253u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 253u, 255u, 254u, 254u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 251u, 254u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 245u, 251u, + 254u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 253u, 253u, 254u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 251u, 253u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 252u, + 253u, 254u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 254u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 252u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 249u, 255u, 254u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 254u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 253u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 250u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 254u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, + 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, +}; + +static const uint8_t +WUFFS_VP8__MV_UPDATE_PROBS[38] WUFFS_BASE__POTENTIALLY_UNUSED = { + 237u, 246u, 253u, 253u, 254u, 254u, 254u, 254u, + 254u, 254u, 254u, 254u, 254u, 254u, 250u, 250u, + 252u, 254u, 254u, 231u, 243u, 245u, 253u, 254u, + 254u, 254u, 254u, 254u, 254u, 254u, 254u, 254u, + 254u, 251u, 251u, 254u, 254u, 254u, +}; + +static const uint8_t +WUFFS_VP8__DEFAULT_MV_PROBS[38] WUFFS_BASE__POTENTIALLY_UNUSED = { + 162u, 128u, 225u, 146u, 172u, 147u, 214u, 39u, + 156u, 128u, 129u, 132u, 75u, 145u, 178u, 206u, + 239u, 254u, 254u, 164u, 128u, 204u, 170u, 119u, + 235u, 140u, 230u, 228u, 128u, 130u, 130u, 74u, + 148u, 180u, 203u, 236u, 254u, 254u, +}; + +static const uint8_t +WUFFS_VP8__NORM_LUT[256] WUFFS_BASE__POTENTIALLY_UNUSED = { + 7u, 6u, 6u, 5u, 5u, 5u, 5u, 4u, + 4u, 4u, 4u, 4u, 4u, 4u, 4u, 3u, + 3u, 3u, 3u, 3u, 3u, 3u, 3u, 3u, + 3u, 3u, 3u, 3u, 3u, 3u, 3u, 2u, + 2u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, + 2u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, + 2u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, + 2u, 2u, 2u, 2u, 2u, 2u, 2u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, +}; + +static const uint8_t +WUFFS_VP8__TOKEN_EXTRA_BITS[12] WUFFS_BASE__POTENTIALLY_UNUSED = { + 0u, 0u, 0u, 0u, 0u, 1u, 2u, 3u, + 4u, 5u, 6u, 11u, +}; + +static const uint16_t +WUFFS_VP8__TOKEN_EXTRA_BASE[12] WUFFS_BASE__POTENTIALLY_UNUSED = { + 0u, 1u, 2u, 3u, 4u, 5u, 7u, 11u, + 19u, 35u, 67u, 2048u, +}; + +static const uint8_t +WUFFS_VP8__CAT_PROBS[26] WUFFS_BASE__POTENTIALLY_UNUSED = { + 159u, 165u, 145u, 173u, 148u, 140u, 176u, 155u, + 140u, 135u, 180u, 157u, 141u, 134u, 130u, 254u, + 254u, 243u, 230u, 196u, 177u, 153u, 140u, 133u, + 130u, 129u, +}; + +static const uint8_t +WUFFS_VP8__CAT_PROBS_OFFSET[6] WUFFS_BASE__POTENTIALLY_UNUSED = { + 0u, 1u, 3u, 6u, 10u, 15u, +}; + +static const uint8_t +WUFFS_VP8__CAT_EXTRA_BITS[6] WUFFS_BASE__POTENTIALLY_UNUSED = { + 1u, 2u, 3u, 4u, 5u, 11u, +}; + +static const uint16_t +WUFFS_VP8__CAT_BASE_VALUE[6] WUFFS_BASE__POTENTIALLY_UNUSED = { + 5u, 7u, 11u, 19u, 35u, 67u, +}; + +static const uint8_t +WUFFS_VP8__KF_Y_MODE_PROBS[4] WUFFS_BASE__POTENTIALLY_UNUSED = { + 145u, 156u, 163u, 128u, +}; + +static const uint8_t +WUFFS_VP8__KF_UV_MODE_PROBS[3] WUFFS_BASE__POTENTIALLY_UNUSED = { + 142u, 114u, 183u, +}; + +static const uint8_t +WUFFS_VP8__KF_B_MODE_PROBS[900] WUFFS_BASE__POTENTIALLY_UNUSED = { + 231u, 120u, 48u, 89u, 115u, 113u, 120u, 152u, + 112u, 152u, 179u, 64u, 126u, 170u, 118u, 46u, + 70u, 95u, 175u, 69u, 143u, 80u, 85u, 82u, + 72u, 155u, 103u, 56u, 58u, 10u, 171u, 218u, + 189u, 17u, 13u, 152u, 144u, 71u, 10u, 38u, + 171u, 213u, 144u, 34u, 26u, 114u, 26u, 17u, + 163u, 44u, 195u, 21u, 10u, 173u, 121u, 24u, + 80u, 195u, 26u, 62u, 44u, 64u, 85u, 170u, + 46u, 55u, 19u, 136u, 160u, 33u, 206u, 71u, + 63u, 20u, 8u, 114u, 114u, 208u, 12u, 9u, + 226u, 81u, 40u, 11u, 96u, 182u, 84u, 29u, + 16u, 36u, 134u, 183u, 89u, 137u, 98u, 101u, + 106u, 165u, 148u, 72u, 187u, 100u, 130u, 157u, + 111u, 32u, 75u, 80u, 66u, 102u, 167u, 99u, + 74u, 62u, 40u, 234u, 128u, 41u, 53u, 9u, + 178u, 241u, 141u, 26u, 8u, 107u, 104u, 79u, + 12u, 27u, 217u, 255u, 87u, 17u, 7u, 74u, + 43u, 26u, 146u, 73u, 166u, 49u, 23u, 157u, + 65u, 38u, 105u, 160u, 51u, 52u, 31u, 115u, + 128u, 87u, 68u, 71u, 44u, 114u, 51u, 15u, + 186u, 23u, 47u, 41u, 14u, 110u, 182u, 183u, + 21u, 17u, 194u, 66u, 45u, 25u, 102u, 197u, + 189u, 23u, 18u, 22u, 88u, 88u, 147u, 150u, + 42u, 46u, 45u, 196u, 205u, 43u, 97u, 183u, + 117u, 85u, 38u, 35u, 179u, 61u, 39u, 53u, + 200u, 87u, 26u, 21u, 43u, 232u, 171u, 56u, + 34u, 51u, 104u, 114u, 102u, 29u, 93u, 77u, + 107u, 54u, 32u, 26u, 51u, 1u, 81u, 43u, + 31u, 39u, 28u, 85u, 171u, 58u, 165u, 90u, + 98u, 64u, 34u, 22u, 116u, 206u, 23u, 34u, + 43u, 166u, 73u, 68u, 25u, 106u, 22u, 64u, + 171u, 36u, 225u, 114u, 34u, 19u, 21u, 102u, + 132u, 188u, 16u, 76u, 124u, 62u, 18u, 78u, + 95u, 85u, 57u, 50u, 48u, 51u, 193u, 101u, + 35u, 159u, 215u, 111u, 89u, 46u, 111u, 60u, + 148u, 31u, 172u, 219u, 228u, 21u, 18u, 111u, + 112u, 113u, 77u, 85u, 179u, 255u, 38u, 120u, + 114u, 40u, 42u, 1u, 196u, 245u, 209u, 10u, + 25u, 109u, 100u, 80u, 8u, 43u, 154u, 1u, + 51u, 26u, 71u, 88u, 43u, 29u, 140u, 166u, + 213u, 37u, 43u, 154u, 61u, 63u, 30u, 155u, + 67u, 45u, 68u, 1u, 209u, 142u, 78u, 78u, + 16u, 255u, 128u, 34u, 197u, 171u, 41u, 40u, + 5u, 102u, 211u, 183u, 4u, 1u, 221u, 51u, + 50u, 17u, 168u, 209u, 192u, 23u, 25u, 82u, + 125u, 98u, 42u, 88u, 104u, 85u, 117u, 175u, + 82u, 95u, 84u, 53u, 89u, 128u, 100u, 113u, + 101u, 45u, 75u, 79u, 123u, 47u, 51u, 128u, + 81u, 171u, 1u, 57u, 17u, 5u, 71u, 102u, + 57u, 53u, 41u, 49u, 115u, 21u, 2u, 10u, + 102u, 255u, 166u, 23u, 6u, 38u, 33u, 13u, + 121u, 57u, 73u, 26u, 1u, 85u, 41u, 10u, + 67u, 138u, 77u, 110u, 90u, 47u, 114u, 101u, + 29u, 16u, 10u, 85u, 128u, 101u, 196u, 26u, + 57u, 18u, 10u, 102u, 102u, 213u, 34u, 20u, + 43u, 117u, 20u, 15u, 36u, 163u, 128u, 68u, + 1u, 26u, 138u, 31u, 36u, 171u, 27u, 166u, + 38u, 44u, 229u, 67u, 87u, 58u, 169u, 82u, + 115u, 26u, 59u, 179u, 63u, 59u, 90u, 180u, + 59u, 166u, 93u, 73u, 154u, 40u, 40u, 21u, + 116u, 143u, 209u, 34u, 39u, 175u, 57u, 46u, + 22u, 24u, 128u, 1u, 54u, 17u, 37u, 47u, + 15u, 16u, 183u, 34u, 223u, 49u, 45u, 183u, + 46u, 17u, 33u, 183u, 6u, 98u, 15u, 32u, + 183u, 65u, 32u, 73u, 115u, 28u, 128u, 23u, + 128u, 205u, 40u, 3u, 9u, 115u, 51u, 192u, + 18u, 6u, 223u, 87u, 37u, 9u, 115u, 59u, + 77u, 64u, 21u, 47u, 104u, 55u, 44u, 218u, + 9u, 54u, 53u, 130u, 226u, 64u, 90u, 70u, + 205u, 40u, 41u, 23u, 26u, 57u, 54u, 57u, + 112u, 184u, 5u, 41u, 38u, 166u, 213u, 30u, + 34u, 26u, 133u, 152u, 116u, 10u, 32u, 134u, + 75u, 32u, 12u, 51u, 192u, 255u, 160u, 43u, + 51u, 39u, 19u, 53u, 221u, 26u, 114u, 32u, + 73u, 255u, 31u, 9u, 65u, 234u, 2u, 15u, + 1u, 118u, 73u, 88u, 31u, 35u, 67u, 102u, + 85u, 55u, 186u, 85u, 56u, 21u, 23u, 111u, + 59u, 205u, 45u, 37u, 192u, 55u, 38u, 70u, + 124u, 73u, 102u, 1u, 34u, 98u, 102u, 61u, + 71u, 37u, 34u, 53u, 31u, 243u, 192u, 69u, + 60u, 71u, 38u, 73u, 119u, 28u, 222u, 37u, + 68u, 45u, 128u, 34u, 1u, 47u, 11u, 245u, + 171u, 62u, 17u, 19u, 70u, 146u, 85u, 55u, + 62u, 70u, 75u, 15u, 9u, 9u, 64u, 255u, + 184u, 119u, 16u, 37u, 43u, 37u, 154u, 100u, + 163u, 85u, 160u, 1u, 63u, 9u, 92u, 136u, + 28u, 64u, 32u, 201u, 85u, 86u, 6u, 28u, + 5u, 64u, 255u, 25u, 248u, 1u, 56u, 8u, + 17u, 132u, 137u, 255u, 55u, 116u, 128u, 58u, + 15u, 20u, 82u, 135u, 57u, 26u, 121u, 40u, + 164u, 50u, 31u, 137u, 154u, 133u, 25u, 35u, + 218u, 51u, 103u, 44u, 131u, 131u, 123u, 31u, + 6u, 158u, 86u, 40u, 64u, 135u, 148u, 224u, + 45u, 183u, 128u, 22u, 26u, 17u, 131u, 240u, + 154u, 14u, 1u, 209u, 83u, 12u, 13u, 54u, + 192u, 255u, 68u, 47u, 28u, 45u, 16u, 21u, + 91u, 64u, 222u, 7u, 1u, 197u, 56u, 21u, + 39u, 155u, 60u, 138u, 23u, 102u, 213u, 85u, + 26u, 85u, 85u, 128u, 128u, 32u, 146u, 171u, + 18u, 11u, 7u, 63u, 144u, 171u, 4u, 4u, + 246u, 35u, 27u, 10u, 146u, 174u, 171u, 12u, + 26u, 128u, 190u, 80u, 35u, 99u, 180u, 80u, + 126u, 54u, 45u, 85u, 126u, 47u, 87u, 176u, + 51u, 41u, 20u, 32u, 101u, 75u, 128u, 139u, + 118u, 146u, 116u, 128u, 85u, 56u, 41u, 15u, + 176u, 236u, 85u, 37u, 9u, 62u, 146u, 36u, + 19u, 30u, 171u, 255u, 97u, 27u, 20u, 71u, + 30u, 17u, 119u, 118u, 255u, 17u, 18u, 138u, + 101u, 38u, 60u, 138u, 55u, 70u, 43u, 26u, + 142u, 138u, 45u, 61u, 62u, 219u, 1u, 81u, + 188u, 64u, 32u, 41u, 20u, 117u, 151u, 142u, + 20u, 21u, 163u, 112u, 19u, 12u, 61u, 195u, + 128u, 48u, 4u, 24u, +}; + +static const uint8_t +WUFFS_VP8__RENORM_SHIFT_256[256] WUFFS_BASE__POTENTIALLY_UNUSED = { + 7u, 6u, 6u, 5u, 5u, 5u, 5u, 4u, + 4u, 4u, 4u, 4u, 4u, 4u, 4u, 3u, + 3u, 3u, 3u, 3u, 3u, 3u, 3u, 3u, + 3u, 3u, 3u, 3u, 3u, 3u, 3u, 2u, + 2u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, + 2u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, + 2u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, + 2u, 2u, 2u, 2u, 2u, 2u, 2u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, + 1u, 1u, 1u, 1u, 1u, 1u, 1u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, +}; + +static const uint8_t +WUFFS_VP8__RENORM_RANGE_256[256] WUFFS_BASE__POTENTIALLY_UNUSED = { + 127u, 127u, 191u, 127u, 159u, 191u, 223u, 127u, + 143u, 159u, 175u, 191u, 207u, 223u, 239u, 127u, + 135u, 143u, 151u, 159u, 167u, 175u, 183u, 191u, + 199u, 207u, 215u, 223u, 231u, 239u, 247u, 127u, + 131u, 135u, 139u, 143u, 147u, 151u, 155u, 159u, + 163u, 167u, 171u, 175u, 179u, 183u, 187u, 191u, + 195u, 199u, 203u, 207u, 211u, 215u, 219u, 223u, + 227u, 231u, 235u, 239u, 243u, 247u, 251u, 127u, + 129u, 131u, 133u, 135u, 137u, 139u, 141u, 143u, + 145u, 147u, 149u, 151u, 153u, 155u, 157u, 159u, + 161u, 163u, 165u, 167u, 169u, 171u, 173u, 175u, + 177u, 179u, 181u, 183u, 185u, 187u, 189u, 191u, + 193u, 195u, 197u, 199u, 201u, 203u, 205u, 207u, + 209u, 211u, 213u, 215u, 217u, 219u, 221u, 223u, + 225u, 227u, 229u, 231u, 233u, 235u, 237u, 239u, + 241u, 243u, 245u, 247u, 249u, 251u, 253u, 127u, + 128u, 129u, 130u, 131u, 132u, 133u, 134u, 135u, + 136u, 137u, 138u, 139u, 140u, 141u, 142u, 143u, + 144u, 145u, 146u, 147u, 148u, 149u, 150u, 151u, + 152u, 153u, 154u, 155u, 156u, 157u, 158u, 159u, + 160u, 161u, 162u, 163u, 164u, 165u, 166u, 167u, + 168u, 169u, 170u, 171u, 172u, 173u, 174u, 175u, + 176u, 177u, 178u, 179u, 180u, 181u, 182u, 183u, + 184u, 185u, 186u, 187u, 188u, 189u, 190u, 191u, + 192u, 193u, 194u, 195u, 196u, 197u, 198u, 199u, + 200u, 201u, 202u, 203u, 204u, 205u, 206u, 207u, + 208u, 209u, 210u, 211u, 212u, 213u, 214u, 215u, + 216u, 217u, 218u, 219u, 220u, 221u, 222u, 223u, + 224u, 225u, 226u, 227u, 228u, 229u, 230u, 231u, + 232u, 233u, 234u, 235u, 236u, 237u, 238u, 239u, + 240u, 241u, 242u, 243u, 244u, 245u, 246u, 247u, + 248u, 249u, 250u, 251u, 252u, 253u, 254u, 254u, +}; + // ---------------- Private Initializer Prototypes // ---------------- Private Function Prototypes WUFFS_BASE__GENERATED_C_CODE -static wuffs_base__status -wuffs_vp8__decoder__do_decode_image_config( +static wuffs_base__empty_struct +wuffs_vp8__decoder__bool_init( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__bool_read_bool( wuffs_vp8__decoder* self, - wuffs_base__image_config* a_dst, - wuffs_base__io_buffer* a_src); + uint8_t a_prob); WUFFS_BASE__GENERATED_C_CODE -static wuffs_base__status -wuffs_vp8__decoder__do_decode_frame_config( +static uint32_t +wuffs_vp8__decoder__bool_read_literal( wuffs_vp8__decoder* self, - wuffs_base__frame_config* a_dst, - wuffs_base__io_buffer* a_src); + uint32_t a_n); WUFFS_BASE__GENERATED_C_CODE -static wuffs_base__status -wuffs_vp8__decoder__do_decode_frame( +static int32_t +wuffs_vp8__decoder__bool_read_signed( wuffs_vp8__decoder* self, - wuffs_base__pixel_buffer* a_dst, - wuffs_base__io_buffer* a_src, - wuffs_base__pixel_blend a_blend, - wuffs_base__slice_u8 a_workbuf, - wuffs_base__decode_frame_options* a_opts); + uint32_t a_n); WUFFS_BASE__GENERATED_C_CODE -static wuffs_base__status -wuffs_vp8__decoder__make_a_placeholder_gradient( +static wuffs_base__empty_struct +wuffs_vp8__decoder__p1_init( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__p1_read_bool( wuffs_vp8__decoder* self, - wuffs_base__pixel_buffer* a_dst); + uint8_t a_prob); -// ---------------- VTables +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__p1_read_sign( + wuffs_vp8__decoder* self); -const wuffs_base__image_decoder__func_ptrs -wuffs_vp8__decoder__func_ptrs_for__wuffs_base__image_decoder = { - (wuffs_base__status(*)(void*, - wuffs_base__pixel_buffer*, - wuffs_base__io_buffer*, - wuffs_base__pixel_blend, - wuffs_base__slice_u8, - wuffs_base__decode_frame_options*))(&wuffs_vp8__decoder__decode_frame), - (wuffs_base__status(*)(void*, - wuffs_base__frame_config*, - wuffs_base__io_buffer*))(&wuffs_vp8__decoder__decode_frame_config), - (wuffs_base__status(*)(void*, - wuffs_base__image_config*, - wuffs_base__io_buffer*))(&wuffs_vp8__decoder__decode_image_config), - (wuffs_base__rect_ie_u32(*)(const void*))(&wuffs_vp8__decoder__frame_dirty_rect), - (uint64_t(*)(const void*, - uint32_t))(&wuffs_vp8__decoder__get_quirk), - (uint32_t(*)(const void*))(&wuffs_vp8__decoder__num_animation_loops), - (uint64_t(*)(const void*))(&wuffs_vp8__decoder__num_decoded_frame_configs), - (uint64_t(*)(const void*))(&wuffs_vp8__decoder__num_decoded_frames), - (wuffs_base__status(*)(void*, - uint64_t, - uint64_t))(&wuffs_vp8__decoder__restart_frame), - (wuffs_base__status(*)(void*, - uint32_t, - uint64_t))(&wuffs_vp8__decoder__set_quirk), - (wuffs_base__empty_struct(*)(void*, - uint32_t, - bool))(&wuffs_vp8__decoder__set_report_metadata), - (wuffs_base__status(*)(void*, - wuffs_base__io_buffer*, - wuffs_base__more_information*, - wuffs_base__io_buffer*))(&wuffs_vp8__decoder__tell_me_more), - (wuffs_base__range_ii_u64(*)(const void*))(&wuffs_vp8__decoder__workbuf_len), -}; +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__p1_fill_buffer( + wuffs_vp8__decoder* self, + wuffs_base__io_buffer* a_src, + uint32_t a_n); -// ---------------- Initializer Implementations +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__p1_fill_from_workbuf( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf); -wuffs_base__status WUFFS_BASE__WARN_UNUSED_RESULT -wuffs_vp8__decoder__initialize( +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__bool_fill_from_workbuf( wuffs_vp8__decoder* self, - size_t sizeof_star_self, - uint64_t wuffs_version, - uint32_t options){ - if (!self) { - return wuffs_base__make_status(wuffs_base__error__bad_receiver); - } - if (sizeof(*self) != sizeof_star_self) { - return wuffs_base__make_status(wuffs_base__error__bad_sizeof_receiver); - } - if (((wuffs_version >> 32) != WUFFS_VERSION_MAJOR) || - (((wuffs_version >> 16) & 0xFFFF) > WUFFS_VERSION_MINOR)) { - return wuffs_base__make_status(wuffs_base__error__bad_wuffs_version); - } + wuffs_base__slice_u8 a_workbuf); - if ((options & WUFFS_INITIALIZE__ALREADY_ZEROED) != 0) { - // The whole point of this if-check is to detect an uninitialized *self. - // We disable the warning on GCC. Clang-5.0 does not have this warning. -#if !defined(__clang__) && defined(__GNUC__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif - if (self->private_impl.magic != 0) { - return wuffs_base__make_status(wuffs_base__error__initialize_falsely_claimed_already_zeroed); - } -#if !defined(__clang__) && defined(__GNUC__) -#pragma GCC diagnostic pop -#endif - } else { - if ((options & WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED) == 0) { - memset(self, 0, sizeof(*self)); - options |= WUFFS_INITIALIZE__ALREADY_ZEROED; - } else { - memset(&(self->private_impl), 0, sizeof(self->private_impl)); - } - } +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__apply_simple_filter_all( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf); - self->private_impl.magic = WUFFS_BASE__MAGIC; - self->private_impl.vtable_for__wuffs_base__image_decoder.vtable_name = - wuffs_base__image_decoder__vtable_name; - self->private_impl.vtable_for__wuffs_base__image_decoder.function_pointers = - (const void*)(&wuffs_vp8__decoder__func_ptrs_for__wuffs_base__image_decoder); - return wuffs_base__make_status(NULL); -} +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__apply_simple_filter_row( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint32_t a_mby); -wuffs_vp8__decoder* -wuffs_vp8__decoder__alloc(void) { - wuffs_vp8__decoder* x = - (wuffs_vp8__decoder*)(calloc(1, sizeof(wuffs_vp8__decoder))); - if (!x) { - return NULL; - } - if (wuffs_vp8__decoder__initialize( - x, sizeof(wuffs_vp8__decoder), WUFFS_VERSION, WUFFS_INITIALIZE__ALREADY_ZEROED).repr) { - free(x); - return NULL; - } - return x; -} +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__simple_vfilter_16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_limit); -size_t -sizeof__wuffs_vp8__decoder(void) { - return sizeof(wuffs_vp8__decoder); -} +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__simple_vfilter_16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_limit); -// ---------------- Function Implementations +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); -// -------- func vp8.decoder.get_quirk +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); WUFFS_BASE__GENERATED_C_CODE -WUFFS_BASE__MAYBE_STATIC uint64_t -wuffs_vp8__decoder__get_quirk( - const wuffs_vp8__decoder* self, - uint32_t a_key) { - if (!self) { - return 0; - } - if ((self->private_impl.magic != WUFFS_BASE__MAGIC) && - (self->private_impl.magic != WUFFS_BASE__DISABLED)) { - return 0; - } +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); - return 0u; -} +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); -// -------- func vp8.decoder.set_quirk +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_8( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); WUFFS_BASE__GENERATED_C_CODE -WUFFS_BASE__MAYBE_STATIC wuffs_base__status -wuffs_vp8__decoder__set_quirk( +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_8__choosy_default( wuffs_vp8__decoder* self, - uint32_t a_key, - uint64_t a_value) { - if (!self) { - return wuffs_base__make_status(wuffs_base__error__bad_receiver); - } - if (self->private_impl.magic != WUFFS_BASE__MAGIC) { - return wuffs_base__make_status( - (self->private_impl.magic == WUFFS_BASE__DISABLED) - ? wuffs_base__error__disabled_by_previous_error - : wuffs_base__error__initialize_not_called); - } + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); - return wuffs_base__make_status(wuffs_base__error__unsupported_option); -} +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); -// -------- func vp8.decoder.decode_image_config +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); WUFFS_BASE__GENERATED_C_CODE -WUFFS_BASE__MAYBE_STATIC wuffs_base__status -wuffs_vp8__decoder__decode_image_config( +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_8( wuffs_vp8__decoder* self, - wuffs_base__image_config* a_dst, - wuffs_base__io_buffer* a_src) { - if (!self) { - return wuffs_base__make_status(wuffs_base__error__bad_receiver); - } - if (self->private_impl.magic != WUFFS_BASE__MAGIC) { - return wuffs_base__make_status( - (self->private_impl.magic == WUFFS_BASE__DISABLED) - ? wuffs_base__error__disabled_by_previous_error - : wuffs_base__error__initialize_not_called); - } - if (!a_src) { - self->private_impl.magic = WUFFS_BASE__DISABLED; - return wuffs_base__make_status(wuffs_base__error__bad_argument); - } - if ((self->private_impl.active_coroutine != 0) && - (self->private_impl.active_coroutine != 1)) { - self->private_impl.magic = WUFFS_BASE__DISABLED; - return wuffs_base__make_status(wuffs_base__error__interleaved_coroutine_calls); - } - self->private_impl.active_coroutine = 0; - wuffs_base__status status = wuffs_base__make_status(NULL); + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); - wuffs_base__status v_status = wuffs_base__make_status(NULL); +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_8__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); - uint32_t coro_susp_point = self->private_impl.p_decode_image_config; - switch (coro_susp_point) { - WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0; +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); - while (true) { - { - wuffs_base__status t_0 = wuffs_vp8__decoder__do_decode_image_config(self, a_dst, a_src); - v_status = t_0; - } - if ((v_status.repr == wuffs_base__suspension__short_read) && (a_src && a_src->meta.closed)) { +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_8( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_8__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_8( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_8__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_uv( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_uv__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_uv( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_uv__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_uv( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_uv__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_uv( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_uv__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__filter2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_idx, + uint64_t a_step, + uint32_t a_limit); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__clamp15_asr3( + wuffs_vp8__decoder* self, + uint32_t a_v); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__clamp127( + wuffs_vp8__decoder* self, + uint32_t a_v); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__abs_u32( + wuffs_vp8__decoder* self, + uint32_t a_v); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__clamp255( + wuffs_vp8__decoder* self, + uint32_t a_v); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__apply_normal_filter_all( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__apply_normal_filter_row( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint32_t a_mby); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__filter246( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_idx, + uint64_t a_step, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel, + bool a_four_not_six); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__signed_shift_right_7( + wuffs_vp8__decoder* self, + uint32_t a_v); + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__simple_vfilter_16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_limit); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_8_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_8_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_8_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_8_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_uv_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_uv_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_uv_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_uv_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__simple_vfilter_16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_limit); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_8_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_8_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_8_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_8_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_partition0( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_segmentation( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_loop_filter( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_partitions( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_quant_indices( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_coeff_prob_updates( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_mb_skip_coeff( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__compute_dequant_values( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__precompute_filter_strengths( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__clamp_qi( + wuffs_vp8__decoder* self, + uint32_t a_qi, + int32_t a_delta); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__asr16( + wuffs_vp8__decoder* self, + uint32_t a_v); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__asr3( + wuffs_vp8__decoder* self, + uint32_t a_v); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add_pair( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add_pair__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add_pair( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add_pair__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__wht( + wuffs_vp8__decoder* self, + uint32_t a_coeff_offset); + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add_pair_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add_pair_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__status +wuffs_vp8__decoder__decode_frame_mb( + wuffs_vp8__decoder* self, + wuffs_base__io_buffer* a_src, + wuffs_base__pixel_buffer* a_dst, + wuffs_base__slice_u8 a_workbuf); + +WUFFS_BASE__GENERATED_C_CODE_NOINLINE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_one_mb( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_luma_mode( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__decode_sub_block_mode( + wuffs_vp8__decoder* self, + uint32_t a_prob_offset); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_chroma_mode( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__clear_mb_nz_context( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE_NOINLINE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_mb_coefficients( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__decode_coeff_category( + wuffs_vp8__decoder* self, + uint32_t a_prob_idx); + +WUFFS_BASE__GENERATED_C_CODE_ALWAYS_INLINE +static uint32_t +wuffs_vp8__decoder__decode_block_coeffs( + wuffs_vp8__decoder* self, + uint32_t a_block_offset, + uint32_t a_block_type, + uint32_t a_start_coeff, + uint32_t a_init_ctx); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_16x16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_16x16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_8x8( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode, + uint64_t a_plane_offset); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_8x8__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode, + uint64_t a_plane_offset); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_4x4( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint32_t a_block_idx, + uint8_t a_mode); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__pred4x4_store( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_off, + uint32_t a_v00, + uint32_t a_v01, + uint32_t a_v02, + uint32_t a_v03, + uint32_t a_v10, + uint32_t a_v11, + uint32_t a_v12, + uint32_t a_v13, + uint32_t a_v20, + uint32_t a_v21, + uint32_t a_v22, + uint32_t a_v23, + uint32_t a_v30, + uint32_t a_v31, + uint32_t a_v32, + uint32_t a_v33); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__avg2( + const wuffs_vp8__decoder* self, + uint32_t a_a, + uint32_t a_b); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__avg3( + const wuffs_vp8__decoder* self, + uint32_t a_a, + uint32_t a_b, + uint32_t a_c); + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__clip8( + const wuffs_vp8__decoder* self, + uint32_t a_v); + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_16x16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_8x8_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode, + uint64_t a_plane_offset); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_16x16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_8x8_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode, + uint64_t a_plane_offset); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__status +wuffs_vp8__decoder__do_decode_image_config( + wuffs_vp8__decoder* self, + wuffs_base__image_config* a_dst, + wuffs_base__io_buffer* a_src); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__status +wuffs_vp8__decoder__do_decode_frame_config( + wuffs_vp8__decoder* self, + wuffs_base__frame_config* a_dst, + wuffs_base__io_buffer* a_src); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__status +wuffs_vp8__decoder__do_decode_frame( + wuffs_vp8__decoder* self, + wuffs_base__pixel_buffer* a_dst, + wuffs_base__io_buffer* a_src, + wuffs_base__pixel_blend a_blend, + wuffs_base__slice_u8 a_workbuf, + wuffs_base__decode_frame_options* a_opts); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__init_mb_coeffs( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__init_coeff_probs( + wuffs_vp8__decoder* self); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__status +wuffs_vp8__decoder__swizzle_mb_row( + wuffs_vp8__decoder* self, + wuffs_base__pixel_buffer* a_dst, + wuffs_base__slice_u8 a_workbuf, + uint32_t a_mby, + bool a_is_last); + +// ---------------- VTables + +const wuffs_base__image_decoder__func_ptrs +wuffs_vp8__decoder__func_ptrs_for__wuffs_base__image_decoder = { + (wuffs_base__status(*)(void*, + wuffs_base__pixel_buffer*, + wuffs_base__io_buffer*, + wuffs_base__pixel_blend, + wuffs_base__slice_u8, + wuffs_base__decode_frame_options*))(&wuffs_vp8__decoder__decode_frame), + (wuffs_base__status(*)(void*, + wuffs_base__frame_config*, + wuffs_base__io_buffer*))(&wuffs_vp8__decoder__decode_frame_config), + (wuffs_base__status(*)(void*, + wuffs_base__image_config*, + wuffs_base__io_buffer*))(&wuffs_vp8__decoder__decode_image_config), + (wuffs_base__rect_ie_u32(*)(const void*))(&wuffs_vp8__decoder__frame_dirty_rect), + (uint64_t(*)(const void*, + uint32_t))(&wuffs_vp8__decoder__get_quirk), + (uint32_t(*)(const void*))(&wuffs_vp8__decoder__num_animation_loops), + (uint64_t(*)(const void*))(&wuffs_vp8__decoder__num_decoded_frame_configs), + (uint64_t(*)(const void*))(&wuffs_vp8__decoder__num_decoded_frames), + (wuffs_base__status(*)(void*, + uint64_t, + uint64_t))(&wuffs_vp8__decoder__restart_frame), + (wuffs_base__status(*)(void*, + uint32_t, + uint64_t))(&wuffs_vp8__decoder__set_quirk), + (wuffs_base__empty_struct(*)(void*, + uint32_t, + bool))(&wuffs_vp8__decoder__set_report_metadata), + (wuffs_base__status(*)(void*, + wuffs_base__io_buffer*, + wuffs_base__more_information*, + wuffs_base__io_buffer*))(&wuffs_vp8__decoder__tell_me_more), + (wuffs_base__range_ii_u64(*)(const void*))(&wuffs_vp8__decoder__workbuf_len), +}; + +// ---------------- Initializer Implementations + +wuffs_base__status WUFFS_BASE__WARN_UNUSED_RESULT +wuffs_vp8__decoder__initialize( + wuffs_vp8__decoder* self, + size_t sizeof_star_self, + uint64_t wuffs_version, + uint32_t options){ + if (!self) { + return wuffs_base__make_status(wuffs_base__error__bad_receiver); + } + if (sizeof(*self) != sizeof_star_self) { + return wuffs_base__make_status(wuffs_base__error__bad_sizeof_receiver); + } + if (((wuffs_version >> 32) != WUFFS_VERSION_MAJOR) || + (((wuffs_version >> 16) & 0xFFFF) > WUFFS_VERSION_MINOR)) { + return wuffs_base__make_status(wuffs_base__error__bad_wuffs_version); + } + + if ((options & WUFFS_INITIALIZE__ALREADY_ZEROED) != 0) { + // The whole point of this if-check is to detect an uninitialized *self. + // We disable the warning on GCC. Clang-5.0 does not have this warning. +#if !defined(__clang__) && defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + if (self->private_impl.magic != 0) { + return wuffs_base__make_status(wuffs_base__error__initialize_falsely_claimed_already_zeroed); + } +#if !defined(__clang__) && defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else { + if ((options & WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED) == 0) { + memset(self, 0, sizeof(*self)); + options |= WUFFS_INITIALIZE__ALREADY_ZEROED; + } else { + memset(&(self->private_impl), 0, sizeof(self->private_impl)); + } + } + + self->private_impl.choosy_simple_vfilter_16 = &wuffs_vp8__decoder__simple_vfilter_16__choosy_default; + self->private_impl.choosy_normal_vfilter_inner_16 = &wuffs_vp8__decoder__normal_vfilter_inner_16__choosy_default; + self->private_impl.choosy_normal_vfilter_mb_16 = &wuffs_vp8__decoder__normal_vfilter_mb_16__choosy_default; + self->private_impl.choosy_normal_vfilter_mb_8 = &wuffs_vp8__decoder__normal_vfilter_mb_8__choosy_default; + self->private_impl.choosy_normal_hfilter_mb_16 = &wuffs_vp8__decoder__normal_hfilter_mb_16__choosy_default; + self->private_impl.choosy_normal_hfilter_mb_8 = &wuffs_vp8__decoder__normal_hfilter_mb_8__choosy_default; + self->private_impl.choosy_normal_hfilter_inner_16 = &wuffs_vp8__decoder__normal_hfilter_inner_16__choosy_default; + self->private_impl.choosy_normal_hfilter_inner_8 = &wuffs_vp8__decoder__normal_hfilter_inner_8__choosy_default; + self->private_impl.choosy_normal_vfilter_inner_8 = &wuffs_vp8__decoder__normal_vfilter_inner_8__choosy_default; + self->private_impl.choosy_normal_vfilter_mb_uv = &wuffs_vp8__decoder__normal_vfilter_mb_uv__choosy_default; + self->private_impl.choosy_normal_hfilter_mb_uv = &wuffs_vp8__decoder__normal_hfilter_mb_uv__choosy_default; + self->private_impl.choosy_normal_vfilter_inner_uv = &wuffs_vp8__decoder__normal_vfilter_inner_uv__choosy_default; + self->private_impl.choosy_normal_hfilter_inner_uv = &wuffs_vp8__decoder__normal_hfilter_inner_uv__choosy_default; + self->private_impl.choosy_idct_add = &wuffs_vp8__decoder__idct_add__choosy_default; + self->private_impl.choosy_idct_dc_add = &wuffs_vp8__decoder__idct_dc_add__choosy_default; + self->private_impl.choosy_idct_add_pair = &wuffs_vp8__decoder__idct_add_pair__choosy_default; + self->private_impl.choosy_idct_dc_add_pair = &wuffs_vp8__decoder__idct_dc_add_pair__choosy_default; + self->private_impl.choosy_predict_16x16 = &wuffs_vp8__decoder__predict_16x16__choosy_default; + self->private_impl.choosy_predict_8x8 = &wuffs_vp8__decoder__predict_8x8__choosy_default; + + self->private_impl.magic = WUFFS_BASE__MAGIC; + self->private_impl.vtable_for__wuffs_base__image_decoder.vtable_name = + wuffs_base__image_decoder__vtable_name; + self->private_impl.vtable_for__wuffs_base__image_decoder.function_pointers = + (const void*)(&wuffs_vp8__decoder__func_ptrs_for__wuffs_base__image_decoder); + return wuffs_base__make_status(NULL); +} + +wuffs_vp8__decoder* +wuffs_vp8__decoder__alloc(void) { + wuffs_vp8__decoder* x = + (wuffs_vp8__decoder*)(calloc(1, sizeof(wuffs_vp8__decoder))); + if (!x) { + return NULL; + } + if (wuffs_vp8__decoder__initialize( + x, sizeof(wuffs_vp8__decoder), WUFFS_VERSION, WUFFS_INITIALIZE__ALREADY_ZEROED).repr) { + free(x); + return NULL; + } + return x; +} + +size_t +sizeof__wuffs_vp8__decoder(void) { + return sizeof(wuffs_vp8__decoder); +} + +// ---------------- Function Implementations + +// -------- func vp8.decoder.bool_init + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__bool_init( + wuffs_vp8__decoder* self) { + uint64_t v_bb = 0; + + self->private_impl.f_bool_range = 254u; + self->private_impl.f_bool_value = 0u; + self->private_impl.f_bool_bits = 0u; + while ((self->private_impl.f_bool_bits <= 48u) && (self->private_impl.f_bool_ri < self->private_impl.f_bool_wi)) { + v_bb = ((uint64_t)(self->private_data.f_bool_buffer[self->private_impl.f_bool_ri])); + self->private_impl.f_bool_ri += 1u; + self->private_impl.f_bool_value = (((uint64_t)(self->private_impl.f_bool_value << 8u)) | v_bb); + self->private_impl.f_bool_bits += 8u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.bool_read_bool + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__bool_read_bool( + wuffs_vp8__decoder* self, + uint8_t a_prob) { + uint32_t v_s = 0; + uint32_t v_retval = 0; + uint32_t v_v = 0; + uint32_t v_shift = 0; + uint64_t v_bb = 0; + uint32_t v_pos = 0; + + if (self->private_impl.f_bool_bits < 16u) { + while ((self->private_impl.f_bool_bits <= 48u) && (self->private_impl.f_bool_ri < self->private_impl.f_bool_wi)) { + v_bb = ((uint64_t)(self->private_data.f_bool_buffer[self->private_impl.f_bool_ri])); + self->private_impl.f_bool_ri += 1u; + self->private_impl.f_bool_value = (((uint64_t)(self->private_impl.f_bool_value << 8u)) | v_bb); + self->private_impl.f_bool_bits += 8u; + } + } + v_s = ((self->private_impl.f_bool_range * ((uint32_t)(a_prob))) >> 8u); + v_pos = (((uint32_t)(self->private_impl.f_bool_bits - 8u)) & 63u); + v_v = ((uint32_t)((self->private_impl.f_bool_value >> v_pos))); + if (v_v > v_s) { + v_retval = 1u; + self->private_impl.f_bool_value -= ((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)); + self->private_impl.f_bool_range = (((uint32_t)(((uint32_t)(self->private_impl.f_bool_range - v_s)) - 1u)) & 255u); + } else { + v_retval = 0u; + self->private_impl.f_bool_range = v_s; + } + v_shift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(self->private_impl.f_bool_range & 255u)])); + self->private_impl.f_bool_range = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(self->private_impl.f_bool_range & 255u)])); + self->private_impl.f_bool_bits -= v_shift; + return v_retval; +} + +// -------- func vp8.decoder.bool_read_literal + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__bool_read_literal( + wuffs_vp8__decoder* self, + uint32_t a_n) { + uint32_t v_result = 0; + uint32_t v_i = 0; + uint32_t v_bit = 0; + + v_result = 0u; + v_i = 0u; + while (v_i < a_n) { + v_bit = wuffs_vp8__decoder__bool_read_bool(self, 128u); + v_result = (((uint32_t)(v_result << 1u)) | v_bit); + v_i += 1u; + } + return v_result; +} + +// -------- func vp8.decoder.bool_read_signed + +WUFFS_BASE__GENERATED_C_CODE +static int32_t +wuffs_vp8__decoder__bool_read_signed( + wuffs_vp8__decoder* self, + uint32_t a_n) { + uint32_t v_flag = 0; + uint32_t v_magnitude = 0; + uint32_t v_sign = 0; + + v_flag = wuffs_vp8__decoder__bool_read_bool(self, 128u); + if (v_flag == 0u) { + return 0u; + } + v_magnitude = wuffs_vp8__decoder__bool_read_literal(self, a_n); + v_magnitude &= 2147483647u; + v_sign = wuffs_vp8__decoder__bool_read_bool(self, 128u); + if (v_sign != 0u) { + return - ((int32_t)(v_magnitude)); + } + return ((int32_t)(v_magnitude)); +} + +// -------- func vp8.decoder.p1_init + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__p1_init( + wuffs_vp8__decoder* self) { + self->private_impl.f_p1_range = 254u; + self->private_impl.f_p1_value = 0u; + self->private_impl.f_p1_bits = 0u; + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.p1_read_bool + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__p1_read_bool( + wuffs_vp8__decoder* self, + uint8_t a_prob) { + uint32_t v_s = 0; + uint32_t v_retval = 0; + uint32_t v_v = 0; + uint32_t v_shift = 0; + uint64_t v_bb = 0; + uint32_t v_pos = 0; + + if (self->private_impl.f_p1_bits < 16u) { + while ((self->private_impl.f_p1_bits <= 48u) && (self->private_impl.f_p1_ri < self->private_impl.f_p1_wi)) { + v_bb = ((uint64_t)(self->private_data.f_p1_buffer[self->private_impl.f_p1_ri])); + self->private_impl.f_p1_ri += 1u; + self->private_impl.f_p1_value = (((uint64_t)(self->private_impl.f_p1_value << 8u)) | v_bb); + self->private_impl.f_p1_bits += 8u; + } + } + v_s = ((self->private_impl.f_p1_range * ((uint32_t)(a_prob))) >> 8u); + v_pos = (((uint32_t)(self->private_impl.f_p1_bits - 8u)) & 63u); + v_v = ((uint32_t)((self->private_impl.f_p1_value >> v_pos))); + if (v_v > v_s) { + v_retval = 1u; + self->private_impl.f_p1_value -= ((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)); + self->private_impl.f_p1_range = (((uint32_t)(((uint32_t)(self->private_impl.f_p1_range - v_s)) - 1u)) & 255u); + } else { + v_retval = 0u; + self->private_impl.f_p1_range = v_s; + } + v_shift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(self->private_impl.f_p1_range & 255u)])); + self->private_impl.f_p1_range = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(self->private_impl.f_p1_range & 255u)])); + if (v_shift > self->private_impl.f_p1_bits) { + self->private_impl.f_p1_value = 0u; + self->private_impl.f_p1_bits = 56u; + } else { + self->private_impl.f_p1_bits -= v_shift; + } + return v_retval; +} + +// -------- func vp8.decoder.p1_read_sign + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__p1_read_sign( + wuffs_vp8__decoder* self) { + uint32_t v_s = 0; + uint32_t v_retval = 0; + uint32_t v_v = 0; + uint32_t v_shift = 0; + uint64_t v_bb = 0; + uint32_t v_pos = 0; + + if (self->private_impl.f_p1_bits < 16u) { + if ((((uint32_t)(self->private_impl.f_p1_ri + 4u)) <= self->private_impl.f_p1_wi) && (self->private_impl.f_p1_ri < 4093u)) { + self->private_impl.f_p1_value = (((uint64_t)(self->private_impl.f_p1_value << 32u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(self->private_impl.f_p1_ri + 0u)])) << 24u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(self->private_impl.f_p1_ri + 1u)])) << 16u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(self->private_impl.f_p1_ri + 2u)])) << 8u)) | + ((uint64_t)(self->private_data.f_p1_buffer[(self->private_impl.f_p1_ri + 3u)]))); + self->private_impl.f_p1_ri += 4u; + self->private_impl.f_p1_bits += 32u; + } else { + while ((self->private_impl.f_p1_bits <= 48u) && (self->private_impl.f_p1_ri < self->private_impl.f_p1_wi)) { + v_bb = ((uint64_t)(self->private_data.f_p1_buffer[self->private_impl.f_p1_ri])); + self->private_impl.f_p1_ri += 1u; + self->private_impl.f_p1_value = (((uint64_t)(self->private_impl.f_p1_value << 8u)) | v_bb); + self->private_impl.f_p1_bits += 8u; + } + } + } + v_s = (self->private_impl.f_p1_range >> 1u); + v_pos = (((uint32_t)(self->private_impl.f_p1_bits - 8u)) & 63u); + v_v = ((uint32_t)((self->private_impl.f_p1_value >> v_pos))); + if (v_v > v_s) { + v_retval = 1u; + self->private_impl.f_p1_value -= ((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)); + self->private_impl.f_p1_range = (((uint32_t)(((uint32_t)(self->private_impl.f_p1_range - v_s)) - 1u)) & 255u); + } else { + v_retval = 0u; + self->private_impl.f_p1_range = v_s; + } + v_shift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(self->private_impl.f_p1_range & 255u)])); + self->private_impl.f_p1_range = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(self->private_impl.f_p1_range & 255u)])); + if (v_shift > self->private_impl.f_p1_bits) { + self->private_impl.f_p1_value = 0u; + self->private_impl.f_p1_bits = 56u; + } else { + self->private_impl.f_p1_bits -= v_shift; + } + return v_retval; +} + +// -------- func vp8.decoder.p1_fill_buffer + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__p1_fill_buffer( + wuffs_vp8__decoder* self, + wuffs_base__io_buffer* a_src, + uint32_t a_n) { + uint32_t v_remaining = 0; + uint8_t v_c8 = 0; + + const uint8_t* iop_a_src = NULL; + const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + if (a_src && a_src->data.ptr) { + io0_a_src = a_src->data.ptr; + io1_a_src = io0_a_src + a_src->meta.ri; + iop_a_src = io1_a_src; + io2_a_src = io0_a_src + a_src->meta.wi; + } + + if ((self->private_impl.f_p1_ri > 0u) && (self->private_impl.f_p1_ri <= self->private_impl.f_p1_wi)) { + wuffs_private_impl__slice_u8__copy_from_slice(wuffs_base__make_slice_u8(self->private_data.f_p1_buffer, 4096), wuffs_base__make_slice_u8_ij(self->private_data.f_p1_buffer, + self->private_impl.f_p1_ri, + self->private_impl.f_p1_wi)); + wuffs_private_impl__u32__sat_sub_indirect(&self->private_impl.f_p1_wi, self->private_impl.f_p1_ri); + self->private_impl.f_p1_ri = 0u; + } + v_remaining = a_n; + while ((v_remaining > 0u) && (self->private_impl.f_p1_wi < 4096u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { + v_c8 = wuffs_base__peek_u8be__no_bounds_check(iop_a_src); + iop_a_src += 1u; + if (self->private_impl.f_p1_wi < 4096u) { + self->private_data.f_p1_buffer[self->private_impl.f_p1_wi] = v_c8; + self->private_impl.f_p1_wi += 1u; + } + v_remaining -= 1u; + } + if (a_src && a_src->data.ptr) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.p1_fill_from_workbuf + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__p1_fill_from_workbuf( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf) { + uint64_t v_idx = 0; + uint32_t v_p = 0; + uint64_t v_poff = 0; + + if ((self->private_impl.f_p1_ri > 0u) && (self->private_impl.f_p1_ri <= self->private_impl.f_p1_wi)) { + wuffs_private_impl__slice_u8__copy_from_slice(wuffs_base__make_slice_u8(self->private_data.f_p1_buffer, 4096), wuffs_base__make_slice_u8_ij(self->private_data.f_p1_buffer, + self->private_impl.f_p1_ri, + self->private_impl.f_p1_wi)); + wuffs_private_impl__u32__sat_sub_indirect(&self->private_impl.f_p1_wi, self->private_impl.f_p1_ri); + self->private_impl.f_p1_ri = 0u; + } + v_p = self->private_impl.f_current_partition; + v_poff = self->private_impl.f_part_wbuf_offset[v_p]; + while ((self->private_impl.f_p1_wi < 4096u) && (self->private_impl.f_current_part_wbuf_ri < self->private_impl.f_part_wbuf_size[v_p])) { + v_idx = ((uint64_t)(v_poff + ((uint64_t)(self->private_impl.f_current_part_wbuf_ri)))); + if (v_idx >= ((uint64_t)(a_workbuf.len))) { + break; + } + self->private_data.f_p1_buffer[self->private_impl.f_p1_wi] = a_workbuf.ptr[v_idx]; + self->private_impl.f_p1_wi += 1u; + self->private_impl.f_current_part_wbuf_ri += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.bool_fill_from_workbuf + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__bool_fill_from_workbuf( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf) { + uint64_t v_idx = 0; + + if ((self->private_impl.f_bool_ri > 0u) && (self->private_impl.f_bool_ri <= self->private_impl.f_bool_wi)) { + wuffs_private_impl__slice_u8__copy_from_slice(wuffs_base__make_slice_u8(self->private_data.f_bool_buffer, 4096), wuffs_base__make_slice_u8_ij(self->private_data.f_bool_buffer, + self->private_impl.f_bool_ri, + self->private_impl.f_bool_wi)); + wuffs_private_impl__u32__sat_sub_indirect(&self->private_impl.f_bool_wi, self->private_impl.f_bool_ri); + self->private_impl.f_bool_ri = 0u; + } + while ((self->private_impl.f_bool_wi < 4096u) && (self->private_impl.f_p0_wbuf_ri < self->private_impl.f_p0_wbuf_count)) { + v_idx = ((uint64_t)(self->private_impl.f_workbuf_offset_v_end + ((uint64_t)(self->private_impl.f_p0_wbuf_ri)))); + if (v_idx >= ((uint64_t)(a_workbuf.len))) { + break; + } + self->private_data.f_bool_buffer[self->private_impl.f_bool_wi] = a_workbuf.ptr[v_idx]; + self->private_impl.f_bool_wi += 1u; + self->private_impl.f_p0_wbuf_ri += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.apply_simple_filter_all + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__apply_simple_filter_all( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf) { + uint32_t v_mby = 0; + + v_mby = 0u; + while (v_mby < self->private_impl.f_mb_height) { + wuffs_vp8__decoder__apply_simple_filter_row(self, a_workbuf, v_mby); + if (v_mby < 1023u) { + v_mby += 1u; + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.apply_simple_filter_row + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__apply_simple_filter_row( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint32_t a_mby) { + uint32_t v_mbx = 0; + uint32_t v_mb_idx = 0; + uint32_t v_f_level = 0; + bool v_has_inner = false; + uint32_t v_mb_lim = 0; + uint32_t v_sub_lim = 0; + uint64_t v_y_off = 0; + uint32_t v_r = 0; + uint64_t v_idx = 0; + + v_mbx = 0u; + while (v_mbx < self->private_impl.f_mb_width) { + v_mb_idx = ((uint32_t)(((a_mby & 1u) * 1024u) + v_mbx)); + if (v_mb_idx >= 2048u) { + v_mbx += 1u; + continue; + } + v_f_level = ((uint32_t)(self->private_data.f_mb_filter_level[v_mb_idx])); + if (v_f_level == 0u) { + v_mbx += 1u; + continue; + } + v_has_inner = (self->private_data.f_mb_filter_inner[v_mb_idx] != 0u); + v_sub_lim = v_f_level; + v_mb_lim = ((uint32_t)(v_sub_lim + 4u)); + v_y_off = ((((uint64_t)(a_mby)) * 16u * ((uint64_t)(self->private_impl.f_y_stride))) + (((uint64_t)(v_mbx)) * 16u)); + if (v_mbx > 0u) { + v_r = 0u; + while (v_r < 16u) { + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + wuffs_vp8__decoder__filter2(self, + a_workbuf, + v_idx, + 1u, + v_mb_lim); + v_r += 1u; + } + } + if (v_has_inner) { + v_r = 0u; + while (v_r < 16u) { + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + wuffs_vp8__decoder__filter2(self, + a_workbuf, + ((uint64_t)(v_idx + 4u)), + 1u, + v_sub_lim); + wuffs_vp8__decoder__filter2(self, + a_workbuf, + ((uint64_t)(v_idx + 8u)), + 1u, + v_sub_lim); + wuffs_vp8__decoder__filter2(self, + a_workbuf, + ((uint64_t)(v_idx + 12u)), + 1u, + v_sub_lim); + v_r += 1u; + } + } + if (a_mby > 0u) { + wuffs_vp8__decoder__simple_vfilter_16(self, a_workbuf, v_y_off, v_mb_lim); + } + if (v_has_inner) { + wuffs_vp8__decoder__simple_vfilter_16(self, a_workbuf, ((uint64_t)(v_y_off + (4u * ((uint64_t)(self->private_impl.f_y_stride))))), v_sub_lim); + wuffs_vp8__decoder__simple_vfilter_16(self, a_workbuf, ((uint64_t)(v_y_off + (8u * ((uint64_t)(self->private_impl.f_y_stride))))), v_sub_lim); + wuffs_vp8__decoder__simple_vfilter_16(self, a_workbuf, ((uint64_t)(v_y_off + (12u * ((uint64_t)(self->private_impl.f_y_stride))))), v_sub_lim); + } + if (v_mbx < 1023u) { + v_mbx += 1u; + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.simple_vfilter_16 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__simple_vfilter_16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_limit) { + return (*self->private_impl.choosy_simple_vfilter_16)(self, a_workbuf, a_q0_off, a_limit); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__simple_vfilter_16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_limit) { + uint32_t v_r = 0; + + v_r = 0u; + while (v_r < 16u) { + wuffs_vp8__decoder__filter2(self, + a_workbuf, + ((uint64_t)(a_q0_off + ((uint64_t)(v_r)))), + ((uint64_t)(self->private_impl.f_y_stride)), + a_limit); + v_r += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_vfilter_inner_16 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_vfilter_inner_16)(self, a_workbuf, a_q0_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + uint32_t v_r = 0; + + v_r = 0u; + while (v_r < 16u) { + wuffs_vp8__decoder__filter246(self, + a_workbuf, + ((uint64_t)(a_q0_off + ((uint64_t)(v_r)))), + ((uint64_t)(self->private_impl.f_y_stride)), + a_level, + a_ilevel, + a_hlevel, + true); + v_r += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_vfilter_mb_16 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_vfilter_mb_16)(self, a_workbuf, a_q0_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + uint32_t v_r = 0; + + v_r = 0u; + while (v_r < 16u) { + wuffs_vp8__decoder__filter246(self, + a_workbuf, + ((uint64_t)(a_q0_off + ((uint64_t)(v_r)))), + ((uint64_t)(self->private_impl.f_y_stride)), + a_level, + a_ilevel, + a_hlevel, + false); + v_r += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_vfilter_mb_8 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_8( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_vfilter_mb_8)(self, a_workbuf, a_q0_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_8__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + uint32_t v_r = 0; + + v_r = 0u; + while (v_r < 8u) { + wuffs_vp8__decoder__filter246(self, + a_workbuf, + ((uint64_t)(a_q0_off + ((uint64_t)(v_r)))), + ((uint64_t)(self->private_impl.f_uv_stride)), + a_level, + a_ilevel, + a_hlevel, + false); + v_r += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_hfilter_mb_16 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_hfilter_mb_16)(self, a_workbuf, a_q0_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + uint32_t v_r = 0; + + v_r = 0u; + while (v_r < 16u) { + wuffs_vp8__decoder__filter246(self, + a_workbuf, + ((uint64_t)(a_q0_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))), + 1u, + a_level, + a_ilevel, + a_hlevel, + false); + v_r += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_hfilter_mb_8 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_8( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_hfilter_mb_8)(self, a_workbuf, a_q0_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_8__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + uint32_t v_r = 0; + + v_r = 0u; + while (v_r < 8u) { + wuffs_vp8__decoder__filter246(self, + a_workbuf, + ((uint64_t)(a_q0_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))), + 1u, + a_level, + a_ilevel, + a_hlevel, + false); + v_r += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_hfilter_inner_16 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_hfilter_inner_16)(self, a_workbuf, a_q0_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + uint32_t v_r = 0; + + v_r = 0u; + while (v_r < 16u) { + wuffs_vp8__decoder__filter246(self, + a_workbuf, + ((uint64_t)(a_q0_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))), + 1u, + a_level, + a_ilevel, + a_hlevel, + true); + v_r += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_hfilter_inner_8 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_8( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_hfilter_inner_8)(self, a_workbuf, a_q0_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_8__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + uint32_t v_r = 0; + + v_r = 0u; + while (v_r < 8u) { + wuffs_vp8__decoder__filter246(self, + a_workbuf, + ((uint64_t)(a_q0_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))), + 1u, + a_level, + a_ilevel, + a_hlevel, + true); + v_r += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_vfilter_inner_8 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_8( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_vfilter_inner_8)(self, a_workbuf, a_q0_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_8__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + uint32_t v_r = 0; + + v_r = 0u; + while (v_r < 8u) { + wuffs_vp8__decoder__filter246(self, + a_workbuf, + ((uint64_t)(a_q0_off + ((uint64_t)(v_r)))), + ((uint64_t)(self->private_impl.f_uv_stride)), + a_level, + a_ilevel, + a_hlevel, + true); + v_r += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_vfilter_mb_uv + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_uv( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_vfilter_mb_uv)(self, a_workbuf, a_u_off, a_v_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_uv__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_vp8__decoder__normal_vfilter_mb_8(self, + a_workbuf, + a_u_off, + a_level, + a_ilevel, + a_hlevel); + wuffs_vp8__decoder__normal_vfilter_mb_8(self, + a_workbuf, + a_v_off, + a_level, + a_ilevel, + a_hlevel); + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_hfilter_mb_uv + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_uv( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_hfilter_mb_uv)(self, a_workbuf, a_u_off, a_v_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_uv__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_vp8__decoder__normal_hfilter_mb_8(self, + a_workbuf, + a_u_off, + a_level, + a_ilevel, + a_hlevel); + wuffs_vp8__decoder__normal_hfilter_mb_8(self, + a_workbuf, + a_v_off, + a_level, + a_ilevel, + a_hlevel); + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_vfilter_inner_uv + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_uv( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_vfilter_inner_uv)(self, a_workbuf, a_u_off, a_v_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_uv__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_vp8__decoder__normal_vfilter_inner_8(self, + a_workbuf, + a_u_off, + a_level, + a_ilevel, + a_hlevel); + wuffs_vp8__decoder__normal_vfilter_inner_8(self, + a_workbuf, + a_v_off, + a_level, + a_ilevel, + a_hlevel); + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.normal_hfilter_inner_uv + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_uv( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + return (*self->private_impl.choosy_normal_hfilter_inner_uv)(self, a_workbuf, a_u_off, a_v_off, a_level, a_ilevel, a_hlevel); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_uv__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_vp8__decoder__normal_hfilter_inner_8(self, + a_workbuf, + a_u_off, + a_level, + a_ilevel, + a_hlevel); + wuffs_vp8__decoder__normal_hfilter_inner_8(self, + a_workbuf, + a_v_off, + a_level, + a_ilevel, + a_hlevel); + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.filter2 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__filter2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_idx, + uint64_t a_step, + uint32_t a_limit) { + uint64_t v_p1_idx = 0; + uint64_t v_p0_idx = 0; + uint64_t v_q1_idx = 0; + uint32_t v_p1 = 0; + uint32_t v_p0 = 0; + uint32_t v_q0 = 0; + uint32_t v_q1 = 0; + uint32_t v_dp0q0 = 0; + uint32_t v_dp1q1 = 0; + uint32_t v_thresh = 0; + uint32_t v_a = 0; + uint32_t v_a1 = 0; + uint32_t v_a2 = 0; + uint32_t v_pq_diff = 0; + uint32_t v_val = 0; + + if (a_q0_idx < a_step) { + return wuffs_base__make_empty_struct(); + } + v_p0_idx = (a_q0_idx - a_step); + if (v_p0_idx < a_step) { + return wuffs_base__make_empty_struct(); + } + v_p1_idx = (v_p0_idx - a_step); + v_q1_idx = ((uint64_t)(a_q0_idx + a_step)); + if ((v_q1_idx >= ((uint64_t)(a_workbuf.len))) || + (a_q0_idx >= ((uint64_t)(a_workbuf.len))) || + (v_p0_idx >= ((uint64_t)(a_workbuf.len))) || + (v_p1_idx >= ((uint64_t)(a_workbuf.len)))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = ((uint32_t)(a_workbuf.ptr[v_p1_idx])); + v_p0 = ((uint32_t)(a_workbuf.ptr[v_p0_idx])); + v_q0 = ((uint32_t)(a_workbuf.ptr[a_q0_idx])); + v_q1 = ((uint32_t)(a_workbuf.ptr[v_q1_idx])); + v_dp0q0 = ((uint32_t)(v_p0 - v_q0)); + if ((v_dp0q0 & 2147483648u) != 0u) { + v_dp0q0 = ((uint32_t)(0u - v_dp0q0)); + } + v_dp0q0 = (v_dp0q0 & 255u); + v_dp1q1 = ((uint32_t)(v_p1 - v_q1)); + if ((v_dp1q1 & 2147483648u) != 0u) { + v_dp1q1 = ((uint32_t)(0u - v_dp1q1)); + } + v_dp1q1 = (v_dp1q1 & 255u); + v_thresh = ((v_dp0q0 * 2u) + (v_dp1q1 >> 1u)); + if (v_thresh > a_limit) { + return wuffs_base__make_empty_struct(); + } + v_pq_diff = ((uint32_t)(v_p1 - v_q1)); + if ((v_pq_diff & 2147483648u) != 0u) { + if (v_pq_diff < 4294967168u) { + v_pq_diff = 4294967168u; + } + } else { + if (v_pq_diff > 127u) { + v_pq_diff = 127u; + } + } + v_a = ((uint32_t)(((uint32_t)(3u * ((uint32_t)(v_q0 - v_p0)))) + v_pq_diff)); + v_a1 = wuffs_vp8__decoder__clamp15_asr3(self, ((uint32_t)(v_a + 4u))); + v_a2 = wuffs_vp8__decoder__clamp15_asr3(self, ((uint32_t)(v_a + 3u))); + v_val = ((uint32_t)(v_p0 + v_a2)); + if (v_val > 255u) { + if ((v_val & 2147483648u) != 0u) { + v_val = 0u; + } else { + v_val = 255u; + } + } + a_workbuf.ptr[v_p0_idx] = ((uint8_t)(v_val)); + v_val = ((uint32_t)(v_q0 - v_a1)); + if (v_val > 255u) { + if ((v_val & 2147483648u) != 0u) { + v_val = 0u; + } else { + v_val = 255u; + } + } + a_workbuf.ptr[a_q0_idx] = ((uint8_t)(v_val)); + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.clamp15_asr3 + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__clamp15_asr3( + wuffs_vp8__decoder* self, + uint32_t a_v) { + uint32_t v_result = 0; + + if ((a_v & 2147483648u) != 0u) { + v_result = ((a_v >> 3u) | 3758096384u); + } else { + v_result = (a_v >> 3u); + } + if ((v_result & 2147483648u) != 0u) { + if (v_result < 4294967280u) { + v_result = 4294967280u; + } + } else { + if (v_result > 15u) { + v_result = 15u; + } + } + return v_result; +} + +// -------- func vp8.decoder.clamp127 + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__clamp127( + wuffs_vp8__decoder* self, + uint32_t a_v) { + if ((a_v & 2147483648u) != 0u) { + if (a_v < 4294967168u) { + return 4294967168u; + } + } else { + if (a_v > 127u) { + return 127u; + } + } + return a_v; +} + +// -------- func vp8.decoder.abs_u32 + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__abs_u32( + wuffs_vp8__decoder* self, + uint32_t a_v) { + if ((a_v & 2147483648u) != 0u) { + return ((uint32_t)(0u - a_v)); + } + return a_v; +} + +// -------- func vp8.decoder.clamp255 + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__clamp255( + wuffs_vp8__decoder* self, + uint32_t a_v) { + if ((a_v & 2147483648u) != 0u) { + return 0u; + } + if (a_v > 255u) { + return 255u; + } + return a_v; +} + +// -------- func vp8.decoder.apply_normal_filter_all + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__apply_normal_filter_all( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf) { + uint32_t v_mby = 0; + + v_mby = 0u; + while (v_mby < self->private_impl.f_mb_height) { + wuffs_vp8__decoder__apply_normal_filter_row(self, a_workbuf, v_mby); + if (v_mby < 1023u) { + v_mby += 1u; + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.apply_normal_filter_row + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__apply_normal_filter_row( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint32_t a_mby) { + uint32_t v_mbx = 0; + uint32_t v_mb_idx = 0; + uint32_t v_f_level = 0; + uint32_t v_f_ilevel = 0; + uint32_t v_f_hlevel = 0; + bool v_has_inner = false; + uint64_t v_y_off = 0; + uint64_t v_u_off = 0; + uint64_t v_v_off = 0; + + v_mbx = 0u; + while (v_mbx < self->private_impl.f_mb_width) { + v_mb_idx = ((uint32_t)(((a_mby & 1u) * 1024u) + v_mbx)); + if (v_mb_idx >= 2048u) { + v_mbx += 1u; + continue; + } + v_f_level = ((uint32_t)(self->private_data.f_mb_filter_level[v_mb_idx])); + if (v_f_level == 0u) { + v_mbx += 1u; + continue; + } + v_f_ilevel = ((uint32_t)(self->private_data.f_mb_filter_ilevel[v_mb_idx])); + v_f_hlevel = ((uint32_t)(self->private_data.f_mb_filter_hlevel[v_mb_idx])); + v_has_inner = (self->private_data.f_mb_filter_inner[v_mb_idx] != 0u); + v_y_off = ((((uint64_t)(a_mby)) * 16u * ((uint64_t)(self->private_impl.f_y_stride))) + (((uint64_t)(v_mbx)) * 16u)); + v_u_off = (self->private_impl.f_workbuf_offset_y_end + (((uint64_t)(a_mby)) * 8u * ((uint64_t)(self->private_impl.f_uv_stride))) + (((uint64_t)(v_mbx)) * 8u)); + v_v_off = (self->private_impl.f_workbuf_offset_u_end + (((uint64_t)(a_mby)) * 8u * ((uint64_t)(self->private_impl.f_uv_stride))) + (((uint64_t)(v_mbx)) * 8u)); + if (v_mbx > 0u) { + wuffs_vp8__decoder__normal_hfilter_mb_16(self, + a_workbuf, + v_y_off, + ((uint32_t)(v_f_level + 4u)), + v_f_ilevel, + v_f_hlevel); + wuffs_vp8__decoder__normal_hfilter_mb_uv(self, + a_workbuf, + v_u_off, + v_v_off, + ((uint32_t)(v_f_level + 4u)), + v_f_ilevel, + v_f_hlevel); + } + if (v_has_inner) { + wuffs_vp8__decoder__normal_hfilter_inner_16(self, + a_workbuf, + ((uint64_t)(v_y_off + 4u)), + v_f_level, + v_f_ilevel, + v_f_hlevel); + wuffs_vp8__decoder__normal_hfilter_inner_16(self, + a_workbuf, + ((uint64_t)(v_y_off + 8u)), + v_f_level, + v_f_ilevel, + v_f_hlevel); + wuffs_vp8__decoder__normal_hfilter_inner_16(self, + a_workbuf, + ((uint64_t)(v_y_off + 12u)), + v_f_level, + v_f_ilevel, + v_f_hlevel); + wuffs_vp8__decoder__normal_hfilter_inner_uv(self, + a_workbuf, + ((uint64_t)(v_u_off + 4u)), + ((uint64_t)(v_v_off + 4u)), + v_f_level, + v_f_ilevel, + v_f_hlevel); + } + if (a_mby > 0u) { + wuffs_vp8__decoder__normal_vfilter_mb_16(self, + a_workbuf, + v_y_off, + ((uint32_t)(v_f_level + 4u)), + v_f_ilevel, + v_f_hlevel); + wuffs_vp8__decoder__normal_vfilter_mb_uv(self, + a_workbuf, + v_u_off, + v_v_off, + ((uint32_t)(v_f_level + 4u)), + v_f_ilevel, + v_f_hlevel); + } + if (v_has_inner) { + wuffs_vp8__decoder__normal_vfilter_inner_16(self, + a_workbuf, + ((uint64_t)(v_y_off + (4u * ((uint64_t)(self->private_impl.f_y_stride))))), + v_f_level, + v_f_ilevel, + v_f_hlevel); + wuffs_vp8__decoder__normal_vfilter_inner_16(self, + a_workbuf, + ((uint64_t)(v_y_off + (8u * ((uint64_t)(self->private_impl.f_y_stride))))), + v_f_level, + v_f_ilevel, + v_f_hlevel); + wuffs_vp8__decoder__normal_vfilter_inner_16(self, + a_workbuf, + ((uint64_t)(v_y_off + (12u * ((uint64_t)(self->private_impl.f_y_stride))))), + v_f_level, + v_f_ilevel, + v_f_hlevel); + wuffs_vp8__decoder__normal_vfilter_inner_uv(self, + a_workbuf, + ((uint64_t)(v_u_off + (4u * ((uint64_t)(self->private_impl.f_uv_stride))))), + ((uint64_t)(v_v_off + (4u * ((uint64_t)(self->private_impl.f_uv_stride))))), + v_f_level, + v_f_ilevel, + v_f_hlevel); + } + if (v_mbx < 1023u) { + v_mbx += 1u; + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.filter246 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__filter246( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_idx, + uint64_t a_step, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel, + bool a_four_not_six) { + uint64_t v_p3_idx = 0; + uint64_t v_p2_idx = 0; + uint64_t v_p1_idx = 0; + uint64_t v_p0_idx = 0; + uint64_t v_q1_idx = 0; + uint64_t v_q2_idx = 0; + uint64_t v_q3_idx = 0; + uint32_t v_p3 = 0; + uint32_t v_p2 = 0; + uint32_t v_p1 = 0; + uint32_t v_p0 = 0; + uint32_t v_q0 = 0; + uint32_t v_q1 = 0; + uint32_t v_q2 = 0; + uint32_t v_q3 = 0; + uint32_t v_a = 0; + uint32_t v_a1 = 0; + uint32_t v_a2 = 0; + uint32_t v_a3 = 0; + uint32_t v_t1 = 0; + uint32_t v_t2 = 0; + + if (a_q0_idx < a_step) { + return wuffs_base__make_empty_struct(); + } + v_p0_idx = (a_q0_idx - a_step); + if (v_p0_idx < a_step) { + return wuffs_base__make_empty_struct(); + } + v_p1_idx = (v_p0_idx - a_step); + if (v_p1_idx < a_step) { + return wuffs_base__make_empty_struct(); + } + v_p2_idx = (v_p1_idx - a_step); + if (v_p2_idx < a_step) { + return wuffs_base__make_empty_struct(); + } + v_p3_idx = (v_p2_idx - a_step); + v_q1_idx = ((uint64_t)(a_q0_idx + a_step)); + v_q2_idx = ((uint64_t)(v_q1_idx + a_step)); + v_q3_idx = ((uint64_t)(v_q2_idx + a_step)); + if ((v_q3_idx >= ((uint64_t)(a_workbuf.len))) || + (v_q2_idx >= ((uint64_t)(a_workbuf.len))) || + (v_q1_idx >= ((uint64_t)(a_workbuf.len))) || + (a_q0_idx >= ((uint64_t)(a_workbuf.len))) || + (v_p0_idx >= ((uint64_t)(a_workbuf.len))) || + (v_p1_idx >= ((uint64_t)(a_workbuf.len))) || + (v_p2_idx >= ((uint64_t)(a_workbuf.len))) || + (v_p3_idx >= ((uint64_t)(a_workbuf.len)))) { + return wuffs_base__make_empty_struct(); + } + v_p3 = ((uint32_t)(a_workbuf.ptr[v_p3_idx])); + v_p2 = ((uint32_t)(a_workbuf.ptr[v_p2_idx])); + v_p1 = ((uint32_t)(a_workbuf.ptr[v_p1_idx])); + v_p0 = ((uint32_t)(a_workbuf.ptr[v_p0_idx])); + v_q0 = ((uint32_t)(a_workbuf.ptr[a_q0_idx])); + v_q1 = ((uint32_t)(a_workbuf.ptr[v_q1_idx])); + v_q2 = ((uint32_t)(a_workbuf.ptr[v_q2_idx])); + v_q3 = ((uint32_t)(a_workbuf.ptr[v_q3_idx])); + v_t1 = wuffs_vp8__decoder__abs_u32(self, ((uint32_t)(v_p0 - v_q0))); + v_t1 = (v_t1 & 255u); + v_t2 = wuffs_vp8__decoder__abs_u32(self, ((uint32_t)(v_p1 - v_q1))); + v_t2 = (v_t2 & 255u); + if (((v_t1 * 2u) + (v_t2 >> 1u)) > a_level) { + return wuffs_base__make_empty_struct(); + } + v_t1 = wuffs_vp8__decoder__abs_u32(self, ((uint32_t)(v_p3 - v_p2))); + if (v_t1 > a_ilevel) { + return wuffs_base__make_empty_struct(); + } + v_t1 = wuffs_vp8__decoder__abs_u32(self, ((uint32_t)(v_p2 - v_p1))); + if (v_t1 > a_ilevel) { + return wuffs_base__make_empty_struct(); + } + v_t1 = wuffs_vp8__decoder__abs_u32(self, ((uint32_t)(v_p1 - v_p0))); + if (v_t1 > a_ilevel) { + return wuffs_base__make_empty_struct(); + } + v_t1 = wuffs_vp8__decoder__abs_u32(self, ((uint32_t)(v_q1 - v_q0))); + if (v_t1 > a_ilevel) { + return wuffs_base__make_empty_struct(); + } + v_t1 = wuffs_vp8__decoder__abs_u32(self, ((uint32_t)(v_q2 - v_q1))); + if (v_t1 > a_ilevel) { + return wuffs_base__make_empty_struct(); + } + v_t1 = wuffs_vp8__decoder__abs_u32(self, ((uint32_t)(v_q3 - v_q2))); + if (v_t1 > a_ilevel) { + return wuffs_base__make_empty_struct(); + } + v_t1 = wuffs_vp8__decoder__abs_u32(self, ((uint32_t)(v_p1 - v_p0))); + v_t2 = wuffs_vp8__decoder__abs_u32(self, ((uint32_t)(v_q1 - v_q0))); + if ((v_t1 > a_hlevel) || (v_t2 > a_hlevel)) { + v_t1 = wuffs_vp8__decoder__clamp127(self, ((uint32_t)(v_p1 - v_q1))); + v_a = ((uint32_t)(((uint32_t)(3u * ((uint32_t)(v_q0 - v_p0)))) + v_t1)); + v_a1 = wuffs_vp8__decoder__clamp15_asr3(self, ((uint32_t)(v_a + 4u))); + v_a2 = wuffs_vp8__decoder__clamp15_asr3(self, ((uint32_t)(v_a + 3u))); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_p0 + v_a2))); + a_workbuf.ptr[v_p0_idx] = ((uint8_t)(v_t1)); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_q0 - v_a1))); + a_workbuf.ptr[a_q0_idx] = ((uint8_t)(v_t1)); + } else if (a_four_not_six) { + v_a = ((uint32_t)(3u * ((uint32_t)(v_q0 - v_p0)))); + v_a1 = wuffs_vp8__decoder__clamp15_asr3(self, ((uint32_t)(v_a + 4u))); + v_a2 = wuffs_vp8__decoder__clamp15_asr3(self, ((uint32_t)(v_a + 3u))); + v_a3 = ((uint32_t)(v_a1 + 1u)); + if ((v_a3 & 2147483648u) != 0u) { + v_a3 = ((v_a3 >> 1u) | 2147483648u); + } else { + v_a3 >>= 1u; + } + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_p1 + v_a3))); + a_workbuf.ptr[v_p1_idx] = ((uint8_t)(v_t1)); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_p0 + v_a2))); + a_workbuf.ptr[v_p0_idx] = ((uint8_t)(v_t1)); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_q0 - v_a1))); + a_workbuf.ptr[a_q0_idx] = ((uint8_t)(v_t1)); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_q1 - v_a3))); + a_workbuf.ptr[v_q1_idx] = ((uint8_t)(v_t1)); + } else { + v_t1 = wuffs_vp8__decoder__clamp127(self, ((uint32_t)(v_p1 - v_q1))); + v_t2 = ((uint32_t)(((uint32_t)(3u * ((uint32_t)(v_q0 - v_p0)))) + v_t1)); + v_a = wuffs_vp8__decoder__clamp127(self, v_t2); + v_a1 = wuffs_vp8__decoder__signed_shift_right_7(self, ((uint32_t)(((uint32_t)(27u * v_a)) + 63u))); + v_a2 = wuffs_vp8__decoder__signed_shift_right_7(self, ((uint32_t)(((uint32_t)(18u * v_a)) + 63u))); + v_a3 = wuffs_vp8__decoder__signed_shift_right_7(self, ((uint32_t)(((uint32_t)(9u * v_a)) + 63u))); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_p2 + v_a3))); + a_workbuf.ptr[v_p2_idx] = ((uint8_t)(v_t1)); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_p1 + v_a2))); + a_workbuf.ptr[v_p1_idx] = ((uint8_t)(v_t1)); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_p0 + v_a1))); + a_workbuf.ptr[v_p0_idx] = ((uint8_t)(v_t1)); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_q0 - v_a1))); + a_workbuf.ptr[a_q0_idx] = ((uint8_t)(v_t1)); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_q1 - v_a2))); + a_workbuf.ptr[v_q1_idx] = ((uint8_t)(v_t1)); + v_t1 = wuffs_vp8__decoder__clamp255(self, ((uint32_t)(v_q2 - v_a3))); + a_workbuf.ptr[v_q2_idx] = ((uint8_t)(v_t1)); + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.signed_shift_right_7 + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__signed_shift_right_7( + wuffs_vp8__decoder* self, + uint32_t a_v) { + if ((a_v & 2147483648u) != 0u) { + return ((a_v >> 7u) | 4261412864u); + } + return (a_v >> 7u); +} + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.simple_vfilter_16_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__simple_vfilter_16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_limit) { + uint8x16_t v_p1 = {0}; + uint8x16_t v_p0 = {0}; + uint8x16_t v_q0 = {0}; + uint8x16_t v_q1 = {0}; + uint8x16_t v_sign_bit = {0}; + uint8x16_t v_kFE = {0}; + uint8x16_t v_m_thresh = {0}; + uint8x16_t v_k3 = {0}; + uint8x16_t v_k4 = {0}; + uint8x16_t v_mask = {0}; + uint8x16_t v_t1 = {0}; + uint8x16_t v_t2 = {0}; + uint8x16_t v_t3 = {0}; + uint8x16_t v_delta = {0}; + uint8x16_t v_v3 = {0}; + uint8x16_t v_v4 = {0}; + uint8x16_t v_zero = {0}; + wuffs_base__slice_u8 v_wb = {0}; + + if (a_q0_off < (2u * ((uint64_t)(self->private_impl.f_y_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - (2u * ((uint64_t)(self->private_impl.f_y_stride)))) <= ((uint64_t)(v_wb.len))) { + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - (2u * ((uint64_t)(self->private_impl.f_y_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p0 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q0 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q1 = vld1q_u8(v_wb.ptr); + v_zero = vdupq_n_u8(0u); + v_sign_bit = vdupq_n_u8(128u); + v_kFE = vdupq_n_u8(254u); + v_m_thresh = vdupq_n_u8(((uint8_t)(a_limit))); + v_k3 = vdupq_n_u8(3u); + v_k4 = vdupq_n_u8(4u); + v_t1 = vabdq_u8(v_p1, v_q1); + v_t2 = vandq_u8(v_t1, v_kFE); + v_t2 = vshrq_n_u8(v_t2, 1u); + v_t3 = vabdq_u8(v_p0, v_q0); + v_t3 = vqaddq_u8(v_t3, v_t3); + v_t3 = vqaddq_u8(v_t3, v_t2); + v_mask = vqsubq_u8(v_t3, v_m_thresh); + v_mask = vceqq_u8(v_mask, v_zero); + v_p1 = veorq_u8(v_p1, v_sign_bit); + v_p0 = veorq_u8(v_p0, v_sign_bit); + v_q0 = veorq_u8(v_q0, v_sign_bit); + v_q1 = veorq_u8(v_q1, v_sign_bit); + v_t1 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_p1), vreinterpretq_s8_u8(v_q1))); + v_t2 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q0), vreinterpretq_s8_u8(v_p0))); + v_t1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_t1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_delta = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_delta = vandq_u8(v_delta, v_mask); + v_v4 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_delta), vreinterpretq_s8_u8(v_k4))); + v_v4 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_v4), 3u)); + v_v3 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_delta), vreinterpretq_s8_u8(v_k3))); + v_v3 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_v3), 3u)); + v_q0 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q0), vreinterpretq_s8_u8(v_v4))); + v_p0 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p0), vreinterpretq_s8_u8(v_v3))); + v_p0 = veorq_u8(v_p0, v_sign_bit); + v_q0 = veorq_u8(v_q0, v_sign_bit); + if (a_q0_off < ((uint64_t)(self->private_impl.f_y_stride))) { + return wuffs_base__make_empty_struct(); + } + if ((a_q0_off - ((uint64_t)(self->private_impl.f_y_stride))) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - ((uint64_t)(self->private_impl.f_y_stride)))); + } else { + return wuffs_base__make_empty_struct(); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_p0); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_q0); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.normal_vfilter_inner_16_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint8x16_t v_p3 = {0}; + uint8x16_t v_p2 = {0}; + uint8x16_t v_p1 = {0}; + uint8x16_t v_p0 = {0}; + uint8x16_t v_q0 = {0}; + uint8x16_t v_q1 = {0}; + uint8x16_t v_q2 = {0}; + uint8x16_t v_q3 = {0}; + uint8x16_t v_zero = {0}; + uint8x16_t v_sign_bit = {0}; + uint8x16_t v_kFE = {0}; + uint8x16_t v_m_thresh = {0}; + uint8x16_t v_m_ithresh = {0}; + uint8x16_t v_m_hthresh = {0}; + uint8x16_t v_k1 = {0}; + uint8x16_t v_k3 = {0}; + uint8x16_t v_k4 = {0}; + uint8x16_t v_mask = {0}; + uint8x16_t v_not_hev = {0}; + uint8x16_t v_delta = {0}; + uint8x16_t v_v3 = {0}; + uint8x16_t v_v4 = {0}; + uint8x16_t v_a3 = {0}; + uint8x16_t v_t1 = {0}; + uint8x16_t v_t2 = {0}; + uint8x16_t v_t3 = {0}; + + if (a_q0_off < (4u * ((uint64_t)(self->private_impl.f_y_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - (4u * ((uint64_t)(self->private_impl.f_y_stride)))) <= ((uint64_t)(v_wb.len))) { + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - (4u * ((uint64_t)(self->private_impl.f_y_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p3 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p2 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p0 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q0 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q1 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q2 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q3 = vld1q_u8(v_wb.ptr); + v_zero = vdupq_n_u8(0u); + v_sign_bit = vdupq_n_u8(128u); + v_kFE = vdupq_n_u8(254u); + v_m_thresh = vdupq_n_u8(((uint8_t)(a_level))); + v_m_ithresh = vdupq_n_u8(((uint8_t)(a_ilevel))); + v_m_hthresh = vdupq_n_u8(((uint8_t)(a_hlevel))); + v_k1 = vdupq_n_u8(1u); + v_k3 = vdupq_n_u8(3u); + v_k4 = vdupq_n_u8(4u); + v_t1 = vabdq_u8(v_p1, v_q1); + v_t2 = vandq_u8(v_t1, v_kFE); + v_t2 = vshrq_n_u8(v_t2, 1u); + v_t3 = vabdq_u8(v_p0, v_q0); + v_t3 = vqaddq_u8(v_t3, v_t3); + v_t3 = vqaddq_u8(v_t3, v_t2); + v_mask = vqsubq_u8(v_t3, v_m_thresh); + v_mask = vceqq_u8(v_mask, v_zero); + v_t1 = vabdq_u8(v_p3, v_p2); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p2, v_p1); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p1, v_p0); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q0, v_q1); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q1, v_q2); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q2, v_q3); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p1, v_p0); + v_t2 = vabdq_u8(v_q1, v_q0); + v_t3 = vorrq_u8(vqsubq_u8(v_t1, v_m_hthresh), vqsubq_u8(v_t2, v_m_hthresh)); + v_not_hev = vceqq_u8(v_t3, v_zero); + v_p1 = veorq_u8(v_p1, v_sign_bit); + v_p0 = veorq_u8(v_p0, v_sign_bit); + v_q0 = veorq_u8(v_q0, v_sign_bit); + v_q1 = veorq_u8(v_q1, v_sign_bit); + v_t1 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_p1), vreinterpretq_s8_u8(v_q1))); + v_t1 = vbicq_u8(v_t1, v_not_hev); + v_t2 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q0), vreinterpretq_s8_u8(v_p0))); + v_t1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_t1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_delta = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_delta = vandq_u8(v_delta, v_mask); + v_v4 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_delta), vreinterpretq_s8_u8(v_k4))); + v_v4 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_v4), 3u)); + v_v3 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_delta), vreinterpretq_s8_u8(v_k3))); + v_v3 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_v3), 3u)); + v_q0 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q0), vreinterpretq_s8_u8(v_v4))); + v_p0 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p0), vreinterpretq_s8_u8(v_v3))); + v_a3 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_v4), vreinterpretq_s8_u8(v_k1))); + v_a3 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_a3), 1u)); + v_a3 = vandq_u8(v_a3, v_not_hev); + v_q1 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q1), vreinterpretq_s8_u8(v_a3))); + v_p1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p1), vreinterpretq_s8_u8(v_a3))); + v_p1 = veorq_u8(v_p1, v_sign_bit); + v_p0 = veorq_u8(v_p0, v_sign_bit); + v_q0 = veorq_u8(v_q0, v_sign_bit); + v_q1 = veorq_u8(v_q1, v_sign_bit); + if (a_q0_off < (2u * ((uint64_t)(self->private_impl.f_y_stride)))) { + return wuffs_base__make_empty_struct(); + } + if ((a_q0_off - (2u * ((uint64_t)(self->private_impl.f_y_stride)))) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - (2u * ((uint64_t)(self->private_impl.f_y_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_p1); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_p0); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_q0); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_q1); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.normal_vfilter_mb_16_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint8x16_t v_p3 = {0}; + uint8x16_t v_p2 = {0}; + uint8x16_t v_p1 = {0}; + uint8x16_t v_p0 = {0}; + uint8x16_t v_q0 = {0}; + uint8x16_t v_q1 = {0}; + uint8x16_t v_q2 = {0}; + uint8x16_t v_q3 = {0}; + uint8x16_t v_zero = {0}; + uint8x16_t v_sign_bit = {0}; + uint8x16_t v_kFE = {0}; + uint8x16_t v_m_thresh = {0}; + uint8x16_t v_m_ithresh = {0}; + uint8x16_t v_m_hthresh = {0}; + uint8x16_t v_k3 = {0}; + uint8x16_t v_k4 = {0}; + uint8x16_t v_mask = {0}; + uint8x16_t v_not_hev = {0}; + uint8x16_t v_delta = {0}; + uint8x16_t v_v3 = {0}; + uint8x16_t v_v4 = {0}; + uint8x16_t v_a1 = {0}; + uint8x16_t v_a2 = {0}; + uint8x16_t v_a3 = {0}; + uint8x16_t v_t1 = {0}; + uint8x16_t v_t2 = {0}; + uint8x16_t v_t3 = {0}; + uint8x16_t v_p0_adj = {0}; + uint8x16_t v_q0_adj = {0}; + uint8x8_t v_d_lo = {0}; + uint8x8_t v_d_hi = {0}; + uint16x8_t v_lo = {0}; + uint16x8_t v_hi = {0}; + uint16x8_t v_k63_16 = {0}; + uint16x8_t v_tmp_lo = {0}; + uint16x8_t v_tmp_hi = {0}; + uint8x8_t v_narrow_lo = {0}; + uint8x8_t v_narrow_hi = {0}; + + if (a_q0_off < (4u * ((uint64_t)(self->private_impl.f_y_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - (4u * ((uint64_t)(self->private_impl.f_y_stride)))) <= ((uint64_t)(v_wb.len))) { + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - (4u * ((uint64_t)(self->private_impl.f_y_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p3 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p2 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p0 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q0 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q1 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q2 = vld1q_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q3 = vld1q_u8(v_wb.ptr); + v_zero = vdupq_n_u8(0u); + v_sign_bit = vdupq_n_u8(128u); + v_kFE = vdupq_n_u8(254u); + v_m_thresh = vdupq_n_u8(((uint8_t)(a_level))); + v_m_ithresh = vdupq_n_u8(((uint8_t)(a_ilevel))); + v_m_hthresh = vdupq_n_u8(((uint8_t)(a_hlevel))); + v_k3 = vdupq_n_u8(3u); + v_k4 = vdupq_n_u8(4u); + v_k63_16 = vdupq_n_u16(63u); + v_t1 = vabdq_u8(v_p1, v_q1); + v_t2 = vshrq_n_u8(vandq_u8(v_t1, v_kFE), 1u); + v_t3 = vabdq_u8(v_p0, v_q0); + v_t3 = vqaddq_u8(v_t3, v_t3); + v_t3 = vqaddq_u8(v_t3, v_t2); + v_mask = vceqq_u8(vqsubq_u8(v_t3, v_m_thresh), v_zero); + v_t1 = vabdq_u8(v_p3, v_p2); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p2, v_p1); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p1, v_p0); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q0, v_q1); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q1, v_q2); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q2, v_q3); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p1, v_p0); + v_t2 = vabdq_u8(v_q1, v_q0); + v_t3 = vorrq_u8(vqsubq_u8(v_t1, v_m_hthresh), vqsubq_u8(v_t2, v_m_hthresh)); + v_not_hev = vceqq_u8(v_t3, v_zero); + v_p2 = veorq_u8(v_p2, v_sign_bit); + v_p1 = veorq_u8(v_p1, v_sign_bit); + v_p0 = veorq_u8(v_p0, v_sign_bit); + v_q0 = veorq_u8(v_q0, v_sign_bit); + v_q1 = veorq_u8(v_q1, v_sign_bit); + v_q2 = veorq_u8(v_q2, v_sign_bit); + v_t1 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_p1), vreinterpretq_s8_u8(v_q1))); + v_t2 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q0), vreinterpretq_s8_u8(v_p0))); + v_t1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_t1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_delta = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_delta = vandq_u8(v_delta, v_mask); + v_v4 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_delta), vreinterpretq_s8_u8(v_k4))); + v_v4 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_v4), 3u)); + v_v3 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_delta), vreinterpretq_s8_u8(v_k3))); + v_v3 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_v3), 3u)); + v_d_lo = vget_low_u8(v_delta); + v_d_hi = vget_high_u8(v_delta); + v_lo = vreinterpretq_u16_s16(vmovl_s8(vreinterpret_s8_u8(v_d_lo))); + v_hi = vreinterpretq_u16_s16(vmovl_s8(vreinterpret_s8_u8(v_d_hi))); + v_tmp_lo = vmulq_n_u16(v_lo, 27u); + v_tmp_lo = vaddq_u16(v_tmp_lo, v_k63_16); + v_tmp_lo = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_lo), 7u)); + v_tmp_hi = vmulq_n_u16(v_hi, 27u); + v_tmp_hi = vaddq_u16(v_tmp_hi, v_k63_16); + v_tmp_hi = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_hi), 7u)); + v_narrow_lo = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_lo))); + v_narrow_hi = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_hi))); + v_a1 = vcombine_u8(v_narrow_lo, v_narrow_hi); + v_tmp_lo = vmulq_n_u16(v_lo, 18u); + v_tmp_lo = vaddq_u16(v_tmp_lo, v_k63_16); + v_tmp_lo = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_lo), 7u)); + v_tmp_hi = vmulq_n_u16(v_hi, 18u); + v_tmp_hi = vaddq_u16(v_tmp_hi, v_k63_16); + v_tmp_hi = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_hi), 7u)); + v_narrow_lo = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_lo))); + v_narrow_hi = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_hi))); + v_a2 = vcombine_u8(v_narrow_lo, v_narrow_hi); + v_tmp_lo = vmulq_n_u16(v_lo, 9u); + v_tmp_lo = vaddq_u16(v_tmp_lo, v_k63_16); + v_tmp_lo = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_lo), 7u)); + v_tmp_hi = vmulq_n_u16(v_hi, 9u); + v_tmp_hi = vaddq_u16(v_tmp_hi, v_k63_16); + v_tmp_hi = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_hi), 7u)); + v_narrow_lo = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_lo))); + v_narrow_hi = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_hi))); + v_a3 = vcombine_u8(v_narrow_lo, v_narrow_hi); + v_p0_adj = vbicq_u8(v_v3, v_not_hev); + v_p0_adj = vorrq_u8(v_p0_adj, vandq_u8(v_a1, v_not_hev)); + v_p0 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p0), vreinterpretq_s8_u8(v_p0_adj))); + v_q0_adj = vbicq_u8(v_v4, v_not_hev); + v_q0_adj = vorrq_u8(v_q0_adj, vandq_u8(v_a1, v_not_hev)); + v_q0 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q0), vreinterpretq_s8_u8(v_q0_adj))); + v_p1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p1), vreinterpretq_s8_u8(vandq_u8(v_a2, v_not_hev)))); + v_q1 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q1), vreinterpretq_s8_u8(vandq_u8(v_a2, v_not_hev)))); + v_p2 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p2), vreinterpretq_s8_u8(vandq_u8(v_a3, v_not_hev)))); + v_q2 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q2), vreinterpretq_s8_u8(vandq_u8(v_a3, v_not_hev)))); + v_p2 = veorq_u8(v_p2, v_sign_bit); + v_p1 = veorq_u8(v_p1, v_sign_bit); + v_p0 = veorq_u8(v_p0, v_sign_bit); + v_q0 = veorq_u8(v_q0, v_sign_bit); + v_q1 = veorq_u8(v_q1, v_sign_bit); + v_q2 = veorq_u8(v_q2, v_sign_bit); + if (a_q0_off < (3u * ((uint64_t)(self->private_impl.f_y_stride)))) { + return wuffs_base__make_empty_struct(); + } + if ((a_q0_off - (3u * ((uint64_t)(self->private_impl.f_y_stride)))) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - (3u * ((uint64_t)(self->private_impl.f_y_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_p2); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_p1); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_p0); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_q0); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_q1); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_q2); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.normal_vfilter_mb_8_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_8_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint8x8_t v_p3 = {0}; + uint8x8_t v_p2 = {0}; + uint8x8_t v_p1 = {0}; + uint8x8_t v_p0 = {0}; + uint8x8_t v_q0 = {0}; + uint8x8_t v_q1 = {0}; + uint8x8_t v_q2 = {0}; + uint8x8_t v_q3 = {0}; + uint8x8_t v_zero = {0}; + uint8x8_t v_sign_bit = {0}; + uint8x8_t v_kFE = {0}; + uint8x8_t v_m_thresh = {0}; + uint8x8_t v_m_ithresh = {0}; + uint8x8_t v_m_hthresh = {0}; + uint8x8_t v_k3 = {0}; + uint8x8_t v_k4 = {0}; + uint8x8_t v_mask = {0}; + uint8x8_t v_not_hev = {0}; + uint8x8_t v_delta = {0}; + uint8x8_t v_v3 = {0}; + uint8x8_t v_v4 = {0}; + uint8x8_t v_a1 = {0}; + uint8x8_t v_a2 = {0}; + uint8x8_t v_a3 = {0}; + uint8x8_t v_t1 = {0}; + uint8x8_t v_t2 = {0}; + uint8x8_t v_t3 = {0}; + uint8x8_t v_p0_adj = {0}; + uint8x8_t v_q0_adj = {0}; + uint16x8_t v_wide = {0}; + uint16x8_t v_tmp = {0}; + uint16x8_t v_k63_16 = {0}; + + if (a_q0_off < (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(v_wb.len))) { + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p3 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p2 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p0 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q0 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q1 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q2 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q3 = vld1_u8(v_wb.ptr); + v_zero = vdup_n_u8(0u); + v_sign_bit = vdup_n_u8(128u); + v_kFE = vdup_n_u8(254u); + v_m_thresh = vdup_n_u8(((uint8_t)(a_level))); + v_m_ithresh = vdup_n_u8(((uint8_t)(a_ilevel))); + v_m_hthresh = vdup_n_u8(((uint8_t)(a_hlevel))); + v_k3 = vdup_n_u8(3u); + v_k4 = vdup_n_u8(4u); + v_k63_16 = vdupq_n_u16(63u); + v_t1 = vabd_u8(v_p1, v_q1); + v_t2 = vshr_n_u8(vand_u8(v_t1, v_kFE), 1u); + v_t3 = vabd_u8(v_p0, v_q0); + v_t3 = vqadd_u8(v_t3, v_t3); + v_t3 = vqadd_u8(v_t3, v_t2); + v_mask = vceq_u8(vqsub_u8(v_t3, v_m_thresh), v_zero); + v_t1 = vabd_u8(v_p3, v_p2); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p2, v_p1); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p1, v_p0); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q0, v_q1); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q1, v_q2); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q2, v_q3); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p1, v_p0); + v_t2 = vabd_u8(v_q1, v_q0); + v_t3 = vorr_u8(vqsub_u8(v_t1, v_m_hthresh), vqsub_u8(v_t2, v_m_hthresh)); + v_not_hev = vceq_u8(v_t3, v_zero); + v_p2 = veor_u8(v_p2, v_sign_bit); + v_p1 = veor_u8(v_p1, v_sign_bit); + v_p0 = veor_u8(v_p0, v_sign_bit); + v_q0 = veor_u8(v_q0, v_sign_bit); + v_q1 = veor_u8(v_q1, v_sign_bit); + v_q2 = veor_u8(v_q2, v_sign_bit); + v_t1 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_p1), vreinterpret_s8_u8(v_q1))); + v_t2 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q0), vreinterpret_s8_u8(v_p0))); + v_t1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_t1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_delta = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_delta = vand_u8(v_delta, v_mask); + v_v4 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_delta), vreinterpret_s8_u8(v_k4))); + v_v4 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_u8(v_v4), 3u)); + v_v3 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_delta), vreinterpret_s8_u8(v_k3))); + v_v3 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_u8(v_v3), 3u)); + v_wide = vreinterpretq_u16_s16(vmovl_s8(vreinterpret_s8_u8(v_delta))); + v_tmp = vmulq_n_u16(v_wide, 27u); + v_tmp = vaddq_u16(v_tmp, v_k63_16); + v_tmp = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp), 7u)); + v_a1 = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp))); + v_tmp = vmulq_n_u16(v_wide, 18u); + v_tmp = vaddq_u16(v_tmp, v_k63_16); + v_tmp = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp), 7u)); + v_a2 = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp))); + v_tmp = vmulq_n_u16(v_wide, 9u); + v_tmp = vaddq_u16(v_tmp, v_k63_16); + v_tmp = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp), 7u)); + v_a3 = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp))); + v_p0_adj = vbic_u8(v_v3, v_not_hev); + v_p0_adj = vorr_u8(v_p0_adj, vand_u8(v_a1, v_not_hev)); + v_p0 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_p0), vreinterpret_s8_u8(v_p0_adj))); + v_q0_adj = vbic_u8(v_v4, v_not_hev); + v_q0_adj = vorr_u8(v_q0_adj, vand_u8(v_a1, v_not_hev)); + v_q0 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q0), vreinterpret_s8_u8(v_q0_adj))); + v_p1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_p1), vreinterpret_s8_u8(vand_u8(v_a2, v_not_hev)))); + v_q1 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q1), vreinterpret_s8_u8(vand_u8(v_a2, v_not_hev)))); + v_p2 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_p2), vreinterpret_s8_u8(vand_u8(v_a3, v_not_hev)))); + v_q2 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q2), vreinterpret_s8_u8(vand_u8(v_a3, v_not_hev)))); + v_p2 = veor_u8(v_p2, v_sign_bit); + v_p1 = veor_u8(v_p1, v_sign_bit); + v_p0 = veor_u8(v_p0, v_sign_bit); + v_q0 = veor_u8(v_q0, v_sign_bit); + v_q1 = veor_u8(v_q1, v_sign_bit); + v_q2 = veor_u8(v_q2, v_sign_bit); + if (a_q0_off < (3u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + if ((a_q0_off - (3u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - (3u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_p2); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_p1); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_p0); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_q0); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_q1); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_q2); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.normal_vfilter_inner_8_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_8_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint8x8_t v_p3 = {0}; + uint8x8_t v_p2 = {0}; + uint8x8_t v_p1 = {0}; + uint8x8_t v_p0 = {0}; + uint8x8_t v_q0 = {0}; + uint8x8_t v_q1 = {0}; + uint8x8_t v_q2 = {0}; + uint8x8_t v_q3 = {0}; + uint8x8_t v_zero = {0}; + uint8x8_t v_sign_bit = {0}; + uint8x8_t v_kFE = {0}; + uint8x8_t v_m_thresh = {0}; + uint8x8_t v_m_ithresh = {0}; + uint8x8_t v_m_hthresh = {0}; + uint8x8_t v_k1 = {0}; + uint8x8_t v_k3 = {0}; + uint8x8_t v_k4 = {0}; + uint8x8_t v_mask = {0}; + uint8x8_t v_not_hev = {0}; + uint8x8_t v_delta = {0}; + uint8x8_t v_v3 = {0}; + uint8x8_t v_v4 = {0}; + uint8x8_t v_a3 = {0}; + uint8x8_t v_t1 = {0}; + uint8x8_t v_t2 = {0}; + uint8x8_t v_t3 = {0}; + + if (a_q0_off < (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(v_wb.len))) { + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p3 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p2 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p0 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q0 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q1 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q2 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q3 = vld1_u8(v_wb.ptr); + v_zero = vdup_n_u8(0u); + v_sign_bit = vdup_n_u8(128u); + v_kFE = vdup_n_u8(254u); + v_m_thresh = vdup_n_u8(((uint8_t)(a_level))); + v_m_ithresh = vdup_n_u8(((uint8_t)(a_ilevel))); + v_m_hthresh = vdup_n_u8(((uint8_t)(a_hlevel))); + v_k1 = vdup_n_u8(1u); + v_k3 = vdup_n_u8(3u); + v_k4 = vdup_n_u8(4u); + v_t1 = vabd_u8(v_p1, v_q1); + v_t2 = vshr_n_u8(vand_u8(v_t1, v_kFE), 1u); + v_t3 = vabd_u8(v_p0, v_q0); + v_t3 = vqadd_u8(v_t3, v_t3); + v_t3 = vqadd_u8(v_t3, v_t2); + v_mask = vceq_u8(vqsub_u8(v_t3, v_m_thresh), v_zero); + v_t1 = vabd_u8(v_p3, v_p2); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p2, v_p1); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p1, v_p0); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q0, v_q1); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q1, v_q2); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q2, v_q3); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p1, v_p0); + v_t2 = vabd_u8(v_q1, v_q0); + v_t3 = vorr_u8(vqsub_u8(v_t1, v_m_hthresh), vqsub_u8(v_t2, v_m_hthresh)); + v_not_hev = vceq_u8(v_t3, v_zero); + v_p1 = veor_u8(v_p1, v_sign_bit); + v_p0 = veor_u8(v_p0, v_sign_bit); + v_q0 = veor_u8(v_q0, v_sign_bit); + v_q1 = veor_u8(v_q1, v_sign_bit); + v_t1 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_p1), vreinterpret_s8_u8(v_q1))); + v_t1 = vbic_u8(v_t1, v_not_hev); + v_t2 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q0), vreinterpret_s8_u8(v_p0))); + v_t1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_t1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_delta = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_delta = vand_u8(v_delta, v_mask); + v_v4 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_delta), vreinterpret_s8_u8(v_k4))); + v_v4 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_u8(v_v4), 3u)); + v_v3 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_delta), vreinterpret_s8_u8(v_k3))); + v_v3 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_u8(v_v3), 3u)); + v_q0 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q0), vreinterpret_s8_u8(v_v4))); + v_p0 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_p0), vreinterpret_s8_u8(v_v3))); + v_a3 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_v4), vreinterpret_s8_u8(v_k1))); + v_a3 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_u8(v_a3), 1u)); + v_a3 = vand_u8(v_a3, v_not_hev); + v_q1 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q1), vreinterpret_s8_u8(v_a3))); + v_p1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_p1), vreinterpret_s8_u8(v_a3))); + v_p1 = veor_u8(v_p1, v_sign_bit); + v_p0 = veor_u8(v_p0, v_sign_bit); + v_q0 = veor_u8(v_q0, v_sign_bit); + v_q1 = veor_u8(v_q1, v_sign_bit); + if (a_q0_off < (2u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + if ((a_q0_off - (2u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - (2u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_p1); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_p0); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_q0); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_q1); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.normal_hfilter_mb_8_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_8_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint8x8_t v_r0 = {0}; + uint8x8_t v_r1 = {0}; + uint8x8_t v_r2 = {0}; + uint8x8_t v_r3 = {0}; + uint8x8_t v_r4 = {0}; + uint8x8_t v_r5 = {0}; + uint8x8_t v_r6 = {0}; + uint8x8_t v_r7 = {0}; + uint8x8_t v_s0 = {0}; + uint8x8_t v_s1 = {0}; + uint8x8_t v_s2 = {0}; + uint8x8_t v_s3 = {0}; + uint8x8_t v_s4 = {0}; + uint8x8_t v_s5 = {0}; + uint8x8_t v_s6 = {0}; + uint8x8_t v_s7 = {0}; + uint8x8_t v_p3 = {0}; + uint8x8_t v_p2 = {0}; + uint8x8_t v_p1 = {0}; + uint8x8_t v_p0 = {0}; + uint8x8_t v_q0 = {0}; + uint8x8_t v_q1 = {0}; + uint8x8_t v_q2 = {0}; + uint8x8_t v_q3 = {0}; + uint8x8_t v_zero = {0}; + uint8x8_t v_sign_bit = {0}; + uint8x8_t v_kFE = {0}; + uint8x8_t v_m_thresh = {0}; + uint8x8_t v_m_ithresh = {0}; + uint8x8_t v_m_hthresh = {0}; + uint8x8_t v_k3 = {0}; + uint8x8_t v_k4 = {0}; + uint8x8_t v_mask = {0}; + uint8x8_t v_not_hev = {0}; + uint8x8_t v_delta = {0}; + uint8x8_t v_v3 = {0}; + uint8x8_t v_v4 = {0}; + uint8x8_t v_a1 = {0}; + uint8x8_t v_a2 = {0}; + uint8x8_t v_a3 = {0}; + uint8x8_t v_t1 = {0}; + uint8x8_t v_t2 = {0}; + uint8x8_t v_t3 = {0}; + uint8x8_t v_p0_adj = {0}; + uint8x8_t v_q0_adj = {0}; + uint16x8_t v_wide = {0}; + uint16x8_t v_tmp = {0}; + uint16x8_t v_k63_16 = {0}; + + if (a_q0_off < 4u) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - 4u) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - 4u)); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r0 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r1 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r2 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r3 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r4 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r5 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r6 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r7 = vld1_u8(v_wb.ptr); + v_s0 = vtrn1_u8(v_r0, v_r1); + v_s1 = vtrn2_u8(v_r0, v_r1); + v_s2 = vtrn1_u8(v_r2, v_r3); + v_s3 = vtrn2_u8(v_r2, v_r3); + v_s4 = vtrn1_u8(v_r4, v_r5); + v_s5 = vtrn2_u8(v_r4, v_r5); + v_s6 = vtrn1_u8(v_r6, v_r7); + v_s7 = vtrn2_u8(v_r6, v_r7); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_p3 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_q0 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_p2 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_q1 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_p1 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_q2 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_p0 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_q3 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_zero = vdup_n_u8(0u); + v_sign_bit = vdup_n_u8(128u); + v_kFE = vdup_n_u8(254u); + v_m_thresh = vdup_n_u8(((uint8_t)(a_level))); + v_m_ithresh = vdup_n_u8(((uint8_t)(a_ilevel))); + v_m_hthresh = vdup_n_u8(((uint8_t)(a_hlevel))); + v_k3 = vdup_n_u8(3u); + v_k4 = vdup_n_u8(4u); + v_k63_16 = vdupq_n_u16(63u); + v_t1 = vabd_u8(v_p1, v_q1); + v_t2 = vshr_n_u8(vand_u8(v_t1, v_kFE), 1u); + v_t3 = vabd_u8(v_p0, v_q0); + v_t3 = vqadd_u8(v_t3, v_t3); + v_t3 = vqadd_u8(v_t3, v_t2); + v_mask = vceq_u8(vqsub_u8(v_t3, v_m_thresh), v_zero); + v_t1 = vabd_u8(v_p3, v_p2); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p2, v_p1); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p1, v_p0); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q0, v_q1); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q1, v_q2); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q2, v_q3); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p1, v_p0); + v_t2 = vabd_u8(v_q1, v_q0); + v_t3 = vorr_u8(vqsub_u8(v_t1, v_m_hthresh), vqsub_u8(v_t2, v_m_hthresh)); + v_not_hev = vceq_u8(v_t3, v_zero); + v_p2 = veor_u8(v_p2, v_sign_bit); + v_p1 = veor_u8(v_p1, v_sign_bit); + v_p0 = veor_u8(v_p0, v_sign_bit); + v_q0 = veor_u8(v_q0, v_sign_bit); + v_q1 = veor_u8(v_q1, v_sign_bit); + v_q2 = veor_u8(v_q2, v_sign_bit); + v_t1 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_p1), vreinterpret_s8_u8(v_q1))); + v_t2 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q0), vreinterpret_s8_u8(v_p0))); + v_t1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_t1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_delta = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_delta = vand_u8(v_delta, v_mask); + v_v4 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_delta), vreinterpret_s8_u8(v_k4))); + v_v4 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_u8(v_v4), 3u)); + v_v3 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_delta), vreinterpret_s8_u8(v_k3))); + v_v3 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_u8(v_v3), 3u)); + v_wide = vreinterpretq_u16_s16(vmovl_s8(vreinterpret_s8_u8(v_delta))); + v_tmp = vmulq_n_u16(v_wide, 27u); + v_tmp = vaddq_u16(v_tmp, v_k63_16); + v_tmp = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp), 7u)); + v_a1 = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp))); + v_tmp = vmulq_n_u16(v_wide, 18u); + v_tmp = vaddq_u16(v_tmp, v_k63_16); + v_tmp = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp), 7u)); + v_a2 = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp))); + v_tmp = vmulq_n_u16(v_wide, 9u); + v_tmp = vaddq_u16(v_tmp, v_k63_16); + v_tmp = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp), 7u)); + v_a3 = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp))); + v_p0_adj = vbic_u8(v_v3, v_not_hev); + v_p0_adj = vorr_u8(v_p0_adj, vand_u8(v_a1, v_not_hev)); + v_p0 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_p0), vreinterpret_s8_u8(v_p0_adj))); + v_q0_adj = vbic_u8(v_v4, v_not_hev); + v_q0_adj = vorr_u8(v_q0_adj, vand_u8(v_a1, v_not_hev)); + v_q0 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q0), vreinterpret_s8_u8(v_q0_adj))); + v_p1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_p1), vreinterpret_s8_u8(vand_u8(v_a2, v_not_hev)))); + v_q1 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q1), vreinterpret_s8_u8(vand_u8(v_a2, v_not_hev)))); + v_p2 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_p2), vreinterpret_s8_u8(vand_u8(v_a3, v_not_hev)))); + v_q2 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q2), vreinterpret_s8_u8(vand_u8(v_a3, v_not_hev)))); + v_p2 = veor_u8(v_p2, v_sign_bit); + v_p1 = veor_u8(v_p1, v_sign_bit); + v_p0 = veor_u8(v_p0, v_sign_bit); + v_q0 = veor_u8(v_q0, v_sign_bit); + v_q1 = veor_u8(v_q1, v_sign_bit); + v_q2 = veor_u8(v_q2, v_sign_bit); + v_s0 = vtrn1_u8(v_p3, v_p2); + v_s1 = vtrn2_u8(v_p3, v_p2); + v_s2 = vtrn1_u8(v_p1, v_p0); + v_s3 = vtrn2_u8(v_p1, v_p0); + v_s4 = vtrn1_u8(v_q0, v_q1); + v_s5 = vtrn2_u8(v_q0, v_q1); + v_s6 = vtrn1_u8(v_q2, v_q3); + v_s7 = vtrn2_u8(v_q2, v_q3); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_s0 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s4 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s1 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s5 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s2 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s6 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s3 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_s7 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + if ((a_q0_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - 4u)); + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s0); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s1); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s2); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s3); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s4); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s5); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s6); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s7); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.normal_hfilter_inner_8_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_8_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint8x8_t v_r0 = {0}; + uint8x8_t v_r1 = {0}; + uint8x8_t v_r2 = {0}; + uint8x8_t v_r3 = {0}; + uint8x8_t v_r4 = {0}; + uint8x8_t v_r5 = {0}; + uint8x8_t v_r6 = {0}; + uint8x8_t v_r7 = {0}; + uint8x8_t v_s0 = {0}; + uint8x8_t v_s1 = {0}; + uint8x8_t v_s2 = {0}; + uint8x8_t v_s3 = {0}; + uint8x8_t v_s4 = {0}; + uint8x8_t v_s5 = {0}; + uint8x8_t v_s6 = {0}; + uint8x8_t v_s7 = {0}; + uint8x8_t v_p3 = {0}; + uint8x8_t v_p2 = {0}; + uint8x8_t v_p1 = {0}; + uint8x8_t v_p0 = {0}; + uint8x8_t v_q0 = {0}; + uint8x8_t v_q1 = {0}; + uint8x8_t v_q2 = {0}; + uint8x8_t v_q3 = {0}; + uint8x8_t v_zero = {0}; + uint8x8_t v_sign_bit = {0}; + uint8x8_t v_kFE = {0}; + uint8x8_t v_m_thresh = {0}; + uint8x8_t v_m_ithresh = {0}; + uint8x8_t v_m_hthresh = {0}; + uint8x8_t v_k1 = {0}; + uint8x8_t v_k3 = {0}; + uint8x8_t v_k4 = {0}; + uint8x8_t v_mask = {0}; + uint8x8_t v_not_hev = {0}; + uint8x8_t v_delta = {0}; + uint8x8_t v_v3 = {0}; + uint8x8_t v_v4 = {0}; + uint8x8_t v_a3 = {0}; + uint8x8_t v_t1 = {0}; + uint8x8_t v_t2 = {0}; + uint8x8_t v_t3 = {0}; + + if (a_q0_off < 4u) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - 4u) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - 4u)); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r0 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r1 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r2 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r3 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r4 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r5 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r6 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r7 = vld1_u8(v_wb.ptr); + v_s0 = vtrn1_u8(v_r0, v_r1); + v_s1 = vtrn2_u8(v_r0, v_r1); + v_s2 = vtrn1_u8(v_r2, v_r3); + v_s3 = vtrn2_u8(v_r2, v_r3); + v_s4 = vtrn1_u8(v_r4, v_r5); + v_s5 = vtrn2_u8(v_r4, v_r5); + v_s6 = vtrn1_u8(v_r6, v_r7); + v_s7 = vtrn2_u8(v_r6, v_r7); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_p3 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_q0 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_p2 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_q1 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_p1 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_q2 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_p0 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_q3 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_zero = vdup_n_u8(0u); + v_sign_bit = vdup_n_u8(128u); + v_kFE = vdup_n_u8(254u); + v_m_thresh = vdup_n_u8(((uint8_t)(a_level))); + v_m_ithresh = vdup_n_u8(((uint8_t)(a_ilevel))); + v_m_hthresh = vdup_n_u8(((uint8_t)(a_hlevel))); + v_k1 = vdup_n_u8(1u); + v_k3 = vdup_n_u8(3u); + v_k4 = vdup_n_u8(4u); + v_t1 = vabd_u8(v_p1, v_q1); + v_t2 = vshr_n_u8(vand_u8(v_t1, v_kFE), 1u); + v_t3 = vabd_u8(v_p0, v_q0); + v_t3 = vqadd_u8(v_t3, v_t3); + v_t3 = vqadd_u8(v_t3, v_t2); + v_mask = vceq_u8(vqsub_u8(v_t3, v_m_thresh), v_zero); + v_t1 = vabd_u8(v_p3, v_p2); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p2, v_p1); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p1, v_p0); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q0, v_q1); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q1, v_q2); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_q2, v_q3); + v_mask = vand_u8(v_mask, vceq_u8(vqsub_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabd_u8(v_p1, v_p0); + v_t2 = vabd_u8(v_q1, v_q0); + v_t3 = vorr_u8(vqsub_u8(v_t1, v_m_hthresh), vqsub_u8(v_t2, v_m_hthresh)); + v_not_hev = vceq_u8(v_t3, v_zero); + v_p1 = veor_u8(v_p1, v_sign_bit); + v_p0 = veor_u8(v_p0, v_sign_bit); + v_q0 = veor_u8(v_q0, v_sign_bit); + v_q1 = veor_u8(v_q1, v_sign_bit); + v_t1 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_p1), vreinterpret_s8_u8(v_q1))); + v_t1 = vbic_u8(v_t1, v_not_hev); + v_t2 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q0), vreinterpret_s8_u8(v_p0))); + v_t1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_t1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_delta = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_t1), vreinterpret_s8_u8(v_t2))); + v_delta = vand_u8(v_delta, v_mask); + v_v4 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_delta), vreinterpret_s8_u8(v_k4))); + v_v4 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_u8(v_v4), 3u)); + v_v3 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_delta), vreinterpret_s8_u8(v_k3))); + v_v3 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_u8(v_v3), 3u)); + v_q0 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q0), vreinterpret_s8_u8(v_v4))); + v_p0 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_p0), vreinterpret_s8_u8(v_v3))); + v_a3 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_v4), vreinterpret_s8_u8(v_k1))); + v_a3 = vreinterpret_u8_s8(vshr_n_s8(vreinterpret_s8_u8(v_a3), 1u)); + v_a3 = vand_u8(v_a3, v_not_hev); + v_q1 = vreinterpret_u8_s8(vqsub_s8(vreinterpret_s8_u8(v_q1), vreinterpret_s8_u8(v_a3))); + v_p1 = vreinterpret_u8_s8(vqadd_s8(vreinterpret_s8_u8(v_p1), vreinterpret_s8_u8(v_a3))); + v_p1 = veor_u8(v_p1, v_sign_bit); + v_p0 = veor_u8(v_p0, v_sign_bit); + v_q0 = veor_u8(v_q0, v_sign_bit); + v_q1 = veor_u8(v_q1, v_sign_bit); + v_s0 = vtrn1_u8(v_p3, v_p2); + v_s1 = vtrn2_u8(v_p3, v_p2); + v_s2 = vtrn1_u8(v_p1, v_p0); + v_s3 = vtrn2_u8(v_p1, v_p0); + v_s4 = vtrn1_u8(v_q0, v_q1); + v_s5 = vtrn2_u8(v_q0, v_q1); + v_s6 = vtrn1_u8(v_q2, v_q3); + v_s7 = vtrn2_u8(v_q2, v_q3); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_s0 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s4 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s1 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s5 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s2 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s6 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s3 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_s7 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + if ((a_q0_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - 4u)); + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s0); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s1); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s2); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s3); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s4); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s5); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s6); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s7); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.normal_hfilter_mb_16_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint8x8_t v_r0 = {0}; + uint8x8_t v_r1 = {0}; + uint8x8_t v_r2 = {0}; + uint8x8_t v_r3 = {0}; + uint8x8_t v_r4 = {0}; + uint8x8_t v_r5 = {0}; + uint8x8_t v_r6 = {0}; + uint8x8_t v_r7 = {0}; + uint8x8_t v_s0 = {0}; + uint8x8_t v_s1 = {0}; + uint8x8_t v_s2 = {0}; + uint8x8_t v_s3 = {0}; + uint8x8_t v_s4 = {0}; + uint8x8_t v_s5 = {0}; + uint8x8_t v_s6 = {0}; + uint8x8_t v_s7 = {0}; + uint8x8_t v_p3_lo = {0}; + uint8x8_t v_p2_lo = {0}; + uint8x8_t v_p1_lo = {0}; + uint8x8_t v_p0_lo = {0}; + uint8x8_t v_q0_lo = {0}; + uint8x8_t v_q1_lo = {0}; + uint8x8_t v_q2_lo = {0}; + uint8x8_t v_q3_lo = {0}; + uint8x16_t v_p3 = {0}; + uint8x16_t v_p2 = {0}; + uint8x16_t v_p1 = {0}; + uint8x16_t v_p0 = {0}; + uint8x16_t v_q0 = {0}; + uint8x16_t v_q1 = {0}; + uint8x16_t v_q2 = {0}; + uint8x16_t v_q3 = {0}; + uint8x16_t v_zero = {0}; + uint8x16_t v_sign_bit = {0}; + uint8x16_t v_kFE = {0}; + uint8x16_t v_m_thresh = {0}; + uint8x16_t v_m_ithresh = {0}; + uint8x16_t v_m_hthresh = {0}; + uint8x16_t v_k3 = {0}; + uint8x16_t v_k4 = {0}; + uint8x16_t v_mask = {0}; + uint8x16_t v_not_hev = {0}; + uint8x16_t v_delta = {0}; + uint8x16_t v_v3 = {0}; + uint8x16_t v_v4 = {0}; + uint8x16_t v_a1 = {0}; + uint8x16_t v_a2 = {0}; + uint8x16_t v_a3 = {0}; + uint8x16_t v_t1 = {0}; + uint8x16_t v_t2 = {0}; + uint8x16_t v_t3 = {0}; + uint8x16_t v_p0_adj = {0}; + uint8x16_t v_q0_adj = {0}; + uint8x8_t v_d_lo = {0}; + uint8x8_t v_d_hi = {0}; + uint16x8_t v_lo = {0}; + uint16x8_t v_hi = {0}; + uint16x8_t v_k63_16 = {0}; + uint16x8_t v_tmp_lo = {0}; + uint16x8_t v_tmp_hi = {0}; + uint8x8_t v_narrow_lo = {0}; + uint8x8_t v_narrow_hi = {0}; + + if (a_q0_off < 4u) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - 4u) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - 4u)); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r0 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r1 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r2 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r3 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r4 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r5 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r6 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r7 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + v_s0 = vtrn1_u8(v_r0, v_r1); + v_s1 = vtrn2_u8(v_r0, v_r1); + v_s2 = vtrn1_u8(v_r2, v_r3); + v_s3 = vtrn2_u8(v_r2, v_r3); + v_s4 = vtrn1_u8(v_r4, v_r5); + v_s5 = vtrn2_u8(v_r4, v_r5); + v_s6 = vtrn1_u8(v_r6, v_r7); + v_s7 = vtrn2_u8(v_r6, v_r7); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_p3_lo = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_q0_lo = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_p2_lo = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_q1_lo = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_p1_lo = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_q2_lo = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_p0_lo = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_q3_lo = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r0 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r1 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r2 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r3 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r4 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r5 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r6 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r7 = vld1_u8(v_wb.ptr); + v_s0 = vtrn1_u8(v_r0, v_r1); + v_s1 = vtrn2_u8(v_r0, v_r1); + v_s2 = vtrn1_u8(v_r2, v_r3); + v_s3 = vtrn2_u8(v_r2, v_r3); + v_s4 = vtrn1_u8(v_r4, v_r5); + v_s5 = vtrn2_u8(v_r4, v_r5); + v_s6 = vtrn1_u8(v_r6, v_r7); + v_s7 = vtrn2_u8(v_r6, v_r7); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_s0 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_p3 = vcombine_u8(v_p3_lo, v_s0); + v_s0 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_q0 = vcombine_u8(v_q0_lo, v_s0); + v_s1 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_p2 = vcombine_u8(v_p2_lo, v_s1); + v_s1 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_q1 = vcombine_u8(v_q1_lo, v_s1); + v_s2 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_p1 = vcombine_u8(v_p1_lo, v_s2); + v_s2 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_q2 = vcombine_u8(v_q2_lo, v_s2); + v_s3 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_p0 = vcombine_u8(v_p0_lo, v_s3); + v_s3 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_q3 = vcombine_u8(v_q3_lo, v_s3); + v_zero = vdupq_n_u8(0u); + v_sign_bit = vdupq_n_u8(128u); + v_kFE = vdupq_n_u8(254u); + v_m_thresh = vdupq_n_u8(((uint8_t)(a_level))); + v_m_ithresh = vdupq_n_u8(((uint8_t)(a_ilevel))); + v_m_hthresh = vdupq_n_u8(((uint8_t)(a_hlevel))); + v_k3 = vdupq_n_u8(3u); + v_k4 = vdupq_n_u8(4u); + v_k63_16 = vdupq_n_u16(63u); + v_t1 = vabdq_u8(v_p1, v_q1); + v_t2 = vshrq_n_u8(vandq_u8(v_t1, v_kFE), 1u); + v_t3 = vabdq_u8(v_p0, v_q0); + v_t3 = vqaddq_u8(v_t3, v_t3); + v_t3 = vqaddq_u8(v_t3, v_t2); + v_mask = vceqq_u8(vqsubq_u8(v_t3, v_m_thresh), v_zero); + v_t1 = vabdq_u8(v_p3, v_p2); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p2, v_p1); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p1, v_p0); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q0, v_q1); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q1, v_q2); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q2, v_q3); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p1, v_p0); + v_t2 = vabdq_u8(v_q1, v_q0); + v_t3 = vorrq_u8(vqsubq_u8(v_t1, v_m_hthresh), vqsubq_u8(v_t2, v_m_hthresh)); + v_not_hev = vceqq_u8(v_t3, v_zero); + v_p2 = veorq_u8(v_p2, v_sign_bit); + v_p1 = veorq_u8(v_p1, v_sign_bit); + v_p0 = veorq_u8(v_p0, v_sign_bit); + v_q0 = veorq_u8(v_q0, v_sign_bit); + v_q1 = veorq_u8(v_q1, v_sign_bit); + v_q2 = veorq_u8(v_q2, v_sign_bit); + v_t1 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_p1), vreinterpretq_s8_u8(v_q1))); + v_t2 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q0), vreinterpretq_s8_u8(v_p0))); + v_t1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_t1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_delta = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_delta = vandq_u8(v_delta, v_mask); + v_v4 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_delta), vreinterpretq_s8_u8(v_k4))); + v_v4 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_v4), 3u)); + v_v3 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_delta), vreinterpretq_s8_u8(v_k3))); + v_v3 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_v3), 3u)); + v_d_lo = vget_low_u8(v_delta); + v_d_hi = vget_high_u8(v_delta); + v_lo = vreinterpretq_u16_s16(vmovl_s8(vreinterpret_s8_u8(v_d_lo))); + v_hi = vreinterpretq_u16_s16(vmovl_s8(vreinterpret_s8_u8(v_d_hi))); + v_tmp_lo = vmulq_n_u16(v_lo, 27u); + v_tmp_lo = vaddq_u16(v_tmp_lo, v_k63_16); + v_tmp_lo = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_lo), 7u)); + v_tmp_hi = vmulq_n_u16(v_hi, 27u); + v_tmp_hi = vaddq_u16(v_tmp_hi, v_k63_16); + v_tmp_hi = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_hi), 7u)); + v_narrow_lo = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_lo))); + v_narrow_hi = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_hi))); + v_a1 = vcombine_u8(v_narrow_lo, v_narrow_hi); + v_tmp_lo = vmulq_n_u16(v_lo, 18u); + v_tmp_lo = vaddq_u16(v_tmp_lo, v_k63_16); + v_tmp_lo = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_lo), 7u)); + v_tmp_hi = vmulq_n_u16(v_hi, 18u); + v_tmp_hi = vaddq_u16(v_tmp_hi, v_k63_16); + v_tmp_hi = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_hi), 7u)); + v_narrow_lo = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_lo))); + v_narrow_hi = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_hi))); + v_a2 = vcombine_u8(v_narrow_lo, v_narrow_hi); + v_tmp_lo = vmulq_n_u16(v_lo, 9u); + v_tmp_lo = vaddq_u16(v_tmp_lo, v_k63_16); + v_tmp_lo = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_lo), 7u)); + v_tmp_hi = vmulq_n_u16(v_hi, 9u); + v_tmp_hi = vaddq_u16(v_tmp_hi, v_k63_16); + v_tmp_hi = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_tmp_hi), 7u)); + v_narrow_lo = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_lo))); + v_narrow_hi = vreinterpret_u8_s8(vqmovn_s16(vreinterpretq_s16_u16(v_tmp_hi))); + v_a3 = vcombine_u8(v_narrow_lo, v_narrow_hi); + v_p0_adj = vbicq_u8(v_v3, v_not_hev); + v_p0_adj = vorrq_u8(v_p0_adj, vandq_u8(v_a1, v_not_hev)); + v_p0 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p0), vreinterpretq_s8_u8(v_p0_adj))); + v_q0_adj = vbicq_u8(v_v4, v_not_hev); + v_q0_adj = vorrq_u8(v_q0_adj, vandq_u8(v_a1, v_not_hev)); + v_q0 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q0), vreinterpretq_s8_u8(v_q0_adj))); + v_p1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p1), vreinterpretq_s8_u8(vandq_u8(v_a2, v_not_hev)))); + v_q1 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q1), vreinterpretq_s8_u8(vandq_u8(v_a2, v_not_hev)))); + v_p2 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p2), vreinterpretq_s8_u8(vandq_u8(v_a3, v_not_hev)))); + v_q2 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q2), vreinterpretq_s8_u8(vandq_u8(v_a3, v_not_hev)))); + v_p2 = veorq_u8(v_p2, v_sign_bit); + v_p1 = veorq_u8(v_p1, v_sign_bit); + v_p0 = veorq_u8(v_p0, v_sign_bit); + v_q0 = veorq_u8(v_q0, v_sign_bit); + v_q1 = veorq_u8(v_q1, v_sign_bit); + v_q2 = veorq_u8(v_q2, v_sign_bit); + v_p3_lo = vget_low_u8(v_p3); + v_p2_lo = vget_low_u8(v_p2); + v_p1_lo = vget_low_u8(v_p1); + v_p0_lo = vget_low_u8(v_p0); + v_q0_lo = vget_low_u8(v_q0); + v_q1_lo = vget_low_u8(v_q1); + v_q2_lo = vget_low_u8(v_q2); + v_q3_lo = vget_low_u8(v_q3); + v_s0 = vtrn1_u8(v_p3_lo, v_p2_lo); + v_s1 = vtrn2_u8(v_p3_lo, v_p2_lo); + v_s2 = vtrn1_u8(v_p1_lo, v_p0_lo); + v_s3 = vtrn2_u8(v_p1_lo, v_p0_lo); + v_s4 = vtrn1_u8(v_q0_lo, v_q1_lo); + v_s5 = vtrn2_u8(v_q0_lo, v_q1_lo); + v_s6 = vtrn1_u8(v_q2_lo, v_q3_lo); + v_s7 = vtrn2_u8(v_q2_lo, v_q3_lo); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_s0 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s4 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s1 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s5 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s2 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s6 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s3 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_s7 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + if ((a_q0_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - 4u)); + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s0); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s1); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s2); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s3); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s4); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s5); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s6); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s7); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + v_r0 = vget_high_u8(v_p3); + v_r1 = vget_high_u8(v_p2); + v_r2 = vget_high_u8(v_p1); + v_r3 = vget_high_u8(v_p0); + v_r4 = vget_high_u8(v_q0); + v_r5 = vget_high_u8(v_q1); + v_r6 = vget_high_u8(v_q2); + v_r7 = vget_high_u8(v_q3); + v_s0 = vtrn1_u8(v_r0, v_r1); + v_s1 = vtrn2_u8(v_r0, v_r1); + v_s2 = vtrn1_u8(v_r2, v_r3); + v_s3 = vtrn2_u8(v_r2, v_r3); + v_s4 = vtrn1_u8(v_r4, v_r5); + v_s5 = vtrn2_u8(v_r4, v_r5); + v_s6 = vtrn1_u8(v_r6, v_r7); + v_s7 = vtrn2_u8(v_r6, v_r7); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_s0 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s4 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s1 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s5 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s2 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s6 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s3 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_s7 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s0); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s1); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s2); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s3); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s4); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s5); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s6); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s7); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.normal_hfilter_inner_16_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint8x8_t v_r0 = {0}; + uint8x8_t v_r1 = {0}; + uint8x8_t v_r2 = {0}; + uint8x8_t v_r3 = {0}; + uint8x8_t v_r4 = {0}; + uint8x8_t v_r5 = {0}; + uint8x8_t v_r6 = {0}; + uint8x8_t v_r7 = {0}; + uint8x8_t v_s0 = {0}; + uint8x8_t v_s1 = {0}; + uint8x8_t v_s2 = {0}; + uint8x8_t v_s3 = {0}; + uint8x8_t v_s4 = {0}; + uint8x8_t v_s5 = {0}; + uint8x8_t v_s6 = {0}; + uint8x8_t v_s7 = {0}; + uint8x8_t v_p3_lo = {0}; + uint8x8_t v_p2_lo = {0}; + uint8x8_t v_p1_lo = {0}; + uint8x8_t v_p0_lo = {0}; + uint8x8_t v_q0_lo = {0}; + uint8x8_t v_q1_lo = {0}; + uint8x8_t v_q2_lo = {0}; + uint8x8_t v_q3_lo = {0}; + uint8x16_t v_p3 = {0}; + uint8x16_t v_p2 = {0}; + uint8x16_t v_p1 = {0}; + uint8x16_t v_p0 = {0}; + uint8x16_t v_q0 = {0}; + uint8x16_t v_q1 = {0}; + uint8x16_t v_q2 = {0}; + uint8x16_t v_q3 = {0}; + uint8x16_t v_zero = {0}; + uint8x16_t v_sign_bit = {0}; + uint8x16_t v_kFE = {0}; + uint8x16_t v_m_thresh = {0}; + uint8x16_t v_m_ithresh = {0}; + uint8x16_t v_m_hthresh = {0}; + uint8x16_t v_k1 = {0}; + uint8x16_t v_k3 = {0}; + uint8x16_t v_k4 = {0}; + uint8x16_t v_mask = {0}; + uint8x16_t v_not_hev = {0}; + uint8x16_t v_delta = {0}; + uint8x16_t v_v3 = {0}; + uint8x16_t v_v4 = {0}; + uint8x16_t v_a3 = {0}; + uint8x16_t v_t1 = {0}; + uint8x16_t v_t2 = {0}; + uint8x16_t v_t3 = {0}; + + if (a_q0_off < 4u) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - 4u) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - 4u)); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r0 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r1 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r2 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r3 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r4 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r5 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r6 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r7 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + v_s0 = vtrn1_u8(v_r0, v_r1); + v_s1 = vtrn2_u8(v_r0, v_r1); + v_s2 = vtrn1_u8(v_r2, v_r3); + v_s3 = vtrn2_u8(v_r2, v_r3); + v_s4 = vtrn1_u8(v_r4, v_r5); + v_s5 = vtrn2_u8(v_r4, v_r5); + v_s6 = vtrn1_u8(v_r6, v_r7); + v_s7 = vtrn2_u8(v_r6, v_r7); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_p3_lo = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_q0_lo = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_p2_lo = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_q1_lo = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_p1_lo = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_q2_lo = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_p0_lo = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_q3_lo = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r0 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r1 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r2 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r3 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r4 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r5 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r6 = vld1_u8(v_wb.ptr); + if (((uint64_t)(self->private_impl.f_y_stride)) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, ((uint64_t)(self->private_impl.f_y_stride))); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_r7 = vld1_u8(v_wb.ptr); + v_s0 = vtrn1_u8(v_r0, v_r1); + v_s1 = vtrn2_u8(v_r0, v_r1); + v_s2 = vtrn1_u8(v_r2, v_r3); + v_s3 = vtrn2_u8(v_r2, v_r3); + v_s4 = vtrn1_u8(v_r4, v_r5); + v_s5 = vtrn2_u8(v_r4, v_r5); + v_s6 = vtrn1_u8(v_r6, v_r7); + v_s7 = vtrn2_u8(v_r6, v_r7); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_s0 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_p3 = vcombine_u8(v_p3_lo, v_s0); + v_s0 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_q0 = vcombine_u8(v_q0_lo, v_s0); + v_s1 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_p2 = vcombine_u8(v_p2_lo, v_s1); + v_s1 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_q1 = vcombine_u8(v_q1_lo, v_s1); + v_s2 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_p1 = vcombine_u8(v_p1_lo, v_s2); + v_s2 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_q2 = vcombine_u8(v_q2_lo, v_s2); + v_s3 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_p0 = vcombine_u8(v_p0_lo, v_s3); + v_s3 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_q3 = vcombine_u8(v_q3_lo, v_s3); + v_zero = vdupq_n_u8(0u); + v_sign_bit = vdupq_n_u8(128u); + v_kFE = vdupq_n_u8(254u); + v_m_thresh = vdupq_n_u8(((uint8_t)(a_level))); + v_m_ithresh = vdupq_n_u8(((uint8_t)(a_ilevel))); + v_m_hthresh = vdupq_n_u8(((uint8_t)(a_hlevel))); + v_k1 = vdupq_n_u8(1u); + v_k3 = vdupq_n_u8(3u); + v_k4 = vdupq_n_u8(4u); + v_t1 = vabdq_u8(v_p1, v_q1); + v_t2 = vshrq_n_u8(vandq_u8(v_t1, v_kFE), 1u); + v_t3 = vabdq_u8(v_p0, v_q0); + v_t3 = vqaddq_u8(v_t3, v_t3); + v_t3 = vqaddq_u8(v_t3, v_t2); + v_mask = vceqq_u8(vqsubq_u8(v_t3, v_m_thresh), v_zero); + v_t1 = vabdq_u8(v_p3, v_p2); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p2, v_p1); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p1, v_p0); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q0, v_q1); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q1, v_q2); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_q2, v_q3); + v_mask = vandq_u8(v_mask, vceqq_u8(vqsubq_u8(v_t1, v_m_ithresh), v_zero)); + v_t1 = vabdq_u8(v_p1, v_p0); + v_t2 = vabdq_u8(v_q1, v_q0); + v_t3 = vorrq_u8(vqsubq_u8(v_t1, v_m_hthresh), vqsubq_u8(v_t2, v_m_hthresh)); + v_not_hev = vceqq_u8(v_t3, v_zero); + v_p1 = veorq_u8(v_p1, v_sign_bit); + v_p0 = veorq_u8(v_p0, v_sign_bit); + v_q0 = veorq_u8(v_q0, v_sign_bit); + v_q1 = veorq_u8(v_q1, v_sign_bit); + v_t1 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_p1), vreinterpretq_s8_u8(v_q1))); + v_t1 = vbicq_u8(v_t1, v_not_hev); + v_t2 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q0), vreinterpretq_s8_u8(v_p0))); + v_t1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_t1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_delta = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_t1), vreinterpretq_s8_u8(v_t2))); + v_delta = vandq_u8(v_delta, v_mask); + v_v4 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_delta), vreinterpretq_s8_u8(v_k4))); + v_v4 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_v4), 3u)); + v_v3 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_delta), vreinterpretq_s8_u8(v_k3))); + v_v3 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_v3), 3u)); + v_q0 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q0), vreinterpretq_s8_u8(v_v4))); + v_p0 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p0), vreinterpretq_s8_u8(v_v3))); + v_a3 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_v4), vreinterpretq_s8_u8(v_k1))); + v_a3 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_u8(v_a3), 1u)); + v_a3 = vandq_u8(v_a3, v_not_hev); + v_q1 = vreinterpretq_u8_s8(vqsubq_s8(vreinterpretq_s8_u8(v_q1), vreinterpretq_s8_u8(v_a3))); + v_p1 = vreinterpretq_u8_s8(vqaddq_s8(vreinterpretq_s8_u8(v_p1), vreinterpretq_s8_u8(v_a3))); + v_p1 = veorq_u8(v_p1, v_sign_bit); + v_p0 = veorq_u8(v_p0, v_sign_bit); + v_q0 = veorq_u8(v_q0, v_sign_bit); + v_q1 = veorq_u8(v_q1, v_sign_bit); + v_p3_lo = vget_low_u8(v_p3); + v_p2_lo = vget_low_u8(v_p2); + v_p1_lo = vget_low_u8(v_p1); + v_p0_lo = vget_low_u8(v_p0); + v_q0_lo = vget_low_u8(v_q0); + v_q1_lo = vget_low_u8(v_q1); + v_q2_lo = vget_low_u8(v_q2); + v_q3_lo = vget_low_u8(v_q3); + v_s0 = vtrn1_u8(v_p3_lo, v_p2_lo); + v_s1 = vtrn2_u8(v_p3_lo, v_p2_lo); + v_s2 = vtrn1_u8(v_p1_lo, v_p0_lo); + v_s3 = vtrn2_u8(v_p1_lo, v_p0_lo); + v_s4 = vtrn1_u8(v_q0_lo, v_q1_lo); + v_s5 = vtrn2_u8(v_q0_lo, v_q1_lo); + v_s6 = vtrn1_u8(v_q2_lo, v_q3_lo); + v_s7 = vtrn2_u8(v_q2_lo, v_q3_lo); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_s0 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s4 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s1 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s5 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s2 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s6 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s3 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_s7 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + if ((a_q0_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - 4u)); + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s0); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s1); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s2); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s3); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s4); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s5); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s6); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s7); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + v_r0 = vget_high_u8(v_p3); + v_r1 = vget_high_u8(v_p2); + v_r2 = vget_high_u8(v_p1); + v_r3 = vget_high_u8(v_p0); + v_r4 = vget_high_u8(v_q0); + v_r5 = vget_high_u8(v_q1); + v_r6 = vget_high_u8(v_q2); + v_r7 = vget_high_u8(v_q3); + v_s0 = vtrn1_u8(v_r0, v_r1); + v_s1 = vtrn2_u8(v_r0, v_r1); + v_s2 = vtrn1_u8(v_r2, v_r3); + v_s3 = vtrn2_u8(v_r2, v_r3); + v_s4 = vtrn1_u8(v_r4, v_r5); + v_s5 = vtrn2_u8(v_r4, v_r5); + v_s6 = vtrn1_u8(v_r6, v_r7); + v_s7 = vtrn2_u8(v_r6, v_r7); + v_r0 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r2 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s0), vreinterpret_u16_u8(v_s2))); + v_r1 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r3 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s1), vreinterpret_u16_u8(v_s3))); + v_r4 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r6 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s4), vreinterpret_u16_u8(v_s6))); + v_r5 = vreinterpret_u8_u16(vtrn1_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_r7 = vreinterpret_u8_u16(vtrn2_u16(vreinterpret_u16_u8(v_s5), vreinterpret_u16_u8(v_s7))); + v_s0 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s4 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r0), vreinterpret_u32_u8(v_r4))); + v_s1 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s5 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r1), vreinterpret_u32_u8(v_r5))); + v_s2 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s6 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r2), vreinterpret_u32_u8(v_r6))); + v_s3 = vreinterpret_u8_u32(vtrn1_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + v_s7 = vreinterpret_u8_u32(vtrn2_u32(vreinterpret_u32_u8(v_r3), vreinterpret_u32_u8(v_r7))); + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s0); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s1); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s2); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s3); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s4); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s5); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s6); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_s7); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +x86_avx2 +// -------- func vp8.decoder.normal_vfilter_mb_uv_x86_avx2 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_uv_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_u_wb = {0}; + wuffs_base__slice_u8 v_v_wb = {0}; + __m128i v_u_128 = {0}; + __m128i v_v_128 = {0}; + __m256i v_p3 = {0}; + __m256i v_p2 = {0}; + __m256i v_p1 = {0}; + __m256i v_p0 = {0}; + __m256i v_q0 = {0}; + __m256i v_q1 = {0}; + __m256i v_q2 = {0}; + __m256i v_q3 = {0}; + __m256i v_zero = {0}; + __m256i v_sign_bit = {0}; + __m256i v_kFE = {0}; + __m256i v_m_thresh = {0}; + __m256i v_m_ithresh = {0}; + __m256i v_m_hthresh = {0}; + __m256i v_k3 = {0}; + __m256i v_k4 = {0}; + __m256i v_k63 = {0}; + __m256i v_k27 = {0}; + __m256i v_k18 = {0}; + __m256i v_k9 = {0}; + __m256i v_mask = {0}; + __m256i v_not_hev = {0}; + __m256i v_delta = {0}; + __m256i v_v3 = {0}; + __m256i v_v4 = {0}; + __m256i v_a1 = {0}; + __m256i v_a2 = {0}; + __m256i v_a3 = {0}; + __m256i v_t1 = {0}; + __m256i v_t2 = {0}; + __m256i v_t3 = {0}; + __m256i v_lo = {0}; + __m256i v_hi = {0}; + __m256i v_d_lo = {0}; + __m256i v_d_hi = {0}; + __m256i v_p0_adj = {0}; + __m256i v_q0_adj = {0}; + + if (a_u_off < (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + if (a_v_off < (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = a_workbuf; + if ((a_u_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, (a_u_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + v_v_wb = a_workbuf; + if ((a_v_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, (a_v_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_p3 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_p2 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_p1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_p0 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_q0 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_q1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_q2 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_q3 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + v_zero = _mm256_setzero_si256(); + v_sign_bit = _mm256_set1_epi8((int8_t)(128u)); + v_kFE = _mm256_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k3 = _mm256_set1_epi8((int8_t)(3u)); + v_k4 = _mm256_set1_epi8((int8_t)(4u)); + v_k63 = _mm256_set1_epi16((int16_t)(63u)); + v_k27 = _mm256_set1_epi16((int16_t)(27u)); + v_k18 = _mm256_set1_epi16((int16_t)(18u)); + v_k9 = _mm256_set1_epi16((int16_t)(9u)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_q1), _mm256_subs_epu8(v_q1, v_p1)); + v_t2 = _mm256_srli_epi16(_mm256_and_si256(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm256_or_si256(_mm256_subs_epu8(v_p0, v_q0), _mm256_subs_epu8(v_q0, v_p0)); + v_t3 = _mm256_adds_epu8(v_t3, v_t3); + v_t3 = _mm256_adds_epu8(v_t3, v_t2); + v_mask = _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p3, v_p2), _mm256_subs_epu8(v_p2, v_p3)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p2, v_p1), _mm256_subs_epu8(v_p1, v_p2)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_p0), _mm256_subs_epu8(v_p0, v_p1)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q0, v_q1), _mm256_subs_epu8(v_q1, v_q0)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q1, v_q2), _mm256_subs_epu8(v_q2, v_q1)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q2, v_q3), _mm256_subs_epu8(v_q3, v_q2)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_p0), _mm256_subs_epu8(v_p0, v_p1)); + v_t2 = _mm256_or_si256(_mm256_subs_epu8(v_q1, v_q0), _mm256_subs_epu8(v_q0, v_q1)); + v_t3 = _mm256_or_si256(_mm256_subs_epu8(v_t1, v_m_hthresh), _mm256_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm256_cmpeq_epi8(v_t3, v_zero); + v_p2 = _mm256_xor_si256(v_p2, v_sign_bit); + v_p1 = _mm256_xor_si256(v_p1, v_sign_bit); + v_p0 = _mm256_xor_si256(v_p0, v_sign_bit); + v_q0 = _mm256_xor_si256(v_q0, v_sign_bit); + v_q1 = _mm256_xor_si256(v_q1, v_sign_bit); + v_q2 = _mm256_xor_si256(v_q2, v_sign_bit); + v_t1 = _mm256_subs_epi8(v_p1, v_q1); + v_t2 = _mm256_subs_epi8(v_q0, v_p0); + v_t1 = _mm256_adds_epi8(v_t1, v_t2); + v_t1 = _mm256_adds_epi8(v_t1, v_t2); + v_delta = _mm256_adds_epi8(v_t1, v_t2); + v_delta = _mm256_and_si256(v_delta, v_mask); + v_v4 = _mm256_adds_epi8(v_delta, v_k4); + v_lo = _mm256_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm256_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm256_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm256_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm256_packs_epi16(v_lo, v_hi); + v_v3 = _mm256_adds_epi8(v_delta, v_k3); + v_lo = _mm256_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm256_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm256_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm256_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm256_packs_epi16(v_lo, v_hi); + v_d_lo = _mm256_srai_epi16(_mm256_unpacklo_epi8(v_zero, v_delta), (int32_t)(8u)); + v_d_hi = _mm256_srai_epi16(_mm256_unpackhi_epi8(v_zero, v_delta), (int32_t)(8u)); + v_lo = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_lo, v_k27), v_k63), (int32_t)(7u)); + v_hi = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_hi, v_k27), v_k63), (int32_t)(7u)); + v_a1 = _mm256_packs_epi16(v_lo, v_hi); + v_lo = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_lo, v_k18), v_k63), (int32_t)(7u)); + v_hi = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_hi, v_k18), v_k63), (int32_t)(7u)); + v_a2 = _mm256_packs_epi16(v_lo, v_hi); + v_lo = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_lo, v_k9), v_k63), (int32_t)(7u)); + v_hi = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_hi, v_k9), v_k63), (int32_t)(7u)); + v_a3 = _mm256_packs_epi16(v_lo, v_hi); + v_p0_adj = _mm256_or_si256(_mm256_andnot_si256(v_not_hev, v_v3), _mm256_and_si256(v_a1, v_not_hev)); + v_p0 = _mm256_adds_epi8(v_p0, v_p0_adj); + v_q0_adj = _mm256_or_si256(_mm256_andnot_si256(v_not_hev, v_v4), _mm256_and_si256(v_a1, v_not_hev)); + v_q0 = _mm256_subs_epi8(v_q0, v_q0_adj); + v_p1 = _mm256_adds_epi8(v_p1, _mm256_and_si256(v_a2, v_not_hev)); + v_q1 = _mm256_subs_epi8(v_q1, _mm256_and_si256(v_a2, v_not_hev)); + v_p2 = _mm256_adds_epi8(v_p2, _mm256_and_si256(v_a3, v_not_hev)); + v_q2 = _mm256_subs_epi8(v_q2, _mm256_and_si256(v_a3, v_not_hev)); + v_p2 = _mm256_xor_si256(v_p2, v_sign_bit); + v_p1 = _mm256_xor_si256(v_p1, v_sign_bit); + v_p0 = _mm256_xor_si256(v_p0, v_sign_bit); + v_q0 = _mm256_xor_si256(v_q0, v_sign_bit); + v_q1 = _mm256_xor_si256(v_q1, v_sign_bit); + v_q2 = _mm256_xor_si256(v_q2, v_sign_bit); + if (a_u_off < (3u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = a_workbuf; + if ((a_u_off - (3u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, (a_u_off - (3u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (a_v_off < (3u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_v_wb = a_workbuf; + if ((a_v_off - (3u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, (a_v_off - (3u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm256_castsi256_si128(v_p2); + v_v_128 = _mm256_extracti128_si256(v_p2, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_u_128 = _mm256_castsi256_si128(v_p1); + v_v_128 = _mm256_extracti128_si256(v_p1, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_u_128 = _mm256_castsi256_si128(v_p0); + v_v_128 = _mm256_extracti128_si256(v_p0, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_u_128 = _mm256_castsi256_si128(v_q0); + v_v_128 = _mm256_extracti128_si256(v_q0, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_u_128 = _mm256_castsi256_si128(v_q1); + v_v_128 = _mm256_extracti128_si256(v_q1, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_u_128 = _mm256_castsi256_si128(v_q2); + v_v_128 = _mm256_extracti128_si256(v_q2, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +// ‼ WUFFS MULTI-FILE SECTION -x86_avx2 + +// ‼ WUFFS MULTI-FILE SECTION +x86_avx2 +// -------- func vp8.decoder.normal_vfilter_inner_uv_x86_avx2 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_uv_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_u_wb = {0}; + wuffs_base__slice_u8 v_v_wb = {0}; + __m128i v_u_128 = {0}; + __m128i v_v_128 = {0}; + __m256i v_p3 = {0}; + __m256i v_p2 = {0}; + __m256i v_p1 = {0}; + __m256i v_p0 = {0}; + __m256i v_q0 = {0}; + __m256i v_q1 = {0}; + __m256i v_q2 = {0}; + __m256i v_q3 = {0}; + __m256i v_zero = {0}; + __m256i v_sign_bit = {0}; + __m256i v_kFE = {0}; + __m256i v_m_thresh = {0}; + __m256i v_m_ithresh = {0}; + __m256i v_m_hthresh = {0}; + __m256i v_k1 = {0}; + __m256i v_k3 = {0}; + __m256i v_k4 = {0}; + __m256i v_mask = {0}; + __m256i v_not_hev = {0}; + __m256i v_delta = {0}; + __m256i v_v3 = {0}; + __m256i v_v4 = {0}; + __m256i v_a3 = {0}; + __m256i v_t1 = {0}; + __m256i v_t2 = {0}; + __m256i v_t3 = {0}; + __m256i v_lo = {0}; + __m256i v_hi = {0}; + + if (a_u_off < (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + if (a_v_off < (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = a_workbuf; + if ((a_u_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, (a_u_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + v_v_wb = a_workbuf; + if ((a_v_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, (a_v_off - (4u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_p3 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_p2 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_p1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_p0 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_q0 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_q1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_q2 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + if ((((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_u_wb.len))) || (((uint64_t)(self->private_impl.f_uv_stride)) > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_128 = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_q3 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_128), v_v_128, (int32_t)(1u)); + v_zero = _mm256_setzero_si256(); + v_sign_bit = _mm256_set1_epi8((int8_t)(128u)); + v_kFE = _mm256_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k1 = _mm256_set1_epi8((int8_t)(1u)); + v_k3 = _mm256_set1_epi8((int8_t)(3u)); + v_k4 = _mm256_set1_epi8((int8_t)(4u)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_q1), _mm256_subs_epu8(v_q1, v_p1)); + v_t2 = _mm256_srli_epi16(_mm256_and_si256(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm256_or_si256(_mm256_subs_epu8(v_p0, v_q0), _mm256_subs_epu8(v_q0, v_p0)); + v_t3 = _mm256_adds_epu8(v_t3, v_t3); + v_t3 = _mm256_adds_epu8(v_t3, v_t2); + v_mask = _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p3, v_p2), _mm256_subs_epu8(v_p2, v_p3)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p2, v_p1), _mm256_subs_epu8(v_p1, v_p2)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_p0), _mm256_subs_epu8(v_p0, v_p1)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q0, v_q1), _mm256_subs_epu8(v_q1, v_q0)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q1, v_q2), _mm256_subs_epu8(v_q2, v_q1)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q2, v_q3), _mm256_subs_epu8(v_q3, v_q2)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_p0), _mm256_subs_epu8(v_p0, v_p1)); + v_t2 = _mm256_or_si256(_mm256_subs_epu8(v_q1, v_q0), _mm256_subs_epu8(v_q0, v_q1)); + v_t3 = _mm256_or_si256(_mm256_subs_epu8(v_t1, v_m_hthresh), _mm256_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm256_cmpeq_epi8(v_t3, v_zero); + v_p1 = _mm256_xor_si256(v_p1, v_sign_bit); + v_p0 = _mm256_xor_si256(v_p0, v_sign_bit); + v_q0 = _mm256_xor_si256(v_q0, v_sign_bit); + v_q1 = _mm256_xor_si256(v_q1, v_sign_bit); + v_t1 = _mm256_subs_epi8(v_p1, v_q1); + v_t1 = _mm256_andnot_si256(v_not_hev, v_t1); + v_t2 = _mm256_subs_epi8(v_q0, v_p0); + v_t1 = _mm256_adds_epi8(v_t1, v_t2); + v_t1 = _mm256_adds_epi8(v_t1, v_t2); + v_delta = _mm256_adds_epi8(v_t1, v_t2); + v_delta = _mm256_and_si256(v_delta, v_mask); + v_v4 = _mm256_adds_epi8(v_delta, v_k4); + v_lo = _mm256_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm256_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm256_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm256_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm256_packs_epi16(v_lo, v_hi); + v_v3 = _mm256_adds_epi8(v_delta, v_k3); + v_lo = _mm256_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm256_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm256_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm256_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm256_packs_epi16(v_lo, v_hi); + v_q0 = _mm256_subs_epi8(v_q0, v_v4); + v_p0 = _mm256_adds_epi8(v_p0, v_v3); + v_a3 = _mm256_adds_epi8(v_v4, v_k1); + v_lo = _mm256_unpacklo_epi8(v_zero, v_a3); + v_hi = _mm256_unpackhi_epi8(v_zero, v_a3); + v_lo = _mm256_srai_epi16(v_lo, (int32_t)(9u)); + v_hi = _mm256_srai_epi16(v_hi, (int32_t)(9u)); + v_a3 = _mm256_packs_epi16(v_lo, v_hi); + v_a3 = _mm256_and_si256(v_a3, v_not_hev); + v_q1 = _mm256_subs_epi8(v_q1, v_a3); + v_p1 = _mm256_adds_epi8(v_p1, v_a3); + v_p1 = _mm256_xor_si256(v_p1, v_sign_bit); + v_p0 = _mm256_xor_si256(v_p0, v_sign_bit); + v_q0 = _mm256_xor_si256(v_q0, v_sign_bit); + v_q1 = _mm256_xor_si256(v_q1, v_sign_bit); + if (a_u_off < (2u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = a_workbuf; + if ((a_u_off - (2u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, (a_u_off - (2u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + if (a_v_off < (2u * ((uint64_t)(self->private_impl.f_uv_stride)))) { + return wuffs_base__make_empty_struct(); + } + v_v_wb = a_workbuf; + if ((a_v_off - (2u * ((uint64_t)(self->private_impl.f_uv_stride)))) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, (a_v_off - (2u * ((uint64_t)(self->private_impl.f_uv_stride))))); + } else { + return wuffs_base__make_empty_struct(); + } + v_u_128 = _mm256_castsi256_si128(v_p1); + v_v_128 = _mm256_extracti128_si256(v_p1, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_u_128 = _mm256_castsi256_si128(v_p0); + v_v_128 = _mm256_extracti128_si256(v_p0, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_u_128 = _mm256_castsi256_si128(v_q0); + v_v_128 = _mm256_extracti128_si256(v_q0, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_u_128 = _mm256_castsi256_si128(v_q1); + v_v_128 = _mm256_extracti128_si256(v_q1, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +// ‼ WUFFS MULTI-FILE SECTION -x86_avx2 + +// ‼ WUFFS MULTI-FILE SECTION +x86_avx2 +// -------- func vp8.decoder.normal_hfilter_mb_uv_x86_avx2 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_uv_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_u_wb = {0}; + wuffs_base__slice_u8 v_v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_u_ra = {0}; + __m128i v_u_rb = {0}; + __m128i v_v_ra = {0}; + __m128i v_v_rb = {0}; + __m128i v_u_128 = {0}; + __m128i v_v_128 = {0}; + __m256i v_f0 = {0}; + __m256i v_f1 = {0}; + __m256i v_f2 = {0}; + __m256i v_f3 = {0}; + __m256i v_g0 = {0}; + __m256i v_g1 = {0}; + __m256i v_g2 = {0}; + __m256i v_g3 = {0}; + __m256i v_p3 = {0}; + __m256i v_p2 = {0}; + __m256i v_p1 = {0}; + __m256i v_p0 = {0}; + __m256i v_q0 = {0}; + __m256i v_q1 = {0}; + __m256i v_q2 = {0}; + __m256i v_q3 = {0}; + __m256i v_zero = {0}; + __m256i v_sign_bit = {0}; + __m256i v_kFE = {0}; + __m256i v_m_thresh = {0}; + __m256i v_m_ithresh = {0}; + __m256i v_m_hthresh = {0}; + __m256i v_k3 = {0}; + __m256i v_k4 = {0}; + __m256i v_k63 = {0}; + __m256i v_k27 = {0}; + __m256i v_k18 = {0}; + __m256i v_k9 = {0}; + __m256i v_mask = {0}; + __m256i v_not_hev = {0}; + __m256i v_delta = {0}; + __m256i v_v3 = {0}; + __m256i v_v4 = {0}; + __m256i v_a1 = {0}; + __m256i v_a2 = {0}; + __m256i v_a3 = {0}; + __m256i v_t1 = {0}; + __m256i v_t2 = {0}; + __m256i v_t3 = {0}; + __m256i v_lo = {0}; + __m256i v_hi = {0}; + __m256i v_d_lo = {0}; + __m256i v_d_hi = {0}; + __m256i v_p0_adj = {0}; + __m256i v_q0_adj = {0}; + __m256i v_ra = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_uv_stride)); + if ((a_u_off < 4u) || (a_v_off < 4u)) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = a_workbuf; + if ((a_u_off - 4u) > ((uint64_t)(v_u_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, (a_u_off - 4u)); + v_v_wb = a_workbuf; + if ((a_v_off - 4u) > ((uint64_t)(v_v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, (a_v_off - 4u)); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + v_ra = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_ra), v_v_ra, (int32_t)(1u)); + v_t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_rb), v_v_rb, (int32_t)(1u)); + v_f0 = _mm256_unpacklo_epi8(v_ra, v_t1); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + v_ra = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_ra), v_v_ra, (int32_t)(1u)); + v_t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_rb), v_v_rb, (int32_t)(1u)); + v_f1 = _mm256_unpacklo_epi8(v_ra, v_t1); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + v_ra = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_ra), v_v_ra, (int32_t)(1u)); + v_t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_rb), v_v_rb, (int32_t)(1u)); + v_f2 = _mm256_unpacklo_epi8(v_ra, v_t1); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if (v_stride > ((uint64_t)(v_u_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + if (v_stride > ((uint64_t)(v_v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_ra = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_ra), v_v_ra, (int32_t)(1u)); + v_t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_rb), v_v_rb, (int32_t)(1u)); + v_f3 = _mm256_unpacklo_epi8(v_ra, v_t1); + v_g0 = _mm256_unpacklo_epi16(v_f0, v_f1); + v_g1 = _mm256_unpackhi_epi16(v_f0, v_f1); + v_g2 = _mm256_unpacklo_epi16(v_f2, v_f3); + v_g3 = _mm256_unpackhi_epi16(v_f2, v_f3); + v_f0 = _mm256_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm256_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm256_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm256_unpackhi_epi32(v_g1, v_g3); + v_p3 = v_f0; + v_t1 = v_f0; + v_p2 = _mm256_unpackhi_epi64(v_t1, v_t1); + v_p1 = v_f1; + v_t1 = v_f1; + v_p0 = _mm256_unpackhi_epi64(v_t1, v_t1); + v_q0 = v_f2; + v_t1 = v_f2; + v_q1 = _mm256_unpackhi_epi64(v_t1, v_t1); + v_q2 = v_f3; + v_t1 = v_f3; + v_q3 = _mm256_unpackhi_epi64(v_t1, v_t1); + v_zero = _mm256_setzero_si256(); + v_sign_bit = _mm256_set1_epi8((int8_t)(128u)); + v_kFE = _mm256_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k3 = _mm256_set1_epi8((int8_t)(3u)); + v_k4 = _mm256_set1_epi8((int8_t)(4u)); + v_k63 = _mm256_set1_epi16((int16_t)(63u)); + v_k27 = _mm256_set1_epi16((int16_t)(27u)); + v_k18 = _mm256_set1_epi16((int16_t)(18u)); + v_k9 = _mm256_set1_epi16((int16_t)(9u)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_q1), _mm256_subs_epu8(v_q1, v_p1)); + v_t2 = _mm256_srli_epi16(_mm256_and_si256(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm256_or_si256(_mm256_subs_epu8(v_p0, v_q0), _mm256_subs_epu8(v_q0, v_p0)); + v_t3 = _mm256_adds_epu8(v_t3, v_t3); + v_t3 = _mm256_adds_epu8(v_t3, v_t2); + v_mask = _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p3, v_p2), _mm256_subs_epu8(v_p2, v_p3)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p2, v_p1), _mm256_subs_epu8(v_p1, v_p2)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_p0), _mm256_subs_epu8(v_p0, v_p1)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q0, v_q1), _mm256_subs_epu8(v_q1, v_q0)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q1, v_q2), _mm256_subs_epu8(v_q2, v_q1)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q2, v_q3), _mm256_subs_epu8(v_q3, v_q2)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_p0), _mm256_subs_epu8(v_p0, v_p1)); + v_t2 = _mm256_or_si256(_mm256_subs_epu8(v_q1, v_q0), _mm256_subs_epu8(v_q0, v_q1)); + v_t3 = _mm256_or_si256(_mm256_subs_epu8(v_t1, v_m_hthresh), _mm256_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm256_cmpeq_epi8(v_t3, v_zero); + v_p2 = _mm256_xor_si256(v_p2, v_sign_bit); + v_p1 = _mm256_xor_si256(v_p1, v_sign_bit); + v_p0 = _mm256_xor_si256(v_p0, v_sign_bit); + v_q0 = _mm256_xor_si256(v_q0, v_sign_bit); + v_q1 = _mm256_xor_si256(v_q1, v_sign_bit); + v_q2 = _mm256_xor_si256(v_q2, v_sign_bit); + v_t1 = _mm256_subs_epi8(v_p1, v_q1); + v_t2 = _mm256_subs_epi8(v_q0, v_p0); + v_t1 = _mm256_adds_epi8(v_t1, v_t2); + v_t1 = _mm256_adds_epi8(v_t1, v_t2); + v_delta = _mm256_adds_epi8(v_t1, v_t2); + v_delta = _mm256_and_si256(v_delta, v_mask); + v_v4 = _mm256_adds_epi8(v_delta, v_k4); + v_lo = _mm256_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm256_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm256_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm256_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm256_packs_epi16(v_lo, v_hi); + v_v3 = _mm256_adds_epi8(v_delta, v_k3); + v_lo = _mm256_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm256_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm256_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm256_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm256_packs_epi16(v_lo, v_hi); + v_d_lo = _mm256_srai_epi16(_mm256_unpacklo_epi8(v_zero, v_delta), (int32_t)(8u)); + v_d_hi = _mm256_srai_epi16(_mm256_unpackhi_epi8(v_zero, v_delta), (int32_t)(8u)); + v_lo = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_lo, v_k27), v_k63), (int32_t)(7u)); + v_hi = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_hi, v_k27), v_k63), (int32_t)(7u)); + v_a1 = _mm256_packs_epi16(v_lo, v_hi); + v_lo = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_lo, v_k18), v_k63), (int32_t)(7u)); + v_hi = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_hi, v_k18), v_k63), (int32_t)(7u)); + v_a2 = _mm256_packs_epi16(v_lo, v_hi); + v_lo = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_lo, v_k9), v_k63), (int32_t)(7u)); + v_hi = _mm256_srai_epi16(_mm256_add_epi16(_mm256_mullo_epi16(v_d_hi, v_k9), v_k63), (int32_t)(7u)); + v_a3 = _mm256_packs_epi16(v_lo, v_hi); + v_p0_adj = _mm256_or_si256(_mm256_andnot_si256(v_not_hev, v_v3), _mm256_and_si256(v_a1, v_not_hev)); + v_p0 = _mm256_adds_epi8(v_p0, v_p0_adj); + v_q0_adj = _mm256_or_si256(_mm256_andnot_si256(v_not_hev, v_v4), _mm256_and_si256(v_a1, v_not_hev)); + v_q0 = _mm256_subs_epi8(v_q0, v_q0_adj); + v_p1 = _mm256_adds_epi8(v_p1, _mm256_and_si256(v_a2, v_not_hev)); + v_q1 = _mm256_subs_epi8(v_q1, _mm256_and_si256(v_a2, v_not_hev)); + v_p2 = _mm256_adds_epi8(v_p2, _mm256_and_si256(v_a3, v_not_hev)); + v_q2 = _mm256_subs_epi8(v_q2, _mm256_and_si256(v_a3, v_not_hev)); + v_p2 = _mm256_xor_si256(v_p2, v_sign_bit); + v_p1 = _mm256_xor_si256(v_p1, v_sign_bit); + v_p0 = _mm256_xor_si256(v_p0, v_sign_bit); + v_q0 = _mm256_xor_si256(v_q0, v_sign_bit); + v_q1 = _mm256_xor_si256(v_q1, v_sign_bit); + v_q2 = _mm256_xor_si256(v_q2, v_sign_bit); + v_f0 = _mm256_unpacklo_epi8(v_p3, v_p2); + v_f1 = _mm256_unpacklo_epi8(v_p1, v_p0); + v_f2 = _mm256_unpacklo_epi8(v_q0, v_q1); + v_f3 = _mm256_unpacklo_epi8(v_q2, v_q3); + v_g0 = _mm256_unpacklo_epi16(v_f0, v_f1); + v_g1 = _mm256_unpackhi_epi16(v_f0, v_f1); + v_g2 = _mm256_unpacklo_epi16(v_f2, v_f3); + v_g3 = _mm256_unpackhi_epi16(v_f2, v_f3); + v_f0 = _mm256_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm256_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm256_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm256_unpackhi_epi32(v_g1, v_g3); + if ((a_u_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_u_off - 4u)); + if ((a_v_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_v_wb = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_v_off - 4u)); + v_u_128 = _mm256_castsi256_si128(v_f0); + v_v_128 = _mm256_extracti128_si256(v_f0, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm_unpackhi_epi64(_mm256_castsi256_si128(v_f0), _mm256_castsi256_si128(v_f0)); + v_v_128 = _mm_unpackhi_epi64(_mm256_extracti128_si256(v_f0, (int32_t)(1u)), _mm256_extracti128_si256(v_f0, (int32_t)(1u))); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm256_castsi256_si128(v_f1); + v_v_128 = _mm256_extracti128_si256(v_f1, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm_unpackhi_epi64(_mm256_castsi256_si128(v_f1), _mm256_castsi256_si128(v_f1)); + v_v_128 = _mm_unpackhi_epi64(_mm256_extracti128_si256(v_f1, (int32_t)(1u)), _mm256_extracti128_si256(v_f1, (int32_t)(1u))); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm256_castsi256_si128(v_f2); + v_v_128 = _mm256_extracti128_si256(v_f2, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm_unpackhi_epi64(_mm256_castsi256_si128(v_f2), _mm256_castsi256_si128(v_f2)); + v_v_128 = _mm_unpackhi_epi64(_mm256_extracti128_si256(v_f2, (int32_t)(1u)), _mm256_extracti128_si256(v_f2, (int32_t)(1u))); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm256_castsi256_si128(v_f3); + v_v_128 = _mm256_extracti128_si256(v_f3, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm_unpackhi_epi64(_mm256_castsi256_si128(v_f3), _mm256_castsi256_si128(v_f3)); + v_v_128 = _mm_unpackhi_epi64(_mm256_extracti128_si256(v_f3, (int32_t)(1u)), _mm256_extracti128_si256(v_f3, (int32_t)(1u))); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +// ‼ WUFFS MULTI-FILE SECTION -x86_avx2 + +// ‼ WUFFS MULTI-FILE SECTION +x86_avx2 +// -------- func vp8.decoder.normal_hfilter_inner_uv_x86_avx2 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_uv_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_u_off, + uint64_t a_v_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_u_wb = {0}; + wuffs_base__slice_u8 v_v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_u_ra = {0}; + __m128i v_u_rb = {0}; + __m128i v_v_ra = {0}; + __m128i v_v_rb = {0}; + __m128i v_u_128 = {0}; + __m128i v_v_128 = {0}; + __m256i v_f0 = {0}; + __m256i v_f1 = {0}; + __m256i v_f2 = {0}; + __m256i v_f3 = {0}; + __m256i v_g0 = {0}; + __m256i v_g1 = {0}; + __m256i v_g2 = {0}; + __m256i v_g3 = {0}; + __m256i v_p3 = {0}; + __m256i v_p2 = {0}; + __m256i v_p1 = {0}; + __m256i v_p0 = {0}; + __m256i v_q0 = {0}; + __m256i v_q1 = {0}; + __m256i v_q2 = {0}; + __m256i v_q3 = {0}; + __m256i v_zero = {0}; + __m256i v_sign_bit = {0}; + __m256i v_kFE = {0}; + __m256i v_m_thresh = {0}; + __m256i v_m_ithresh = {0}; + __m256i v_m_hthresh = {0}; + __m256i v_k1 = {0}; + __m256i v_k3 = {0}; + __m256i v_k4 = {0}; + __m256i v_mask = {0}; + __m256i v_not_hev = {0}; + __m256i v_delta = {0}; + __m256i v_v3 = {0}; + __m256i v_v4 = {0}; + __m256i v_a3 = {0}; + __m256i v_t1 = {0}; + __m256i v_t2 = {0}; + __m256i v_t3 = {0}; + __m256i v_lo = {0}; + __m256i v_hi = {0}; + __m256i v_ra = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_uv_stride)); + if ((a_u_off < 4u) || (a_v_off < 4u)) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = a_workbuf; + if ((a_u_off - 4u) > ((uint64_t)(v_u_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, (a_u_off - 4u)); + v_v_wb = a_workbuf; + if ((a_v_off - 4u) > ((uint64_t)(v_v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, (a_v_off - 4u)); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + v_ra = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_ra), v_v_ra, (int32_t)(1u)); + v_t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_rb), v_v_rb, (int32_t)(1u)); + v_f0 = _mm256_unpacklo_epi8(v_ra, v_t1); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + v_ra = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_ra), v_v_ra, (int32_t)(1u)); + v_t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_rb), v_v_rb, (int32_t)(1u)); + v_f1 = _mm256_unpacklo_epi8(v_ra, v_t1); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if ((v_stride > ((uint64_t)(v_u_wb.len))) || (v_stride > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + v_ra = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_ra), v_v_ra, (int32_t)(1u)); + v_t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_rb), v_v_rb, (int32_t)(1u)); + v_f2 = _mm256_unpacklo_epi8(v_ra, v_t1); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + if (v_stride > ((uint64_t)(v_u_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + if (v_stride > ((uint64_t)(v_v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + if ((8u > ((uint64_t)(v_u_wb.len))) || (8u > ((uint64_t)(v_v_wb.len)))) { + return wuffs_base__make_empty_struct(); + } + v_u_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_u_wb.ptr)); + v_v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_v_wb.ptr)); + v_ra = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_ra), v_v_ra, (int32_t)(1u)); + v_t1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_u_rb), v_v_rb, (int32_t)(1u)); + v_f3 = _mm256_unpacklo_epi8(v_ra, v_t1); + v_g0 = _mm256_unpacklo_epi16(v_f0, v_f1); + v_g1 = _mm256_unpackhi_epi16(v_f0, v_f1); + v_g2 = _mm256_unpacklo_epi16(v_f2, v_f3); + v_g3 = _mm256_unpackhi_epi16(v_f2, v_f3); + v_f0 = _mm256_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm256_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm256_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm256_unpackhi_epi32(v_g1, v_g3); + v_p3 = v_f0; + v_t1 = v_f0; + v_p2 = _mm256_unpackhi_epi64(v_t1, v_t1); + v_p1 = v_f1; + v_t1 = v_f1; + v_p0 = _mm256_unpackhi_epi64(v_t1, v_t1); + v_q0 = v_f2; + v_t1 = v_f2; + v_q1 = _mm256_unpackhi_epi64(v_t1, v_t1); + v_q2 = v_f3; + v_t1 = v_f3; + v_q3 = _mm256_unpackhi_epi64(v_t1, v_t1); + v_zero = _mm256_setzero_si256(); + v_sign_bit = _mm256_set1_epi8((int8_t)(128u)); + v_kFE = _mm256_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm256_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k1 = _mm256_set1_epi8((int8_t)(1u)); + v_k3 = _mm256_set1_epi8((int8_t)(3u)); + v_k4 = _mm256_set1_epi8((int8_t)(4u)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_q1), _mm256_subs_epu8(v_q1, v_p1)); + v_t2 = _mm256_srli_epi16(_mm256_and_si256(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm256_or_si256(_mm256_subs_epu8(v_p0, v_q0), _mm256_subs_epu8(v_q0, v_p0)); + v_t3 = _mm256_adds_epu8(v_t3, v_t3); + v_t3 = _mm256_adds_epu8(v_t3, v_t2); + v_mask = _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p3, v_p2), _mm256_subs_epu8(v_p2, v_p3)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p2, v_p1), _mm256_subs_epu8(v_p1, v_p2)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_p0), _mm256_subs_epu8(v_p0, v_p1)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q0, v_q1), _mm256_subs_epu8(v_q1, v_q0)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q1, v_q2), _mm256_subs_epu8(v_q2, v_q1)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_q2, v_q3), _mm256_subs_epu8(v_q3, v_q2)); + v_mask = _mm256_and_si256(v_mask, _mm256_cmpeq_epi8(_mm256_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm256_or_si256(_mm256_subs_epu8(v_p1, v_p0), _mm256_subs_epu8(v_p0, v_p1)); + v_t2 = _mm256_or_si256(_mm256_subs_epu8(v_q1, v_q0), _mm256_subs_epu8(v_q0, v_q1)); + v_t3 = _mm256_or_si256(_mm256_subs_epu8(v_t1, v_m_hthresh), _mm256_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm256_cmpeq_epi8(v_t3, v_zero); + v_p1 = _mm256_xor_si256(v_p1, v_sign_bit); + v_p0 = _mm256_xor_si256(v_p0, v_sign_bit); + v_q0 = _mm256_xor_si256(v_q0, v_sign_bit); + v_q1 = _mm256_xor_si256(v_q1, v_sign_bit); + v_t1 = _mm256_subs_epi8(v_p1, v_q1); + v_t1 = _mm256_andnot_si256(v_not_hev, v_t1); + v_t2 = _mm256_subs_epi8(v_q0, v_p0); + v_t1 = _mm256_adds_epi8(v_t1, v_t2); + v_t1 = _mm256_adds_epi8(v_t1, v_t2); + v_delta = _mm256_adds_epi8(v_t1, v_t2); + v_delta = _mm256_and_si256(v_delta, v_mask); + v_v4 = _mm256_adds_epi8(v_delta, v_k4); + v_lo = _mm256_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm256_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm256_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm256_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm256_packs_epi16(v_lo, v_hi); + v_v3 = _mm256_adds_epi8(v_delta, v_k3); + v_lo = _mm256_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm256_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm256_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm256_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm256_packs_epi16(v_lo, v_hi); + v_q0 = _mm256_subs_epi8(v_q0, v_v4); + v_p0 = _mm256_adds_epi8(v_p0, v_v3); + v_a3 = _mm256_adds_epi8(v_v4, v_k1); + v_lo = _mm256_unpacklo_epi8(v_zero, v_a3); + v_hi = _mm256_unpackhi_epi8(v_zero, v_a3); + v_lo = _mm256_srai_epi16(v_lo, (int32_t)(9u)); + v_hi = _mm256_srai_epi16(v_hi, (int32_t)(9u)); + v_a3 = _mm256_packs_epi16(v_lo, v_hi); + v_a3 = _mm256_and_si256(v_a3, v_not_hev); + v_q1 = _mm256_subs_epi8(v_q1, v_a3); + v_p1 = _mm256_adds_epi8(v_p1, v_a3); + v_p1 = _mm256_xor_si256(v_p1, v_sign_bit); + v_p0 = _mm256_xor_si256(v_p0, v_sign_bit); + v_q0 = _mm256_xor_si256(v_q0, v_sign_bit); + v_q1 = _mm256_xor_si256(v_q1, v_sign_bit); + v_f0 = _mm256_unpacklo_epi8(v_p3, v_p2); + v_f1 = _mm256_unpacklo_epi8(v_p1, v_p0); + v_f2 = _mm256_unpacklo_epi8(v_q0, v_q1); + v_f3 = _mm256_unpacklo_epi8(v_q2, v_q3); + v_g0 = _mm256_unpacklo_epi16(v_f0, v_f1); + v_g1 = _mm256_unpackhi_epi16(v_f0, v_f1); + v_g2 = _mm256_unpacklo_epi16(v_f2, v_f3); + v_g3 = _mm256_unpackhi_epi16(v_f2, v_f3); + v_f0 = _mm256_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm256_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm256_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm256_unpackhi_epi32(v_g1, v_g3); + if ((a_u_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_u_wb = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_u_off - 4u)); + if ((a_v_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_v_wb = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_v_off - 4u)); + v_u_128 = _mm256_castsi256_si128(v_f0); + v_v_128 = _mm256_extracti128_si256(v_f0, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm_unpackhi_epi64(_mm256_castsi256_si128(v_f0), _mm256_castsi256_si128(v_f0)); + v_v_128 = _mm_unpackhi_epi64(_mm256_extracti128_si256(v_f0, (int32_t)(1u)), _mm256_extracti128_si256(v_f0, (int32_t)(1u))); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm256_castsi256_si128(v_f1); + v_v_128 = _mm256_extracti128_si256(v_f1, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm_unpackhi_epi64(_mm256_castsi256_si128(v_f1), _mm256_castsi256_si128(v_f1)); + v_v_128 = _mm_unpackhi_epi64(_mm256_extracti128_si256(v_f1, (int32_t)(1u)), _mm256_extracti128_si256(v_f1, (int32_t)(1u))); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm256_castsi256_si128(v_f2); + v_v_128 = _mm256_extracti128_si256(v_f2, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm_unpackhi_epi64(_mm256_castsi256_si128(v_f2), _mm256_castsi256_si128(v_f2)); + v_v_128 = _mm_unpackhi_epi64(_mm256_extracti128_si256(v_f2, (int32_t)(1u)), _mm256_extracti128_si256(v_f2, (int32_t)(1u))); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm256_castsi256_si128(v_f3); + v_v_128 = _mm256_extracti128_si256(v_f3, (int32_t)(1u)); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + if (v_stride <= ((uint64_t)(v_u_wb.len))) { + v_u_wb = wuffs_base__slice_u8__subslice_i(v_u_wb, v_stride); + } + if (v_stride <= ((uint64_t)(v_v_wb.len))) { + v_v_wb = wuffs_base__slice_u8__subslice_i(v_v_wb, v_stride); + } + v_u_128 = _mm_unpackhi_epi64(_mm256_castsi256_si128(v_f3), _mm256_castsi256_si128(v_f3)); + v_v_128 = _mm_unpackhi_epi64(_mm256_extracti128_si256(v_f3, (int32_t)(1u)), _mm256_extracti128_si256(v_f3, (int32_t)(1u))); + if (8u <= ((uint64_t)(v_u_wb.len))) { + _mm_storeu_si64((void*)(v_u_wb.ptr), v_u_128); + } + if (8u <= ((uint64_t)(v_v_wb.len))) { + _mm_storeu_si64((void*)(v_v_wb.ptr), v_v_128); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +// ‼ WUFFS MULTI-FILE SECTION -x86_avx2 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.simple_vfilter_16_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__simple_vfilter_16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_limit) { + wuffs_base__slice_u8 v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_p1 = {0}; + __m128i v_p0 = {0}; + __m128i v_q0 = {0}; + __m128i v_q1 = {0}; + __m128i v_sign_bit = {0}; + __m128i v_zero = {0}; + __m128i v_kFE = {0}; + __m128i v_m_thresh = {0}; + __m128i v_k3 = {0}; + __m128i v_k4 = {0}; + __m128i v_mask = {0}; + __m128i v_t1 = {0}; + __m128i v_t2 = {0}; + __m128i v_t3 = {0}; + __m128i v_delta = {0}; + __m128i v_v3 = {0}; + __m128i v_v4 = {0}; + __m128i v_lo = {0}; + __m128i v_hi = {0}; + __m128i v_p1s = {0}; + __m128i v_q1s = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_y_stride)); + if (v_stride < 16u) { + return wuffs_base__make_empty_struct(); + } + if (a_q0_off < (2u * v_stride)) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - (2u * v_stride)) <= ((uint64_t)(v_wb.len))) { + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - (2u * v_stride))); + } else { + return wuffs_base__make_empty_struct(); + } + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p0 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q0 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q1 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_zero = _mm_setzero_si128(); + v_sign_bit = _mm_set1_epi8((int8_t)(128u)); + v_kFE = _mm_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_limit)))); + v_k3 = _mm_set1_epi8((int8_t)(3u)); + v_k4 = _mm_set1_epi8((int8_t)(4u)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_q1), _mm_subs_epu8(v_q1, v_p1)); + v_t2 = _mm_and_si128(v_t1, v_kFE); + v_t2 = _mm_srli_epi16(v_t2, (int32_t)(1u)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_p0, v_q0), _mm_subs_epu8(v_q0, v_p0)); + v_t3 = _mm_adds_epu8(v_t3, v_t3); + v_t3 = _mm_adds_epu8(v_t3, v_t2); + v_mask = _mm_subs_epu8(v_t3, v_m_thresh); + v_mask = _mm_cmpeq_epi8(v_mask, v_zero); + v_p1s = _mm_xor_si128(v_p1, v_sign_bit); + v_q1s = _mm_xor_si128(v_q1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_t1 = _mm_subs_epi8(v_p1s, v_q1s); + v_t2 = _mm_subs_epi8(v_q0, v_p0); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_and_si128(v_delta, v_mask); + v_v4 = _mm_adds_epi8(v_delta, v_k4); + v_lo = _mm_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm_packs_epi16(v_lo, v_hi); + v_v3 = _mm_adds_epi8(v_delta, v_k3); + v_lo = _mm_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm_packs_epi16(v_lo, v_hi); + v_q0 = _mm_subs_epi8(v_q0, v_v4); + v_p0 = _mm_adds_epi8(v_p0, v_v3); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + if (a_q0_off < v_stride) { + return wuffs_base__make_empty_struct(); + } + if ((a_q0_off - v_stride) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - v_stride)); + } else { + return wuffs_base__make_empty_struct(); + } + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_p0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (16u > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_q0); + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.normal_vfilter_inner_16_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_p3 = {0}; + __m128i v_p2 = {0}; + __m128i v_p1 = {0}; + __m128i v_p0 = {0}; + __m128i v_q0 = {0}; + __m128i v_q1 = {0}; + __m128i v_q2 = {0}; + __m128i v_q3 = {0}; + __m128i v_zero = {0}; + __m128i v_sign_bit = {0}; + __m128i v_kFE = {0}; + __m128i v_m_thresh = {0}; + __m128i v_m_ithresh = {0}; + __m128i v_m_hthresh = {0}; + __m128i v_k1 = {0}; + __m128i v_k3 = {0}; + __m128i v_k4 = {0}; + __m128i v_mask = {0}; + __m128i v_not_hev = {0}; + __m128i v_delta = {0}; + __m128i v_v3 = {0}; + __m128i v_v4 = {0}; + __m128i v_a3 = {0}; + __m128i v_t1 = {0}; + __m128i v_t2 = {0}; + __m128i v_t3 = {0}; + __m128i v_lo = {0}; + __m128i v_hi = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_y_stride)); + if (v_stride < 16u) { + return wuffs_base__make_empty_struct(); + } + if (a_q0_off < (4u * v_stride)) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - (4u * v_stride)) <= ((uint64_t)(v_wb.len))) { + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - (4u * v_stride))); + } else { + return wuffs_base__make_empty_struct(); + } + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p3 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p2 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p0 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q0 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q1 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q2 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q3 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_zero = _mm_setzero_si128(); + v_sign_bit = _mm_set1_epi8((int8_t)(128u)); + v_kFE = _mm_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k1 = _mm_set1_epi8((int8_t)(1u)); + v_k3 = _mm_set1_epi8((int8_t)(3u)); + v_k4 = _mm_set1_epi8((int8_t)(4u)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_q1), _mm_subs_epu8(v_q1, v_p1)); + v_t2 = _mm_srli_epi16(_mm_and_si128(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_p0, v_q0), _mm_subs_epu8(v_q0, v_p0)); + v_t3 = _mm_adds_epu8(v_t3, v_t3); + v_t3 = _mm_adds_epu8(v_t3, v_t2); + v_mask = _mm_cmpeq_epi8(_mm_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p3, v_p2), _mm_subs_epu8(v_p2, v_p3)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p2, v_p1), _mm_subs_epu8(v_p1, v_p2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q0, v_q1), _mm_subs_epu8(v_q1, v_q0)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q2), _mm_subs_epu8(v_q2, v_q1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q2, v_q3), _mm_subs_epu8(v_q3, v_q2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_t2 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q0), _mm_subs_epu8(v_q0, v_q1)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_t1, v_m_hthresh), _mm_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm_cmpeq_epi8(v_t3, v_zero); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_t1 = _mm_subs_epi8(v_p1, v_q1); + v_t1 = _mm_andnot_si128(v_not_hev, v_t1); + v_t2 = _mm_subs_epi8(v_q0, v_p0); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_and_si128(v_delta, v_mask); + v_v4 = _mm_adds_epi8(v_delta, v_k4); + v_lo = _mm_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm_packs_epi16(v_lo, v_hi); + v_v3 = _mm_adds_epi8(v_delta, v_k3); + v_lo = _mm_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm_packs_epi16(v_lo, v_hi); + v_q0 = _mm_subs_epi8(v_q0, v_v4); + v_p0 = _mm_adds_epi8(v_p0, v_v3); + v_a3 = _mm_adds_epi8(v_v4, v_k1); + v_lo = _mm_unpacklo_epi8(v_zero, v_a3); + v_hi = _mm_unpackhi_epi8(v_zero, v_a3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(9u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(9u)); + v_a3 = _mm_packs_epi16(v_lo, v_hi); + v_a3 = _mm_and_si128(v_a3, v_not_hev); + v_q1 = _mm_subs_epi8(v_q1, v_a3); + v_p1 = _mm_adds_epi8(v_p1, v_a3); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + if (a_q0_off < (2u * v_stride)) { + return wuffs_base__make_empty_struct(); + } + if ((a_q0_off - (2u * v_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - (2u * v_stride))); + } else { + return wuffs_base__make_empty_struct(); + } + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_p1); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_p0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_q0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (16u > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_q1); + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.normal_vfilter_mb_16_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_p3 = {0}; + __m128i v_p2 = {0}; + __m128i v_p1 = {0}; + __m128i v_p0 = {0}; + __m128i v_q0 = {0}; + __m128i v_q1 = {0}; + __m128i v_q2 = {0}; + __m128i v_q3 = {0}; + __m128i v_zero = {0}; + __m128i v_sign_bit = {0}; + __m128i v_kFE = {0}; + __m128i v_m_thresh = {0}; + __m128i v_m_ithresh = {0}; + __m128i v_m_hthresh = {0}; + __m128i v_k3 = {0}; + __m128i v_k4 = {0}; + __m128i v_k63 = {0}; + __m128i v_k27 = {0}; + __m128i v_k18 = {0}; + __m128i v_k9 = {0}; + __m128i v_mask = {0}; + __m128i v_not_hev = {0}; + __m128i v_delta = {0}; + __m128i v_v3 = {0}; + __m128i v_v4 = {0}; + __m128i v_a1 = {0}; + __m128i v_a2 = {0}; + __m128i v_a3 = {0}; + __m128i v_t1 = {0}; + __m128i v_t2 = {0}; + __m128i v_t3 = {0}; + __m128i v_lo = {0}; + __m128i v_hi = {0}; + __m128i v_d_lo = {0}; + __m128i v_d_hi = {0}; + __m128i v_p0_adj = {0}; + __m128i v_q0_adj = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_y_stride)); + if (v_stride < 16u) { + return wuffs_base__make_empty_struct(); + } + if (a_q0_off < (4u * v_stride)) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - (4u * v_stride)) <= ((uint64_t)(v_wb.len))) { + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - (4u * v_stride))); + } else { + return wuffs_base__make_empty_struct(); + } + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p3 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p2 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p0 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q0 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q1 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q2 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (16u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q3 = _mm_lddqu_si128((const __m128i*)(const void*)(v_wb.ptr)); + v_zero = _mm_setzero_si128(); + v_sign_bit = _mm_set1_epi8((int8_t)(128u)); + v_kFE = _mm_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k3 = _mm_set1_epi8((int8_t)(3u)); + v_k4 = _mm_set1_epi8((int8_t)(4u)); + v_k63 = _mm_set1_epi16((int16_t)(63u)); + v_k27 = _mm_set1_epi16((int16_t)(27u)); + v_k18 = _mm_set1_epi16((int16_t)(18u)); + v_k9 = _mm_set1_epi16((int16_t)(9u)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_q1), _mm_subs_epu8(v_q1, v_p1)); + v_t2 = _mm_srli_epi16(_mm_and_si128(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_p0, v_q0), _mm_subs_epu8(v_q0, v_p0)); + v_t3 = _mm_adds_epu8(v_t3, v_t3); + v_t3 = _mm_adds_epu8(v_t3, v_t2); + v_mask = _mm_cmpeq_epi8(_mm_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p3, v_p2), _mm_subs_epu8(v_p2, v_p3)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p2, v_p1), _mm_subs_epu8(v_p1, v_p2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q0, v_q1), _mm_subs_epu8(v_q1, v_q0)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q2), _mm_subs_epu8(v_q2, v_q1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q2, v_q3), _mm_subs_epu8(v_q3, v_q2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_t2 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q0), _mm_subs_epu8(v_q0, v_q1)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_t1, v_m_hthresh), _mm_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm_cmpeq_epi8(v_t3, v_zero); + v_p2 = _mm_xor_si128(v_p2, v_sign_bit); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_q2 = _mm_xor_si128(v_q2, v_sign_bit); + v_t1 = _mm_subs_epi8(v_p1, v_q1); + v_t2 = _mm_subs_epi8(v_q0, v_p0); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_and_si128(v_delta, v_mask); + v_v4 = _mm_adds_epi8(v_delta, v_k4); + v_lo = _mm_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm_packs_epi16(v_lo, v_hi); + v_v3 = _mm_adds_epi8(v_delta, v_k3); + v_lo = _mm_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm_packs_epi16(v_lo, v_hi); + v_d_lo = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_delta), (int32_t)(8u)); + v_d_hi = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_delta), (int32_t)(8u)); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k27), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k27), v_k63), (int32_t)(7u)); + v_a1 = _mm_packs_epi16(v_lo, v_hi); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k18), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k18), v_k63), (int32_t)(7u)); + v_a2 = _mm_packs_epi16(v_lo, v_hi); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k9), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k9), v_k63), (int32_t)(7u)); + v_a3 = _mm_packs_epi16(v_lo, v_hi); + v_p0_adj = _mm_or_si128(_mm_andnot_si128(v_not_hev, v_v3), _mm_and_si128(v_a1, v_not_hev)); + v_p0 = _mm_adds_epi8(v_p0, v_p0_adj); + v_q0_adj = _mm_or_si128(_mm_andnot_si128(v_not_hev, v_v4), _mm_and_si128(v_a1, v_not_hev)); + v_q0 = _mm_subs_epi8(v_q0, v_q0_adj); + v_p1 = _mm_adds_epi8(v_p1, _mm_and_si128(v_a2, v_not_hev)); + v_q1 = _mm_subs_epi8(v_q1, _mm_and_si128(v_a2, v_not_hev)); + v_p2 = _mm_adds_epi8(v_p2, _mm_and_si128(v_a3, v_not_hev)); + v_q2 = _mm_subs_epi8(v_q2, _mm_and_si128(v_a3, v_not_hev)); + v_p2 = _mm_xor_si128(v_p2, v_sign_bit); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_q2 = _mm_xor_si128(v_q2, v_sign_bit); + if (a_q0_off < (3u * v_stride)) { + return wuffs_base__make_empty_struct(); + } + if ((a_q0_off - (3u * v_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - (3u * v_stride))); + } else { + return wuffs_base__make_empty_struct(); + } + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_p2); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_p1); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_p0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_q0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_q1); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (16u > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_q2); + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.normal_vfilter_mb_8_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_mb_8_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_p3 = {0}; + __m128i v_p2 = {0}; + __m128i v_p1 = {0}; + __m128i v_p0 = {0}; + __m128i v_q0 = {0}; + __m128i v_q1 = {0}; + __m128i v_q2 = {0}; + __m128i v_q3 = {0}; + __m128i v_zero = {0}; + __m128i v_sign_bit = {0}; + __m128i v_kFE = {0}; + __m128i v_m_thresh = {0}; + __m128i v_m_ithresh = {0}; + __m128i v_m_hthresh = {0}; + __m128i v_k3 = {0}; + __m128i v_k4 = {0}; + __m128i v_k63 = {0}; + __m128i v_k27 = {0}; + __m128i v_k18 = {0}; + __m128i v_k9 = {0}; + __m128i v_mask = {0}; + __m128i v_not_hev = {0}; + __m128i v_delta = {0}; + __m128i v_v3 = {0}; + __m128i v_v4 = {0}; + __m128i v_a1 = {0}; + __m128i v_a2 = {0}; + __m128i v_a3 = {0}; + __m128i v_t1 = {0}; + __m128i v_t2 = {0}; + __m128i v_t3 = {0}; + __m128i v_lo = {0}; + __m128i v_hi = {0}; + __m128i v_d_lo = {0}; + __m128i v_d_hi = {0}; + __m128i v_p0_adj = {0}; + __m128i v_q0_adj = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_uv_stride)); + if (v_stride < 8u) { + return wuffs_base__make_empty_struct(); + } + if (a_q0_off < (4u * v_stride)) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - (4u * v_stride)) <= ((uint64_t)(v_wb.len))) { + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - (4u * v_stride))); + } else { + return wuffs_base__make_empty_struct(); + } + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p3 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p2 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p0 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q0 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q1 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q2 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q3 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_zero = _mm_setzero_si128(); + v_sign_bit = _mm_set1_epi8((int8_t)(128u)); + v_kFE = _mm_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k3 = _mm_set1_epi8((int8_t)(3u)); + v_k4 = _mm_set1_epi8((int8_t)(4u)); + v_k63 = _mm_set1_epi16((int16_t)(63u)); + v_k27 = _mm_set1_epi16((int16_t)(27u)); + v_k18 = _mm_set1_epi16((int16_t)(18u)); + v_k9 = _mm_set1_epi16((int16_t)(9u)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_q1), _mm_subs_epu8(v_q1, v_p1)); + v_t2 = _mm_srli_epi16(_mm_and_si128(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_p0, v_q0), _mm_subs_epu8(v_q0, v_p0)); + v_t3 = _mm_adds_epu8(v_t3, v_t3); + v_t3 = _mm_adds_epu8(v_t3, v_t2); + v_mask = _mm_cmpeq_epi8(_mm_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p3, v_p2), _mm_subs_epu8(v_p2, v_p3)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p2, v_p1), _mm_subs_epu8(v_p1, v_p2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q0, v_q1), _mm_subs_epu8(v_q1, v_q0)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q2), _mm_subs_epu8(v_q2, v_q1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q2, v_q3), _mm_subs_epu8(v_q3, v_q2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_t2 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q0), _mm_subs_epu8(v_q0, v_q1)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_t1, v_m_hthresh), _mm_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm_cmpeq_epi8(v_t3, v_zero); + v_p2 = _mm_xor_si128(v_p2, v_sign_bit); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_q2 = _mm_xor_si128(v_q2, v_sign_bit); + v_t1 = _mm_subs_epi8(v_p1, v_q1); + v_t2 = _mm_subs_epi8(v_q0, v_p0); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_and_si128(v_delta, v_mask); + v_v4 = _mm_adds_epi8(v_delta, v_k4); + v_lo = _mm_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm_packs_epi16(v_lo, v_hi); + v_v3 = _mm_adds_epi8(v_delta, v_k3); + v_lo = _mm_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm_packs_epi16(v_lo, v_hi); + v_d_lo = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_delta), (int32_t)(8u)); + v_d_hi = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_delta), (int32_t)(8u)); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k27), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k27), v_k63), (int32_t)(7u)); + v_a1 = _mm_packs_epi16(v_lo, v_hi); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k18), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k18), v_k63), (int32_t)(7u)); + v_a2 = _mm_packs_epi16(v_lo, v_hi); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k9), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k9), v_k63), (int32_t)(7u)); + v_a3 = _mm_packs_epi16(v_lo, v_hi); + v_p0_adj = _mm_or_si128(_mm_andnot_si128(v_not_hev, v_v3), _mm_and_si128(v_a1, v_not_hev)); + v_p0 = _mm_adds_epi8(v_p0, v_p0_adj); + v_q0_adj = _mm_or_si128(_mm_andnot_si128(v_not_hev, v_v4), _mm_and_si128(v_a1, v_not_hev)); + v_q0 = _mm_subs_epi8(v_q0, v_q0_adj); + v_p1 = _mm_adds_epi8(v_p1, _mm_and_si128(v_a2, v_not_hev)); + v_q1 = _mm_subs_epi8(v_q1, _mm_and_si128(v_a2, v_not_hev)); + v_p2 = _mm_adds_epi8(v_p2, _mm_and_si128(v_a3, v_not_hev)); + v_q2 = _mm_subs_epi8(v_q2, _mm_and_si128(v_a3, v_not_hev)); + v_p2 = _mm_xor_si128(v_p2, v_sign_bit); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_q2 = _mm_xor_si128(v_q2, v_sign_bit); + if (a_q0_off < (3u * v_stride)) { + return wuffs_base__make_empty_struct(); + } + if ((a_q0_off - (3u * v_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - (3u * v_stride))); + } else { + return wuffs_base__make_empty_struct(); + } + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_p2); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_p1); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_p0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_q0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_q1); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (8u > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_q2); + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.normal_hfilter_mb_16_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_ra = {0}; + __m128i v_rb = {0}; + __m128i v_f0 = {0}; + __m128i v_f1 = {0}; + __m128i v_f2 = {0}; + __m128i v_f3 = {0}; + __m128i v_f4 = {0}; + __m128i v_f5 = {0}; + __m128i v_f6 = {0}; + __m128i v_f7 = {0}; + __m128i v_g0 = {0}; + __m128i v_g1 = {0}; + __m128i v_g2 = {0}; + __m128i v_g3 = {0}; + __m128i v_g4 = {0}; + __m128i v_g5 = {0}; + __m128i v_g6 = {0}; + __m128i v_g7 = {0}; + __m128i v_p3 = {0}; + __m128i v_p2 = {0}; + __m128i v_p1 = {0}; + __m128i v_p0 = {0}; + __m128i v_q0 = {0}; + __m128i v_q1 = {0}; + __m128i v_q2 = {0}; + __m128i v_q3 = {0}; + __m128i v_zero = {0}; + __m128i v_sign_bit = {0}; + __m128i v_kFE = {0}; + __m128i v_m_thresh = {0}; + __m128i v_m_ithresh = {0}; + __m128i v_m_hthresh = {0}; + __m128i v_k3 = {0}; + __m128i v_k4 = {0}; + __m128i v_k63 = {0}; + __m128i v_k27 = {0}; + __m128i v_k18 = {0}; + __m128i v_k9 = {0}; + __m128i v_mask = {0}; + __m128i v_not_hev = {0}; + __m128i v_delta = {0}; + __m128i v_v3 = {0}; + __m128i v_v4 = {0}; + __m128i v_a1 = {0}; + __m128i v_a2 = {0}; + __m128i v_a3 = {0}; + __m128i v_t1 = {0}; + __m128i v_t2 = {0}; + __m128i v_t3 = {0}; + __m128i v_lo = {0}; + __m128i v_hi = {0}; + __m128i v_d_lo = {0}; + __m128i v_d_hi = {0}; + __m128i v_p0_adj = {0}; + __m128i v_q0_adj = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_y_stride)); + if (v_stride < 8u) { + return wuffs_base__make_empty_struct(); + } + if (a_q0_off < 4u) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - 4u) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - 4u)); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f0 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f1 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f2 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f3 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f4 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f5 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f6 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_f7 = _mm_unpacklo_epi8(v_ra, v_rb); + v_g0 = _mm_unpacklo_epi16(v_f0, v_f1); + v_g1 = _mm_unpackhi_epi16(v_f0, v_f1); + v_g2 = _mm_unpacklo_epi16(v_f2, v_f3); + v_g3 = _mm_unpackhi_epi16(v_f2, v_f3); + v_g4 = _mm_unpacklo_epi16(v_f4, v_f5); + v_g5 = _mm_unpackhi_epi16(v_f4, v_f5); + v_g6 = _mm_unpacklo_epi16(v_f6, v_f7); + v_g7 = _mm_unpackhi_epi16(v_f6, v_f7); + v_f0 = _mm_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm_unpackhi_epi32(v_g1, v_g3); + v_f4 = _mm_unpacklo_epi32(v_g4, v_g6); + v_f5 = _mm_unpackhi_epi32(v_g4, v_g6); + v_f6 = _mm_unpacklo_epi32(v_g5, v_g7); + v_f7 = _mm_unpackhi_epi32(v_g5, v_g7); + v_p3 = _mm_unpacklo_epi64(v_f0, v_f4); + v_p2 = _mm_unpackhi_epi64(v_f0, v_f4); + v_p1 = _mm_unpacklo_epi64(v_f1, v_f5); + v_p0 = _mm_unpackhi_epi64(v_f1, v_f5); + v_q0 = _mm_unpacklo_epi64(v_f2, v_f6); + v_q1 = _mm_unpackhi_epi64(v_f2, v_f6); + v_q2 = _mm_unpacklo_epi64(v_f3, v_f7); + v_q3 = _mm_unpackhi_epi64(v_f3, v_f7); + v_zero = _mm_setzero_si128(); + v_sign_bit = _mm_set1_epi8((int8_t)(128u)); + v_kFE = _mm_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k3 = _mm_set1_epi8((int8_t)(3u)); + v_k4 = _mm_set1_epi8((int8_t)(4u)); + v_k63 = _mm_set1_epi16((int16_t)(63u)); + v_k27 = _mm_set1_epi16((int16_t)(27u)); + v_k18 = _mm_set1_epi16((int16_t)(18u)); + v_k9 = _mm_set1_epi16((int16_t)(9u)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_q1), _mm_subs_epu8(v_q1, v_p1)); + v_t2 = _mm_srli_epi16(_mm_and_si128(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_p0, v_q0), _mm_subs_epu8(v_q0, v_p0)); + v_t3 = _mm_adds_epu8(v_t3, v_t3); + v_t3 = _mm_adds_epu8(v_t3, v_t2); + v_mask = _mm_cmpeq_epi8(_mm_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p3, v_p2), _mm_subs_epu8(v_p2, v_p3)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p2, v_p1), _mm_subs_epu8(v_p1, v_p2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q0, v_q1), _mm_subs_epu8(v_q1, v_q0)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q2), _mm_subs_epu8(v_q2, v_q1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q2, v_q3), _mm_subs_epu8(v_q3, v_q2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_t2 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q0), _mm_subs_epu8(v_q0, v_q1)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_t1, v_m_hthresh), _mm_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm_cmpeq_epi8(v_t3, v_zero); + v_p2 = _mm_xor_si128(v_p2, v_sign_bit); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_q2 = _mm_xor_si128(v_q2, v_sign_bit); + v_t1 = _mm_subs_epi8(v_p1, v_q1); + v_t2 = _mm_subs_epi8(v_q0, v_p0); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_and_si128(v_delta, v_mask); + v_v4 = _mm_adds_epi8(v_delta, v_k4); + v_lo = _mm_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm_packs_epi16(v_lo, v_hi); + v_v3 = _mm_adds_epi8(v_delta, v_k3); + v_lo = _mm_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm_packs_epi16(v_lo, v_hi); + v_d_lo = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_delta), (int32_t)(8u)); + v_d_hi = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_delta), (int32_t)(8u)); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k27), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k27), v_k63), (int32_t)(7u)); + v_a1 = _mm_packs_epi16(v_lo, v_hi); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k18), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k18), v_k63), (int32_t)(7u)); + v_a2 = _mm_packs_epi16(v_lo, v_hi); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k9), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k9), v_k63), (int32_t)(7u)); + v_a3 = _mm_packs_epi16(v_lo, v_hi); + v_p0_adj = _mm_or_si128(_mm_andnot_si128(v_not_hev, v_v3), _mm_and_si128(v_a1, v_not_hev)); + v_p0 = _mm_adds_epi8(v_p0, v_p0_adj); + v_q0_adj = _mm_or_si128(_mm_andnot_si128(v_not_hev, v_v4), _mm_and_si128(v_a1, v_not_hev)); + v_q0 = _mm_subs_epi8(v_q0, v_q0_adj); + v_p1 = _mm_adds_epi8(v_p1, _mm_and_si128(v_a2, v_not_hev)); + v_q1 = _mm_subs_epi8(v_q1, _mm_and_si128(v_a2, v_not_hev)); + v_p2 = _mm_adds_epi8(v_p2, _mm_and_si128(v_a3, v_not_hev)); + v_q2 = _mm_subs_epi8(v_q2, _mm_and_si128(v_a3, v_not_hev)); + v_p2 = _mm_xor_si128(v_p2, v_sign_bit); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_q2 = _mm_xor_si128(v_q2, v_sign_bit); + v_f0 = _mm_unpacklo_epi8(v_p3, v_p2); + v_f1 = _mm_unpackhi_epi8(v_p3, v_p2); + v_f2 = _mm_unpacklo_epi8(v_p1, v_p0); + v_f3 = _mm_unpackhi_epi8(v_p1, v_p0); + v_f4 = _mm_unpacklo_epi8(v_q0, v_q1); + v_f5 = _mm_unpackhi_epi8(v_q0, v_q1); + v_f6 = _mm_unpacklo_epi8(v_q2, v_q3); + v_f7 = _mm_unpackhi_epi8(v_q2, v_q3); + v_g0 = _mm_unpacklo_epi16(v_f0, v_f2); + v_g1 = _mm_unpackhi_epi16(v_f0, v_f2); + v_g2 = _mm_unpacklo_epi16(v_f4, v_f6); + v_g3 = _mm_unpackhi_epi16(v_f4, v_f6); + v_g4 = _mm_unpacklo_epi16(v_f1, v_f3); + v_g5 = _mm_unpackhi_epi16(v_f1, v_f3); + v_g6 = _mm_unpacklo_epi16(v_f5, v_f7); + v_g7 = _mm_unpackhi_epi16(v_f5, v_f7); + v_f0 = _mm_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm_unpackhi_epi32(v_g1, v_g3); + v_f4 = _mm_unpacklo_epi32(v_g4, v_g6); + v_f5 = _mm_unpackhi_epi32(v_g4, v_g6); + v_f6 = _mm_unpacklo_epi32(v_g5, v_g7); + v_f7 = _mm_unpackhi_epi32(v_g5, v_g7); + if ((a_q0_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - 4u)); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f0, v_f0); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f1); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f1, v_f1); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f2); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f2, v_f2); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f3); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f3, v_f3); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f4); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f4, v_f4); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f5); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f5, v_f5); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f6); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f6, v_f6); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f7); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + v_ra = _mm_unpackhi_epi64(v_f7, v_f7); + if (8u > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.normal_hfilter_mb_8_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_mb_8_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_ra = {0}; + __m128i v_rb = {0}; + __m128i v_f0 = {0}; + __m128i v_f1 = {0}; + __m128i v_f2 = {0}; + __m128i v_f3 = {0}; + __m128i v_g0 = {0}; + __m128i v_g1 = {0}; + __m128i v_g2 = {0}; + __m128i v_g3 = {0}; + __m128i v_p3 = {0}; + __m128i v_p2 = {0}; + __m128i v_p1 = {0}; + __m128i v_p0 = {0}; + __m128i v_q0 = {0}; + __m128i v_q1 = {0}; + __m128i v_q2 = {0}; + __m128i v_q3 = {0}; + __m128i v_zero = {0}; + __m128i v_sign_bit = {0}; + __m128i v_kFE = {0}; + __m128i v_m_thresh = {0}; + __m128i v_m_ithresh = {0}; + __m128i v_m_hthresh = {0}; + __m128i v_k3 = {0}; + __m128i v_k4 = {0}; + __m128i v_k63 = {0}; + __m128i v_k27 = {0}; + __m128i v_k18 = {0}; + __m128i v_k9 = {0}; + __m128i v_mask = {0}; + __m128i v_not_hev = {0}; + __m128i v_delta = {0}; + __m128i v_v3 = {0}; + __m128i v_v4 = {0}; + __m128i v_a1 = {0}; + __m128i v_a2 = {0}; + __m128i v_a3 = {0}; + __m128i v_t1 = {0}; + __m128i v_t2 = {0}; + __m128i v_t3 = {0}; + __m128i v_lo = {0}; + __m128i v_hi = {0}; + __m128i v_d_lo = {0}; + __m128i v_d_hi = {0}; + __m128i v_p0_adj = {0}; + __m128i v_q0_adj = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_uv_stride)); + if (v_stride < 8u) { + return wuffs_base__make_empty_struct(); + } + if (a_q0_off < 4u) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - 4u) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - 4u)); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f0 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f1 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f2 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_f3 = _mm_unpacklo_epi8(v_ra, v_rb); + v_g0 = _mm_unpacklo_epi16(v_f0, v_f1); + v_g1 = _mm_unpackhi_epi16(v_f0, v_f1); + v_g2 = _mm_unpacklo_epi16(v_f2, v_f3); + v_g3 = _mm_unpackhi_epi16(v_f2, v_f3); + v_f0 = _mm_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm_unpackhi_epi32(v_g1, v_g3); + v_p3 = v_f0; + v_p2 = _mm_unpackhi_epi64(v_f0, v_f0); + v_p1 = v_f1; + v_p0 = _mm_unpackhi_epi64(v_f1, v_f1); + v_q0 = v_f2; + v_q1 = _mm_unpackhi_epi64(v_f2, v_f2); + v_q2 = v_f3; + v_q3 = _mm_unpackhi_epi64(v_f3, v_f3); + v_zero = _mm_setzero_si128(); + v_sign_bit = _mm_set1_epi8((int8_t)(128u)); + v_kFE = _mm_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k3 = _mm_set1_epi8((int8_t)(3u)); + v_k4 = _mm_set1_epi8((int8_t)(4u)); + v_k63 = _mm_set1_epi16((int16_t)(63u)); + v_k27 = _mm_set1_epi16((int16_t)(27u)); + v_k18 = _mm_set1_epi16((int16_t)(18u)); + v_k9 = _mm_set1_epi16((int16_t)(9u)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_q1), _mm_subs_epu8(v_q1, v_p1)); + v_t2 = _mm_srli_epi16(_mm_and_si128(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_p0, v_q0), _mm_subs_epu8(v_q0, v_p0)); + v_t3 = _mm_adds_epu8(v_t3, v_t3); + v_t3 = _mm_adds_epu8(v_t3, v_t2); + v_mask = _mm_cmpeq_epi8(_mm_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p3, v_p2), _mm_subs_epu8(v_p2, v_p3)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p2, v_p1), _mm_subs_epu8(v_p1, v_p2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q0, v_q1), _mm_subs_epu8(v_q1, v_q0)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q2), _mm_subs_epu8(v_q2, v_q1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q2, v_q3), _mm_subs_epu8(v_q3, v_q2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_t2 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q0), _mm_subs_epu8(v_q0, v_q1)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_t1, v_m_hthresh), _mm_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm_cmpeq_epi8(v_t3, v_zero); + v_p2 = _mm_xor_si128(v_p2, v_sign_bit); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_q2 = _mm_xor_si128(v_q2, v_sign_bit); + v_t1 = _mm_subs_epi8(v_p1, v_q1); + v_t2 = _mm_subs_epi8(v_q0, v_p0); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_and_si128(v_delta, v_mask); + v_v4 = _mm_adds_epi8(v_delta, v_k4); + v_lo = _mm_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm_packs_epi16(v_lo, v_hi); + v_v3 = _mm_adds_epi8(v_delta, v_k3); + v_lo = _mm_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm_packs_epi16(v_lo, v_hi); + v_d_lo = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_delta), (int32_t)(8u)); + v_d_hi = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero, v_delta), (int32_t)(8u)); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k27), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k27), v_k63), (int32_t)(7u)); + v_a1 = _mm_packs_epi16(v_lo, v_hi); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k18), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k18), v_k63), (int32_t)(7u)); + v_a2 = _mm_packs_epi16(v_lo, v_hi); + v_lo = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_lo, v_k9), v_k63), (int32_t)(7u)); + v_hi = _mm_srai_epi16(_mm_add_epi16(_mm_mullo_epi16(v_d_hi, v_k9), v_k63), (int32_t)(7u)); + v_a3 = _mm_packs_epi16(v_lo, v_hi); + v_p0_adj = _mm_or_si128(_mm_andnot_si128(v_not_hev, v_v3), _mm_and_si128(v_a1, v_not_hev)); + v_p0 = _mm_adds_epi8(v_p0, v_p0_adj); + v_q0_adj = _mm_or_si128(_mm_andnot_si128(v_not_hev, v_v4), _mm_and_si128(v_a1, v_not_hev)); + v_q0 = _mm_subs_epi8(v_q0, v_q0_adj); + v_p1 = _mm_adds_epi8(v_p1, _mm_and_si128(v_a2, v_not_hev)); + v_q1 = _mm_subs_epi8(v_q1, _mm_and_si128(v_a2, v_not_hev)); + v_p2 = _mm_adds_epi8(v_p2, _mm_and_si128(v_a3, v_not_hev)); + v_q2 = _mm_subs_epi8(v_q2, _mm_and_si128(v_a3, v_not_hev)); + v_p2 = _mm_xor_si128(v_p2, v_sign_bit); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_q2 = _mm_xor_si128(v_q2, v_sign_bit); + v_f0 = _mm_unpacklo_epi8(v_p3, v_p2); + v_f1 = _mm_unpacklo_epi8(v_p1, v_p0); + v_f2 = _mm_unpacklo_epi8(v_q0, v_q1); + v_f3 = _mm_unpacklo_epi8(v_q2, v_q3); + v_g0 = _mm_unpacklo_epi16(v_f0, v_f1); + v_g1 = _mm_unpackhi_epi16(v_f0, v_f1); + v_g2 = _mm_unpacklo_epi16(v_f2, v_f3); + v_g3 = _mm_unpackhi_epi16(v_f2, v_f3); + v_f0 = _mm_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm_unpackhi_epi32(v_g1, v_g3); + if ((a_q0_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - 4u)); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f0, v_f0); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f1); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f1, v_f1); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f2); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f2, v_f2); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f3); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + v_ra = _mm_unpackhi_epi64(v_f3, v_f3); + if (8u > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.normal_hfilter_inner_16_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_ra = {0}; + __m128i v_rb = {0}; + __m128i v_f0 = {0}; + __m128i v_f1 = {0}; + __m128i v_f2 = {0}; + __m128i v_f3 = {0}; + __m128i v_f4 = {0}; + __m128i v_f5 = {0}; + __m128i v_f6 = {0}; + __m128i v_f7 = {0}; + __m128i v_g0 = {0}; + __m128i v_g1 = {0}; + __m128i v_g2 = {0}; + __m128i v_g3 = {0}; + __m128i v_g4 = {0}; + __m128i v_g5 = {0}; + __m128i v_g6 = {0}; + __m128i v_g7 = {0}; + __m128i v_p3 = {0}; + __m128i v_p2 = {0}; + __m128i v_p1 = {0}; + __m128i v_p0 = {0}; + __m128i v_q0 = {0}; + __m128i v_q1 = {0}; + __m128i v_q2 = {0}; + __m128i v_q3 = {0}; + __m128i v_zero = {0}; + __m128i v_sign_bit = {0}; + __m128i v_kFE = {0}; + __m128i v_m_thresh = {0}; + __m128i v_m_ithresh = {0}; + __m128i v_m_hthresh = {0}; + __m128i v_k1 = {0}; + __m128i v_k3 = {0}; + __m128i v_k4 = {0}; + __m128i v_mask = {0}; + __m128i v_not_hev = {0}; + __m128i v_delta = {0}; + __m128i v_v3 = {0}; + __m128i v_v4 = {0}; + __m128i v_a3 = {0}; + __m128i v_t1 = {0}; + __m128i v_t2 = {0}; + __m128i v_t3 = {0}; + __m128i v_lo = {0}; + __m128i v_hi = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_y_stride)); + if (v_stride < 8u) { + return wuffs_base__make_empty_struct(); + } + if (a_q0_off < 4u) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - 4u) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - 4u)); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f0 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f1 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f2 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f3 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f4 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f5 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f6 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_f7 = _mm_unpacklo_epi8(v_ra, v_rb); + v_g0 = _mm_unpacklo_epi16(v_f0, v_f1); + v_g1 = _mm_unpackhi_epi16(v_f0, v_f1); + v_g2 = _mm_unpacklo_epi16(v_f2, v_f3); + v_g3 = _mm_unpackhi_epi16(v_f2, v_f3); + v_g4 = _mm_unpacklo_epi16(v_f4, v_f5); + v_g5 = _mm_unpackhi_epi16(v_f4, v_f5); + v_g6 = _mm_unpacklo_epi16(v_f6, v_f7); + v_g7 = _mm_unpackhi_epi16(v_f6, v_f7); + v_f0 = _mm_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm_unpackhi_epi32(v_g1, v_g3); + v_f4 = _mm_unpacklo_epi32(v_g4, v_g6); + v_f5 = _mm_unpackhi_epi32(v_g4, v_g6); + v_f6 = _mm_unpacklo_epi32(v_g5, v_g7); + v_f7 = _mm_unpackhi_epi32(v_g5, v_g7); + v_p3 = _mm_unpacklo_epi64(v_f0, v_f4); + v_p2 = _mm_unpackhi_epi64(v_f0, v_f4); + v_p1 = _mm_unpacklo_epi64(v_f1, v_f5); + v_p0 = _mm_unpackhi_epi64(v_f1, v_f5); + v_q0 = _mm_unpacklo_epi64(v_f2, v_f6); + v_q1 = _mm_unpackhi_epi64(v_f2, v_f6); + v_q2 = _mm_unpacklo_epi64(v_f3, v_f7); + v_q3 = _mm_unpackhi_epi64(v_f3, v_f7); + v_zero = _mm_setzero_si128(); + v_sign_bit = _mm_set1_epi8((int8_t)(128u)); + v_kFE = _mm_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k1 = _mm_set1_epi8((int8_t)(1u)); + v_k3 = _mm_set1_epi8((int8_t)(3u)); + v_k4 = _mm_set1_epi8((int8_t)(4u)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_q1), _mm_subs_epu8(v_q1, v_p1)); + v_t2 = _mm_srli_epi16(_mm_and_si128(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_p0, v_q0), _mm_subs_epu8(v_q0, v_p0)); + v_t3 = _mm_adds_epu8(v_t3, v_t3); + v_t3 = _mm_adds_epu8(v_t3, v_t2); + v_mask = _mm_cmpeq_epi8(_mm_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p3, v_p2), _mm_subs_epu8(v_p2, v_p3)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p2, v_p1), _mm_subs_epu8(v_p1, v_p2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q0, v_q1), _mm_subs_epu8(v_q1, v_q0)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q2), _mm_subs_epu8(v_q2, v_q1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q2, v_q3), _mm_subs_epu8(v_q3, v_q2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_t2 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q0), _mm_subs_epu8(v_q0, v_q1)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_t1, v_m_hthresh), _mm_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm_cmpeq_epi8(v_t3, v_zero); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_t1 = _mm_subs_epi8(v_p1, v_q1); + v_t1 = _mm_andnot_si128(v_not_hev, v_t1); + v_t2 = _mm_subs_epi8(v_q0, v_p0); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_and_si128(v_delta, v_mask); + v_v4 = _mm_adds_epi8(v_delta, v_k4); + v_lo = _mm_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm_packs_epi16(v_lo, v_hi); + v_v3 = _mm_adds_epi8(v_delta, v_k3); + v_lo = _mm_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm_packs_epi16(v_lo, v_hi); + v_q0 = _mm_subs_epi8(v_q0, v_v4); + v_p0 = _mm_adds_epi8(v_p0, v_v3); + v_a3 = _mm_adds_epi8(v_v4, v_k1); + v_lo = _mm_unpacklo_epi8(v_zero, v_a3); + v_hi = _mm_unpackhi_epi8(v_zero, v_a3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(9u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(9u)); + v_a3 = _mm_packs_epi16(v_lo, v_hi); + v_a3 = _mm_and_si128(v_a3, v_not_hev); + v_q1 = _mm_subs_epi8(v_q1, v_a3); + v_p1 = _mm_adds_epi8(v_p1, v_a3); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_f0 = _mm_unpacklo_epi8(v_p3, v_p2); + v_f1 = _mm_unpackhi_epi8(v_p3, v_p2); + v_f2 = _mm_unpacklo_epi8(v_p1, v_p0); + v_f3 = _mm_unpackhi_epi8(v_p1, v_p0); + v_f4 = _mm_unpacklo_epi8(v_q0, v_q1); + v_f5 = _mm_unpackhi_epi8(v_q0, v_q1); + v_f6 = _mm_unpacklo_epi8(v_q2, v_q3); + v_f7 = _mm_unpackhi_epi8(v_q2, v_q3); + v_g0 = _mm_unpacklo_epi16(v_f0, v_f2); + v_g1 = _mm_unpackhi_epi16(v_f0, v_f2); + v_g2 = _mm_unpacklo_epi16(v_f4, v_f6); + v_g3 = _mm_unpackhi_epi16(v_f4, v_f6); + v_g4 = _mm_unpacklo_epi16(v_f1, v_f3); + v_g5 = _mm_unpackhi_epi16(v_f1, v_f3); + v_g6 = _mm_unpacklo_epi16(v_f5, v_f7); + v_g7 = _mm_unpackhi_epi16(v_f5, v_f7); + v_f0 = _mm_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm_unpackhi_epi32(v_g1, v_g3); + v_f4 = _mm_unpacklo_epi32(v_g4, v_g6); + v_f5 = _mm_unpackhi_epi32(v_g4, v_g6); + v_f6 = _mm_unpacklo_epi32(v_g5, v_g7); + v_f7 = _mm_unpackhi_epi32(v_g5, v_g7); + if ((a_q0_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - 4u)); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f0, v_f0); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f1); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f1, v_f1); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f2); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f2, v_f2); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f3); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f3, v_f3); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f4); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f4, v_f4); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f5); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f5, v_f5); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f6); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f6, v_f6); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f7); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + v_ra = _mm_unpackhi_epi64(v_f7, v_f7); + if (8u > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.normal_hfilter_inner_8_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_hfilter_inner_8_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_ra = {0}; + __m128i v_rb = {0}; + __m128i v_f0 = {0}; + __m128i v_f1 = {0}; + __m128i v_f2 = {0}; + __m128i v_f3 = {0}; + __m128i v_g0 = {0}; + __m128i v_g1 = {0}; + __m128i v_g2 = {0}; + __m128i v_g3 = {0}; + __m128i v_p3 = {0}; + __m128i v_p2 = {0}; + __m128i v_p1 = {0}; + __m128i v_p0 = {0}; + __m128i v_q0 = {0}; + __m128i v_q1 = {0}; + __m128i v_q2 = {0}; + __m128i v_q3 = {0}; + __m128i v_zero = {0}; + __m128i v_sign_bit = {0}; + __m128i v_kFE = {0}; + __m128i v_m_thresh = {0}; + __m128i v_m_ithresh = {0}; + __m128i v_m_hthresh = {0}; + __m128i v_k1 = {0}; + __m128i v_k3 = {0}; + __m128i v_k4 = {0}; + __m128i v_mask = {0}; + __m128i v_not_hev = {0}; + __m128i v_delta = {0}; + __m128i v_v3 = {0}; + __m128i v_v4 = {0}; + __m128i v_a3 = {0}; + __m128i v_t1 = {0}; + __m128i v_t2 = {0}; + __m128i v_t3 = {0}; + __m128i v_lo = {0}; + __m128i v_hi = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_uv_stride)); + if (v_stride < 8u) { + return wuffs_base__make_empty_struct(); + } + if (a_q0_off < 4u) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - 4u) > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - 4u)); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f0 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f1 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + v_f2 = _mm_unpacklo_epi8(v_ra, v_rb); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_rb = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_f3 = _mm_unpacklo_epi8(v_ra, v_rb); + v_g0 = _mm_unpacklo_epi16(v_f0, v_f1); + v_g1 = _mm_unpackhi_epi16(v_f0, v_f1); + v_g2 = _mm_unpacklo_epi16(v_f2, v_f3); + v_g3 = _mm_unpackhi_epi16(v_f2, v_f3); + v_f0 = _mm_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm_unpackhi_epi32(v_g1, v_g3); + v_p3 = v_f0; + v_p2 = _mm_unpackhi_epi64(v_f0, v_f0); + v_p1 = v_f1; + v_p0 = _mm_unpackhi_epi64(v_f1, v_f1); + v_q0 = v_f2; + v_q1 = _mm_unpackhi_epi64(v_f2, v_f2); + v_q2 = v_f3; + v_q3 = _mm_unpackhi_epi64(v_f3, v_f3); + v_zero = _mm_setzero_si128(); + v_sign_bit = _mm_set1_epi8((int8_t)(128u)); + v_kFE = _mm_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k1 = _mm_set1_epi8((int8_t)(1u)); + v_k3 = _mm_set1_epi8((int8_t)(3u)); + v_k4 = _mm_set1_epi8((int8_t)(4u)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_q1), _mm_subs_epu8(v_q1, v_p1)); + v_t2 = _mm_srli_epi16(_mm_and_si128(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_p0, v_q0), _mm_subs_epu8(v_q0, v_p0)); + v_t3 = _mm_adds_epu8(v_t3, v_t3); + v_t3 = _mm_adds_epu8(v_t3, v_t2); + v_mask = _mm_cmpeq_epi8(_mm_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p3, v_p2), _mm_subs_epu8(v_p2, v_p3)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p2, v_p1), _mm_subs_epu8(v_p1, v_p2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q0, v_q1), _mm_subs_epu8(v_q1, v_q0)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q2), _mm_subs_epu8(v_q2, v_q1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q2, v_q3), _mm_subs_epu8(v_q3, v_q2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_t2 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q0), _mm_subs_epu8(v_q0, v_q1)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_t1, v_m_hthresh), _mm_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm_cmpeq_epi8(v_t3, v_zero); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_t1 = _mm_subs_epi8(v_p1, v_q1); + v_t1 = _mm_andnot_si128(v_not_hev, v_t1); + v_t2 = _mm_subs_epi8(v_q0, v_p0); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_and_si128(v_delta, v_mask); + v_v4 = _mm_adds_epi8(v_delta, v_k4); + v_lo = _mm_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm_packs_epi16(v_lo, v_hi); + v_v3 = _mm_adds_epi8(v_delta, v_k3); + v_lo = _mm_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm_packs_epi16(v_lo, v_hi); + v_q0 = _mm_subs_epi8(v_q0, v_v4); + v_p0 = _mm_adds_epi8(v_p0, v_v3); + v_a3 = _mm_adds_epi8(v_v4, v_k1); + v_lo = _mm_unpacklo_epi8(v_zero, v_a3); + v_hi = _mm_unpackhi_epi8(v_zero, v_a3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(9u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(9u)); + v_a3 = _mm_packs_epi16(v_lo, v_hi); + v_a3 = _mm_and_si128(v_a3, v_not_hev); + v_q1 = _mm_subs_epi8(v_q1, v_a3); + v_p1 = _mm_adds_epi8(v_p1, v_a3); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_f0 = _mm_unpacklo_epi8(v_p3, v_p2); + v_f1 = _mm_unpacklo_epi8(v_p1, v_p0); + v_f2 = _mm_unpacklo_epi8(v_q0, v_q1); + v_f3 = _mm_unpacklo_epi8(v_q2, v_q3); + v_g0 = _mm_unpacklo_epi16(v_f0, v_f1); + v_g1 = _mm_unpackhi_epi16(v_f0, v_f1); + v_g2 = _mm_unpacklo_epi16(v_f2, v_f3); + v_g3 = _mm_unpackhi_epi16(v_f2, v_f3); + v_f0 = _mm_unpacklo_epi32(v_g0, v_g2); + v_f1 = _mm_unpackhi_epi32(v_g0, v_g2); + v_f2 = _mm_unpacklo_epi32(v_g1, v_g3); + v_f3 = _mm_unpackhi_epi32(v_g1, v_g3); + if ((a_q0_off - 4u) > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - 4u)); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f0, v_f0); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f1); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f1, v_f1); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f2); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + v_ra = _mm_unpackhi_epi64(v_f2, v_f2); + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_f3); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + v_ra = _mm_unpackhi_epi64(v_f3, v_f3); + if (8u > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_ra); + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.normal_vfilter_inner_8_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__normal_vfilter_inner_8_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_q0_off, + uint32_t a_level, + uint32_t a_ilevel, + uint32_t a_hlevel) { + wuffs_base__slice_u8 v_wb = {0}; + uint64_t v_stride = 0; + __m128i v_p3 = {0}; + __m128i v_p2 = {0}; + __m128i v_p1 = {0}; + __m128i v_p0 = {0}; + __m128i v_q0 = {0}; + __m128i v_q1 = {0}; + __m128i v_q2 = {0}; + __m128i v_q3 = {0}; + __m128i v_zero = {0}; + __m128i v_sign_bit = {0}; + __m128i v_kFE = {0}; + __m128i v_m_thresh = {0}; + __m128i v_m_ithresh = {0}; + __m128i v_m_hthresh = {0}; + __m128i v_k1 = {0}; + __m128i v_k3 = {0}; + __m128i v_k4 = {0}; + __m128i v_mask = {0}; + __m128i v_not_hev = {0}; + __m128i v_delta = {0}; + __m128i v_v3 = {0}; + __m128i v_v4 = {0}; + __m128i v_a3 = {0}; + __m128i v_t1 = {0}; + __m128i v_t2 = {0}; + __m128i v_t3 = {0}; + __m128i v_lo = {0}; + __m128i v_hi = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_uv_stride)); + if (v_stride < 8u) { + return wuffs_base__make_empty_struct(); + } + if (a_q0_off < (4u * v_stride)) { + return wuffs_base__make_empty_struct(); + } + v_wb = a_workbuf; + if ((a_q0_off - (4u * v_stride)) <= ((uint64_t)(v_wb.len))) { + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, (a_q0_off - (4u * v_stride))); + } else { + return wuffs_base__make_empty_struct(); + } + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p3 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p2 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p1 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_p0 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q0 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q1 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (v_stride > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q2 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_wb = wuffs_base__slice_u8__subslice_i(v_wb, v_stride); + if (8u > ((uint64_t)(v_wb.len))) { + return wuffs_base__make_empty_struct(); + } + v_q3 = _mm_loadl_epi64((const __m128i*)(const void*)(v_wb.ptr)); + v_zero = _mm_setzero_si128(); + v_sign_bit = _mm_set1_epi8((int8_t)(128u)); + v_kFE = _mm_set1_epi8((int8_t)(254u)); + v_m_thresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_level)))); + v_m_ithresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_ilevel)))); + v_m_hthresh = _mm_set1_epi8((int8_t)(((uint8_t)(a_hlevel)))); + v_k1 = _mm_set1_epi8((int8_t)(1u)); + v_k3 = _mm_set1_epi8((int8_t)(3u)); + v_k4 = _mm_set1_epi8((int8_t)(4u)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_q1), _mm_subs_epu8(v_q1, v_p1)); + v_t2 = _mm_srli_epi16(_mm_and_si128(v_t1, v_kFE), (int32_t)(1u)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_p0, v_q0), _mm_subs_epu8(v_q0, v_p0)); + v_t3 = _mm_adds_epu8(v_t3, v_t3); + v_t3 = _mm_adds_epu8(v_t3, v_t2); + v_mask = _mm_cmpeq_epi8(_mm_subs_epu8(v_t3, v_m_thresh), v_zero); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p3, v_p2), _mm_subs_epu8(v_p2, v_p3)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p2, v_p1), _mm_subs_epu8(v_p1, v_p2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q0, v_q1), _mm_subs_epu8(v_q1, v_q0)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q2), _mm_subs_epu8(v_q2, v_q1)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_q2, v_q3), _mm_subs_epu8(v_q3, v_q2)); + v_mask = _mm_and_si128(v_mask, _mm_cmpeq_epi8(_mm_subs_epu8(v_t1, v_m_ithresh), v_zero)); + v_t1 = _mm_or_si128(_mm_subs_epu8(v_p1, v_p0), _mm_subs_epu8(v_p0, v_p1)); + v_t2 = _mm_or_si128(_mm_subs_epu8(v_q1, v_q0), _mm_subs_epu8(v_q0, v_q1)); + v_t3 = _mm_or_si128(_mm_subs_epu8(v_t1, v_m_hthresh), _mm_subs_epu8(v_t2, v_m_hthresh)); + v_not_hev = _mm_cmpeq_epi8(v_t3, v_zero); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + v_t1 = _mm_subs_epi8(v_p1, v_q1); + v_t1 = _mm_andnot_si128(v_not_hev, v_t1); + v_t2 = _mm_subs_epi8(v_q0, v_p0); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_t1 = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_adds_epi8(v_t1, v_t2); + v_delta = _mm_and_si128(v_delta, v_mask); + v_v4 = _mm_adds_epi8(v_delta, v_k4); + v_lo = _mm_unpacklo_epi8(v_zero, v_v4); + v_hi = _mm_unpackhi_epi8(v_zero, v_v4); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v4 = _mm_packs_epi16(v_lo, v_hi); + v_v3 = _mm_adds_epi8(v_delta, v_k3); + v_lo = _mm_unpacklo_epi8(v_zero, v_v3); + v_hi = _mm_unpackhi_epi8(v_zero, v_v3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(11u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(11u)); + v_v3 = _mm_packs_epi16(v_lo, v_hi); + v_q0 = _mm_subs_epi8(v_q0, v_v4); + v_p0 = _mm_adds_epi8(v_p0, v_v3); + v_a3 = _mm_adds_epi8(v_v4, v_k1); + v_lo = _mm_unpacklo_epi8(v_zero, v_a3); + v_hi = _mm_unpackhi_epi8(v_zero, v_a3); + v_lo = _mm_srai_epi16(v_lo, (int32_t)(9u)); + v_hi = _mm_srai_epi16(v_hi, (int32_t)(9u)); + v_a3 = _mm_packs_epi16(v_lo, v_hi); + v_a3 = _mm_and_si128(v_a3, v_not_hev); + v_q1 = _mm_subs_epi8(v_q1, v_a3); + v_p1 = _mm_adds_epi8(v_p1, v_a3); + v_p1 = _mm_xor_si128(v_p1, v_sign_bit); + v_p0 = _mm_xor_si128(v_p0, v_sign_bit); + v_q0 = _mm_xor_si128(v_q0, v_sign_bit); + v_q1 = _mm_xor_si128(v_q1, v_sign_bit); + if (a_q0_off < (2u * v_stride)) { + return wuffs_base__make_empty_struct(); + } + if ((a_q0_off - (2u * v_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, (a_q0_off - (2u * v_stride))); + } else { + return wuffs_base__make_empty_struct(); + } + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_p1); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_p0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (v_stride > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_q0); + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_stride); + if (8u > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_empty_struct(); + } + _mm_storeu_si64((void*)(a_workbuf.ptr), v_q1); + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// -------- func vp8.decoder.decode_partition0 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_partition0( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf) { + self->private_impl.f_bool_ri = 0u; + self->private_impl.f_bool_wi = 0u; + wuffs_vp8__decoder__bool_fill_from_workbuf(self, a_workbuf); + wuffs_vp8__decoder__bool_init(self); + if (self->private_impl.f_key_frame) { + wuffs_vp8__decoder__bool_read_literal(self, 2u); + } + wuffs_vp8__decoder__decode_segmentation(self); + wuffs_vp8__decoder__decode_loop_filter(self); + wuffs_vp8__decoder__decode_partitions(self); + wuffs_vp8__decoder__decode_quant_indices(self); + if (self->private_impl.f_key_frame) { + wuffs_vp8__decoder__bool_read_literal(self, 1u); + } + wuffs_vp8__decoder__decode_coeff_prob_updates(self); + wuffs_vp8__decoder__decode_mb_skip_coeff(self); + wuffs_vp8__decoder__compute_dequant_values(self); + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.decode_segmentation + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_segmentation( + wuffs_vp8__decoder* self) { + uint32_t v_v = 0; + uint32_t v_i = 0; + uint32_t v_val = 0; + uint32_t v_update_feature_data = 0; + + v_v = wuffs_vp8__decoder__bool_read_bool(self, 128u); + if (v_v == 0u) { + self->private_impl.f_use_segment = false; + return wuffs_base__make_empty_struct(); + } + self->private_impl.f_use_segment = true; + v_v = wuffs_vp8__decoder__bool_read_bool(self, 128u); + self->private_impl.f_update_segment_map = (v_v != 0u); + v_update_feature_data = wuffs_vp8__decoder__bool_read_bool(self, 128u); + if (v_update_feature_data != 0u) { + v_v = wuffs_vp8__decoder__bool_read_bool(self, 128u); + self->private_impl.f_segment_is_abs = (v_v != 0u); + v_i = 0u; + while (v_i < 4u) { + self->private_impl.f_segment_quant[v_i] = wuffs_vp8__decoder__bool_read_signed(self, 7u); + v_i += 1u; + } + v_i = 0u; + while (v_i < 4u) { + self->private_impl.f_segment_lf[v_i] = wuffs_vp8__decoder__bool_read_signed(self, 6u); + v_i += 1u; + } + } + if (self->private_impl.f_update_segment_map) { + v_i = 0u; + while (v_i < 3u) { + v_v = wuffs_vp8__decoder__bool_read_bool(self, 128u); + if (v_v != 0u) { + v_val = wuffs_vp8__decoder__bool_read_literal(self, 8u); + self->private_impl.f_segment_prob[v_i] = ((uint8_t)(v_val)); + } else { + self->private_impl.f_segment_prob[v_i] = 255u; + } + v_i += 1u; + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.decode_loop_filter + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_loop_filter( + wuffs_vp8__decoder* self) { + uint32_t v_v = 0; + uint32_t v_i = 0; + uint32_t v_val = 0; + + v_val = wuffs_vp8__decoder__bool_read_literal(self, 1u); + self->private_impl.f_filter_type = ((uint8_t)((v_val & 1u))); + v_val = wuffs_vp8__decoder__bool_read_literal(self, 6u); + self->private_impl.f_filter_level = ((uint8_t)((v_val & 63u))); + v_val = wuffs_vp8__decoder__bool_read_literal(self, 3u); + self->private_impl.f_sharpness_level = ((uint8_t)((v_val & 7u))); + v_v = wuffs_vp8__decoder__bool_read_bool(self, 128u); + self->private_impl.f_lf_delta_enabled = (v_v != 0u); + if (self->private_impl.f_lf_delta_enabled) { + v_v = wuffs_vp8__decoder__bool_read_bool(self, 128u); + if (v_v != 0u) { + v_i = 0u; + while (v_i < 4u) { + v_v = wuffs_vp8__decoder__bool_read_bool(self, 128u); + if (v_v != 0u) { + v_val = wuffs_vp8__decoder__bool_read_literal(self, 6u); + v_val = (v_val & 63u); + v_v = wuffs_vp8__decoder__bool_read_bool(self, 128u); + if (v_v != 0u) { + self->private_impl.f_lf_ref_delta[v_i] = - ((int32_t)(v_val)); + } else { + self->private_impl.f_lf_ref_delta[v_i] = ((int32_t)(v_val)); + } + } + v_i += 1u; + } + v_i = 0u; + while (v_i < 4u) { + v_v = wuffs_vp8__decoder__bool_read_bool(self, 128u); + if (v_v != 0u) { + v_val = wuffs_vp8__decoder__bool_read_literal(self, 6u); + v_val = (v_val & 63u); + v_v = wuffs_vp8__decoder__bool_read_bool(self, 128u); + if (v_v != 0u) { + self->private_impl.f_lf_mode_delta[v_i] = - ((int32_t)(v_val)); + } else { + self->private_impl.f_lf_mode_delta[v_i] = ((int32_t)(v_val)); + } + } + v_i += 1u; + } + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.decode_partitions + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_partitions( + wuffs_vp8__decoder* self) { + uint32_t v_log2_parts = 0; + + v_log2_parts = wuffs_vp8__decoder__bool_read_literal(self, 2u); + if (v_log2_parts == 0u) { + self->private_impl.f_num_partitions = 1u; + } else if (v_log2_parts == 1u) { + self->private_impl.f_num_partitions = 2u; + } else if (v_log2_parts == 2u) { + self->private_impl.f_num_partitions = 4u; + } else { + self->private_impl.f_num_partitions = 8u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.decode_quant_indices + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_quant_indices( + wuffs_vp8__decoder* self) { + uint32_t v_val = 0; + + v_val = wuffs_vp8__decoder__bool_read_literal(self, 7u); + self->private_impl.f_quant_y_ac_qi = ((uint8_t)((v_val & 127u))); + self->private_impl.f_quant_y_dc_delta = wuffs_vp8__decoder__bool_read_signed(self, 4u); + self->private_impl.f_quant_y2_dc_delta = wuffs_vp8__decoder__bool_read_signed(self, 4u); + self->private_impl.f_quant_y2_ac_delta = wuffs_vp8__decoder__bool_read_signed(self, 4u); + self->private_impl.f_quant_uv_dc_delta = wuffs_vp8__decoder__bool_read_signed(self, 4u); + self->private_impl.f_quant_uv_ac_delta = wuffs_vp8__decoder__bool_read_signed(self, 4u); + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.decode_coeff_prob_updates + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_coeff_prob_updates( + wuffs_vp8__decoder* self) { + uint32_t v_i = 0; + uint32_t v_flag = 0; + uint32_t v_val = 0; + + v_i = 0u; + while (v_i < 1056u) { + v_flag = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__COEFF_UPDATE_PROBS[v_i]); + if (v_flag != 0u) { + v_val = wuffs_vp8__decoder__bool_read_literal(self, 8u); + self->private_data.f_coeff_probs[v_i] = ((uint8_t)(v_val)); + } + v_i += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.decode_mb_skip_coeff + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_mb_skip_coeff( + wuffs_vp8__decoder* self) { + uint32_t v_val = 0; + + v_val = wuffs_vp8__decoder__bool_read_literal(self, 1u); + self->private_impl.f_mb_no_skip_coeff = (v_val != 0u); + if (self->private_impl.f_mb_no_skip_coeff) { + v_val = wuffs_vp8__decoder__bool_read_literal(self, 8u); + self->private_impl.f_prob_skip_false = ((uint8_t)(v_val)); + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.compute_dequant_values + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__compute_dequant_values( + wuffs_vp8__decoder* self) { + uint32_t v_seg = 0; + uint32_t v_base_qi = 0; + uint32_t v_qi = 0; + int32_t v_seg_delta = 0; + uint32_t v_y_dc = 0; + uint32_t v_y2_dc = 0; + uint32_t v_y2_ac = 0; + uint32_t v_uv_dc = 0; + uint32_t v_uv_ac = 0; + uint32_t v_fl = 0; + + v_base_qi = ((uint32_t)(((uint8_t)(self->private_impl.f_quant_y_ac_qi & 127u)))); + v_seg = 0u; + while (v_seg < 4u) { + if (self->private_impl.f_use_segment) { + v_seg_delta = self->private_impl.f_segment_quant[v_seg]; + if (self->private_impl.f_segment_is_abs) { + v_qi = wuffs_vp8__decoder__clamp_qi(self, 0u, v_seg_delta); + } else { + v_qi = wuffs_vp8__decoder__clamp_qi(self, v_base_qi, v_seg_delta); + } + } else { + v_qi = v_base_qi; + } + self->private_impl.f_dequant_y_ac[v_seg] = ((uint32_t)(WUFFS_VP8__AC_QUANT[v_qi])); + v_y_dc = wuffs_vp8__decoder__clamp_qi(self, v_qi, self->private_impl.f_quant_y_dc_delta); + self->private_impl.f_dequant_y_dc[v_seg] = ((uint32_t)(WUFFS_VP8__DC_QUANT[v_y_dc])); + v_y2_dc = wuffs_vp8__decoder__clamp_qi(self, v_qi, self->private_impl.f_quant_y2_dc_delta); + self->private_impl.f_dequant_y2_dc[v_seg] = (((uint32_t)(WUFFS_VP8__DC_QUANT[v_y2_dc])) * 2u); + v_y2_ac = wuffs_vp8__decoder__clamp_qi(self, v_qi, self->private_impl.f_quant_y2_ac_delta); + self->private_impl.f_dequant_y2_ac[v_seg] = ((((uint32_t)(WUFFS_VP8__AC_QUANT[v_y2_ac])) * 155u) / 100u); + if (self->private_impl.f_dequant_y2_ac[v_seg] < 8u) { + self->private_impl.f_dequant_y2_ac[v_seg] = 8u; + } + v_uv_dc = wuffs_vp8__decoder__clamp_qi(self, v_qi, self->private_impl.f_quant_uv_dc_delta); + self->private_impl.f_dequant_uv_dc[v_seg] = ((uint32_t)(WUFFS_VP8__DC_QUANT[v_uv_dc])); + if (self->private_impl.f_dequant_uv_dc[v_seg] > 132u) { + self->private_impl.f_dequant_uv_dc[v_seg] = 132u; + } + v_uv_ac = wuffs_vp8__decoder__clamp_qi(self, v_qi, self->private_impl.f_quant_uv_ac_delta); + self->private_impl.f_dequant_uv_ac[v_seg] = ((uint32_t)(WUFFS_VP8__AC_QUANT[v_uv_ac])); + if (self->private_impl.f_use_segment) { + v_seg_delta = self->private_impl.f_segment_lf[v_seg]; + if (self->private_impl.f_segment_is_abs) { + v_fl = wuffs_vp8__decoder__clamp_qi(self, 0u, v_seg_delta); + } else { + v_fl = wuffs_vp8__decoder__clamp_qi(self, ((uint32_t)(((uint8_t)(self->private_impl.f_filter_level & 127u)))), v_seg_delta); + } + self->private_impl.f_seg_filter_level[v_seg] = ((uint32_t)(v_fl)); + } else { + self->private_impl.f_seg_filter_level[v_seg] = ((uint32_t)(self->private_impl.f_filter_level)); + } + v_seg += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.precompute_filter_strengths + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__precompute_filter_strengths( + wuffs_vp8__decoder* self) { + uint32_t v_seg = 0; + uint32_t v_i4x4 = 0; + uint32_t v_idx = 0; + uint32_t v_level = 0; + int32_t v_ref_d = 0; + int32_t v_mode_d = 0; + uint32_t v_ilevel = 0; + uint32_t v_hlevel = 0; + + v_seg = 0u; + while (v_seg < 4u) { + v_i4x4 = 0u; + while (v_i4x4 < 2u) { + v_idx = ((v_seg * 2u) + v_i4x4); + if (v_idx >= 8u) { + break; + } + v_level = self->private_impl.f_seg_filter_level[v_seg]; + if (v_level > 63u) { + v_level = 63u; + } + if (self->private_impl.f_lf_delta_enabled) { + v_ref_d = self->private_impl.f_lf_ref_delta[0u]; + if ((v_ref_d <= -1) && (v_ref_d >= -63)) { + v_level -= ((uint32_t)(( - v_ref_d & 63u))); + } else if (v_ref_d > 0u) { + v_level += ((uint32_t)((v_ref_d & 63u))); + } + if (v_i4x4 != 0u) { + v_mode_d = self->private_impl.f_lf_mode_delta[0u]; + if ((v_mode_d <= -1) && (v_mode_d >= -63)) { + v_level -= ((uint32_t)(( - v_mode_d & 63u))); + } else if (v_mode_d > 0u) { + v_level += ((uint32_t)((v_mode_d & 63u))); + } + } + if (v_level > 63u) { + if ((v_level & 2147483648u) != 0u) { + v_level = 0u; + } else { + v_level = 63u; + } + } + } + if ((v_level > 0u) && (v_level <= 63u)) { + v_ilevel = v_level; + if (self->private_impl.f_sharpness_level > 4u) { + v_ilevel >>= 2u; + } else if (self->private_impl.f_sharpness_level > 0u) { + v_ilevel >>= 1u; + } + if (self->private_impl.f_sharpness_level > 0u) { + if (v_ilevel > (9u - ((uint32_t)(self->private_impl.f_sharpness_level)))) { + v_ilevel = (9u - ((uint32_t)(self->private_impl.f_sharpness_level))); + } + } + if (v_ilevel < 1u) { + v_ilevel = 1u; + } + self->private_impl.f_fstrength_ilevel[v_idx] = ((uint8_t)(v_ilevel)); + if (v_level < 15u) { + v_hlevel = 0u; + } else if (v_level < 40u) { + v_hlevel = 1u; + } else { + v_hlevel = 2u; + } + self->private_impl.f_fstrength_hlevel[v_idx] = ((uint8_t)(v_hlevel)); + v_level = ((uint32_t)(((uint32_t)(2u * v_level)) + v_ilevel)); + self->private_impl.f_fstrength_level[v_idx] = ((uint8_t)(v_level)); + } + v_i4x4 += 1u; + } + v_seg += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.clamp_qi + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__clamp_qi( + wuffs_vp8__decoder* self, + uint32_t a_qi, + int32_t a_delta) { + uint32_t v_neg = 0; + uint32_t v_pos = 0; + + if (a_delta <= -1) { + if (a_delta <= -128) { + return 0u; + } + v_neg = ((uint32_t)(( - a_delta & 127u))); + if (a_qi <= v_neg) { + return 0u; + } + return ((uint32_t)((a_qi - v_neg))); + } + v_pos = ((uint32_t)((a_delta & 127u))); + if ((a_qi + v_pos) > 127u) { + return 127u; + } + return ((uint32_t)((a_qi + v_pos))); +} + +// -------- func vp8.decoder.asr16 + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__asr16( + wuffs_vp8__decoder* self, + uint32_t a_v) { + return ((a_v >> 16u) | ((uint32_t)(((uint32_t)(0u - (a_v >> 31u))) << 16u))); +} + +// -------- func vp8.decoder.asr3 + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__asr3( + wuffs_vp8__decoder* self, + uint32_t a_v) { + return ((a_v >> 3u) | ((uint32_t)(((uint32_t)(0u - (a_v >> 31u))) << 29u))); +} + +// -------- func vp8.decoder.idct_add + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset) { + return (*self->private_impl.choosy_idct_add)(self, a_dst, a_stride, a_coeff_offset); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset) { + uint32_t v_in0 = 0; + uint32_t v_in1 = 0; + uint32_t v_in2 = 0; + uint32_t v_in3 = 0; + uint32_t v_t0 = 0; + uint32_t v_t1 = 0; + uint32_t v_t2 = 0; + uint32_t v_t3 = 0; + uint32_t v_d0 = 0; + uint32_t v_d1 = 0; + uint32_t v_d2 = 0; + uint32_t v_d3 = 0; + uint32_t v_c1 = 0; + uint32_t v_c2 = 0; + uint32_t v_sh = 0; + uint32_t v_temp[16] = {0}; + uint32_t v_i = 0; + uint32_t v_j = 0; + uint32_t v_val = 0; + uint64_t v_idx = 0; + uint32_t v_row = 0; + + v_i = 0u; + while (v_i < 4u) { + v_in0 = self->private_data.f_mb_coeffs[(a_coeff_offset + v_i)]; + v_in1 = self->private_data.f_mb_coeffs[(a_coeff_offset + v_i + 4u)]; + v_in2 = self->private_data.f_mb_coeffs[(a_coeff_offset + v_i + 8u)]; + v_in3 = self->private_data.f_mb_coeffs[(a_coeff_offset + v_i + 12u)]; + v_t0 = ((uint32_t)(v_in0 + v_in2)); + v_t1 = ((uint32_t)(v_in0 - v_in2)); + v_sh = wuffs_vp8__decoder__asr16(self, ((uint32_t)(v_in1 * 20091u))); + v_c1 = ((uint32_t)(v_sh + v_in1)); + v_sh = wuffs_vp8__decoder__asr16(self, ((uint32_t)(v_in3 * 20091u))); + v_c2 = ((uint32_t)(v_sh + v_in3)); + v_sh = wuffs_vp8__decoder__asr16(self, ((uint32_t)(v_in1 * 35468u))); + v_t2 = ((uint32_t)(v_sh - v_c2)); + v_sh = wuffs_vp8__decoder__asr16(self, ((uint32_t)(v_in3 * 35468u))); + v_t3 = ((uint32_t)(v_c1 + v_sh)); + v_temp[v_i] = ((uint32_t)(v_t0 + v_t3)); + v_temp[(v_i + 12u)] = ((uint32_t)(v_t0 - v_t3)); + v_temp[(v_i + 4u)] = ((uint32_t)(v_t1 + v_t2)); + v_temp[(v_i + 8u)] = ((uint32_t)(v_t1 - v_t2)); + v_i += 1u; + } + v_row = 0u; + while (v_row < 4u) { + v_j = (v_row * 4u); + v_in0 = ((uint32_t)(v_temp[v_j] + 4u)); + v_in1 = v_temp[(v_j + 1u)]; + v_in2 = v_temp[(v_j + 2u)]; + v_in3 = v_temp[(v_j + 3u)]; + v_t0 = ((uint32_t)(v_in0 + v_in2)); + v_t1 = ((uint32_t)(v_in0 - v_in2)); + v_sh = wuffs_vp8__decoder__asr16(self, ((uint32_t)(v_in1 * 20091u))); + v_c1 = ((uint32_t)(v_sh + v_in1)); + v_sh = wuffs_vp8__decoder__asr16(self, ((uint32_t)(v_in3 * 20091u))); + v_c2 = ((uint32_t)(v_sh + v_in3)); + v_sh = wuffs_vp8__decoder__asr16(self, ((uint32_t)(v_in1 * 35468u))); + v_t2 = ((uint32_t)(v_sh - v_c2)); + v_sh = wuffs_vp8__decoder__asr16(self, ((uint32_t)(v_in3 * 35468u))); + v_t3 = ((uint32_t)(v_c1 + v_sh)); + v_d0 = wuffs_vp8__decoder__asr3(self, ((uint32_t)(v_t0 + v_t3))); + v_d1 = wuffs_vp8__decoder__asr3(self, ((uint32_t)(v_t1 + v_t2))); + v_d2 = wuffs_vp8__decoder__asr3(self, ((uint32_t)(v_t1 - v_t2))); + v_d3 = wuffs_vp8__decoder__asr3(self, ((uint32_t)(v_t0 - v_t3))); + v_idx = (((uint64_t)(v_row)) * ((uint64_t)(a_stride))); + if (v_idx < ((uint64_t)(a_dst.len))) { + v_val = ((uint32_t)(((uint32_t)(a_dst.ptr[v_idx])) + v_d0)); + if (v_val > 255u) { + if ((v_val & 2147483648u) != 0u) { + v_val = 0u; + } else { + v_val = 255u; + } + } + a_dst.ptr[v_idx] = ((uint8_t)(v_val)); + } + v_idx += 1u; + if (v_idx < ((uint64_t)(a_dst.len))) { + v_val = ((uint32_t)(((uint32_t)(a_dst.ptr[v_idx])) + v_d1)); + if (v_val > 255u) { + if ((v_val & 2147483648u) != 0u) { + v_val = 0u; + } else { + v_val = 255u; + } + } + a_dst.ptr[v_idx] = ((uint8_t)(v_val)); + } + v_idx += 1u; + if (v_idx < ((uint64_t)(a_dst.len))) { + v_val = ((uint32_t)(((uint32_t)(a_dst.ptr[v_idx])) + v_d2)); + if (v_val > 255u) { + if ((v_val & 2147483648u) != 0u) { + v_val = 0u; + } else { + v_val = 255u; + } + } + a_dst.ptr[v_idx] = ((uint8_t)(v_val)); + } + v_idx += 1u; + if (v_idx < ((uint64_t)(a_dst.len))) { + v_val = ((uint32_t)(((uint32_t)(a_dst.ptr[v_idx])) + v_d3)); + if (v_val > 255u) { + if ((v_val & 2147483648u) != 0u) { + v_val = 0u; + } else { + v_val = 255u; + } + } + a_dst.ptr[v_idx] = ((uint8_t)(v_val)); + } + v_row += 1u; + } + v_i = 0u; + while (v_i < 16u) { + self->private_data.f_mb_coeffs[(a_coeff_offset + v_i)] = 0u; + v_i += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.idct_dc_add + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset) { + return (*self->private_impl.choosy_idct_dc_add)(self, a_dst, a_stride, a_coeff_offset); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset) { + uint32_t v_dc = 0; + uint32_t v_row = 0; + uint64_t v_idx = 0; + uint32_t v_val = 0; + + v_dc = wuffs_vp8__decoder__asr3(self, ((uint32_t)(self->private_data.f_mb_coeffs[a_coeff_offset] + 4u))); + self->private_data.f_mb_coeffs[a_coeff_offset] = 0u; + v_row = 0u; + while (v_row < 4u) { + v_idx = (((uint64_t)(v_row)) * ((uint64_t)(a_stride))); + if (v_idx < ((uint64_t)(a_dst.len))) { + v_val = ((uint32_t)(((uint32_t)(a_dst.ptr[v_idx])) + v_dc)); + if (v_val > 255u) { + if ((v_val & 2147483648u) != 0u) { + v_val = 0u; + } else { + v_val = 255u; + } + } + a_dst.ptr[v_idx] = ((uint8_t)(v_val)); + } + v_idx += 1u; + if (v_idx < ((uint64_t)(a_dst.len))) { + v_val = ((uint32_t)(((uint32_t)(a_dst.ptr[v_idx])) + v_dc)); + if (v_val > 255u) { + if ((v_val & 2147483648u) != 0u) { + v_val = 0u; + } else { + v_val = 255u; + } + } + a_dst.ptr[v_idx] = ((uint8_t)(v_val)); + } + v_idx += 1u; + if (v_idx < ((uint64_t)(a_dst.len))) { + v_val = ((uint32_t)(((uint32_t)(a_dst.ptr[v_idx])) + v_dc)); + if (v_val > 255u) { + if ((v_val & 2147483648u) != 0u) { + v_val = 0u; + } else { + v_val = 255u; + } + } + a_dst.ptr[v_idx] = ((uint8_t)(v_val)); + } + v_idx += 1u; + if (v_idx < ((uint64_t)(a_dst.len))) { + v_val = ((uint32_t)(((uint32_t)(a_dst.ptr[v_idx])) + v_dc)); + if (v_val > 255u) { + if ((v_val & 2147483648u) != 0u) { + v_val = 0u; + } else { + v_val = 255u; + } + } + a_dst.ptr[v_idx] = ((uint8_t)(v_val)); + } + v_row += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.idct_add_pair + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add_pair( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b) { + return (*self->private_impl.choosy_idct_add_pair)(self, a_dst, a_stride, a_coeff_offset_a, a_coeff_offset_b); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add_pair__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b) { + wuffs_vp8__decoder__idct_add(self, a_dst, a_stride, a_coeff_offset_a); + if (4u <= ((uint64_t)(a_dst.len))) { + wuffs_vp8__decoder__idct_add(self, wuffs_base__slice_u8__subslice_i(a_dst, 4u), a_stride, a_coeff_offset_b); + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.idct_dc_add_pair + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add_pair( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b) { + return (*self->private_impl.choosy_idct_dc_add_pair)(self, a_dst, a_stride, a_coeff_offset_a, a_coeff_offset_b); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add_pair__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b) { + wuffs_vp8__decoder__idct_dc_add(self, a_dst, a_stride, a_coeff_offset_a); + if (4u <= ((uint64_t)(a_dst.len))) { + wuffs_vp8__decoder__idct_dc_add(self, wuffs_base__slice_u8__subslice_i(a_dst, 4u), a_stride, a_coeff_offset_b); + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.wht + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__wht( + wuffs_vp8__decoder* self, + uint32_t a_coeff_offset) { + uint32_t v_temp[16] = {0}; + uint32_t v_i = 0; + uint32_t v_j = 0; + uint32_t v_a0 = 0; + uint32_t v_a1 = 0; + uint32_t v_a2 = 0; + uint32_t v_a3 = 0; + uint32_t v_b0 = 0; + uint32_t v_b1 = 0; + uint32_t v_b2 = 0; + uint32_t v_b3 = 0; + + v_i = 0u; + while (v_i < 4u) { + v_a0 = self->private_data.f_mb_coeffs[(a_coeff_offset + v_i)]; + v_a1 = self->private_data.f_mb_coeffs[(a_coeff_offset + v_i + 4u)]; + v_a2 = self->private_data.f_mb_coeffs[(a_coeff_offset + v_i + 8u)]; + v_a3 = self->private_data.f_mb_coeffs[(a_coeff_offset + v_i + 12u)]; + v_b0 = ((uint32_t)(v_a0 + v_a3)); + v_b1 = ((uint32_t)(v_a1 + v_a2)); + v_b2 = ((uint32_t)(v_a1 - v_a2)); + v_b3 = ((uint32_t)(v_a0 - v_a3)); + v_temp[v_i] = ((uint32_t)(v_b0 + v_b1)); + v_temp[(v_i + 4u)] = ((uint32_t)(v_b3 + v_b2)); + v_temp[(v_i + 8u)] = ((uint32_t)(v_b0 - v_b1)); + v_temp[(v_i + 12u)] = ((uint32_t)(v_b3 - v_b2)); + v_i += 1u; + } + v_i = 0u; + while (v_i < 4u) { + v_j = (v_i * 4u); + v_a0 = v_temp[v_j]; + v_a1 = v_temp[(v_j + 1u)]; + v_a2 = v_temp[(v_j + 2u)]; + v_a3 = v_temp[(v_j + 3u)]; + v_b0 = ((uint32_t)(v_a0 + v_a3)); + v_b1 = ((uint32_t)(v_a1 + v_a2)); + v_b2 = ((uint32_t)(v_a1 - v_a2)); + v_b3 = ((uint32_t)(v_a0 - v_a3)); + v_temp[v_j] = wuffs_vp8__decoder__asr3(self, ((uint32_t)(((uint32_t)(v_b0 + v_b1)) + 3u))); + v_temp[(v_j + 1u)] = wuffs_vp8__decoder__asr3(self, ((uint32_t)(((uint32_t)(v_b3 + v_b2)) + 3u))); + v_temp[(v_j + 2u)] = wuffs_vp8__decoder__asr3(self, ((uint32_t)(((uint32_t)(v_b0 - v_b1)) + 3u))); + v_temp[(v_j + 3u)] = wuffs_vp8__decoder__asr3(self, ((uint32_t)(((uint32_t)(v_b3 - v_b2)) + 3u))); + v_i += 1u; + } + v_i = 0u; + while (v_i < 16u) { + self->private_data.f_mb_coeffs[(v_i * 16u)] = v_temp[v_i]; + v_i += 1u; + } + v_i = 0u; + while (v_i < 16u) { + self->private_data.f_mb_coeffs[(a_coeff_offset + v_i)] = 0u; + v_i += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.idct_add_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset) { + uint32x4_t v_load0 = {0}; + uint32x4_t v_load1 = {0}; + uint16x4_t v_low = {0}; + uint16x8_t v_r01 = {0}; + uint16x8_t v_r23 = {0}; + uint16x8_t v_b1 = {0}; + uint16x8_t v_mul1 = {0}; + uint16x8_t v_c0 = {0}; + uint16x8_t v_c1 = {0}; + uint16x4_t v_a_val = {0}; + uint16x4_t v_b_val = {0}; + uint16x4_t v_c_val = {0}; + uint16x4_t v_d_val = {0}; + uint16x8_t v_d0 = {0}; + uint16x8_t v_d1 = {0}; + uint16x8_t v_e0 = {0}; + uint16x8_t v_e_tmp = {0}; + uint16x8_t v_e1 = {0}; + uint16x8_t v_t0 = {0}; + uint16x8_t v_t1 = {0}; + uint16x8_t v_k4 = {0}; + uint8x8_t v_pred01 = {0}; + uint8x8_t v_pred23 = {0}; + uint16x8_t v_pred01_w = {0}; + uint16x8_t v_pred23_w = {0}; + uint16x8_t v_out01 = {0}; + uint16x8_t v_out23 = {0}; + uint8x8_t v_out01_u8 = {0}; + uint8x8_t v_out23_u8 = {0}; + uint32_t v_val = 0; + uint32_t v_off = 0; + uint32_t v_i = 0; + + v_off = a_coeff_offset; + v_load0 = vld1q_u32(self->private_data.f_mb_coeffs + v_off); + v_load1 = vld1q_u32(self->private_data.f_mb_coeffs + (v_off + 4u)); + v_low = vmovn_u32(v_load0); + v_r01 = vmovn_high_u32(v_low, v_load1); + v_load0 = vld1q_u32(self->private_data.f_mb_coeffs + (v_off + 8u)); + v_load1 = vld1q_u32(self->private_data.f_mb_coeffs + (v_off + 12u)); + v_low = vmovn_u32(v_load0); + v_r23 = vmovn_high_u32(v_low, v_load1); + v_b1 = vcombine_u16(vget_high_u16(v_r01), vget_high_u16(v_r23)); + v_mul1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(v_b1), 20091u)); + v_c0 = vaddq_u16(v_b1, vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_mul1), 1u))); + v_c1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(v_b1), 17734u)); + v_a_val = vadd_u16(vget_low_u16(v_r01), vget_low_u16(v_r23)); + v_b_val = vsub_u16(vget_low_u16(v_r01), vget_low_u16(v_r23)); + v_c_val = vsub_u16(vget_low_u16(v_c1), vget_high_u16(v_c0)); + v_d_val = vadd_u16(vget_low_u16(v_c0), vget_high_u16(v_c1)); + v_d0 = vcombine_u16(v_a_val, v_b_val); + v_d1 = vcombine_u16(v_d_val, v_c_val); + v_e0 = vaddq_u16(v_d0, v_d1); + v_e_tmp = vsubq_u16(v_d0, v_d1); + v_e1 = vcombine_u16(vget_high_u16(v_e_tmp), vget_low_u16(v_e_tmp)); + v_t0 = vzip1q_u16(v_e0, v_e1); + v_t1 = vzip2q_u16(v_e0, v_e1); + v_r01 = vzip1q_u16(v_t0, v_t1); + v_r23 = vzip2q_u16(v_t0, v_t1); + v_b1 = vcombine_u16(vget_high_u16(v_r01), vget_high_u16(v_r23)); + v_mul1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(v_b1), 20091u)); + v_c0 = vaddq_u16(v_b1, vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_mul1), 1u))); + v_c1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(v_b1), 17734u)); + v_a_val = vadd_u16(vget_low_u16(v_r01), vget_low_u16(v_r23)); + v_b_val = vsub_u16(vget_low_u16(v_r01), vget_low_u16(v_r23)); + v_c_val = vsub_u16(vget_low_u16(v_c1), vget_high_u16(v_c0)); + v_d_val = vadd_u16(vget_low_u16(v_c0), vget_high_u16(v_c1)); + v_d0 = vcombine_u16(v_a_val, v_b_val); + v_d1 = vcombine_u16(v_d_val, v_c_val); + v_e0 = vaddq_u16(v_d0, v_d1); + v_e_tmp = vsubq_u16(v_d0, v_d1); + v_e1 = vcombine_u16(vget_high_u16(v_e_tmp), vget_low_u16(v_e_tmp)); + v_t0 = vzip1q_u16(v_e0, v_e1); + v_t1 = vzip2q_u16(v_e0, v_e1); + v_r01 = vzip1q_u16(v_t0, v_t1); + v_r23 = vzip2q_u16(v_t0, v_t1); + v_k4 = vdupq_n_u16(4u); + v_r01 = vaddq_u16(v_r01, v_k4); + v_r23 = vaddq_u16(v_r23, v_k4); + v_r01 = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_r01), 3u)); + v_r23 = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_r23), 3u)); + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred01 = ((uint8x8_t){a_dst.ptr[0u], a_dst.ptr[1u], a_dst.ptr[2u], a_dst.ptr[3u], 0u, 0u, 0u, 0u}); + v_pred01_w = vmovl_u8(v_pred01); + v_out01 = vaddq_u16(v_pred01_w, v_r01); + v_out01_u8 = vqmovun_s16(vreinterpretq_s16_u16(v_out01)); + v_val = vget_lane_u32(vreinterpret_u32_u8(v_out01_u8), 0u); + a_dst.ptr[0u] = ((uint8_t)(v_val)); + a_dst.ptr[1u] = ((uint8_t)((v_val >> 8u))); + a_dst.ptr[2u] = ((uint8_t)((v_val >> 16u))); + a_dst.ptr[3u] = ((uint8_t)((v_val >> 24u))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred01 = ((uint8x8_t){0u, 0u, 0u, 0u, a_dst.ptr[0u], a_dst.ptr[1u], a_dst.ptr[2u], a_dst.ptr[3u]}); + v_pred01_w = vmovl_u8(v_pred01); + v_out01 = vaddq_u16(v_pred01_w, v_r01); + v_out01_u8 = vqmovun_s16(vreinterpretq_s16_u16(v_out01)); + v_val = vget_lane_u32(vreinterpret_u32_u8(v_out01_u8), 1u); + a_dst.ptr[0u] = ((uint8_t)(v_val)); + a_dst.ptr[1u] = ((uint8_t)((v_val >> 8u))); + a_dst.ptr[2u] = ((uint8_t)((v_val >> 16u))); + a_dst.ptr[3u] = ((uint8_t)((v_val >> 24u))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred23 = ((uint8x8_t){a_dst.ptr[0u], a_dst.ptr[1u], a_dst.ptr[2u], a_dst.ptr[3u], 0u, 0u, 0u, 0u}); + v_pred23_w = vmovl_u8(v_pred23); + v_out23 = vaddq_u16(v_pred23_w, v_r23); + v_out23_u8 = vqmovun_s16(vreinterpretq_s16_u16(v_out23)); + v_val = vget_lane_u32(vreinterpret_u32_u8(v_out23_u8), 0u); + a_dst.ptr[0u] = ((uint8_t)(v_val)); + a_dst.ptr[1u] = ((uint8_t)((v_val >> 8u))); + a_dst.ptr[2u] = ((uint8_t)((v_val >> 16u))); + a_dst.ptr[3u] = ((uint8_t)((v_val >> 24u))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred23 = ((uint8x8_t){0u, 0u, 0u, 0u, a_dst.ptr[0u], a_dst.ptr[1u], a_dst.ptr[2u], a_dst.ptr[3u]}); + v_pred23_w = vmovl_u8(v_pred23); + v_out23 = vaddq_u16(v_pred23_w, v_r23); + v_out23_u8 = vqmovun_s16(vreinterpretq_s16_u16(v_out23)); + v_val = vget_lane_u32(vreinterpret_u32_u8(v_out23_u8), 1u); + a_dst.ptr[0u] = ((uint8_t)(v_val)); + a_dst.ptr[1u] = ((uint8_t)((v_val >> 8u))); + a_dst.ptr[2u] = ((uint8_t)((v_val >> 16u))); + a_dst.ptr[3u] = ((uint8_t)((v_val >> 24u))); + } + v_i = 0u; + while (v_i < 16u) { + self->private_data.f_mb_coeffs[(v_off + v_i)] = 0u; + v_i += 1u; + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.idct_dc_add_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset) { + uint16x8_t v_dc_vec = {0}; + uint16x8_t v_k4 = {0}; + uint8x8_t v_pred = {0}; + uint16x8_t v_pred_w = {0}; + uint16x8_t v_out = {0}; + uint8x8_t v_out_u8 = {0}; + uint32_t v_val = 0; + + v_dc_vec = vdupq_n_u16(((uint16_t)(self->private_data.f_mb_coeffs[a_coeff_offset]))); + v_k4 = vdupq_n_u16(4u); + v_dc_vec = vaddq_u16(v_dc_vec, v_k4); + v_dc_vec = vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(v_dc_vec), 3u)); + self->private_data.f_mb_coeffs[a_coeff_offset] = 0u; + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = ((uint8x8_t){a_dst.ptr[0u], a_dst.ptr[1u], a_dst.ptr[2u], a_dst.ptr[3u], 0u, 0u, 0u, 0u}); + v_pred_w = vmovl_u8(v_pred); + v_out = vaddq_u16(v_pred_w, v_dc_vec); + v_out_u8 = vqmovun_s16(vreinterpretq_s16_u16(v_out)); + v_val = vget_lane_u32(vreinterpret_u32_u8(v_out_u8), 0u); + a_dst.ptr[0u] = ((uint8_t)(v_val)); + a_dst.ptr[1u] = ((uint8_t)((v_val >> 8u))); + a_dst.ptr[2u] = ((uint8_t)((v_val >> 16u))); + a_dst.ptr[3u] = ((uint8_t)((v_val >> 24u))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = ((uint8x8_t){a_dst.ptr[0u], a_dst.ptr[1u], a_dst.ptr[2u], a_dst.ptr[3u], 0u, 0u, 0u, 0u}); + v_pred_w = vmovl_u8(v_pred); + v_out = vaddq_u16(v_pred_w, v_dc_vec); + v_out_u8 = vqmovun_s16(vreinterpretq_s16_u16(v_out)); + v_val = vget_lane_u32(vreinterpret_u32_u8(v_out_u8), 0u); + a_dst.ptr[0u] = ((uint8_t)(v_val)); + a_dst.ptr[1u] = ((uint8_t)((v_val >> 8u))); + a_dst.ptr[2u] = ((uint8_t)((v_val >> 16u))); + a_dst.ptr[3u] = ((uint8_t)((v_val >> 24u))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = ((uint8x8_t){a_dst.ptr[0u], a_dst.ptr[1u], a_dst.ptr[2u], a_dst.ptr[3u], 0u, 0u, 0u, 0u}); + v_pred_w = vmovl_u8(v_pred); + v_out = vaddq_u16(v_pred_w, v_dc_vec); + v_out_u8 = vqmovun_s16(vreinterpretq_s16_u16(v_out)); + v_val = vget_lane_u32(vreinterpret_u32_u8(v_out_u8), 0u); + a_dst.ptr[0u] = ((uint8_t)(v_val)); + a_dst.ptr[1u] = ((uint8_t)((v_val >> 8u))); + a_dst.ptr[2u] = ((uint8_t)((v_val >> 16u))); + a_dst.ptr[3u] = ((uint8_t)((v_val >> 24u))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = ((uint8x8_t){a_dst.ptr[0u], a_dst.ptr[1u], a_dst.ptr[2u], a_dst.ptr[3u], 0u, 0u, 0u, 0u}); + v_pred_w = vmovl_u8(v_pred); + v_out = vaddq_u16(v_pred_w, v_dc_vec); + v_out_u8 = vqmovun_s16(vreinterpretq_s16_u16(v_out)); + v_val = vget_lane_u32(vreinterpret_u32_u8(v_out_u8), 0u); + a_dst.ptr[0u] = ((uint8_t)(v_val)); + a_dst.ptr[1u] = ((uint8_t)((v_val >> 8u))); + a_dst.ptr[2u] = ((uint8_t)((v_val >> 16u))); + a_dst.ptr[3u] = ((uint8_t)((v_val >> 24u))); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +x86_avx2 +// -------- func vp8.decoder.idct_add_pair_x86_avx2 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add_pair_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b) { + __m256i v_k1 = {0}; + __m256i v_k2 = {0}; + __m256i v_k_4 = {0}; + __m128i v_k_0_128 = {0}; + __m256i v_row0 = {0}; + __m256i v_row1 = {0}; + __m256i v_row2 = {0}; + __m256i v_row3 = {0}; + __m128i v_la = {0}; + __m128i v_lb = {0}; + __m256i v_a = {0}; + __m256i v_b = {0}; + __m256i v_c = {0}; + __m256i v_d = {0}; + __m256i v_c1 = {0}; + __m256i v_c2 = {0}; + __m256i v_c3 = {0}; + __m256i v_c4 = {0}; + __m256i v_d1 = {0}; + __m256i v_d2 = {0}; + __m256i v_d3 = {0}; + __m256i v_d4 = {0}; + __m256i v_tr0 = {0}; + __m256i v_tr1 = {0}; + __m256i v_tr2 = {0}; + __m256i v_tr3 = {0}; + __m256i v_ts0 = {0}; + __m256i v_ts1 = {0}; + __m256i v_ts2 = {0}; + __m256i v_ts3 = {0}; + __m128i v_oa = {0}; + __m128i v_ob = {0}; + uint32_t v_off_a = 0; + uint32_t v_off_b = 0; + uint32_t v_i = 0; + + v_off_a = a_coeff_offset_a; + v_off_b = a_coeff_offset_b; + v_k1 = _mm256_set1_epi16((int16_t)(20091u)); + v_k2 = _mm256_set1_epi16((int16_t)(35468u)); + v_k_4 = _mm256_set1_epi16((int16_t)(4u)); + v_k_0_128 = _mm_setzero_si128(); + v_la = _mm_packs_epi32(_mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + v_off_a)), v_k_0_128); + v_lb = _mm_packs_epi32(_mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + v_off_b)), v_k_0_128); + v_row0 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_la), v_lb, (int32_t)(1u)); + v_la = _mm_packs_epi32(_mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + (v_off_a + 4u))), v_k_0_128); + v_lb = _mm_packs_epi32(_mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + (v_off_b + 4u))), v_k_0_128); + v_row1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_la), v_lb, (int32_t)(1u)); + v_la = _mm_packs_epi32(_mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + (v_off_a + 8u))), v_k_0_128); + v_lb = _mm_packs_epi32(_mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + (v_off_b + 8u))), v_k_0_128); + v_row2 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_la), v_lb, (int32_t)(1u)); + v_la = _mm_packs_epi32(_mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + (v_off_a + 12u))), v_k_0_128); + v_lb = _mm_packs_epi32(_mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + (v_off_b + 12u))), v_k_0_128); + v_row3 = _mm256_inserti128_si256(_mm256_castsi128_si256(v_la), v_lb, (int32_t)(1u)); + v_a = _mm256_add_epi16(v_row0, v_row2); + v_b = _mm256_sub_epi16(v_row0, v_row2); + v_c1 = _mm256_mulhi_epi16(v_row1, v_k2); + v_c2 = _mm256_mulhi_epi16(v_row3, v_k1); + v_c3 = _mm256_sub_epi16(v_row1, v_row3); + v_c4 = _mm256_sub_epi16(v_c1, v_c2); + v_c = _mm256_add_epi16(v_c3, v_c4); + v_d1 = _mm256_mulhi_epi16(v_row1, v_k1); + v_d2 = _mm256_mulhi_epi16(v_row3, v_k2); + v_d3 = _mm256_add_epi16(v_row1, v_row3); + v_d4 = _mm256_add_epi16(v_d1, v_d2); + v_d = _mm256_add_epi16(v_d3, v_d4); + v_row0 = _mm256_add_epi16(v_a, v_d); + v_row1 = _mm256_add_epi16(v_b, v_c); + v_row2 = _mm256_sub_epi16(v_b, v_c); + v_row3 = _mm256_sub_epi16(v_a, v_d); + v_tr0 = _mm256_unpacklo_epi16(v_row0, v_row1); + v_tr1 = _mm256_unpacklo_epi16(v_row2, v_row3); + v_tr2 = _mm256_unpackhi_epi16(v_row0, v_row1); + v_tr3 = _mm256_unpackhi_epi16(v_row2, v_row3); + v_ts0 = _mm256_unpacklo_epi32(v_tr0, v_tr1); + v_ts1 = _mm256_unpackhi_epi32(v_tr0, v_tr1); + v_ts2 = _mm256_unpacklo_epi32(v_tr2, v_tr3); + v_ts3 = _mm256_unpackhi_epi32(v_tr2, v_tr3); + v_row0 = _mm256_unpacklo_epi64(v_ts0, v_ts2); + v_row1 = _mm256_unpackhi_epi64(v_ts0, v_ts2); + v_row2 = _mm256_unpacklo_epi64(v_ts1, v_ts3); + v_row3 = _mm256_unpackhi_epi64(v_ts1, v_ts3); + v_row0 = _mm256_add_epi16(v_row0, v_k_4); + v_a = _mm256_add_epi16(v_row0, v_row2); + v_b = _mm256_sub_epi16(v_row0, v_row2); + v_c1 = _mm256_mulhi_epi16(v_row1, v_k2); + v_c2 = _mm256_mulhi_epi16(v_row3, v_k1); + v_c3 = _mm256_sub_epi16(v_row1, v_row3); + v_c4 = _mm256_sub_epi16(v_c1, v_c2); + v_c = _mm256_add_epi16(v_c3, v_c4); + v_d1 = _mm256_mulhi_epi16(v_row1, v_k1); + v_d2 = _mm256_mulhi_epi16(v_row3, v_k2); + v_d3 = _mm256_add_epi16(v_row1, v_row3); + v_d4 = _mm256_add_epi16(v_d1, v_d2); + v_d = _mm256_add_epi16(v_d3, v_d4); + v_row0 = _mm256_srai_epi16(_mm256_add_epi16(v_a, v_d), (int32_t)(3u)); + v_row1 = _mm256_srai_epi16(_mm256_add_epi16(v_b, v_c), (int32_t)(3u)); + v_row2 = _mm256_srai_epi16(_mm256_sub_epi16(v_b, v_c), (int32_t)(3u)); + v_row3 = _mm256_srai_epi16(_mm256_sub_epi16(v_a, v_d), (int32_t)(3u)); + v_tr0 = _mm256_unpacklo_epi16(v_row0, v_row1); + v_tr1 = _mm256_unpacklo_epi16(v_row2, v_row3); + v_tr2 = _mm256_unpackhi_epi16(v_row0, v_row1); + v_tr3 = _mm256_unpackhi_epi16(v_row2, v_row3); + v_ts0 = _mm256_unpacklo_epi32(v_tr0, v_tr1); + v_ts1 = _mm256_unpackhi_epi32(v_tr0, v_tr1); + v_ts2 = _mm256_unpacklo_epi32(v_tr2, v_tr3); + v_ts3 = _mm256_unpackhi_epi32(v_tr2, v_tr3); + v_row0 = _mm256_unpacklo_epi64(v_ts0, v_ts2); + v_row1 = _mm256_unpackhi_epi64(v_ts0, v_ts2); + v_row2 = _mm256_unpacklo_epi64(v_ts1, v_ts3); + v_row3 = _mm256_unpackhi_epi64(v_ts1, v_ts3); + if (8u <= ((uint64_t)(a_dst.len))) { + v_oa = _mm256_castsi256_si128(v_row0); + v_ob = _mm256_extracti128_si256(v_row0, (int32_t)(1u)); + v_la = _mm_unpacklo_epi64(v_oa, v_ob); + v_lb = _mm_unpacklo_epi8(_mm_cvtsi64_si128((int64_t)(wuffs_base__peek_u64le__no_bounds_check(a_dst.ptr))), v_k_0_128); + v_la = _mm_packus_epi16(_mm_add_epi16(v_lb, v_la), v_la); + wuffs_base__poke_u64le__no_bounds_check(a_dst.ptr, ((uint64_t)(_mm_cvtsi128_si64(v_la)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (8u <= ((uint64_t)(a_dst.len))) { + v_oa = _mm256_castsi256_si128(v_row1); + v_ob = _mm256_extracti128_si256(v_row1, (int32_t)(1u)); + v_la = _mm_unpacklo_epi64(v_oa, v_ob); + v_lb = _mm_unpacklo_epi8(_mm_cvtsi64_si128((int64_t)(wuffs_base__peek_u64le__no_bounds_check(a_dst.ptr))), v_k_0_128); + v_la = _mm_packus_epi16(_mm_add_epi16(v_lb, v_la), v_la); + wuffs_base__poke_u64le__no_bounds_check(a_dst.ptr, ((uint64_t)(_mm_cvtsi128_si64(v_la)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (8u <= ((uint64_t)(a_dst.len))) { + v_oa = _mm256_castsi256_si128(v_row2); + v_ob = _mm256_extracti128_si256(v_row2, (int32_t)(1u)); + v_la = _mm_unpacklo_epi64(v_oa, v_ob); + v_lb = _mm_unpacklo_epi8(_mm_cvtsi64_si128((int64_t)(wuffs_base__peek_u64le__no_bounds_check(a_dst.ptr))), v_k_0_128); + v_la = _mm_packus_epi16(_mm_add_epi16(v_lb, v_la), v_la); + wuffs_base__poke_u64le__no_bounds_check(a_dst.ptr, ((uint64_t)(_mm_cvtsi128_si64(v_la)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (8u <= ((uint64_t)(a_dst.len))) { + v_oa = _mm256_castsi256_si128(v_row3); + v_ob = _mm256_extracti128_si256(v_row3, (int32_t)(1u)); + v_la = _mm_unpacklo_epi64(v_oa, v_ob); + v_lb = _mm_unpacklo_epi8(_mm_cvtsi64_si128((int64_t)(wuffs_base__peek_u64le__no_bounds_check(a_dst.ptr))), v_k_0_128); + v_la = _mm_packus_epi16(_mm_add_epi16(v_lb, v_la), v_la); + wuffs_base__poke_u64le__no_bounds_check(a_dst.ptr, ((uint64_t)(_mm_cvtsi128_si64(v_la)))); + } + v_i = 0u; + while (v_i < 16u) { + self->private_data.f_mb_coeffs[(v_off_a + v_i)] = 0u; + self->private_data.f_mb_coeffs[(v_off_b + v_i)] = 0u; + v_i += 1u; + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +// ‼ WUFFS MULTI-FILE SECTION -x86_avx2 + +// ‼ WUFFS MULTI-FILE SECTION +x86_avx2 +// -------- func vp8.decoder.idct_dc_add_pair_x86_avx2 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add_pair_x86_avx2( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset_a, + uint32_t a_coeff_offset_b) { + __m128i v_k_0 = {0}; + __m128i v_dc = {0}; + __m128i v_la = {0}; + __m128i v_lb = {0}; + uint32_t v_off_a = 0; + uint32_t v_off_b = 0; + uint32_t v_dc_a = 0; + uint32_t v_dc_b = 0; + + v_off_a = a_coeff_offset_a; + v_off_b = a_coeff_offset_b; + v_k_0 = _mm_setzero_si128(); + v_dc_a = ((uint32_t)(self->private_data.f_mb_coeffs[v_off_a] + 4u)); + v_dc_a = ((v_dc_a >> 3u) | ((uint32_t)(((uint32_t)(0u - (v_dc_a >> 31u))) << 29u))); + self->private_data.f_mb_coeffs[v_off_a] = 0u; + v_dc_b = ((uint32_t)(self->private_data.f_mb_coeffs[v_off_b] + 4u)); + v_dc_b = ((v_dc_b >> 3u) | ((uint32_t)(((uint32_t)(0u - (v_dc_b >> 31u))) << 29u))); + self->private_data.f_mb_coeffs[v_off_b] = 0u; + v_la = _mm_set1_epi16((int16_t)(((uint16_t)(v_dc_a)))); + v_lb = _mm_set1_epi16((int16_t)(((uint16_t)(v_dc_b)))); + v_dc = _mm_unpacklo_epi64(v_la, v_lb); + if (8u <= ((uint64_t)(a_dst.len))) { + v_la = _mm_unpacklo_epi8(_mm_cvtsi64_si128((int64_t)(wuffs_base__peek_u64le__no_bounds_check(a_dst.ptr))), v_k_0); + v_la = _mm_packus_epi16(_mm_add_epi16(v_la, v_dc), v_la); + wuffs_base__poke_u64le__no_bounds_check(a_dst.ptr, ((uint64_t)(_mm_cvtsi128_si64(v_la)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (8u <= ((uint64_t)(a_dst.len))) { + v_la = _mm_unpacklo_epi8(_mm_cvtsi64_si128((int64_t)(wuffs_base__peek_u64le__no_bounds_check(a_dst.ptr))), v_k_0); + v_la = _mm_packus_epi16(_mm_add_epi16(v_la, v_dc), v_la); + wuffs_base__poke_u64le__no_bounds_check(a_dst.ptr, ((uint64_t)(_mm_cvtsi128_si64(v_la)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (8u <= ((uint64_t)(a_dst.len))) { + v_la = _mm_unpacklo_epi8(_mm_cvtsi64_si128((int64_t)(wuffs_base__peek_u64le__no_bounds_check(a_dst.ptr))), v_k_0); + v_la = _mm_packus_epi16(_mm_add_epi16(v_la, v_dc), v_la); + wuffs_base__poke_u64le__no_bounds_check(a_dst.ptr, ((uint64_t)(_mm_cvtsi128_si64(v_la)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (8u <= ((uint64_t)(a_dst.len))) { + v_la = _mm_unpacklo_epi8(_mm_cvtsi64_si128((int64_t)(wuffs_base__peek_u64le__no_bounds_check(a_dst.ptr))), v_k_0); + v_la = _mm_packus_epi16(_mm_add_epi16(v_la, v_dc), v_la); + wuffs_base__poke_u64le__no_bounds_check(a_dst.ptr, ((uint64_t)(_mm_cvtsi128_si64(v_la)))); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +// ‼ WUFFS MULTI-FILE SECTION -x86_avx2 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.idct_add_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_add_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset) { + __m128i v_k1 = {0}; + __m128i v_k2 = {0}; + __m128i v_k_4 = {0}; + __m128i v_k_0 = {0}; + __m128i v_row0 = {0}; + __m128i v_row1 = {0}; + __m128i v_row2 = {0}; + __m128i v_row3 = {0}; + __m128i v_load0 = {0}; + __m128i v_load1 = {0}; + __m128i v_load2 = {0}; + __m128i v_load3 = {0}; + __m128i v_a = {0}; + __m128i v_b = {0}; + __m128i v_c = {0}; + __m128i v_d = {0}; + __m128i v_c1 = {0}; + __m128i v_c2 = {0}; + __m128i v_c3 = {0}; + __m128i v_c4 = {0}; + __m128i v_d1 = {0}; + __m128i v_d2 = {0}; + __m128i v_d3 = {0}; + __m128i v_d4 = {0}; + __m128i v_tr0 = {0}; + __m128i v_tr1 = {0}; + __m128i v_tr2 = {0}; + __m128i v_tr3 = {0}; + __m128i v_ts0 = {0}; + __m128i v_ts1 = {0}; + __m128i v_ts2 = {0}; + __m128i v_ts3 = {0}; + __m128i v_pred = {0}; + __m128i v_pred16 = {0}; + __m128i v_sum = {0}; + __m128i v_out = {0}; + uint32_t v_off = 0; + uint32_t v_i = 0; + + v_off = a_coeff_offset; + v_k1 = _mm_set1_epi16((int16_t)(20091u)); + v_k2 = _mm_set1_epi16((int16_t)(35468u)); + v_k_4 = _mm_set1_epi16((int16_t)(4u)); + v_k_0 = _mm_setzero_si128(); + v_load0 = _mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + v_off)); + v_load1 = _mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + (v_off + 4u))); + v_load2 = _mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + (v_off + 8u))); + v_load3 = _mm_lddqu_si128((const __m128i*)(const void*)(self->private_data.f_mb_coeffs + (v_off + 12u))); + v_row0 = _mm_packs_epi32(v_load0, v_k_0); + v_row1 = _mm_packs_epi32(v_load1, v_k_0); + v_row2 = _mm_packs_epi32(v_load2, v_k_0); + v_row3 = _mm_packs_epi32(v_load3, v_k_0); + v_a = _mm_add_epi16(v_row0, v_row2); + v_b = _mm_sub_epi16(v_row0, v_row2); + v_c1 = _mm_mulhi_epi16(v_row1, v_k2); + v_c2 = _mm_mulhi_epi16(v_row3, v_k1); + v_c3 = _mm_sub_epi16(v_row1, v_row3); + v_c4 = _mm_sub_epi16(v_c1, v_c2); + v_c = _mm_add_epi16(v_c3, v_c4); + v_d1 = _mm_mulhi_epi16(v_row1, v_k1); + v_d2 = _mm_mulhi_epi16(v_row3, v_k2); + v_d3 = _mm_add_epi16(v_row1, v_row3); + v_d4 = _mm_add_epi16(v_d1, v_d2); + v_d = _mm_add_epi16(v_d3, v_d4); + v_row0 = _mm_add_epi16(v_a, v_d); + v_row1 = _mm_add_epi16(v_b, v_c); + v_row2 = _mm_sub_epi16(v_b, v_c); + v_row3 = _mm_sub_epi16(v_a, v_d); + v_tr0 = _mm_unpacklo_epi16(v_row0, v_row1); + v_tr1 = _mm_unpacklo_epi16(v_row2, v_row3); + v_tr2 = _mm_unpackhi_epi16(v_row0, v_row1); + v_tr3 = _mm_unpackhi_epi16(v_row2, v_row3); + v_ts0 = _mm_unpacklo_epi32(v_tr0, v_tr1); + v_ts1 = _mm_unpackhi_epi32(v_tr0, v_tr1); + v_ts2 = _mm_unpacklo_epi32(v_tr2, v_tr3); + v_ts3 = _mm_unpackhi_epi32(v_tr2, v_tr3); + v_row0 = _mm_unpacklo_epi64(v_ts0, v_ts2); + v_row1 = _mm_unpackhi_epi64(v_ts0, v_ts2); + v_row2 = _mm_unpacklo_epi64(v_ts1, v_ts3); + v_row3 = _mm_unpackhi_epi64(v_ts1, v_ts3); + v_row0 = _mm_add_epi16(v_row0, v_k_4); + v_a = _mm_add_epi16(v_row0, v_row2); + v_b = _mm_sub_epi16(v_row0, v_row2); + v_c1 = _mm_mulhi_epi16(v_row1, v_k2); + v_c2 = _mm_mulhi_epi16(v_row3, v_k1); + v_c3 = _mm_sub_epi16(v_row1, v_row3); + v_c4 = _mm_sub_epi16(v_c1, v_c2); + v_c = _mm_add_epi16(v_c3, v_c4); + v_d1 = _mm_mulhi_epi16(v_row1, v_k1); + v_d2 = _mm_mulhi_epi16(v_row3, v_k2); + v_d3 = _mm_add_epi16(v_row1, v_row3); + v_d4 = _mm_add_epi16(v_d1, v_d2); + v_d = _mm_add_epi16(v_d3, v_d4); + v_row0 = _mm_srai_epi16(_mm_add_epi16(v_a, v_d), (int32_t)(3u)); + v_row1 = _mm_srai_epi16(_mm_add_epi16(v_b, v_c), (int32_t)(3u)); + v_row2 = _mm_srai_epi16(_mm_sub_epi16(v_b, v_c), (int32_t)(3u)); + v_row3 = _mm_srai_epi16(_mm_sub_epi16(v_a, v_d), (int32_t)(3u)); + v_tr0 = _mm_unpacklo_epi16(v_row0, v_row1); + v_tr1 = _mm_unpacklo_epi16(v_row2, v_row3); + v_tr2 = _mm_unpackhi_epi16(v_row0, v_row1); + v_tr3 = _mm_unpackhi_epi16(v_row2, v_row3); + v_ts0 = _mm_unpacklo_epi32(v_tr0, v_tr1); + v_ts1 = _mm_unpackhi_epi32(v_tr0, v_tr1); + v_ts2 = _mm_unpacklo_epi32(v_tr2, v_tr3); + v_ts3 = _mm_unpackhi_epi32(v_tr2, v_tr3); + v_row0 = _mm_unpacklo_epi64(v_ts0, v_ts2); + v_row1 = _mm_unpackhi_epi64(v_ts0, v_ts2); + v_row2 = _mm_unpacklo_epi64(v_ts1, v_ts3); + v_row3 = _mm_unpackhi_epi64(v_ts1, v_ts3); + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(a_dst.ptr))); + v_pred16 = _mm_unpacklo_epi8(v_pred, v_k_0); + v_sum = _mm_add_epi16(v_pred16, v_row0); + v_out = _mm_packus_epi16(v_sum, v_sum); + wuffs_base__poke_u32le__no_bounds_check(a_dst.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_out)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(a_dst.ptr))); + v_pred16 = _mm_unpacklo_epi8(v_pred, v_k_0); + v_sum = _mm_add_epi16(v_pred16, v_row1); + v_out = _mm_packus_epi16(v_sum, v_sum); + wuffs_base__poke_u32le__no_bounds_check(a_dst.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_out)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(a_dst.ptr))); + v_pred16 = _mm_unpacklo_epi8(v_pred, v_k_0); + v_sum = _mm_add_epi16(v_pred16, v_row2); + v_out = _mm_packus_epi16(v_sum, v_sum); + wuffs_base__poke_u32le__no_bounds_check(a_dst.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_out)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(a_dst.ptr))); + v_pred16 = _mm_unpacklo_epi8(v_pred, v_k_0); + v_sum = _mm_add_epi16(v_pred16, v_row3); + v_out = _mm_packus_epi16(v_sum, v_sum); + wuffs_base__poke_u32le__no_bounds_check(a_dst.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_out)))); + } + v_i = 0u; + while (v_i < 16u) { + self->private_data.f_mb_coeffs[(v_off + v_i)] = 0u; + v_i += 1u; + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.idct_dc_add_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__idct_dc_add_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_dst, + uint32_t a_stride, + uint32_t a_coeff_offset) { + __m128i v_k_0 = {0}; + __m128i v_dc16 = {0}; + __m128i v_pred = {0}; + __m128i v_pred16 = {0}; + __m128i v_sum = {0}; + __m128i v_out = {0}; + uint32_t v_off = 0; + uint32_t v_dc = 0; + + v_off = a_coeff_offset; + v_k_0 = _mm_setzero_si128(); + v_dc = ((uint32_t)(self->private_data.f_mb_coeffs[v_off] + 4u)); + v_dc = ((v_dc >> 3u) | ((uint32_t)(((uint32_t)(0u - (v_dc >> 31u))) << 29u))); + self->private_data.f_mb_coeffs[v_off] = 0u; + v_dc16 = _mm_set1_epi16((int16_t)(((uint16_t)(v_dc)))); + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(a_dst.ptr))); + v_pred16 = _mm_unpacklo_epi8(v_pred, v_k_0); + v_sum = _mm_add_epi16(v_pred16, v_dc16); + v_out = _mm_packus_epi16(v_sum, v_sum); + wuffs_base__poke_u32le__no_bounds_check(a_dst.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_out)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(a_dst.ptr))); + v_pred16 = _mm_unpacklo_epi8(v_pred, v_k_0); + v_sum = _mm_add_epi16(v_pred16, v_dc16); + v_out = _mm_packus_epi16(v_sum, v_sum); + wuffs_base__poke_u32le__no_bounds_check(a_dst.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_out)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(a_dst.ptr))); + v_pred16 = _mm_unpacklo_epi8(v_pred, v_k_0); + v_sum = _mm_add_epi16(v_pred16, v_dc16); + v_out = _mm_packus_epi16(v_sum, v_sum); + wuffs_base__poke_u32le__no_bounds_check(a_dst.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_out)))); + } + if (((uint64_t)(a_stride)) <= ((uint64_t)(a_dst.len))) { + a_dst = wuffs_base__slice_u8__subslice_i(a_dst, ((uint64_t)(a_stride))); + } + if (4u <= ((uint64_t)(a_dst.len))) { + v_pred = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(a_dst.ptr))); + v_pred16 = _mm_unpacklo_epi8(v_pred, v_k_0); + v_sum = _mm_add_epi16(v_pred16, v_dc16); + v_out = _mm_packus_epi16(v_sum, v_sum); + wuffs_base__poke_u32le__no_bounds_check(a_dst.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_out)))); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// -------- func vp8.decoder.decode_frame_mb + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__status +wuffs_vp8__decoder__decode_frame_mb( + wuffs_vp8__decoder* self, + wuffs_base__io_buffer* a_src, + wuffs_base__pixel_buffer* a_dst, + wuffs_base__slice_u8 a_workbuf) { + wuffs_base__status status = wuffs_base__make_status(NULL); + + wuffs_base__status v_swizzle_status = wuffs_base__make_status(NULL); + uint32_t v_prev_mby = 0; + uint32_t v_i = 0; + uint32_t v_part_size = 0; + uint32_t v_total_size = 0; + uint32_t v_n_copied = 0; + uint64_t v_coeff_start = 0; + uint64_t v_off = 0; + uint32_t v_new_part = 0; + uint32_t v_p = 0; + uint32_t v_unconsumed = 0; + + const uint8_t* iop_a_src = NULL; + const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + if (a_src && a_src->data.ptr) { + io0_a_src = a_src->data.ptr; + io1_a_src = io0_a_src + a_src->meta.ri; + iop_a_src = io1_a_src; + io2_a_src = io0_a_src + a_src->meta.wi; + } + + if (self->private_impl.f_num_partitions > 1u) { + self->private_impl.f_multi_partition = true; + v_total_size = 0u; + v_i = 0u; + while ((v_i < 7u) && ((v_i + 1u) < self->private_impl.f_num_partitions)) { + if (((uint64_t)(io2_a_src - iop_a_src)) >= 3u) { + v_part_size = ((uint32_t)(wuffs_base__peek_u24le__no_bounds_check(iop_a_src))); + iop_a_src += 3u; + } else { + v_part_size = 0u; + } + if (v_i < 8u) { + self->private_impl.f_part_wbuf_size[v_i] = v_part_size; + } + v_total_size += v_part_size; + v_i += 1u; + } + v_coeff_start = ((uint64_t)(self->private_impl.f_workbuf_offset_v_end + ((uint64_t)(self->private_impl.f_partition0_size)))); + v_off = v_coeff_start; + v_n_copied = 0u; + while (((uint64_t)(io2_a_src - iop_a_src)) > 0u) { + if (v_off < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_off] = wuffs_base__peek_u8be__no_bounds_check(iop_a_src); + } + iop_a_src += 1u; + v_off += 1u; + v_n_copied += 1u; + } + if (self->private_impl.f_num_partitions > 0u) { + v_i = (self->private_impl.f_num_partitions - 1u); + } else { + v_i = 0u; + } + if (v_i < 8u) { + if (v_n_copied > v_total_size) { + self->private_impl.f_part_wbuf_size[v_i] = (v_n_copied - v_total_size); + } else { + self->private_impl.f_part_wbuf_size[v_i] = 0u; + } + } + v_off = v_coeff_start; + v_i = 0u; + while ((v_i < self->private_impl.f_num_partitions) && (v_i < 8u)) { + self->private_impl.f_part_wbuf_offset[v_i] = v_off; + v_off += ((uint64_t)(self->private_impl.f_part_wbuf_size[v_i])); + v_i += 1u; + } + v_i = 0u; + while (v_i < 8u) { + self->private_impl.f_part_range[v_i] = 254u; + self->private_impl.f_part_value[v_i] = 0u; + self->private_impl.f_part_bits[v_i] = 0u; + self->private_impl.f_part_wbuf_ri[v_i] = 0u; + v_i += 1u; + } + self->private_impl.f_current_partition = 0u; + self->private_impl.f_current_part_wbuf_ri = 0u; + self->private_impl.f_p1_ri = 0u; + self->private_impl.f_p1_wi = 0u; + wuffs_vp8__decoder__p1_fill_from_workbuf(self, a_workbuf); + wuffs_vp8__decoder__p1_init(self); + } else { + self->private_impl.f_multi_partition = false; + self->private_impl.f_p1_ri = 0u; + self->private_impl.f_p1_wi = 0u; + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + wuffs_vp8__decoder__p1_fill_buffer(self, a_src, 4096u); + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } + wuffs_vp8__decoder__p1_init(self); + } + wuffs_private_impl__bulk_memset(&self->private_data.f_above_nz[0], 8200u, 0u); + wuffs_private_impl__bulk_memset(&self->private_data.f_above_nz_y2[0], 1025u, 0u); + wuffs_private_impl__bulk_memset(&self->private_data.f_above_modes[0], 4096u, 0u); + self->private_impl.f_mb_y = 0u; + while (self->private_impl.f_mb_y < self->private_impl.f_mb_height) { + if (self->private_impl.f_multi_partition && (self->private_impl.f_mb_y > 0u)) { + v_p = self->private_impl.f_current_partition; + self->private_impl.f_part_range[v_p] = self->private_impl.f_p1_range; + self->private_impl.f_part_value[v_p] = self->private_impl.f_p1_value; + self->private_impl.f_part_bits[v_p] = self->private_impl.f_p1_bits; + if (self->private_impl.f_p1_wi >= self->private_impl.f_p1_ri) { + v_unconsumed = (self->private_impl.f_p1_wi - self->private_impl.f_p1_ri); + } else { + v_unconsumed = 0u; + } + self->private_impl.f_part_wbuf_ri[v_p] = wuffs_base__u32__sat_sub(self->private_impl.f_current_part_wbuf_ri, v_unconsumed); + v_new_part = (((uint32_t)(self->private_impl.f_current_partition)) + 1u); + if (v_new_part >= self->private_impl.f_num_partitions) { + v_new_part = 0u; + } + if (v_new_part < 8u) { + self->private_impl.f_p1_range = (self->private_impl.f_part_range[v_new_part] & 255u); + self->private_impl.f_p1_value = self->private_impl.f_part_value[v_new_part]; + self->private_impl.f_p1_bits = self->private_impl.f_part_bits[v_new_part]; + self->private_impl.f_current_part_wbuf_ri = self->private_impl.f_part_wbuf_ri[v_new_part]; + self->private_impl.f_current_partition = ((uint32_t)(v_new_part)); + } + self->private_impl.f_p1_ri = 0u; + self->private_impl.f_p1_wi = 0u; + wuffs_vp8__decoder__p1_fill_from_workbuf(self, a_workbuf); + } + if ((self->private_impl.f_mb_y & 1u) == 0u) { + wuffs_private_impl__bulk_memset(&self->private_data.f_mb_filter_level[0u], (1024u - 0u), 0u); + wuffs_private_impl__bulk_memset(&self->private_data.f_mb_filter_inner[0u], (1024u - 0u), 0u); + } else { + wuffs_private_impl__bulk_memset(&self->private_data.f_mb_filter_level[1024u], (2048u - 1024u), 0u); + wuffs_private_impl__bulk_memset(&self->private_data.f_mb_filter_inner[1024u], (2048u - 1024u), 0u); + } + wuffs_private_impl__bulk_memset(&self->private_data.f_left_nz[0], 8u, 0u); + self->private_impl.f_left_nz_y2 = 0u; + wuffs_private_impl__bulk_memset(&self->private_data.f_left_modes[0], 4u, 0u); + self->private_impl.f_mb_x = 0u; + while (self->private_impl.f_mb_x < self->private_impl.f_mb_width) { + if (((uint32_t)(self->private_impl.f_bool_ri + 256u)) >= self->private_impl.f_bool_wi) { + wuffs_vp8__decoder__bool_fill_from_workbuf(self, a_workbuf); + } + if (((uint32_t)(self->private_impl.f_p1_ri + 2048u)) >= self->private_impl.f_p1_wi) { + if (self->private_impl.f_multi_partition) { + wuffs_vp8__decoder__p1_fill_from_workbuf(self, a_workbuf); + } else { + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + wuffs_vp8__decoder__p1_fill_buffer(self, a_src, 2048u); + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } + } + } + wuffs_vp8__decoder__decode_one_mb(self, a_workbuf); + if (self->private_impl.f_mb_x < 1023u) { + self->private_impl.f_mb_x += 1u; + } + } + if (self->private_impl.f_mb_y > 0u) { + v_prev_mby = (self->private_impl.f_mb_y - 1u); + if ((self->private_impl.f_filter_type == 1u) && (self->private_impl.f_filter_level > 0u)) { + wuffs_vp8__decoder__apply_simple_filter_row(self, a_workbuf, v_prev_mby); + } else if (self->private_impl.f_filter_level > 0u) { + wuffs_vp8__decoder__apply_normal_filter_row(self, a_workbuf, v_prev_mby); + } + v_swizzle_status = wuffs_vp8__decoder__swizzle_mb_row(self, + a_dst, + a_workbuf, + v_prev_mby, + false); + } + if (self->private_impl.f_mb_y < 1023u) { + self->private_impl.f_mb_y += 1u; + } + } + if (self->private_impl.f_mb_height > 0u) { + v_prev_mby = (self->private_impl.f_mb_height - 1u); + if (v_prev_mby <= 1023u) { + if ((self->private_impl.f_filter_type == 1u) && (self->private_impl.f_filter_level > 0u)) { + wuffs_vp8__decoder__apply_simple_filter_row(self, a_workbuf, v_prev_mby); + } else if (self->private_impl.f_filter_level > 0u) { + wuffs_vp8__decoder__apply_normal_filter_row(self, a_workbuf, v_prev_mby); + } + v_swizzle_status = wuffs_vp8__decoder__swizzle_mb_row(self, + a_dst, + a_workbuf, + v_prev_mby, + true); + } + } + status = v_swizzle_status; + if (wuffs_base__status__is_error(&status)) { + goto exit; + } else if (wuffs_base__status__is_suspension(&status)) { + status = wuffs_base__make_status(wuffs_base__error__cannot_return_a_suspension); + goto exit; + } + goto ok; + + ok: + goto exit; + exit: + if (a_src && a_src->data.ptr) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + + return status; +} + +// -------- func vp8.decoder.decode_one_mb + +WUFFS_BASE__GENERATED_C_CODE_NOINLINE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_one_mb( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf) { + uint32_t v_i = 0; + uint32_t v_v = 0; + uint32_t v_block_offset = 0; + uint64_t v_y_off = 0; + uint64_t v_uv_off = 0; + wuffs_base__slice_u8 v_dst = {0}; + uint32_t v_mb_idx = 0; + uint32_t v_seg = 0; + uint32_t v_ys = 0; + uint32_t v_uvs = 0; + uint64_t v_y_base = 0; + uint64_t v_uv_base = 0; + + if (self->private_impl.f_use_segment && self->private_impl.f_update_segment_map) { + v_v = wuffs_vp8__decoder__bool_read_bool(self, self->private_impl.f_segment_prob[0u]); + if (v_v == 0u) { + v_v = wuffs_vp8__decoder__bool_read_bool(self, self->private_impl.f_segment_prob[1u]); + if (v_v == 0u) { + self->private_impl.f_segment_id = 0u; + } else { + self->private_impl.f_segment_id = 1u; + } + } else { + v_v = wuffs_vp8__decoder__bool_read_bool(self, self->private_impl.f_segment_prob[2u]); + if (v_v == 0u) { + self->private_impl.f_segment_id = 2u; + } else { + self->private_impl.f_segment_id = 3u; + } + } + } else { + self->private_impl.f_segment_id = 0u; + } + if (self->private_impl.f_mb_no_skip_coeff) { + v_v = wuffs_vp8__decoder__bool_read_bool(self, self->private_impl.f_prob_skip_false); + self->private_impl.f_is_skip_coeff = (v_v != 0u); + } else { + self->private_impl.f_is_skip_coeff = false; + } + wuffs_vp8__decoder__decode_luma_mode(self); + wuffs_vp8__decoder__decode_chroma_mode(self); + if ( ! self->private_impl.f_is_skip_coeff) { + wuffs_vp8__decoder__decode_mb_coefficients(self); + } else { + wuffs_vp8__decoder__clear_mb_nz_context(self); + } + v_ys = self->private_impl.f_y_stride; + v_uvs = self->private_impl.f_uv_stride; + v_y_base = ((((uint64_t)(self->private_impl.f_mb_y)) * 16u * ((uint64_t)(v_ys))) + (((uint64_t)(self->private_impl.f_mb_x)) * 16u)); + if (self->private_impl.f_mb_luma_mode < 4u) { + wuffs_vp8__decoder__predict_16x16(self, a_workbuf, ((uint8_t)(self->private_impl.f_mb_luma_mode))); + if ( ! self->private_impl.f_is_skip_coeff) { + wuffs_vp8__decoder__wht(self, 384u); + v_i = 0u; + while (v_i < 16u) { + v_block_offset = (v_i * 16u); + v_y_off = ((uint64_t)(((uint64_t)(v_y_base + (((uint64_t)((v_i >> 2u))) * 4u * ((uint64_t)(v_ys))))) + (((uint64_t)((v_i & 3u))) * 4u))); + if (v_y_off < ((uint64_t)(a_workbuf.len))) { + v_dst = wuffs_base__slice_u8__subslice_i(a_workbuf, v_y_off); + if (self->private_data.f_mb_y_ac_nz[v_i] >= 2u) { + wuffs_vp8__decoder__idct_add(self, v_dst, v_ys, v_block_offset); + } else if (self->private_data.f_mb_coeffs[v_block_offset] != 0u) { + wuffs_vp8__decoder__idct_dc_add(self, v_dst, v_ys, v_block_offset); + } + } + v_i += 1u; + } + } + } else { + if (self->private_impl.f_mb_y > 0u) { + v_y_off = ((uint64_t)(((uint64_t)(((uint64_t)(self->private_impl.f_mb_y)) * 16u)) * ((uint64_t)(self->private_impl.f_y_stride)))); + v_y_off = ((uint64_t)(v_y_off - ((uint64_t)(self->private_impl.f_y_stride)))); + v_y_off = ((uint64_t)(v_y_off + ((uint64_t)(((uint64_t)(self->private_impl.f_mb_x)) * 16u)))); + if (((uint32_t)(self->private_impl.f_mb_x)) < ((uint32_t)(self->private_impl.f_mb_width - 1u))) { + v_y_off = ((uint64_t)(v_y_off + 16u)); + if (v_y_off < ((uint64_t)(a_workbuf.len))) { + v_dst = wuffs_base__slice_u8__subslice_i(a_workbuf, v_y_off); + if (((uint64_t)(v_dst.len)) >= 4u) { + v_mb_idx = wuffs_base__peek_u32le__no_bounds_check(v_dst.ptr); + self->private_data.f_mb_upper_right[0u] = ((uint8_t)(v_mb_idx)); + self->private_data.f_mb_upper_right[1u] = ((uint8_t)((v_mb_idx >> 8u))); + self->private_data.f_mb_upper_right[2u] = ((uint8_t)((v_mb_idx >> 16u))); + self->private_data.f_mb_upper_right[3u] = ((uint8_t)((v_mb_idx >> 24u))); + } + } + } else { + v_y_off = ((uint64_t)(v_y_off + 15u)); + if (v_y_off < ((uint64_t)(a_workbuf.len))) { + self->private_data.f_mb_upper_right[0u] = a_workbuf.ptr[v_y_off]; + self->private_data.f_mb_upper_right[1u] = a_workbuf.ptr[v_y_off]; + self->private_data.f_mb_upper_right[2u] = a_workbuf.ptr[v_y_off]; + self->private_data.f_mb_upper_right[3u] = a_workbuf.ptr[v_y_off]; + } + } + } else { + self->private_data.f_mb_upper_right[0u] = 127u; + self->private_data.f_mb_upper_right[1u] = 127u; + self->private_data.f_mb_upper_right[2u] = 127u; + self->private_data.f_mb_upper_right[3u] = 127u; + } + v_i = 0u; + while (v_i < 16u) { + v_block_offset = (v_i * 16u); + wuffs_vp8__decoder__predict_4x4(self, a_workbuf, ((uint32_t)(v_i)), self->private_data.f_sub_modes[v_i]); + if ( ! self->private_impl.f_is_skip_coeff && (self->private_data.f_mb_y_ac_nz[v_i] > 0u)) { + v_y_off = ((uint64_t)(((uint64_t)(v_y_base + (((uint64_t)((v_i >> 2u))) * 4u * ((uint64_t)(v_ys))))) + (((uint64_t)((v_i & 3u))) * 4u))); + if (v_y_off < ((uint64_t)(a_workbuf.len))) { + v_dst = wuffs_base__slice_u8__subslice_i(a_workbuf, v_y_off); + if (self->private_data.f_mb_y_ac_nz[v_i] >= 2u) { + wuffs_vp8__decoder__idct_add(self, v_dst, v_ys, v_block_offset); + } else { + wuffs_vp8__decoder__idct_dc_add(self, v_dst, v_ys, v_block_offset); + } + } + } + v_i += 1u; + } + } + wuffs_vp8__decoder__predict_8x8(self, a_workbuf, self->private_impl.f_mb_chroma_mode, self->private_impl.f_workbuf_offset_y_end); + wuffs_vp8__decoder__predict_8x8(self, a_workbuf, self->private_impl.f_mb_chroma_mode, self->private_impl.f_workbuf_offset_u_end); + if ( ! self->private_impl.f_is_skip_coeff) { + v_uv_base = ((uint64_t)(((uint64_t)(self->private_impl.f_workbuf_offset_y_end + (((uint64_t)(self->private_impl.f_mb_y)) * 8u * ((uint64_t)(v_uvs))))) + (((uint64_t)(self->private_impl.f_mb_x)) * 8u))); + v_i = 0u; + while (v_i < 4u) { + v_block_offset = ((16u + v_i) * 16u); + if (self->private_data.f_mb_uv_nz[v_i] > 0u) { + v_uv_off = ((uint64_t)(((uint64_t)(v_uv_base + (((uint64_t)((v_i >> 1u))) * 4u * ((uint64_t)(v_uvs))))) + (((uint64_t)((v_i & 1u))) * 4u))); + if (v_uv_off < ((uint64_t)(a_workbuf.len))) { + v_dst = wuffs_base__slice_u8__subslice_i(a_workbuf, v_uv_off); + if (self->private_data.f_mb_uv_nz[v_i] >= 2u) { + wuffs_vp8__decoder__idct_add(self, v_dst, v_uvs, v_block_offset); + } else { + wuffs_vp8__decoder__idct_dc_add(self, v_dst, v_uvs, v_block_offset); + } + } + } + v_i += 1u; + } + v_uv_base = ((uint64_t)(((uint64_t)(self->private_impl.f_workbuf_offset_u_end + (((uint64_t)(self->private_impl.f_mb_y)) * 8u * ((uint64_t)(v_uvs))))) + (((uint64_t)(self->private_impl.f_mb_x)) * 8u))); + v_i = 0u; + while (v_i < 4u) { + v_block_offset = ((20u + v_i) * 16u); + if (self->private_data.f_mb_uv_nz[(v_i + 4u)] > 0u) { + v_uv_off = ((uint64_t)(((uint64_t)(v_uv_base + (((uint64_t)((v_i >> 1u))) * 4u * ((uint64_t)(v_uvs))))) + (((uint64_t)((v_i & 1u))) * 4u))); + if (v_uv_off < ((uint64_t)(a_workbuf.len))) { + v_dst = wuffs_base__slice_u8__subslice_i(a_workbuf, v_uv_off); + if (self->private_data.f_mb_uv_nz[(v_i + 4u)] >= 2u) { + wuffs_vp8__decoder__idct_add(self, v_dst, v_uvs, v_block_offset); + } else { + wuffs_vp8__decoder__idct_dc_add(self, v_dst, v_uvs, v_block_offset); + } + } + } + v_i += 1u; + } + } + v_mb_idx = (((self->private_impl.f_mb_y & 1u) * 1024u) + self->private_impl.f_mb_x); + if (v_mb_idx < 2048u) { + v_seg = (((uint32_t)(((uint8_t)(self->private_impl.f_segment_id & 3u)))) * 2u); + if (self->private_impl.f_mb_luma_mode == 4u) { + v_seg += 1u; + } + if (v_seg < 8u) { + self->private_data.f_mb_filter_level[v_mb_idx] = self->private_impl.f_fstrength_level[v_seg]; + self->private_data.f_mb_filter_ilevel[v_mb_idx] = self->private_impl.f_fstrength_ilevel[v_seg]; + self->private_data.f_mb_filter_hlevel[v_mb_idx] = self->private_impl.f_fstrength_hlevel[v_seg]; + } + if ((self->private_impl.f_mb_luma_mode == 4u) || ! self->private_impl.f_is_skip_coeff) { + self->private_data.f_mb_filter_inner[v_mb_idx] = 1u; + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.decode_luma_mode + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_luma_mode( + wuffs_vp8__decoder* self) { + uint32_t v_v = 0; + uint32_t v_val = 0; + uint32_t v_mode = 0; + uint32_t v_i = 0; + uint32_t v_above_mode = 0; + uint32_t v_left_mode = 0; + uint32_t v_prob_idx = 0; + uint32_t v_above_idx = 0; + + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_Y_MODE_PROBS[0u]); + if (v_v == 0u) { + v_mode = 4u; + } else { + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_Y_MODE_PROBS[1u]); + if (v_v == 0u) { + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_Y_MODE_PROBS[2u]); + if (v_v == 0u) { + v_mode = 0u; + } else { + v_mode = 1u; + } + } else { + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_Y_MODE_PROBS[3u]); + if (v_v == 0u) { + v_mode = 2u; + } else { + v_mode = 3u; + } + } + } + self->private_impl.f_mb_luma_mode = ((uint8_t)(v_mode)); + if (v_mode == 4u) { + v_i = 0u; + while (v_i < 16u) { + if (v_i < 4u) { + v_above_idx = ((self->private_impl.f_mb_x * 4u) + (v_i & 3u)); + if (v_above_idx < 4096u) { + v_above_mode = ((uint32_t)(self->private_data.f_above_modes[v_above_idx])); + } + } else { + v_above_mode = ((uint32_t)(self->private_data.f_sub_modes[(v_i - 4u)])); + } + if ((v_i & 3u) == 0u) { + if ((v_i >> 2u) < 4u) { + v_left_mode = ((uint32_t)(self->private_data.f_left_modes[(v_i >> 2u)])); + } + } else if (v_i > 0u) { + v_left_mode = ((uint32_t)(self->private_data.f_sub_modes[(v_i - 1u)])); + } + if (v_above_mode > 9u) { + v_above_mode = 0u; + } + if (v_left_mode > 9u) { + v_left_mode = 0u; + } + v_above_mode = (v_above_mode & 15u); + v_left_mode = (v_left_mode & 15u); + v_prob_idx = (((v_above_mode * 10u) + v_left_mode) * 9u); + v_val = wuffs_vp8__decoder__decode_sub_block_mode(self, v_prob_idx); + self->private_data.f_sub_modes[v_i] = ((uint8_t)(v_val)); + v_i += 1u; + } + v_above_idx = (self->private_impl.f_mb_x * 4u); + if (v_above_idx < 4093u) { + self->private_data.f_above_modes[(v_above_idx + 0u)] = self->private_data.f_sub_modes[12u]; + self->private_data.f_above_modes[(v_above_idx + 1u)] = self->private_data.f_sub_modes[13u]; + self->private_data.f_above_modes[(v_above_idx + 2u)] = self->private_data.f_sub_modes[14u]; + self->private_data.f_above_modes[(v_above_idx + 3u)] = self->private_data.f_sub_modes[15u]; + } + self->private_data.f_left_modes[0u] = self->private_data.f_sub_modes[3u]; + self->private_data.f_left_modes[1u] = self->private_data.f_sub_modes[7u]; + self->private_data.f_left_modes[2u] = self->private_data.f_sub_modes[11u]; + self->private_data.f_left_modes[3u] = self->private_data.f_sub_modes[15u]; + } else { + v_val = v_mode; + if (v_mode == 1u) { + v_val = 2u; + } else if (v_mode == 2u) { + v_val = 3u; + } else if (v_mode == 3u) { + v_val = 1u; + } + v_above_idx = (self->private_impl.f_mb_x * 4u); + if (v_above_idx < 4093u) { + self->private_data.f_above_modes[(v_above_idx + 0u)] = ((uint8_t)(v_val)); + self->private_data.f_above_modes[(v_above_idx + 1u)] = ((uint8_t)(v_val)); + self->private_data.f_above_modes[(v_above_idx + 2u)] = ((uint8_t)(v_val)); + self->private_data.f_above_modes[(v_above_idx + 3u)] = ((uint8_t)(v_val)); + } + self->private_data.f_left_modes[0u] = ((uint8_t)(v_val)); + self->private_data.f_left_modes[1u] = ((uint8_t)(v_val)); + self->private_data.f_left_modes[2u] = ((uint8_t)(v_val)); + self->private_data.f_left_modes[3u] = ((uint8_t)(v_val)); + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.decode_sub_block_mode + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__decode_sub_block_mode( + wuffs_vp8__decoder* self, + uint32_t a_prob_offset) { + uint32_t v_v = 0; + uint32_t v_p = 0; + + v_p = a_prob_offset; + if (v_p > 891u) { + return 0u; + } + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_B_MODE_PROBS[v_p]); + if (v_v == 0u) { + return 0u; + } + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_B_MODE_PROBS[(v_p + 1u)]); + if (v_v == 0u) { + return 1u; + } + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_B_MODE_PROBS[(v_p + 2u)]); + if (v_v == 0u) { + return 2u; + } + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_B_MODE_PROBS[(v_p + 3u)]); + if (v_v == 0u) { + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_B_MODE_PROBS[(v_p + 4u)]); + if (v_v == 0u) { + return 3u; + } + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_B_MODE_PROBS[(v_p + 5u)]); + if (v_v == 0u) { + return 5u; + } + return 6u; + } + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_B_MODE_PROBS[(v_p + 6u)]); + if (v_v == 0u) { + return 4u; + } + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_B_MODE_PROBS[(v_p + 7u)]); + if (v_v == 0u) { + return 7u; + } + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_B_MODE_PROBS[(v_p + 8u)]); + if (v_v == 0u) { + return 8u; + } + return 9u; +} + +// -------- func vp8.decoder.decode_chroma_mode + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_chroma_mode( + wuffs_vp8__decoder* self) { + uint32_t v_v = 0; + + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_UV_MODE_PROBS[0u]); + if (v_v == 0u) { + self->private_impl.f_mb_chroma_mode = 0u; + } else { + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_UV_MODE_PROBS[1u]); + if (v_v == 0u) { + self->private_impl.f_mb_chroma_mode = 1u; + } else { + v_v = wuffs_vp8__decoder__bool_read_bool(self, WUFFS_VP8__KF_UV_MODE_PROBS[2u]); + if (v_v == 0u) { + self->private_impl.f_mb_chroma_mode = 2u; + } else { + self->private_impl.f_mb_chroma_mode = 3u; + } + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.clear_mb_nz_context + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__clear_mb_nz_context( + wuffs_vp8__decoder* self) { + uint32_t v_i = 0; + uint32_t v_above_idx = 0; + + v_i = 0u; + while (v_i < 4u) { + v_above_idx = ((self->private_impl.f_mb_x * 8u) + v_i); + self->private_data.f_above_nz[v_above_idx] = 0u; + self->private_data.f_left_nz[v_i] = 0u; + v_i += 1u; + } + v_i = 0u; + while (v_i < 2u) { + v_above_idx = ((self->private_impl.f_mb_x * 8u) + 4u + v_i); + self->private_data.f_above_nz[v_above_idx] = 0u; + self->private_data.f_left_nz[(4u + v_i)] = 0u; + v_i += 1u; + } + v_i = 0u; + while (v_i < 2u) { + v_above_idx = ((self->private_impl.f_mb_x * 8u) + 6u + v_i); + self->private_data.f_above_nz[v_above_idx] = 0u; + self->private_data.f_left_nz[(6u + v_i)] = 0u; + v_i += 1u; + } + if (self->private_impl.f_mb_luma_mode < 4u) { + self->private_data.f_above_nz_y2[self->private_impl.f_mb_x] = 0u; + self->private_impl.f_left_nz_y2 = 0u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.decode_mb_coefficients + +WUFFS_BASE__GENERATED_C_CODE_NOINLINE +static wuffs_base__empty_struct +wuffs_vp8__decoder__decode_mb_coefficients( + wuffs_vp8__decoder* self) { + uint32_t v_block_idx = 0; + uint32_t v_ctx = 0; + uint32_t v_raw_ctx = 0; + uint32_t v_nz = 0; + uint32_t v_above_idx = 0; + uint32_t v_left_idx = 0; + uint32_t v_any_nz = 0; + uint32_t v_uv_idx = 0; + + if (self->private_impl.f_mb_luma_mode < 4u) { + v_raw_ctx = ((uint32_t)(self->private_data.f_above_nz_y2[self->private_impl.f_mb_x])); + v_raw_ctx += ((uint32_t)(self->private_impl.f_left_nz_y2)); + if (v_raw_ctx <= 2u) { + v_ctx = ((uint32_t)(v_raw_ctx)); + } else { + v_ctx = 2u; + } + v_nz = wuffs_vp8__decoder__decode_block_coeffs(self, + 384u, + 1u, + 0u, + v_ctx); + v_any_nz |= v_nz; + self->private_data.f_above_nz_y2[self->private_impl.f_mb_x] = ((uint8_t)(v_nz)); + self->private_impl.f_left_nz_y2 = ((uint8_t)(v_nz)); + v_block_idx = 0u; + while (v_block_idx < 16u) { + v_above_idx = ((self->private_impl.f_mb_x * 8u) + (v_block_idx & 3u)); + v_left_idx = (v_block_idx >> 2u); + v_raw_ctx = ((uint32_t)(((uint32_t)(self->private_data.f_above_nz[v_above_idx])) + ((uint32_t)(self->private_data.f_left_nz[v_left_idx])))); + if (v_raw_ctx <= 2u) { + v_ctx = ((uint32_t)(v_raw_ctx)); + } else { + v_ctx = 2u; + } + v_nz = wuffs_vp8__decoder__decode_block_coeffs(self, + (v_block_idx * 16u), + 0u, + 1u, + v_ctx); + v_any_nz |= v_nz; + if (v_nz == 0u) { + self->private_data.f_mb_y_ac_nz[v_block_idx] = 0u; + } else { + self->private_data.f_mb_y_ac_nz[v_block_idx] = 2u; + } + self->private_data.f_above_nz[v_above_idx] = ((uint8_t)(v_nz)); + self->private_data.f_left_nz[v_left_idx] = ((uint8_t)(v_nz)); + v_block_idx += 1u; + } + } else { + v_block_idx = 0u; + while (v_block_idx < 16u) { + v_above_idx = ((self->private_impl.f_mb_x * 8u) + (v_block_idx & 3u)); + v_left_idx = (v_block_idx >> 2u); + v_raw_ctx = ((uint32_t)(((uint32_t)(self->private_data.f_above_nz[v_above_idx])) + ((uint32_t)(self->private_data.f_left_nz[v_left_idx])))); + if (v_raw_ctx <= 2u) { + v_ctx = ((uint32_t)(v_raw_ctx)); + } else { + v_ctx = 2u; + } + v_nz = wuffs_vp8__decoder__decode_block_coeffs(self, + (v_block_idx * 16u), + 3u, + 0u, + v_ctx); + v_any_nz |= v_nz; + self->private_data.f_mb_y_ac_nz[v_block_idx] = ((uint8_t)((v_nz + (v_nz & (self->private_data.f_block_ac_nz & 1u))))); + self->private_data.f_above_nz[v_above_idx] = ((uint8_t)(v_nz)); + self->private_data.f_left_nz[v_left_idx] = ((uint8_t)(v_nz)); + v_block_idx += 1u; + } + } + v_uv_idx = 0u; + while (v_uv_idx < 8u) { + v_block_idx = (16u + v_uv_idx); + v_above_idx = ((self->private_impl.f_mb_x * 8u) + + 4u + + ((v_uv_idx >> 2u) * 2u) + + (v_uv_idx & 1u)); + v_left_idx = (4u + ((v_uv_idx >> 2u) * 2u) + ((v_uv_idx >> 1u) & 1u)); + v_raw_ctx = ((uint32_t)(((uint32_t)(self->private_data.f_above_nz[v_above_idx])) + ((uint32_t)(self->private_data.f_left_nz[v_left_idx])))); + if (v_raw_ctx <= 2u) { + v_ctx = ((uint32_t)(v_raw_ctx)); + } else { + v_ctx = 2u; + } + v_nz = wuffs_vp8__decoder__decode_block_coeffs(self, + (v_block_idx * 16u), + 2u, + 0u, + v_ctx); + v_any_nz |= v_nz; + self->private_data.f_mb_uv_nz[v_uv_idx] = ((uint8_t)((v_nz + (v_nz & (self->private_data.f_block_ac_nz & 1u))))); + self->private_data.f_above_nz[v_above_idx] = ((uint8_t)(v_nz)); + self->private_data.f_left_nz[v_left_idx] = ((uint8_t)(v_nz)); + v_uv_idx += 1u; + } + if (v_any_nz == 0u) { + self->private_impl.f_is_skip_coeff = true; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.decode_coeff_category + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__decode_coeff_category( + wuffs_vp8__decoder* self, + uint32_t a_prob_idx) { + uint32_t v_v = 0; + uint32_t v_cat = 0; + uint32_t v_extra_val = 0; + uint32_t v_i = 0; + uint32_t v_n_extra = 0; + uint32_t v_cat_off = 0; + uint32_t v_cat_end = 0; + + v_v = wuffs_vp8__decoder__p1_read_bool(self, self->private_data.f_coeff_probs[(a_prob_idx + 6u)]); + if (v_v == 0u) { + v_v = wuffs_vp8__decoder__p1_read_bool(self, self->private_data.f_coeff_probs[(a_prob_idx + 7u)]); + if (v_v == 0u) { + v_cat = 0u; + } else { + v_cat = 1u; + } + } else { + v_v = wuffs_vp8__decoder__p1_read_bool(self, self->private_data.f_coeff_probs[(a_prob_idx + 8u)]); + if (v_v == 0u) { + v_v = wuffs_vp8__decoder__p1_read_bool(self, self->private_data.f_coeff_probs[(a_prob_idx + 9u)]); + if (v_v == 0u) { + v_cat = 2u; + } else { + v_cat = 3u; + } + } else { + v_v = wuffs_vp8__decoder__p1_read_bool(self, self->private_data.f_coeff_probs[(a_prob_idx + 10u)]); + if (v_v == 0u) { + v_cat = 4u; + } else { + v_cat = 5u; + } + } + } + v_cat_off = ((uint32_t)(WUFFS_VP8__CAT_PROBS_OFFSET[v_cat])); + v_n_extra = ((uint32_t)(WUFFS_VP8__CAT_EXTRA_BITS[v_cat])); + v_cat_end = (v_cat_off + v_n_extra); + v_extra_val = 0u; + v_i = v_cat_off; + while ((v_i < v_cat_end) && (v_i < 26u)) { + v_v = wuffs_vp8__decoder__p1_read_bool(self, WUFFS_VP8__CAT_PROBS[v_i]); + v_extra_val = (((uint32_t)(v_extra_val << 1u)) | ((uint32_t)(v_v))); + v_i += 1u; + } + return ((uint32_t)(((uint32_t)(WUFFS_VP8__CAT_BASE_VALUE[v_cat])) + v_extra_val)); +} + +// -------- func vp8.decoder.decode_block_coeffs + +WUFFS_BASE__GENERATED_C_CODE_ALWAYS_INLINE +static uint32_t +wuffs_vp8__decoder__decode_block_coeffs( + wuffs_vp8__decoder* self, + uint32_t a_block_offset, + uint32_t a_block_type, + uint32_t a_start_coeff, + uint32_t a_init_ctx) { + uint32_t v_coeff_idx = 0; + uint32_t v_ctx = 0; + uint32_t v_prob_idx = 0; + uint32_t v_bt_base = 0; + uint32_t v_v = 0; + uint32_t v_abs_val = 0; + uint32_t v_sign = 0; + uint32_t v_zi = 0; + uint32_t v_dq = 0; + uint32_t v_seg = 0; + uint32_t v_ci = 0; + uint32_t v_has_nz = 0; + uint32_t v_has_ac = 0; + uint32_t v_dq_dc = 0; + uint32_t v_dq_ac = 0; + uint32_t v_lr = 0; + uint64_t v_lv = 0; + uint32_t v_lb = 0; + uint32_t v_s = 0; + uint32_t v_pos = 0; + uint32_t v_bval = 0; + uint32_t v_nshift = 0; + uint64_t v_bb = 0; + uint32_t v_lri = 0; + uint32_t v_lwi = 0; + uint32_t v_lr_taken = 0; + uint32_t v_neg_mask = 0; + + v_seg = ((uint32_t)(self->private_impl.f_segment_id)); + v_has_nz = 0u; + v_has_ac = 0u; + v_bt_base = (a_block_type * 264u); + if (a_block_type == 1u) { + v_dq_dc = self->private_impl.f_dequant_y2_dc[v_seg]; + v_dq_ac = self->private_impl.f_dequant_y2_ac[v_seg]; + } else if (a_block_type == 2u) { + v_dq_dc = self->private_impl.f_dequant_uv_dc[v_seg]; + v_dq_ac = self->private_impl.f_dequant_uv_ac[v_seg]; + } else { + v_dq_dc = self->private_impl.f_dequant_y_dc[v_seg]; + v_dq_ac = self->private_impl.f_dequant_y_ac[v_seg]; + } + v_lr = self->private_impl.f_p1_range; + v_lv = self->private_impl.f_p1_value; + v_lb = self->private_impl.f_p1_bits; + v_lri = self->private_impl.f_p1_ri; + v_lwi = self->private_impl.f_p1_wi; + v_coeff_idx = a_start_coeff; + v_ctx = a_init_ctx; + v_prob_idx = (v_bt_base + ((uint32_t)(WUFFS_VP8__COEFF_BAND_OFFSET[v_coeff_idx])) + (v_ctx * 11u)); + if (v_lb < 16u) { + if ((((uint32_t)(v_lri + 4u)) <= v_lwi) && (v_lri < 4093u)) { + v_lv = (((uint64_t)(v_lv << 32u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 0u)])) << 24u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 1u)])) << 16u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 2u)])) << 8u)) | + ((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 3u)]))); + v_lri += 4u; + v_lb += 32u; + } else { + while ((v_lb <= 48u) && (v_lri < v_lwi)) { + v_bb = ((uint64_t)(self->private_data.f_p1_buffer[v_lri])); + v_lri += 1u; + v_lv = (((uint64_t)(v_lv << 8u)) | v_bb); + v_lb += 8u; + } + } + } + v_s = (((uint32_t)(v_lr * ((uint32_t)(self->private_data.f_coeff_probs[v_prob_idx])))) >> 8u); + v_pos = (((uint32_t)(v_lb - 8u)) & 63u); + v_bval = ((uint32_t)((v_lv >> v_pos))); + if (v_bval > v_s) { + v_lv -= ((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)); + v_lr = (((uint32_t)(((uint32_t)(v_lr - v_s)) - 1u)) & 255u); + } else { + v_lr = v_s; + v_nshift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(v_lr & 255u)])); + v_lr = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(v_lr & 255u)])); + v_lb -= v_nshift; + self->private_impl.f_p1_range = (v_lr & 255u); + self->private_impl.f_p1_value = v_lv; + self->private_impl.f_p1_bits = v_lb; + self->private_impl.f_p1_ri = v_lri; + self->private_data.f_block_ac_nz = 0u; + return 0u; + } + v_nshift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(v_lr & 255u)])); + v_lr = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(v_lr & 255u)])); + v_lb -= v_nshift; + while (v_coeff_idx < 16u) { + if (v_lb < 28u) { + if ((((uint32_t)(v_lri + 4u)) <= v_lwi) && (v_lri < 4093u)) { + v_lv = (((uint64_t)(v_lv << 32u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 0u)])) << 24u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 1u)])) << 16u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 2u)])) << 8u)) | + ((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 3u)]))); + v_lri += 4u; + v_lb += 32u; + } else { + while ((v_lb <= 48u) && (v_lri < v_lwi)) { + v_bb = ((uint64_t)(self->private_data.f_p1_buffer[v_lri])); + v_lri += 1u; + v_lv = (((uint64_t)(v_lv << 8u)) | v_bb); + v_lb += 8u; + } + } + } + v_s = (((uint32_t)(v_lr * ((uint32_t)(self->private_data.f_coeff_probs[(v_prob_idx + 1u)])))) >> 8u); + v_pos = (((uint32_t)(v_lb - 8u)) & 63u); + v_bval = ((uint32_t)((v_lv >> v_pos))); + if (v_bval > v_s) { + v_v = 1u; + v_lv -= ((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)); + v_lr = (((uint32_t)(((uint32_t)(v_lr - v_s)) - 1u)) & 255u); + } else { + v_v = 0u; + v_lr = v_s; + } + v_nshift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(v_lr & 255u)])); + v_lr = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(v_lr & 255u)])); + v_lb -= v_nshift; + if (v_v == 0u) { + v_coeff_idx += 1u; + if (v_coeff_idx >= 16u) { + break; + } + v_prob_idx = (v_bt_base + ((uint32_t)(WUFFS_VP8__COEFF_BAND_OFFSET[v_coeff_idx]))); + continue; + } + v_s = (((uint32_t)(v_lr * ((uint32_t)(self->private_data.f_coeff_probs[(v_prob_idx + 2u)])))) >> 8u); + v_pos = (((uint32_t)(v_lb - 8u)) & 63u); + v_bval = ((uint32_t)((v_lv >> v_pos))); + if (v_bval > v_s) { + v_v = 1u; + v_lv -= ((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)); + v_lr = (((uint32_t)(((uint32_t)(v_lr - v_s)) - 1u)) & 255u); + } else { + v_v = 0u; + v_lr = v_s; + } + v_nshift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(v_lr & 255u)])); + v_lr = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(v_lr & 255u)])); + v_lb -= v_nshift; + if (v_v == 0u) { + v_abs_val = 1u; + } else { + if (v_lb < 40u) { + if ((((uint32_t)(v_lri + 3u)) <= v_lwi) && (v_lri < 4093u)) { + v_lv = (((uint64_t)(v_lv << 24u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 0u)])) << 16u)) | + ((uint64_t)(((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 1u)])) << 8u)) | + ((uint64_t)(self->private_data.f_p1_buffer[(v_lri + 2u)]))); + v_lri += 3u; + v_lb += 24u; + } else { + while ((v_lb <= 48u) && (v_lri < v_lwi)) { + v_bb = ((uint64_t)(self->private_data.f_p1_buffer[v_lri])); + v_lri += 1u; + v_lv = (((uint64_t)(v_lv << 8u)) | v_bb); + v_lb += 8u; + } + } + } + v_s = (((uint32_t)(v_lr * ((uint32_t)(self->private_data.f_coeff_probs[(v_prob_idx + 3u)])))) >> 8u); + v_pos = (((uint32_t)(v_lb - 8u)) & 63u); + v_bval = ((uint32_t)((v_lv >> v_pos))); + if (v_bval > v_s) { + v_v = 1u; + v_lv -= ((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)); + v_lr = (((uint32_t)(((uint32_t)(v_lr - v_s)) - 1u)) & 255u); + } else { + v_v = 0u; + v_lr = v_s; + } + v_nshift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(v_lr & 255u)])); + v_lr = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(v_lr & 255u)])); + v_lb -= v_nshift; + if (v_v == 0u) { + v_s = (((uint32_t)(v_lr * ((uint32_t)(self->private_data.f_coeff_probs[(v_prob_idx + 4u)])))) >> 8u); + v_pos = (((uint32_t)(v_lb - 8u)) & 63u); + v_bval = ((uint32_t)((v_lv >> v_pos))); + if (v_bval > v_s) { + v_v = 1u; + v_lv -= ((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)); + v_lr = (((uint32_t)(((uint32_t)(v_lr - v_s)) - 1u)) & 255u); + } else { + v_v = 0u; + v_lr = v_s; + } + v_nshift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(v_lr & 255u)])); + v_lr = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(v_lr & 255u)])); + v_lb -= v_nshift; + if (v_v == 0u) { + v_abs_val = 2u; + } else { + v_s = (((uint32_t)(v_lr * ((uint32_t)(self->private_data.f_coeff_probs[(v_prob_idx + 5u)])))) >> 8u); + v_pos = (((uint32_t)(v_lb - 8u)) & 63u); + v_bval = ((uint32_t)((v_lv >> v_pos))); + if (v_bval > v_s) { + v_lv -= ((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)); + v_lr = (((uint32_t)(((uint32_t)(v_lr - v_s)) - 1u)) & 255u); + v_abs_val = 4u; + } else { + v_lr = v_s; + v_abs_val = 3u; + } + v_nshift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(v_lr & 255u)])); + v_lr = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(v_lr & 255u)])); + v_lb -= v_nshift; + } + } else { + self->private_impl.f_p1_range = (v_lr & 255u); + self->private_impl.f_p1_value = v_lv; + self->private_impl.f_p1_bits = v_lb; + self->private_impl.f_p1_ri = v_lri; + v_abs_val = wuffs_vp8__decoder__decode_coeff_category(self, v_prob_idx); + v_lr = self->private_impl.f_p1_range; + v_lv = self->private_impl.f_p1_value; + v_lb = self->private_impl.f_p1_bits; + v_lri = self->private_impl.f_p1_ri; + } + } + v_s = (v_lr >> 1u); + v_pos = (((uint32_t)(v_lb - 8u)) & 63u); + v_bval = ((uint32_t)((v_lv >> v_pos))); + v_sign = (((uint32_t)(v_s - v_bval)) >> 31u); + v_lv -= ((uint64_t)(((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)) * ((uint64_t)(v_sign)))); + v_lr_taken = (((uint32_t)(((uint32_t)(v_lr - v_s)) - 1u)) & 255u); + v_lr = (v_s ^ ((uint32_t)((v_s ^ v_lr_taken) * v_sign))); + v_nshift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(v_lr & 255u)])); + v_lr = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(v_lr & 255u)])); + v_lb -= v_nshift; + v_has_nz = 1u; + if (v_coeff_idx > 0u) { + v_has_ac = 1u; + } + v_ctx = 1u; + if (v_abs_val > 1u) { + v_ctx = 2u; + } + v_zi = ((uint32_t)(WUFFS_VP8__ZIGZAG[v_coeff_idx])); + if (v_coeff_idx == 0u) { + v_dq = v_dq_dc; + } else { + v_dq = v_dq_ac; + } + v_abs_val = ((uint32_t)(v_abs_val * v_dq)); + v_ci = (a_block_offset + ((uint32_t)(v_zi))); + v_neg_mask = ((uint32_t)(0u - v_sign)); + self->private_data.f_mb_coeffs[v_ci] = ((uint32_t)((v_abs_val ^ v_neg_mask) - v_neg_mask)); + v_coeff_idx += 1u; + if (v_coeff_idx >= 16u) { + break; + } + v_prob_idx = (v_bt_base + ((uint32_t)(WUFFS_VP8__COEFF_BAND_OFFSET[v_coeff_idx])) + (v_ctx * 11u)); + v_s = (((uint32_t)(v_lr * ((uint32_t)(self->private_data.f_coeff_probs[v_prob_idx])))) >> 8u); + v_pos = (((uint32_t)(v_lb - 8u)) & 63u); + v_bval = ((uint32_t)((v_lv >> v_pos))); + if (v_bval > v_s) { + v_v = 1u; + v_lv -= ((uint64_t)(((uint64_t)(((uint32_t)(v_s + 1u)))) << v_pos)); + v_lr = (((uint32_t)(((uint32_t)(v_lr - v_s)) - 1u)) & 255u); + } else { + v_v = 0u; + v_lr = v_s; + } + v_nshift = ((uint32_t)(WUFFS_VP8__RENORM_SHIFT_256[(v_lr & 255u)])); + v_lr = ((uint32_t)(WUFFS_VP8__RENORM_RANGE_256[(v_lr & 255u)])); + v_lb -= v_nshift; + if (v_v == 0u) { + break; + } + } + self->private_impl.f_p1_range = (v_lr & 255u); + self->private_impl.f_p1_value = v_lv; + self->private_impl.f_p1_bits = v_lb; + self->private_impl.f_p1_ri = v_lri; + self->private_data.f_block_ac_nz = v_has_ac; + return v_has_nz; +} + +// -------- func vp8.decoder.predict_16x16 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_16x16( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode) { + return (*self->private_impl.choosy_predict_16x16)(self, a_workbuf, a_mode); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_16x16__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode) { + uint64_t v_y_off = 0; + uint32_t v_r = 0; + uint32_t v_c = 0; + uint64_t v_idx = 0; + uint32_t v_sum = 0; + uint32_t v_count = 0; + uint8_t v_dc = 0; + uint8_t v_tl = 0; + uint32_t v_p = 0; + + v_y_off = (((uint64_t)(self->private_impl.f_mb_y)) * 16u * ((uint64_t)(self->private_impl.f_y_stride))); + v_y_off += (((uint64_t)(self->private_impl.f_mb_x)) * 16u); + if (a_mode == 0u) { + v_sum = 0u; + v_count = 0u; + if ((self->private_impl.f_mb_y > 0u) && (v_y_off >= ((uint64_t)(self->private_impl.f_y_stride)))) { + v_c = 0u; + while (v_c < 16u) { + v_idx = ((uint64_t)(((uint64_t)(v_y_off - ((uint64_t)(self->private_impl.f_y_stride)))) + ((uint64_t)(v_c)))); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_sum += ((uint32_t)(a_workbuf.ptr[v_idx])); + } + v_c += 1u; + v_count += 1u; + } + } + if (self->private_impl.f_mb_x > 0u) { + v_r = 0u; + while (v_r < 16u) { + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + if (v_idx > 0u) { + v_idx -= 1u; + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_sum += ((uint32_t)(a_workbuf.ptr[v_idx])); + } + } + v_r += 1u; + v_count += 1u; + } + } + if (v_count > 0u) { + v_dc = ((uint8_t)((((uint32_t)(v_sum + (v_count >> 1u))) / v_count))); + } else { + v_dc = 128u; + } + v_r = 0u; + while (v_r < 16u) { + v_c = 0u; + while (v_c < 16u) { + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + v_idx += ((uint64_t)(v_c)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_idx] = v_dc; + } + v_c += 1u; + } + v_r += 1u; + } + } else if (a_mode == 1u) { + v_r = 0u; + while (v_r < 16u) { + v_c = 0u; + while (v_c < 16u) { + v_dc = 127u; + if ((self->private_impl.f_mb_y > 0u) && (v_y_off >= ((uint64_t)(self->private_impl.f_y_stride)))) { + v_idx = ((uint64_t)(((uint64_t)(v_y_off - ((uint64_t)(self->private_impl.f_y_stride)))) + ((uint64_t)(v_c)))); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_dc = a_workbuf.ptr[v_idx]; + } + } + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + v_idx += ((uint64_t)(v_c)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_idx] = v_dc; + } + v_c += 1u; + } + v_r += 1u; + } + } else if (a_mode == 2u) { + v_r = 0u; + while (v_r < 16u) { + v_dc = 129u; + if (self->private_impl.f_mb_x > 0u) { + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + if (v_idx > 0u) { + v_idx -= 1u; + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_dc = a_workbuf.ptr[v_idx]; + } + } + } + v_c = 0u; + while (v_c < 16u) { + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + v_idx += ((uint64_t)(v_c)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_idx] = v_dc; + } + v_c += 1u; + } + v_r += 1u; + } + } else { + v_tl = 127u; + if ((self->private_impl.f_mb_x > 0u) && (self->private_impl.f_mb_y > 0u) && (v_y_off > ((uint64_t)(self->private_impl.f_y_stride)))) { + v_idx = ((v_y_off - ((uint64_t)(self->private_impl.f_y_stride))) - 1u); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_tl = a_workbuf.ptr[v_idx]; + } + } else if ((self->private_impl.f_mb_x == 0u) && (self->private_impl.f_mb_y > 0u)) { + v_tl = 129u; + } + v_r = 0u; + while (v_r < 16u) { + v_c = 0u; + while (v_c < 16u) { + v_p = 127u; + if ((self->private_impl.f_mb_y > 0u) && (v_y_off >= ((uint64_t)(self->private_impl.f_y_stride)))) { + v_idx = ((uint64_t)(((uint64_t)(v_y_off - ((uint64_t)(self->private_impl.f_y_stride)))) + ((uint64_t)(v_c)))); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_p = ((uint32_t)(a_workbuf.ptr[v_idx])); + } + } + if (self->private_impl.f_mb_x > 0u) { + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + if (v_idx > 0u) { + v_idx -= 1u; + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_p = ((uint32_t)(((uint32_t)(v_p + ((uint32_t)(a_workbuf.ptr[v_idx])))) - ((uint32_t)(v_tl)))); + } + } + } + if (v_p > 255u) { + if ((v_p & 2147483648u) != 0u) { + v_p = 0u; + } else { + v_p = 255u; + } + } + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + v_idx += ((uint64_t)(v_c)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_idx] = ((uint8_t)(v_p)); + } + v_c += 1u; + } + v_r += 1u; + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.predict_8x8 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_8x8( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode, + uint64_t a_plane_offset) { + return (*self->private_impl.choosy_predict_8x8)(self, a_workbuf, a_mode, a_plane_offset); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_8x8__choosy_default( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode, + uint64_t a_plane_offset) { + uint64_t v_uv_off = 0; + uint32_t v_r = 0; + uint32_t v_c = 0; + uint64_t v_idx = 0; + uint32_t v_sum = 0; + uint32_t v_count = 0; + uint8_t v_dc = 0; + uint8_t v_tl = 0; + uint32_t v_p = 0; + + v_uv_off = ((uint64_t)(a_plane_offset + (((uint64_t)(self->private_impl.f_mb_y)) * 8u * ((uint64_t)(self->private_impl.f_uv_stride))))); + v_uv_off += (((uint64_t)(self->private_impl.f_mb_x)) * 8u); + if (a_mode == 0u) { + v_sum = 0u; + v_count = 0u; + if ((self->private_impl.f_mb_y > 0u) && (v_uv_off >= ((uint64_t)(self->private_impl.f_uv_stride)))) { + v_c = 0u; + while (v_c < 8u) { + v_idx = ((uint64_t)(((uint64_t)(v_uv_off - ((uint64_t)(self->private_impl.f_uv_stride)))) + ((uint64_t)(v_c)))); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_sum += ((uint32_t)(a_workbuf.ptr[v_idx])); + } + v_c += 1u; + v_count += 1u; + } + } + if (self->private_impl.f_mb_x > 0u) { + v_r = 0u; + while (v_r < 8u) { + v_idx = ((uint64_t)(v_uv_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))); + if (v_idx > 0u) { + v_idx -= 1u; + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_sum += ((uint32_t)(a_workbuf.ptr[v_idx])); + } + } + v_r += 1u; + v_count += 1u; + } + } + if (v_count > 0u) { + v_dc = ((uint8_t)((((uint32_t)(v_sum + (v_count >> 1u))) / v_count))); + } else { + v_dc = 128u; + } + v_r = 0u; + while (v_r < 8u) { + v_c = 0u; + while (v_c < 8u) { + v_idx = ((uint64_t)(v_uv_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))); + v_idx += ((uint64_t)(v_c)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_idx] = v_dc; + } + v_c += 1u; + } + v_r += 1u; + } + } else if (a_mode == 1u) { + v_r = 0u; + while (v_r < 8u) { + v_c = 0u; + while (v_c < 8u) { + v_dc = 127u; + if ((self->private_impl.f_mb_y > 0u) && (v_uv_off >= ((uint64_t)(self->private_impl.f_uv_stride)))) { + v_idx = ((uint64_t)(((uint64_t)(v_uv_off - ((uint64_t)(self->private_impl.f_uv_stride)))) + ((uint64_t)(v_c)))); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_dc = a_workbuf.ptr[v_idx]; + } + } + v_idx = ((uint64_t)(v_uv_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))); + v_idx += ((uint64_t)(v_c)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_idx] = v_dc; + } + v_c += 1u; + } + v_r += 1u; + } + } else if (a_mode == 2u) { + v_r = 0u; + while (v_r < 8u) { + v_dc = 129u; + if (self->private_impl.f_mb_x > 0u) { + v_idx = ((uint64_t)(v_uv_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))); + if (v_idx > 0u) { + v_idx -= 1u; + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_dc = a_workbuf.ptr[v_idx]; + } + } + } + v_c = 0u; + while (v_c < 8u) { + v_idx = ((uint64_t)(v_uv_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))); + v_idx += ((uint64_t)(v_c)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_idx] = v_dc; + } + v_c += 1u; + } + v_r += 1u; + } + } else { + v_tl = 127u; + if ((self->private_impl.f_mb_x > 0u) && (self->private_impl.f_mb_y > 0u) && (v_uv_off > ((uint64_t)(self->private_impl.f_uv_stride)))) { + v_idx = ((v_uv_off - ((uint64_t)(self->private_impl.f_uv_stride))) - 1u); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_tl = a_workbuf.ptr[v_idx]; + } + } else if ((self->private_impl.f_mb_x == 0u) && (self->private_impl.f_mb_y > 0u)) { + v_tl = 129u; + } + v_r = 0u; + while (v_r < 8u) { + v_c = 0u; + while (v_c < 8u) { + v_p = 127u; + if ((self->private_impl.f_mb_y > 0u) && (v_uv_off >= ((uint64_t)(self->private_impl.f_uv_stride)))) { + v_idx = ((uint64_t)(((uint64_t)(v_uv_off - ((uint64_t)(self->private_impl.f_uv_stride)))) + ((uint64_t)(v_c)))); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_p = ((uint32_t)(a_workbuf.ptr[v_idx])); + } + } + if (self->private_impl.f_mb_x > 0u) { + v_idx = ((uint64_t)(v_uv_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))); + if (v_idx > 0u) { + v_idx -= 1u; + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_p = ((uint32_t)(((uint32_t)(v_p + ((uint32_t)(a_workbuf.ptr[v_idx])))) - ((uint32_t)(v_tl)))); + } + } + } + if (v_p > 255u) { + if ((v_p & 2147483648u) != 0u) { + v_p = 0u; + } else { + v_p = 255u; + } + } + v_idx = ((uint64_t)(v_uv_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))); + v_idx += ((uint64_t)(v_c)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_idx] = ((uint8_t)(v_p)); + } + v_c += 1u; + } + v_r += 1u; + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.predict_4x4 + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_4x4( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint32_t a_block_idx, + uint8_t a_mode) { + uint64_t v_y_off = 0; + uint32_t v_bx = 0; + uint32_t v_by = 0; + uint64_t v_idx = 0; + uint64_t v_stride = 0; + bool v_has_top = false; + bool v_has_left = false; + uint32_t v_tl = 0; + uint32_t v_a0 = 0; + uint32_t v_a1 = 0; + uint32_t v_a2 = 0; + uint32_t v_a3 = 0; + uint32_t v_a4 = 0; + uint32_t v_a5 = 0; + uint32_t v_a6 = 0; + uint32_t v_a7 = 0; + uint32_t v_l0 = 0; + uint32_t v_l1 = 0; + uint32_t v_l2 = 0; + uint32_t v_l3 = 0; + uint32_t v_dc = 0; + wuffs_base__slice_u8 v_s = {0}; + uint32_t v_above4 = 0; + + v_bx = (a_block_idx & 3u); + v_by = (a_block_idx >> 2u); + v_y_off = (((uint64_t)(self->private_impl.f_mb_y)) * 16u * ((uint64_t)(self->private_impl.f_y_stride))); + v_y_off += (((uint64_t)(self->private_impl.f_mb_x)) * 16u); + v_y_off += (((uint64_t)(v_by)) * 4u * ((uint64_t)(self->private_impl.f_y_stride))); + v_y_off += (((uint64_t)(v_bx)) * 4u); + v_stride = ((uint64_t)(self->private_impl.f_y_stride)); + v_has_top = ((v_by > 0u) || (self->private_impl.f_mb_y > 0u)); + v_has_left = ((v_bx > 0u) || (self->private_impl.f_mb_x > 0u)); + if (v_has_top && (v_y_off >= v_stride)) { + v_idx = ((uint64_t)(v_y_off - v_stride)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_s = wuffs_base__slice_u8__subslice_i(a_workbuf, v_idx); + if (((uint64_t)(v_s.len)) >= 4u) { + v_above4 = wuffs_base__peek_u32le__no_bounds_check(v_s.ptr); + v_a0 = (v_above4 & 255u); + v_a1 = ((v_above4 >> 8u) & 255u); + v_a2 = ((v_above4 >> 16u) & 255u); + v_a3 = (v_above4 >> 24u); + } + } + } else { + v_a0 = 127u; + v_a1 = 127u; + v_a2 = 127u; + v_a3 = 127u; + } + if (v_has_top && (v_y_off >= v_stride) && (v_bx < 3u)) { + v_idx = ((uint64_t)(((uint64_t)(v_y_off - v_stride)) + 4u)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_s = wuffs_base__slice_u8__subslice_i(a_workbuf, v_idx); + if (((uint64_t)(v_s.len)) >= 4u) { + v_above4 = wuffs_base__peek_u32le__no_bounds_check(v_s.ptr); + v_a4 = (v_above4 & 255u); + v_a5 = ((v_above4 >> 8u) & 255u); + v_a6 = ((v_above4 >> 16u) & 255u); + v_a7 = (v_above4 >> 24u); + } + } + } else if ((v_bx >= 3u) && v_has_top) { + v_a4 = ((uint32_t)(self->private_data.f_mb_upper_right[0u])); + v_a5 = ((uint32_t)(self->private_data.f_mb_upper_right[1u])); + v_a6 = ((uint32_t)(self->private_data.f_mb_upper_right[2u])); + v_a7 = ((uint32_t)(self->private_data.f_mb_upper_right[3u])); + } else { + v_a4 = v_a3; + v_a5 = v_a3; + v_a6 = v_a3; + v_a7 = v_a3; + } + if (v_has_left && (v_y_off > 0u)) { + v_idx = ((uint64_t)(v_y_off - 1u)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_l0 = ((uint32_t)(a_workbuf.ptr[v_idx])); + } + v_idx = ((uint64_t)(((uint64_t)(v_y_off + v_stride)) - 1u)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_l1 = ((uint32_t)(a_workbuf.ptr[v_idx])); + } + v_idx = ((uint64_t)(((uint64_t)(v_y_off + ((uint64_t)(v_stride * 2u)))) - 1u)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_l2 = ((uint32_t)(a_workbuf.ptr[v_idx])); + } + v_idx = ((uint64_t)(((uint64_t)(v_y_off + ((uint64_t)(v_stride * 3u)))) - 1u)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_l3 = ((uint32_t)(a_workbuf.ptr[v_idx])); + } + } else { + v_l0 = 129u; + v_l1 = 129u; + v_l2 = 129u; + v_l3 = 129u; + } + if (v_has_top && v_has_left && (v_y_off > v_stride)) { + v_idx = ((uint64_t)(((uint64_t)(v_y_off - v_stride)) - 1u)); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_tl = ((uint32_t)(a_workbuf.ptr[v_idx])); + } + } else if (v_has_top && ! v_has_left) { + v_tl = 129u; + } else { + v_tl = 127u; + } + if (a_mode == 0u) { + v_dc = ((uint32_t)(((uint32_t)(((uint32_t)(((uint32_t)(((uint32_t)(((uint32_t)(((uint32_t)(((uint32_t)(v_a0 + v_a1)) + v_a2)) + v_a3)) + v_l0)) + v_l1)) + v_l2)) + v_l3)) + 4u)); + v_dc = ((v_dc >> 3u) & 255u); + wuffs_vp8__decoder__pred4x4_store(self, + a_workbuf, + v_y_off, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc, + v_dc); + } else if (a_mode == 1u) { + wuffs_vp8__decoder__pred4x4_store(self, + a_workbuf, + v_y_off, + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a0 + v_l0)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a1 + v_l0)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a2 + v_l0)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a3 + v_l0)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a0 + v_l1)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a1 + v_l1)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a2 + v_l1)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a3 + v_l1)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a0 + v_l2)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a1 + v_l2)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a2 + v_l2)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a3 + v_l2)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a0 + v_l3)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a1 + v_l3)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a2 + v_l3)) - v_tl))), + wuffs_vp8__decoder__clip8(self, ((uint32_t)(((uint32_t)(v_a3 + v_l3)) - v_tl)))); + } else if (a_mode == 2u) { + wuffs_vp8__decoder__pred4x4_store(self, + a_workbuf, + v_y_off, + wuffs_vp8__decoder__avg3(self, v_tl, v_a0, v_a1), + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2), + wuffs_vp8__decoder__avg3(self, v_a1, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_a2, v_a3, v_a4), + wuffs_vp8__decoder__avg3(self, v_tl, v_a0, v_a1), + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2), + wuffs_vp8__decoder__avg3(self, v_a1, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_a2, v_a3, v_a4), + wuffs_vp8__decoder__avg3(self, v_tl, v_a0, v_a1), + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2), + wuffs_vp8__decoder__avg3(self, v_a1, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_a2, v_a3, v_a4), + wuffs_vp8__decoder__avg3(self, v_tl, v_a0, v_a1), + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2), + wuffs_vp8__decoder__avg3(self, v_a1, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_a2, v_a3, v_a4)); + } else if (a_mode == 3u) { + wuffs_vp8__decoder__pred4x4_store(self, + a_workbuf, + v_y_off, + wuffs_vp8__decoder__avg3(self, v_tl, v_l0, v_l1), + wuffs_vp8__decoder__avg3(self, v_tl, v_l0, v_l1), + wuffs_vp8__decoder__avg3(self, v_tl, v_l0, v_l1), + wuffs_vp8__decoder__avg3(self, v_tl, v_l0, v_l1), + wuffs_vp8__decoder__avg3(self, v_l0, v_l1, v_l2), + wuffs_vp8__decoder__avg3(self, v_l0, v_l1, v_l2), + wuffs_vp8__decoder__avg3(self, v_l0, v_l1, v_l2), + wuffs_vp8__decoder__avg3(self, v_l0, v_l1, v_l2), + wuffs_vp8__decoder__avg3(self, v_l1, v_l2, v_l3), + wuffs_vp8__decoder__avg3(self, v_l1, v_l2, v_l3), + wuffs_vp8__decoder__avg3(self, v_l1, v_l2, v_l3), + wuffs_vp8__decoder__avg3(self, v_l1, v_l2, v_l3), + wuffs_vp8__decoder__avg3(self, v_l2, v_l3, v_l3), + wuffs_vp8__decoder__avg3(self, v_l2, v_l3, v_l3), + wuffs_vp8__decoder__avg3(self, v_l2, v_l3, v_l3), + wuffs_vp8__decoder__avg3(self, v_l2, v_l3, v_l3)); + } else if (a_mode == 4u) { + wuffs_vp8__decoder__pred4x4_store(self, + a_workbuf, + v_y_off, + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2), + wuffs_vp8__decoder__avg3(self, v_a1, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_a2, v_a3, v_a4), + wuffs_vp8__decoder__avg3(self, v_a3, v_a4, v_a5), + wuffs_vp8__decoder__avg3(self, v_a1, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_a2, v_a3, v_a4), + wuffs_vp8__decoder__avg3(self, v_a3, v_a4, v_a5), + wuffs_vp8__decoder__avg3(self, v_a4, v_a5, v_a6), + wuffs_vp8__decoder__avg3(self, v_a2, v_a3, v_a4), + wuffs_vp8__decoder__avg3(self, v_a3, v_a4, v_a5), + wuffs_vp8__decoder__avg3(self, v_a4, v_a5, v_a6), + wuffs_vp8__decoder__avg3(self, v_a5, v_a6, v_a7), + wuffs_vp8__decoder__avg3(self, v_a3, v_a4, v_a5), + wuffs_vp8__decoder__avg3(self, v_a4, v_a5, v_a6), + wuffs_vp8__decoder__avg3(self, v_a5, v_a6, v_a7), + wuffs_vp8__decoder__avg3(self, v_a6, v_a7, v_a7)); + } else if (a_mode == 5u) { + wuffs_vp8__decoder__pred4x4_store(self, + a_workbuf, + v_y_off, + wuffs_vp8__decoder__avg3(self, v_l0, v_tl, v_a0), + wuffs_vp8__decoder__avg3(self, v_tl, v_a0, v_a1), + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2), + wuffs_vp8__decoder__avg3(self, v_a1, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_l1, v_l0, v_tl), + wuffs_vp8__decoder__avg3(self, v_l0, v_tl, v_a0), + wuffs_vp8__decoder__avg3(self, v_tl, v_a0, v_a1), + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2), + wuffs_vp8__decoder__avg3(self, v_l2, v_l1, v_l0), + wuffs_vp8__decoder__avg3(self, v_l1, v_l0, v_tl), + wuffs_vp8__decoder__avg3(self, v_l0, v_tl, v_a0), + wuffs_vp8__decoder__avg3(self, v_tl, v_a0, v_a1), + wuffs_vp8__decoder__avg3(self, v_l3, v_l2, v_l1), + wuffs_vp8__decoder__avg3(self, v_l2, v_l1, v_l0), + wuffs_vp8__decoder__avg3(self, v_l1, v_l0, v_tl), + wuffs_vp8__decoder__avg3(self, v_l0, v_tl, v_a0)); + } else if (a_mode == 6u) { + wuffs_vp8__decoder__pred4x4_store(self, + a_workbuf, + v_y_off, + wuffs_vp8__decoder__avg2(self, v_tl, v_a0), + wuffs_vp8__decoder__avg2(self, v_a0, v_a1), + wuffs_vp8__decoder__avg2(self, v_a1, v_a2), + wuffs_vp8__decoder__avg2(self, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_l0, v_tl, v_a0), + wuffs_vp8__decoder__avg3(self, v_tl, v_a0, v_a1), + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2), + wuffs_vp8__decoder__avg3(self, v_a1, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_l1, v_l0, v_tl), + wuffs_vp8__decoder__avg2(self, v_tl, v_a0), + wuffs_vp8__decoder__avg2(self, v_a0, v_a1), + wuffs_vp8__decoder__avg2(self, v_a1, v_a2), + wuffs_vp8__decoder__avg3(self, v_l2, v_l1, v_l0), + wuffs_vp8__decoder__avg3(self, v_l0, v_tl, v_a0), + wuffs_vp8__decoder__avg3(self, v_tl, v_a0, v_a1), + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2)); + } else if (a_mode == 7u) { + wuffs_vp8__decoder__pred4x4_store(self, + a_workbuf, + v_y_off, + wuffs_vp8__decoder__avg2(self, v_a0, v_a1), + wuffs_vp8__decoder__avg2(self, v_a1, v_a2), + wuffs_vp8__decoder__avg2(self, v_a2, v_a3), + wuffs_vp8__decoder__avg2(self, v_a3, v_a4), + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2), + wuffs_vp8__decoder__avg3(self, v_a1, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_a2, v_a3, v_a4), + wuffs_vp8__decoder__avg3(self, v_a3, v_a4, v_a5), + wuffs_vp8__decoder__avg2(self, v_a1, v_a2), + wuffs_vp8__decoder__avg2(self, v_a2, v_a3), + wuffs_vp8__decoder__avg2(self, v_a3, v_a4), + wuffs_vp8__decoder__avg3(self, v_a4, v_a5, v_a6), + wuffs_vp8__decoder__avg3(self, v_a1, v_a2, v_a3), + wuffs_vp8__decoder__avg3(self, v_a2, v_a3, v_a4), + wuffs_vp8__decoder__avg3(self, v_a3, v_a4, v_a5), + wuffs_vp8__decoder__avg3(self, v_a5, v_a6, v_a7)); + } else if (a_mode == 8u) { + wuffs_vp8__decoder__pred4x4_store(self, + a_workbuf, + v_y_off, + wuffs_vp8__decoder__avg2(self, v_l0, v_tl), + wuffs_vp8__decoder__avg3(self, v_l0, v_tl, v_a0), + wuffs_vp8__decoder__avg3(self, v_tl, v_a0, v_a1), + wuffs_vp8__decoder__avg3(self, v_a0, v_a1, v_a2), + wuffs_vp8__decoder__avg2(self, v_l1, v_l0), + wuffs_vp8__decoder__avg3(self, v_l1, v_l0, v_tl), + wuffs_vp8__decoder__avg2(self, v_l0, v_tl), + wuffs_vp8__decoder__avg3(self, v_l0, v_tl, v_a0), + wuffs_vp8__decoder__avg2(self, v_l2, v_l1), + wuffs_vp8__decoder__avg3(self, v_l2, v_l1, v_l0), + wuffs_vp8__decoder__avg2(self, v_l1, v_l0), + wuffs_vp8__decoder__avg3(self, v_l1, v_l0, v_tl), + wuffs_vp8__decoder__avg2(self, v_l3, v_l2), + wuffs_vp8__decoder__avg3(self, v_l3, v_l2, v_l1), + wuffs_vp8__decoder__avg2(self, v_l2, v_l1), + wuffs_vp8__decoder__avg3(self, v_l2, v_l1, v_l0)); + } else { + wuffs_vp8__decoder__pred4x4_store(self, + a_workbuf, + v_y_off, + wuffs_vp8__decoder__avg2(self, v_l0, v_l1), + wuffs_vp8__decoder__avg3(self, v_l0, v_l1, v_l2), + wuffs_vp8__decoder__avg2(self, v_l1, v_l2), + wuffs_vp8__decoder__avg3(self, v_l1, v_l2, v_l3), + wuffs_vp8__decoder__avg2(self, v_l1, v_l2), + wuffs_vp8__decoder__avg3(self, v_l1, v_l2, v_l3), + wuffs_vp8__decoder__avg2(self, v_l2, v_l3), + wuffs_vp8__decoder__avg3(self, v_l2, v_l3, v_l3), + wuffs_vp8__decoder__avg2(self, v_l2, v_l3), + wuffs_vp8__decoder__avg3(self, v_l2, v_l3, v_l3), + (v_l3 & 255u), + (v_l3 & 255u), + (v_l3 & 255u), + (v_l3 & 255u), + (v_l3 & 255u), + (v_l3 & 255u)); + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.pred4x4_store + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__pred4x4_store( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_off, + uint32_t a_v00, + uint32_t a_v01, + uint32_t a_v02, + uint32_t a_v03, + uint32_t a_v10, + uint32_t a_v11, + uint32_t a_v12, + uint32_t a_v13, + uint32_t a_v20, + uint32_t a_v21, + uint32_t a_v22, + uint32_t a_v23, + uint32_t a_v30, + uint32_t a_v31, + uint32_t a_v32, + uint32_t a_v33) { + uint64_t v_stride = 0; + uint64_t v_row_off = 0; + wuffs_base__slice_u8 v_s = {0}; + + v_stride = ((uint64_t)(self->private_impl.f_y_stride)); + v_row_off = a_off; + if (v_row_off < ((uint64_t)(a_workbuf.len))) { + v_s = wuffs_base__slice_u8__subslice_i(a_workbuf, v_row_off); + if (((uint64_t)(v_s.len)) >= 4u) { + wuffs_base__poke_u32le__no_bounds_check(v_s.ptr, ((a_v00 & 255u) | + ((a_v01 & 255u) << 8u) | + ((a_v02 & 255u) << 16u) | + ((a_v03 & 255u) << 24u))); + } + } + v_row_off = ((uint64_t)(a_off + v_stride)); + if (v_row_off < ((uint64_t)(a_workbuf.len))) { + v_s = wuffs_base__slice_u8__subslice_i(a_workbuf, v_row_off); + if (((uint64_t)(v_s.len)) >= 4u) { + wuffs_base__poke_u32le__no_bounds_check(v_s.ptr, ((a_v10 & 255u) | + ((a_v11 & 255u) << 8u) | + ((a_v12 & 255u) << 16u) | + ((a_v13 & 255u) << 24u))); + } + } + v_row_off = ((uint64_t)(a_off + ((uint64_t)(v_stride * 2u)))); + if (v_row_off < ((uint64_t)(a_workbuf.len))) { + v_s = wuffs_base__slice_u8__subslice_i(a_workbuf, v_row_off); + if (((uint64_t)(v_s.len)) >= 4u) { + wuffs_base__poke_u32le__no_bounds_check(v_s.ptr, ((a_v20 & 255u) | + ((a_v21 & 255u) << 8u) | + ((a_v22 & 255u) << 16u) | + ((a_v23 & 255u) << 24u))); + } + } + v_row_off = ((uint64_t)(a_off + ((uint64_t)(v_stride * 3u)))); + if (v_row_off < ((uint64_t)(a_workbuf.len))) { + v_s = wuffs_base__slice_u8__subslice_i(a_workbuf, v_row_off); + if (((uint64_t)(v_s.len)) >= 4u) { + wuffs_base__poke_u32le__no_bounds_check(v_s.ptr, ((a_v30 & 255u) | + ((a_v31 & 255u) << 8u) | + ((a_v32 & 255u) << 16u) | + ((a_v33 & 255u) << 24u))); + } + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.avg2 + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__avg2( + const wuffs_vp8__decoder* self, + uint32_t a_a, + uint32_t a_b) { + return ((((uint32_t)(((uint32_t)(a_a + a_b)) + 1u)) >> 1u) & 255u); +} + +// -------- func vp8.decoder.avg3 + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__avg3( + const wuffs_vp8__decoder* self, + uint32_t a_a, + uint32_t a_b, + uint32_t a_c) { + return ((((uint32_t)(((uint32_t)(((uint32_t)(a_a + ((uint32_t)(a_b * 2u)))) + a_c)) + 2u)) >> 2u) & 255u); +} + +// -------- func vp8.decoder.clip8 + +WUFFS_BASE__GENERATED_C_CODE +static uint32_t +wuffs_vp8__decoder__clip8( + const wuffs_vp8__decoder* self, + uint32_t a_v) { + if (a_v <= 255u) { + return a_v; + } + if ((a_v & 2147483648u) != 0u) { + return 0u; + } + return 255u; +} + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.predict_16x16_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_16x16_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode) { + uint8_t v_left_arr[16] = {0}; + uint8_t v_tl = 0; + wuffs_base__slice_u8 v_s = {0}; + uint8x16_t v_above = {0}; + uint8x16_t v_diff_u8 = {0}; + uint8x16_t v_result = {0}; + uint64_t v_y_off = 0; + uint64_t v_idx = 0; + uint32_t v_r = 0; + uint32_t v_sum = 0; + uint32_t v_count = 0; + uint8_t v_dc = 0; + uint8_t v_left_val = 0; + uint8_t v_tl_val = 0; + + v_y_off = (((uint64_t)(self->private_impl.f_mb_y)) * 16u * ((uint64_t)(self->private_impl.f_y_stride))); + v_y_off += (((uint64_t)(self->private_impl.f_mb_x)) * 16u); + if ((self->private_impl.f_mb_y > 0u) && (v_y_off >= ((uint64_t)(self->private_impl.f_y_stride)))) { + v_idx = ((uint64_t)(v_y_off - ((uint64_t)(self->private_impl.f_y_stride)))); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_s = wuffs_base__slice_u8__subslice_i(a_workbuf, v_idx); + if (((uint64_t)(v_s.len)) >= 16u) { + v_above = vld1q_u8(v_s.ptr); + } + } + } else { + v_above = vdupq_n_u8(127u); + } + v_r = 0u; + while (v_r < 16u) { + v_left_arr[v_r] = 129u; + if (self->private_impl.f_mb_x > 0u) { + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + if (v_idx > 0u) { + v_idx -= 1u; + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_left_arr[v_r] = a_workbuf.ptr[v_idx]; + } + } + } + v_r += 1u; + } + v_tl = 127u; + if ((self->private_impl.f_mb_x > 0u) && (self->private_impl.f_mb_y > 0u) && (v_y_off > ((uint64_t)(self->private_impl.f_y_stride)))) { + v_idx = ((v_y_off - ((uint64_t)(self->private_impl.f_y_stride))) - 1u); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_tl = a_workbuf.ptr[v_idx]; + } + } else if ((self->private_impl.f_mb_x == 0u) && (self->private_impl.f_mb_y > 0u)) { + v_tl = 129u; + } + if (v_y_off <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_y_off); + } + if (a_mode == 0u) { + v_sum = 0u; + v_count = 0u; + if (self->private_impl.f_mb_y > 0u) { + v_sum = ((uint32_t)(vaddlvq_u8(v_above))); + v_count = 16u; + } + if (self->private_impl.f_mb_x > 0u) { + v_r = 0u; + while (v_r < 16u) { + v_sum += ((uint32_t)(v_left_arr[v_r])); + v_r += 1u; + } + v_count += 16u; + } + if (v_count > 0u) { + v_dc = ((uint8_t)((((uint32_t)(v_sum + (v_count >> 1u))) / v_count))); + } else { + v_dc = 128u; + } + v_result = vdupq_n_u8(v_dc); + v_r = 0u; + while (v_r < 16u) { + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_result); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + v_r += 1u; + } + } else if (a_mode == 1u) { + v_r = 0u; + while (v_r < 16u) { + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_above); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + v_r += 1u; + } + } else if (a_mode == 2u) { + v_r = 0u; + while (v_r < 16u) { + v_result = vdupq_n_u8(v_left_arr[v_r]); + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_result); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + v_r += 1u; + } + } else { + v_tl_val = v_tl; + v_r = 0u; + while (v_r < 16u) { + v_left_val = v_left_arr[v_r]; + if (v_left_val >= v_tl_val) { + v_diff_u8 = vdupq_n_u8(((uint8_t)(v_left_val - v_tl_val))); + v_result = vqaddq_u8(v_above, v_diff_u8); + } else { + v_diff_u8 = vdupq_n_u8(((uint8_t)(v_tl_val - v_left_val))); + v_result = vqsubq_u8(v_above, v_diff_u8); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + vst1q_u8(a_workbuf.ptr, v_result); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + v_r += 1u; + } + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +arm_neon +// -------- func vp8.decoder.predict_8x8_arm_neon + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_8x8_arm_neon( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode, + uint64_t a_plane_offset) { + uint8_t v_left_arr[8] = {0}; + uint8_t v_tl = 0; + wuffs_base__slice_u8 v_s = {0}; + uint8x8_t v_above = {0}; + uint8x8_t v_diff_u8 = {0}; + uint8x8_t v_result = {0}; + uint64_t v_uv_off = 0; + uint64_t v_idx = 0; + uint32_t v_r = 0; + uint32_t v_sum = 0; + uint32_t v_count = 0; + uint8_t v_dc = 0; + uint8_t v_left_val = 0; + uint8_t v_tl_val = 0; + + v_uv_off = ((uint64_t)(a_plane_offset + (((uint64_t)(self->private_impl.f_mb_y)) * 8u * ((uint64_t)(self->private_impl.f_uv_stride))))); + v_uv_off += (((uint64_t)(self->private_impl.f_mb_x)) * 8u); + if ((self->private_impl.f_mb_y > 0u) && (v_uv_off >= ((uint64_t)(self->private_impl.f_uv_stride)))) { + v_idx = ((uint64_t)(v_uv_off - ((uint64_t)(self->private_impl.f_uv_stride)))); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_s = wuffs_base__slice_u8__subslice_i(a_workbuf, v_idx); + if (((uint64_t)(v_s.len)) >= 8u) { + v_above = vld1_u8(v_s.ptr); + } + } + } else { + v_above = vdup_n_u8(127u); + } + v_r = 0u; + while (v_r < 8u) { + v_left_arr[v_r] = 129u; + if (self->private_impl.f_mb_x > 0u) { + v_idx = ((uint64_t)(v_uv_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))); + if (v_idx > 0u) { + v_idx -= 1u; + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_left_arr[v_r] = a_workbuf.ptr[v_idx]; + } + } + } + v_r += 1u; + } + v_tl = 127u; + if ((self->private_impl.f_mb_x > 0u) && (self->private_impl.f_mb_y > 0u) && (v_uv_off > ((uint64_t)(self->private_impl.f_uv_stride)))) { + v_idx = ((v_uv_off - ((uint64_t)(self->private_impl.f_uv_stride))) - 1u); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_tl = a_workbuf.ptr[v_idx]; + } + } else if ((self->private_impl.f_mb_x == 0u) && (self->private_impl.f_mb_y > 0u)) { + v_tl = 129u; + } + if (v_uv_off <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_uv_off); + } + if (a_mode == 0u) { + v_sum = 0u; + v_count = 0u; + if (self->private_impl.f_mb_y > 0u) { + v_sum = ((uint32_t)(vaddlv_u8(v_above))); + v_count = 8u; + } + if (self->private_impl.f_mb_x > 0u) { + v_r = 0u; + while (v_r < 8u) { + v_sum += ((uint32_t)(v_left_arr[v_r])); + v_r += 1u; + } + v_count += 8u; + } + if (v_count > 0u) { + v_dc = ((uint8_t)((((uint32_t)(v_sum + (v_count >> 1u))) / v_count))); + } else { + v_dc = 128u; + } + v_result = vdup_n_u8(v_dc); + v_r = 0u; + while (v_r < 8u) { + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_result); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_r += 1u; + } + } else if (a_mode == 1u) { + v_r = 0u; + while (v_r < 8u) { + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_above); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_r += 1u; + } + } else if (a_mode == 2u) { + v_r = 0u; + while (v_r < 8u) { + v_result = vdup_n_u8(v_left_arr[v_r]); + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_result); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_r += 1u; + } + } else { + v_tl_val = v_tl; + v_r = 0u; + while (v_r < 8u) { + v_left_val = v_left_arr[v_r]; + if (v_left_val >= v_tl_val) { + v_diff_u8 = vdup_n_u8(((uint8_t)(v_left_val - v_tl_val))); + v_result = vqadd_u8(v_above, v_diff_u8); + } else { + v_diff_u8 = vdup_n_u8(((uint8_t)(v_tl_val - v_left_val))); + v_result = vqsub_u8(v_above, v_diff_u8); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + vst1_u8(a_workbuf.ptr, v_result); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_r += 1u; + } + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) +// ‼ WUFFS MULTI-FILE SECTION -arm_neon + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.predict_16x16_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_16x16_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode) { + uint8_t v_left_arr[16] = {0}; + uint8_t v_tl = 0; + wuffs_base__slice_u8 v_s = {0}; + __m128i v_zero = {0}; + __m128i v_above = {0}; + __m128i v_diff = {0}; + __m128i v_result = {0}; + __m128i v_sad = {0}; + __m128i v_tmp = {0}; + uint64_t v_y_off = 0; + uint64_t v_idx = 0; + uint32_t v_r = 0; + uint32_t v_sum = 0; + uint32_t v_count = 0; + uint8_t v_dc = 0; + + v_zero = _mm_setzero_si128(); + v_y_off = (((uint64_t)(self->private_impl.f_mb_y)) * 16u * ((uint64_t)(self->private_impl.f_y_stride))); + v_y_off += (((uint64_t)(self->private_impl.f_mb_x)) * 16u); + if ((self->private_impl.f_mb_y > 0u) && (v_y_off >= ((uint64_t)(self->private_impl.f_y_stride)))) { + v_idx = ((uint64_t)(v_y_off - ((uint64_t)(self->private_impl.f_y_stride)))); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_s = wuffs_base__slice_u8__subslice_i(a_workbuf, v_idx); + if (((uint64_t)(v_s.len)) >= 16u) { + v_above = _mm_lddqu_si128((const __m128i*)(const void*)(v_s.ptr)); + } + } + } else { + v_above = _mm_set1_epi8((int8_t)(127u)); + } + v_r = 0u; + while (v_r < 16u) { + v_left_arr[v_r] = 129u; + if (self->private_impl.f_mb_x > 0u) { + v_idx = ((uint64_t)(v_y_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_y_stride))))); + if (v_idx > 0u) { + v_idx -= 1u; + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_left_arr[v_r] = a_workbuf.ptr[v_idx]; + } + } + } + v_r += 1u; + } + v_tl = 127u; + if ((self->private_impl.f_mb_x > 0u) && (self->private_impl.f_mb_y > 0u) && (v_y_off > ((uint64_t)(self->private_impl.f_y_stride)))) { + v_idx = ((v_y_off - ((uint64_t)(self->private_impl.f_y_stride))) - 1u); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_tl = a_workbuf.ptr[v_idx]; + } + } else if ((self->private_impl.f_mb_x == 0u) && (self->private_impl.f_mb_y > 0u)) { + v_tl = 129u; + } + if (v_y_off <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_y_off); + } + if (a_mode == 0u) { + v_sum = 0u; + v_count = 0u; + if (self->private_impl.f_mb_y > 0u) { + v_sad = _mm_sad_epu8(v_above, v_zero); + v_tmp = _mm_srli_si128(v_sad, (int32_t)(8u)); + v_sad = _mm_add_epi32(v_sad, v_tmp); + v_sum = ((uint32_t)(_mm_cvtsi128_si32(v_sad))); + v_count = 16u; + } + if (self->private_impl.f_mb_x > 0u) { + v_r = 0u; + while (v_r < 16u) { + v_sum += ((uint32_t)(v_left_arr[v_r])); + v_r += 1u; + } + v_count += 16u; + } + if (v_count > 0u) { + v_dc = ((uint8_t)((((uint32_t)(v_sum + (v_count >> 1u))) / v_count))); + } else { + v_dc = 128u; + } + v_result = _mm_set1_epi8((int8_t)(v_dc)); + v_r = 0u; + while (v_r < 16u) { + if (16u <= ((uint64_t)(a_workbuf.len))) { + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_result); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + v_r += 1u; + } + } else if (a_mode == 1u) { + v_r = 0u; + while (v_r < 16u) { + if (16u <= ((uint64_t)(a_workbuf.len))) { + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_above); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + v_r += 1u; + } + } else if (a_mode == 2u) { + v_r = 0u; + while (v_r < 16u) { + v_result = _mm_set1_epi8((int8_t)(v_left_arr[v_r])); + if (16u <= ((uint64_t)(a_workbuf.len))) { + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_result); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + v_r += 1u; + } + } else { + v_r = 0u; + while (v_r < 16u) { + if (v_left_arr[v_r] >= v_tl) { + v_diff = _mm_set1_epi8((int8_t)(((uint8_t)(v_left_arr[v_r] - v_tl)))); + v_result = _mm_adds_epu8(v_above, v_diff); + } else { + v_diff = _mm_set1_epi8((int8_t)(((uint8_t)(v_tl - v_left_arr[v_r])))); + v_result = _mm_subs_epu8(v_above, v_diff); + } + if (16u <= ((uint64_t)(a_workbuf.len))) { + _mm_storeu_si128((__m128i*)(void*)(a_workbuf.ptr), v_result); + } + if (((uint64_t)(self->private_impl.f_y_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_y_stride))); + } + v_r += 1u; + } + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// ‼ WUFFS MULTI-FILE SECTION +x86_sse42 +// -------- func vp8.decoder.predict_8x8_x86_sse42 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__predict_8x8_x86_sse42( + wuffs_vp8__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint8_t a_mode, + uint64_t a_plane_offset) { + uint8_t v_left_arr[8] = {0}; + uint8_t v_tl = 0; + wuffs_base__slice_u8 v_s = {0}; + __m128i v_zero = {0}; + __m128i v_above = {0}; + __m128i v_diff = {0}; + __m128i v_result = {0}; + __m128i v_sad = {0}; + uint64_t v_uv_off = 0; + uint64_t v_idx = 0; + uint32_t v_r = 0; + uint32_t v_sum = 0; + uint32_t v_count = 0; + uint8_t v_dc = 0; + + v_zero = _mm_setzero_si128(); + v_uv_off = ((uint64_t)(a_plane_offset + (((uint64_t)(self->private_impl.f_mb_y)) * 8u * ((uint64_t)(self->private_impl.f_uv_stride))))); + v_uv_off += (((uint64_t)(self->private_impl.f_mb_x)) * 8u); + if ((self->private_impl.f_mb_y > 0u) && (v_uv_off >= ((uint64_t)(self->private_impl.f_uv_stride)))) { + v_idx = ((uint64_t)(v_uv_off - ((uint64_t)(self->private_impl.f_uv_stride)))); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_s = wuffs_base__slice_u8__subslice_i(a_workbuf, v_idx); + if (((uint64_t)(v_s.len)) >= 8u) { + v_above = _mm_loadl_epi64((const __m128i*)(const void*)(v_s.ptr)); + } + } + } else { + v_above = _mm_set1_epi8((int8_t)(127u)); + } + v_r = 0u; + while (v_r < 8u) { + v_left_arr[v_r] = 129u; + if (self->private_impl.f_mb_x > 0u) { + v_idx = ((uint64_t)(v_uv_off + (((uint64_t)(v_r)) * ((uint64_t)(self->private_impl.f_uv_stride))))); + if (v_idx > 0u) { + v_idx -= 1u; + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_left_arr[v_r] = a_workbuf.ptr[v_idx]; + } + } + } + v_r += 1u; + } + v_tl = 127u; + if ((self->private_impl.f_mb_x > 0u) && (self->private_impl.f_mb_y > 0u) && (v_uv_off > ((uint64_t)(self->private_impl.f_uv_stride)))) { + v_idx = ((v_uv_off - ((uint64_t)(self->private_impl.f_uv_stride))) - 1u); + if (v_idx < ((uint64_t)(a_workbuf.len))) { + v_tl = a_workbuf.ptr[v_idx]; + } + } else if ((self->private_impl.f_mb_x == 0u) && (self->private_impl.f_mb_y > 0u)) { + v_tl = 129u; + } + if (v_uv_off <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, v_uv_off); + } + if (a_mode == 0u) { + v_sum = 0u; + v_count = 0u; + if (self->private_impl.f_mb_y > 0u) { + v_sad = _mm_sad_epu8(v_above, v_zero); + v_sum = ((uint32_t)(_mm_cvtsi128_si32(v_sad))); + v_count = 8u; + } + if (self->private_impl.f_mb_x > 0u) { + v_r = 0u; + while (v_r < 8u) { + v_sum += ((uint32_t)(v_left_arr[v_r])); + v_r += 1u; + } + v_count += 8u; + } + if (v_count > 0u) { + v_dc = ((uint8_t)((((uint32_t)(v_sum + (v_count >> 1u))) / v_count))); + } else { + v_dc = 128u; + } + v_result = _mm_set1_epi8((int8_t)(v_dc)); + v_r = 0u; + while (v_r < 8u) { + if (8u <= ((uint64_t)(a_workbuf.len))) { + _mm_storeu_si64((void*)(a_workbuf.ptr), v_result); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_r += 1u; + } + } else if (a_mode == 1u) { + v_r = 0u; + while (v_r < 8u) { + if (8u <= ((uint64_t)(a_workbuf.len))) { + _mm_storeu_si64((void*)(a_workbuf.ptr), v_above); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_r += 1u; + } + } else if (a_mode == 2u) { + v_r = 0u; + while (v_r < 8u) { + v_result = _mm_set1_epi8((int8_t)(v_left_arr[v_r])); + if (8u <= ((uint64_t)(a_workbuf.len))) { + _mm_storeu_si64((void*)(a_workbuf.ptr), v_result); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_r += 1u; + } + } else { + v_r = 0u; + while (v_r < 8u) { + if (v_left_arr[v_r] >= v_tl) { + v_diff = _mm_set1_epi8((int8_t)(((uint8_t)(v_left_arr[v_r] - v_tl)))); + v_result = _mm_adds_epu8(v_above, v_diff); + } else { + v_diff = _mm_set1_epi8((int8_t)(((uint8_t)(v_tl - v_left_arr[v_r])))); + v_result = _mm_subs_epu8(v_above, v_diff); + } + if (8u <= ((uint64_t)(a_workbuf.len))) { + _mm_storeu_si64((void*)(a_workbuf.ptr), v_result); + } + if (((uint64_t)(self->private_impl.f_uv_stride)) <= ((uint64_t)(a_workbuf.len))) { + a_workbuf = wuffs_base__slice_u8__subslice_i(a_workbuf, ((uint64_t)(self->private_impl.f_uv_stride))); + } + v_r += 1u; + } + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) +// ‼ WUFFS MULTI-FILE SECTION -x86_sse42 + +// -------- func vp8.decoder.get_quirk + +WUFFS_BASE__GENERATED_C_CODE +WUFFS_BASE__MAYBE_STATIC uint64_t +wuffs_vp8__decoder__get_quirk( + const wuffs_vp8__decoder* self, + uint32_t a_key) { + if (!self) { + return 0; + } + if ((self->private_impl.magic != WUFFS_BASE__MAGIC) && + (self->private_impl.magic != WUFFS_BASE__DISABLED)) { + return 0; + } + + return 0u; +} + +// -------- func vp8.decoder.set_quirk + +WUFFS_BASE__GENERATED_C_CODE +WUFFS_BASE__MAYBE_STATIC wuffs_base__status +wuffs_vp8__decoder__set_quirk( + wuffs_vp8__decoder* self, + uint32_t a_key, + uint64_t a_value) { + if (!self) { + return wuffs_base__make_status(wuffs_base__error__bad_receiver); + } + if (self->private_impl.magic != WUFFS_BASE__MAGIC) { + return wuffs_base__make_status( + (self->private_impl.magic == WUFFS_BASE__DISABLED) + ? wuffs_base__error__disabled_by_previous_error + : wuffs_base__error__initialize_not_called); + } + + return wuffs_base__make_status(wuffs_base__error__unsupported_option); +} + +// -------- func vp8.decoder.decode_image_config + +WUFFS_BASE__GENERATED_C_CODE +WUFFS_BASE__MAYBE_STATIC wuffs_base__status +wuffs_vp8__decoder__decode_image_config( + wuffs_vp8__decoder* self, + wuffs_base__image_config* a_dst, + wuffs_base__io_buffer* a_src) { + if (!self) { + return wuffs_base__make_status(wuffs_base__error__bad_receiver); + } + if (self->private_impl.magic != WUFFS_BASE__MAGIC) { + return wuffs_base__make_status( + (self->private_impl.magic == WUFFS_BASE__DISABLED) + ? wuffs_base__error__disabled_by_previous_error + : wuffs_base__error__initialize_not_called); + } + if (!a_src) { + self->private_impl.magic = WUFFS_BASE__DISABLED; + return wuffs_base__make_status(wuffs_base__error__bad_argument); + } + if ((self->private_impl.active_coroutine != 0) && + (self->private_impl.active_coroutine != 1)) { + self->private_impl.magic = WUFFS_BASE__DISABLED; + return wuffs_base__make_status(wuffs_base__error__interleaved_coroutine_calls); + } + self->private_impl.active_coroutine = 0; + wuffs_base__status status = wuffs_base__make_status(NULL); + + wuffs_base__status v_status = wuffs_base__make_status(NULL); + + uint32_t coro_susp_point = self->private_impl.p_decode_image_config; + switch (coro_susp_point) { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0; + + while (true) { + { + wuffs_base__status t_0 = wuffs_vp8__decoder__do_decode_image_config(self, a_dst, a_src); + v_status = t_0; + } + if ((v_status.repr == wuffs_base__suspension__short_read) && (a_src && a_src->meta.closed)) { status = wuffs_base__make_status(wuffs_vp8__error__truncated_input); goto exit; } @@ -80646,10 +95068,12 @@ wuffs_vp8__decoder__do_decode_image_config( } v_c32 = t_0; } - if ((v_c32 & 1u) != 0u) { + self->private_impl.f_key_frame = ((v_c32 & 1u) == 0u); + if ( ! self->private_impl.f_key_frame) { status = wuffs_base__make_status(wuffs_vp8__error__unsupported_vp8_file); goto exit; } + self->private_impl.f_partition0_size = ((v_c32 >> 5u) & 524287u); { WUFFS_BASE__COROUTINE_SUSPENSION_POINT(3); uint32_t t_1; @@ -80714,6 +95138,13 @@ wuffs_vp8__decoder__do_decode_image_config( } self->private_impl.f_width = (16383u & (v_c32 >> 0u)); self->private_impl.f_height = (16383u & (v_c32 >> 16u)); + self->private_impl.f_mb_width = ((self->private_impl.f_width + 15u) / 16u); + self->private_impl.f_mb_height = ((self->private_impl.f_height + 15u) / 16u); + self->private_impl.f_y_stride = (self->private_impl.f_mb_width * 16u); + self->private_impl.f_uv_stride = (self->private_impl.f_mb_width * 8u); + self->private_impl.f_workbuf_offset_y_end = (((uint64_t)(self->private_impl.f_y_stride)) * ((uint64_t)((self->private_impl.f_mb_height * 16u)))); + self->private_impl.f_workbuf_offset_u_end = (self->private_impl.f_workbuf_offset_y_end + (((uint64_t)(self->private_impl.f_uv_stride)) * ((uint64_t)((self->private_impl.f_mb_height * 8u))))); + self->private_impl.f_workbuf_offset_v_end = (self->private_impl.f_workbuf_offset_u_end + (((uint64_t)(self->private_impl.f_uv_stride)) * ((uint64_t)((self->private_impl.f_mb_height * 8u))))); self->private_impl.f_frame_config_io_position = wuffs_base__u64__sat_add((a_src ? a_src->meta.pos : 0), ((uint64_t)(iop_a_src - io0_a_src))); if (a_dst != NULL) { wuffs_base__image_config__set( @@ -80987,6 +95418,19 @@ wuffs_vp8__decoder__do_decode_frame( wuffs_base__status status = wuffs_base__make_status(NULL); wuffs_base__status v_status = wuffs_base__make_status(NULL); + uint32_t v_remaining = 0; + uint64_t v_off = 0; + + const uint8_t* iop_a_src = NULL; + const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + if (a_src && a_src->data.ptr) { + io0_a_src = a_src->data.ptr; + io1_a_src = io0_a_src + a_src->meta.ri; + iop_a_src = io1_a_src; + io2_a_src = io0_a_src + a_src->meta.wi; + } uint32_t coro_susp_point = self->private_impl.p_do_decode_frame; switch (coro_susp_point) { @@ -80994,8 +95438,14 @@ wuffs_vp8__decoder__do_decode_frame( if (self->private_impl.f_call_sequence == 64u) { } else if (self->private_impl.f_call_sequence < 64u) { + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1); status = wuffs_vp8__decoder__do_decode_frame_config(self, NULL, a_src); + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } if (status.repr) { goto suspend; } @@ -81003,8 +95453,168 @@ wuffs_vp8__decoder__do_decode_frame( status = wuffs_base__make_status(wuffs_base__note__end_of_data); goto ok; } - self->private_impl.f_dst_x = 0u; - self->private_impl.f_dst_y = 0u; + if (self->private_impl.f_workbuf_offset_v_end > ((uint64_t)(a_workbuf.len))) { + status = wuffs_base__make_status(wuffs_base__error__bad_workbuf_length); + goto exit; + } + self->private_impl.choosy_idct_add = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__idct_add_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__idct_add_x86_sse42 : +#endif + self->private_impl.choosy_idct_add); + self->private_impl.choosy_idct_dc_add = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__idct_dc_add_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__idct_dc_add_x86_sse42 : +#endif + self->private_impl.choosy_idct_dc_add); + self->private_impl.choosy_idct_add_pair = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_vp8__decoder__idct_add_pair_x86_avx2 : +#endif + self->private_impl.choosy_idct_add_pair); + self->private_impl.choosy_idct_dc_add_pair = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_vp8__decoder__idct_dc_add_pair_x86_avx2 : +#endif + self->private_impl.choosy_idct_dc_add_pair); + self->private_impl.choosy_predict_16x16 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__predict_16x16_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__predict_16x16_x86_sse42 : +#endif + self->private_impl.choosy_predict_16x16); + self->private_impl.choosy_predict_8x8 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__predict_8x8_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__predict_8x8_x86_sse42 : +#endif + self->private_impl.choosy_predict_8x8); + self->private_impl.choosy_simple_vfilter_16 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__simple_vfilter_16_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__simple_vfilter_16_x86_sse42 : +#endif + self->private_impl.choosy_simple_vfilter_16); + self->private_impl.choosy_normal_vfilter_inner_16 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__normal_vfilter_inner_16_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__normal_vfilter_inner_16_x86_sse42 : +#endif + self->private_impl.choosy_normal_vfilter_inner_16); + self->private_impl.choosy_normal_vfilter_mb_16 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__normal_vfilter_mb_16_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__normal_vfilter_mb_16_x86_sse42 : +#endif + self->private_impl.choosy_normal_vfilter_mb_16); + self->private_impl.choosy_normal_vfilter_mb_8 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__normal_vfilter_mb_8_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__normal_vfilter_mb_8_x86_sse42 : +#endif + self->private_impl.choosy_normal_vfilter_mb_8); + self->private_impl.choosy_normal_hfilter_mb_16 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__normal_hfilter_mb_16_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__normal_hfilter_mb_16_x86_sse42 : +#endif + self->private_impl.choosy_normal_hfilter_mb_16); + self->private_impl.choosy_normal_hfilter_mb_8 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__normal_hfilter_mb_8_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__normal_hfilter_mb_8_x86_sse42 : +#endif + self->private_impl.choosy_normal_hfilter_mb_8); + self->private_impl.choosy_normal_hfilter_inner_16 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__normal_hfilter_inner_16_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__normal_hfilter_inner_16_x86_sse42 : +#endif + self->private_impl.choosy_normal_hfilter_inner_16); + self->private_impl.choosy_normal_hfilter_inner_8 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__normal_hfilter_inner_8_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__normal_hfilter_inner_8_x86_sse42 : +#endif + self->private_impl.choosy_normal_hfilter_inner_8); + self->private_impl.choosy_normal_vfilter_inner_8 = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__ARM_NEON) + wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_vp8__decoder__normal_vfilter_inner_8_arm_neon : +#endif +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V2) + wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_vp8__decoder__normal_vfilter_inner_8_x86_sse42 : +#endif + self->private_impl.choosy_normal_vfilter_inner_8); + self->private_impl.choosy_normal_vfilter_mb_uv = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_vp8__decoder__normal_vfilter_mb_uv_x86_avx2 : +#endif + self->private_impl.choosy_normal_vfilter_mb_uv); + self->private_impl.choosy_normal_hfilter_mb_uv = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_vp8__decoder__normal_hfilter_mb_uv_x86_avx2 : +#endif + self->private_impl.choosy_normal_hfilter_mb_uv); + self->private_impl.choosy_normal_vfilter_inner_uv = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_vp8__decoder__normal_vfilter_inner_uv_x86_avx2 : +#endif + self->private_impl.choosy_normal_vfilter_inner_uv); + self->private_impl.choosy_normal_hfilter_inner_uv = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_vp8__decoder__normal_hfilter_inner_uv_x86_avx2 : +#endif + self->private_impl.choosy_normal_hfilter_inner_uv); + wuffs_vp8__decoder__init_mb_coeffs(self); + wuffs_vp8__decoder__init_coeff_probs(self); + self->private_impl.f_p0_wbuf_ri = 0u; + self->private_impl.f_p0_wbuf_count = 0u; + v_off = self->private_impl.f_workbuf_offset_v_end; + v_remaining = self->private_impl.f_partition0_size; + while ((v_remaining > 0u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { + if (v_off < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_off] = wuffs_base__peek_u8be__no_bounds_check(iop_a_src); + } + iop_a_src += 1u; + v_off += 1u; + v_remaining -= 1u; + self->private_impl.f_p0_wbuf_count += 1u; + } + wuffs_vp8__decoder__decode_partition0(self, a_workbuf); + wuffs_vp8__decoder__precompute_filter_strengths(self); + if (self->private_impl.f_filter_level == 0u) { + self->private_impl.f_filter_extra_rows = 0u; + } else if (self->private_impl.f_filter_type == 1u) { + self->private_impl.f_filter_extra_rows = 2u; + } else { + self->private_impl.f_filter_extra_rows = 6u; + } v_status = wuffs_base__pixel_swizzler__prepare(&self->private_impl.f_swizzler, wuffs_base__pixel_buffer__pixel_format(a_dst), wuffs_base__pixel_buffer__palette(a_dst), @@ -81021,7 +95631,13 @@ wuffs_vp8__decoder__do_decode_frame( } goto ok; } - v_status = wuffs_vp8__decoder__make_a_placeholder_gradient(self, a_dst); + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + v_status = wuffs_vp8__decoder__decode_frame_mb(self, a_src, a_dst, a_workbuf); + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } if ( ! wuffs_base__status__is_ok(&v_status)) { status = v_status; if (wuffs_base__status__is_error(&status)) { @@ -81045,52 +95661,146 @@ wuffs_vp8__decoder__do_decode_frame( goto exit; exit: + if (a_src && a_src->data.ptr) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + return status; } -// -------- func vp8.decoder.make_a_placeholder_gradient +// -------- func vp8.decoder.init_mb_coeffs + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__init_mb_coeffs( + wuffs_vp8__decoder* self) { + uint32_t v_i = 0; + + v_i = 0u; + while (v_i < 400u) { + self->private_data.f_mb_coeffs[v_i] = 0u; + v_i += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.init_coeff_probs + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_vp8__decoder__init_coeff_probs( + wuffs_vp8__decoder* self) { + uint32_t v_i = 0; + + v_i = 0u; + while (v_i < 1056u) { + self->private_data.f_coeff_probs[v_i] = WUFFS_VP8__DEFAULT_COEFF_PROBS[v_i]; + v_i += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func vp8.decoder.swizzle_mb_row WUFFS_BASE__GENERATED_C_CODE static wuffs_base__status -wuffs_vp8__decoder__make_a_placeholder_gradient( +wuffs_vp8__decoder__swizzle_mb_row( wuffs_vp8__decoder* self, - wuffs_base__pixel_buffer* a_dst) { - wuffs_base__pixel_format v_dst_pixfmt = {0}; - uint32_t v_dst_bits_per_pixel = 0; - uint32_t v_dst_bytes_per_pixel = 0; - uint64_t v_dst_bytes_per_row = 0; - wuffs_base__table_u8 v_tab = {0}; - wuffs_base__slice_u8 v_dst = {0}; - uint64_t v_i = 0; - uint8_t v_bgrx[4] = {0}; - - v_dst_pixfmt = wuffs_base__pixel_buffer__pixel_format(a_dst); - v_dst_bits_per_pixel = wuffs_base__pixel_format__bits_per_pixel(&v_dst_pixfmt); - if ((v_dst_bits_per_pixel & 7u) != 0u) { - return wuffs_base__make_status(wuffs_base__error__unsupported_option); + wuffs_base__pixel_buffer* a_dst, + wuffs_base__slice_u8 a_workbuf, + uint32_t a_mby, + bool a_is_last) { + wuffs_base__status v_status = wuffs_base__make_status(NULL); + wuffs_base__slice_u8 v_src0 = {0}; + wuffs_base__slice_u8 v_src1 = {0}; + wuffs_base__slice_u8 v_src2 = {0}; + wuffs_base__slice_u8 v_src3 = {0}; + uint32_t v_y_width = 0; + uint32_t v_uv_width = 0; + uint32_t v_y_min = 0; + uint32_t v_y_max = 0; + uint64_t v_y_off = 0; + uint64_t v_uv_off = 0; + uint64_t v_u_start = 0; + uint64_t v_v_start = 0; + uint32_t v_rem_y_h = 0; + uint32_t v_rem_uv_h = 0; + + if (self->private_impl.f_workbuf_offset_v_end > ((uint64_t)(a_workbuf.len))) { + return wuffs_base__make_status(wuffs_base__error__bad_workbuf_length); } - v_dst_bytes_per_pixel = (v_dst_bits_per_pixel / 8u); - v_dst_bytes_per_row = ((uint64_t)((self->private_impl.f_width * v_dst_bytes_per_pixel))); - v_tab = wuffs_base__pixel_buffer__plane(a_dst, 0u); - v_bgrx[0u] = 128u; - while (self->private_impl.f_dst_y < self->private_impl.f_height) { - v_bgrx[1u] = ((uint8_t)(self->private_impl.f_dst_y)); - self->private_impl.f_dst_x = 0u; - while (self->private_impl.f_dst_x < self->private_impl.f_width) { - v_bgrx[2u] = ((uint8_t)(self->private_impl.f_dst_x)); - v_dst = wuffs_private_impl__table_u8__row_u32(v_tab, self->private_impl.f_dst_y); - if (v_dst_bytes_per_row < ((uint64_t)(v_dst.len))) { - v_dst = wuffs_base__slice_u8__subslice_j(v_dst, v_dst_bytes_per_row); - } - v_i = (((uint64_t)(self->private_impl.f_dst_x)) * ((uint64_t)(v_dst_bytes_per_pixel))); - if (v_i < ((uint64_t)(v_dst.len))) { - wuffs_base__pixel_swizzler__swizzle_interleaved_from_slice(&self->private_impl.f_swizzler, wuffs_base__slice_u8__subslice_i(v_dst, v_i), wuffs_base__pixel_buffer__palette(a_dst), wuffs_base__make_slice_u8(v_bgrx, 4)); - } - self->private_impl.f_dst_x += 1u; - } - self->private_impl.f_dst_y += 1u; + if (self->private_impl.f_workbuf_offset_y_end > self->private_impl.f_workbuf_offset_u_end) { + return wuffs_base__make_status(wuffs_base__error__bad_workbuf_length); } - return wuffs_base__make_status(NULL); + if (self->private_impl.f_workbuf_offset_u_end > self->private_impl.f_workbuf_offset_v_end) { + return wuffs_base__make_status(wuffs_base__error__bad_workbuf_length); + } + v_y_width = (self->private_impl.f_mb_width * 16u); + v_uv_width = (self->private_impl.f_mb_width * 8u); + v_y_min = (a_mby * 16u); + if (a_mby > 0u) { + wuffs_private_impl__u32__sat_sub_indirect(&v_y_min, self->private_impl.f_filter_extra_rows); + } + v_y_max = ((((uint32_t)(a_mby)) + 1u) * 16u); + if ( ! a_is_last) { + wuffs_private_impl__u32__sat_sub_indirect(&v_y_max, self->private_impl.f_filter_extra_rows); + } + v_y_max = wuffs_base__u32__min(v_y_max, self->private_impl.f_height); + if (v_y_min >= v_y_max) { + return wuffs_base__make_status(NULL); + } + v_y_off = (((uint64_t)(v_y_min)) * ((uint64_t)(self->private_impl.f_y_stride))); + v_uv_off = (((uint64_t)((v_y_min / 2u))) * ((uint64_t)(self->private_impl.f_uv_stride))); + if (v_y_off <= self->private_impl.f_workbuf_offset_y_end) { + v_src0 = wuffs_base__slice_u8__subslice_ij(a_workbuf, v_y_off, self->private_impl.f_workbuf_offset_y_end); + } + v_u_start = wuffs_base__u64__sat_add(self->private_impl.f_workbuf_offset_y_end, v_uv_off); + if (v_u_start <= self->private_impl.f_workbuf_offset_u_end) { + v_src1 = wuffs_base__slice_u8__subslice_ij(a_workbuf, v_u_start, self->private_impl.f_workbuf_offset_u_end); + } + v_v_start = wuffs_base__u64__sat_add(self->private_impl.f_workbuf_offset_u_end, v_uv_off); + if (v_v_start <= self->private_impl.f_workbuf_offset_v_end) { + v_src2 = wuffs_base__slice_u8__subslice_ij(a_workbuf, v_v_start, self->private_impl.f_workbuf_offset_v_end); + } + v_src3 = wuffs_base__utility__empty_slice_u8(); + v_rem_y_h = wuffs_base__u32__sat_sub((self->private_impl.f_mb_height * 16u), v_y_min); + v_rem_uv_h = wuffs_base__u32__sat_sub((self->private_impl.f_mb_height * 8u), (v_y_min / 2u)); + v_status = wuffs_base__pixel_swizzler__swizzle_ycck(&self->private_impl.f_swizzler, + a_dst, + wuffs_base__pixel_buffer__palette(a_dst), + 0u, + self->private_impl.f_width, + v_y_min, + v_y_max, + v_src0, + v_src1, + v_src2, + v_src3, + v_y_width, + v_uv_width, + v_uv_width, + 0u, + v_rem_y_h, + v_rem_uv_h, + v_rem_uv_h, + 0u, + v_y_width, + v_uv_width, + v_uv_width, + 0u, + 2u, + 1u, + 1u, + 0u, + 2u, + 1u, + 1u, + 0u, + false, + false, + true, + wuffs_base__make_slice_u8(self->private_data.f_scratch_buffer_2k, 2048)); + return wuffs_private_impl__status__ensure_not_a_suspension(v_status); } // -------- func vp8.decoder.frame_dirty_rect @@ -81267,7 +95977,27 @@ wuffs_vp8__decoder__workbuf_len( return wuffs_base__utility__empty_range_ii_u64(); } - return wuffs_base__utility__make_range_ii_u64(0u, 0u); + uint64_t v_total = 0; + + v_total = wuffs_base__u64__sat_add(self->private_impl.f_workbuf_offset_v_end, ((uint64_t)(self->private_impl.f_partition0_size))); + return wuffs_base__utility__make_range_ii_u64(v_total, v_total); +} + +// -------- func vp8.decoder.workbuf_len_total + +WUFFS_BASE__GENERATED_C_CODE +WUFFS_BASE__MAYBE_STATIC uint64_t +wuffs_vp8__decoder__workbuf_len_total( + const wuffs_vp8__decoder* self) { + if (!self) { + return 0; + } + if ((self->private_impl.magic != WUFFS_BASE__MAGIC) && + (self->private_impl.magic != WUFFS_BASE__DISABLED)) { + return 0; + } + + return wuffs_base__u64__sat_add(self->private_impl.f_workbuf_offset_v_end, ((uint64_t)(self->private_impl.f_partition0_size))); } #endif // !defined(WUFFS_CONFIG__MODULES) || defined(WUFFS_CONFIG__MODULE__VP8) @@ -82208,10 +96938,11 @@ const char wuffs_webp__error__bad_transform[] = "#webp: bad transform"; const char wuffs_webp__error__short_chunk[] = "#webp: short chunk"; const char wuffs_webp__error__truncated_input[] = "#webp: truncated input"; const char wuffs_webp__error__unsupported_number_of_huffman_groups[] = "#webp: unsupported number of Huffman groups"; -const char wuffs_webp__error__unsupported_transform_after_color_indexing_transform[] = "#webp: unsupported transform after color indexing transform"; const char wuffs_webp__error__unsupported_webp_file[] = "#webp: unsupported WebP file"; const char wuffs_webp__error__internal_error_inconsistent_huffman_code[] = "#webp: internal error: inconsistent Huffman code"; +const char wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state[] = "#webp: internal error: inconsistent Huffman decoder state"; const char wuffs_webp__error__internal_error_inconsistent_dst_buffer[] = "#webp: internal error: inconsistent dst buffer"; +const char wuffs_webp__error__internal_error_inconsistent_i_o[] = "#webp: internal error: inconsistent I/O"; const char wuffs_webp__error__internal_error_inconsistent_n_bits[] = "#webp: internal error: inconsistent n_bits"; // ---------------- Private Consts @@ -82233,9 +96964,40 @@ WUFFS_WEBP__REPEAT_COUNTS[4] WUFFS_BASE__POTENTIALLY_UNUSED = { 3u, 3u, 11u, 0u, }; -static const uint16_t -WUFFS_WEBP__HUFFMAN_TABLE_BASE_OFFSETS[5] WUFFS_BASE__POTENTIALLY_UNUSED = { - 1612u, 0u, 511u, 1022u, 1533u, +static const uint8_t +WUFFS_WEBP__REVERSE8[256] WUFFS_BASE__POTENTIALLY_UNUSED = { + 0u, 128u, 64u, 192u, 32u, 160u, 96u, 224u, + 16u, 144u, 80u, 208u, 48u, 176u, 112u, 240u, + 8u, 136u, 72u, 200u, 40u, 168u, 104u, 232u, + 24u, 152u, 88u, 216u, 56u, 184u, 120u, 248u, + 4u, 132u, 68u, 196u, 36u, 164u, 100u, 228u, + 20u, 148u, 84u, 212u, 52u, 180u, 116u, 244u, + 12u, 140u, 76u, 204u, 44u, 172u, 108u, 236u, + 28u, 156u, 92u, 220u, 60u, 188u, 124u, 252u, + 2u, 130u, 66u, 194u, 34u, 162u, 98u, 226u, + 18u, 146u, 82u, 210u, 50u, 178u, 114u, 242u, + 10u, 138u, 74u, 202u, 42u, 170u, 106u, 234u, + 26u, 154u, 90u, 218u, 58u, 186u, 122u, 250u, + 6u, 134u, 70u, 198u, 38u, 166u, 102u, 230u, + 22u, 150u, 86u, 214u, 54u, 182u, 118u, 246u, + 14u, 142u, 78u, 206u, 46u, 174u, 110u, 238u, + 30u, 158u, 94u, 222u, 62u, 190u, 126u, 254u, + 1u, 129u, 65u, 193u, 33u, 161u, 97u, 225u, + 17u, 145u, 81u, 209u, 49u, 177u, 113u, 241u, + 9u, 137u, 73u, 201u, 41u, 169u, 105u, 233u, + 25u, 153u, 89u, 217u, 57u, 185u, 121u, 249u, + 5u, 133u, 69u, 197u, 37u, 165u, 101u, 229u, + 21u, 149u, 85u, 213u, 53u, 181u, 117u, 245u, + 13u, 141u, 77u, 205u, 45u, 173u, 109u, 237u, + 29u, 157u, 93u, 221u, 61u, 189u, 125u, 253u, + 3u, 131u, 67u, 195u, 35u, 163u, 99u, 227u, + 19u, 147u, 83u, 211u, 51u, 179u, 115u, 243u, + 11u, 139u, 75u, 203u, 43u, 171u, 107u, 235u, + 27u, 155u, 91u, 219u, 59u, 187u, 123u, 251u, + 7u, 135u, 71u, 199u, 39u, 167u, 103u, 231u, + 23u, 151u, 87u, 215u, 55u, 183u, 119u, 247u, + 15u, 143u, 79u, 207u, 47u, 175u, 111u, 239u, + 31u, 159u, 95u, 223u, 63u, 191u, 127u, 255u, }; static const uint8_t @@ -82266,7 +97028,8 @@ static wuffs_base__status wuffs_webp__decoder__decode_huffman_groups( wuffs_webp__decoder* self, wuffs_base__io_buffer* a_src, - uint32_t a_n_huffman_groups); + uint32_t a_n_huffman_groups, + uint32_t a_n_bitstream_groups); WUFFS_BASE__GENERATED_C_CODE static wuffs_base__status @@ -82297,7 +97060,7 @@ wuffs_webp__decoder__build_code_lengths_huffman_nodes( WUFFS_BASE__GENERATED_C_CODE static wuffs_base__status -wuffs_webp__decoder__build_huffman_nodes( +wuffs_webp__decoder__build_huffman_table( wuffs_webp__decoder* self, uint32_t a_hg, uint32_t a_ht); @@ -82308,6 +97071,17 @@ wuffs_webp__decoder__build_code_lengths( wuffs_webp__decoder* self, wuffs_base__io_buffer* a_src); +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__status +wuffs_webp__decoder__decode_pixels_fast( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_dst, + wuffs_base__io_buffer* a_src, + uint32_t a_width, + uint32_t a_height, + wuffs_base__slice_u8 a_tile_data, + uint32_t a_tile_size_log2); + WUFFS_BASE__GENERATED_C_CODE static wuffs_base__status wuffs_webp__decoder__decode_pixels_slow( @@ -82326,6 +97100,13 @@ wuffs_webp__decoder__apply_transform_predictor( wuffs_base__slice_u8 a_pix, wuffs_base__slice_u8 a_tile_data); +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_predictor__choosy_default( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix, + wuffs_base__slice_u8 a_tile_data); + WUFFS_BASE__GENERATED_C_CODE static uint32_t wuffs_webp__decoder__absolute_difference( @@ -82356,18 +97137,57 @@ wuffs_webp__decoder__apply_transform_cross_color( wuffs_base__slice_u8 a_pix, wuffs_base__slice_u8 a_tile_data); +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_cross_color__choosy_default( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix, + wuffs_base__slice_u8 a_tile_data); + WUFFS_BASE__GENERATED_C_CODE static wuffs_base__empty_struct wuffs_webp__decoder__apply_transform_subtract_green( wuffs_webp__decoder* self, wuffs_base__slice_u8 a_pix); +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_subtract_green__choosy_default( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix); + WUFFS_BASE__GENERATED_C_CODE static wuffs_base__empty_struct wuffs_webp__decoder__apply_transform_color_indexing( wuffs_webp__decoder* self, wuffs_base__slice_u8 a_pix); +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_subtract_green_x86_avx2( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_cross_color_x86_avx2( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix, + wuffs_base__slice_u8 a_tile_data); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_predictor_x86_avx2( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix, + wuffs_base__slice_u8 a_tile_data); +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + WUFFS_BASE__GENERATED_C_CODE static wuffs_base__status wuffs_webp__decoder__do_decode_image_config( @@ -82395,6 +97215,37 @@ wuffs_webp__decoder__do_decode_frame_config( wuffs_base__frame_config* a_dst, wuffs_base__io_buffer* a_src); +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__status +wuffs_webp__decoder__do_decode_frame_vp8x( + wuffs_webp__decoder* self, + wuffs_base__pixel_buffer* a_dst, + wuffs_base__io_buffer* a_src, + wuffs_base__pixel_blend a_blend, + wuffs_base__slice_u8 a_workbuf, + wuffs_base__decode_frame_options* a_opts); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_alpha_filter_horizontal( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_alpha_offset); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_alpha_filter_vertical( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_alpha_offset); + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_alpha_filter_gradient( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_alpha_offset); + WUFFS_BASE__GENERATED_C_CODE static wuffs_base__status wuffs_webp__decoder__do_decode_frame( @@ -82524,6 +97375,10 @@ wuffs_webp__decoder__initialize( } } + self->private_impl.choosy_apply_transform_predictor = &wuffs_webp__decoder__apply_transform_predictor__choosy_default; + self->private_impl.choosy_apply_transform_cross_color = &wuffs_webp__decoder__apply_transform_cross_color__choosy_default; + self->private_impl.choosy_apply_transform_subtract_green = &wuffs_webp__decoder__apply_transform_subtract_green__choosy_default; + { wuffs_base__status z = wuffs_vp8__decoder__initialize( &self->private_data.f_vp8, sizeof(self->private_data.f_vp8), WUFFS_VERSION, options); @@ -82568,32 +97423,104 @@ static wuffs_base__status wuffs_webp__decoder__decode_huffman_groups( wuffs_webp__decoder* self, wuffs_base__io_buffer* a_src, - uint32_t a_n_huffman_groups) { + uint32_t a_n_huffman_groups, + uint32_t a_n_bitstream_groups) { wuffs_base__status status = wuffs_base__make_status(NULL); uint32_t v_hg = 0; uint32_t v_ht = 0; + uint32_t v_target = 0; + uint32_t v_sorted_idx = 0; + uint32_t v_raw_hg = 0; + uint32_t v_red_entry = 0; + uint32_t v_blue_entry = 0; + uint32_t v_alpha_entry = 0; + uint32_t v_green_entry = 0; uint32_t coro_susp_point = self->private_impl.p_decode_huffman_groups; if (coro_susp_point) { v_hg = self->private_data.s_decode_huffman_groups.v_hg; v_ht = self->private_data.s_decode_huffman_groups.v_ht; + v_target = self->private_data.s_decode_huffman_groups.v_target; + v_sorted_idx = self->private_data.s_decode_huffman_groups.v_sorted_idx; + v_raw_hg = self->private_data.s_decode_huffman_groups.v_raw_hg; } switch (coro_susp_point) { WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0; - v_hg = 0u; - while (v_hg < a_n_huffman_groups) { - v_ht = 0u; - while (v_ht < 5u) { - WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1); - status = wuffs_webp__decoder__decode_huffman_tree(self, a_src, v_hg, v_ht); - if (status.repr) { - goto suspend; + if (a_n_bitstream_groups <= a_n_huffman_groups) { + v_hg = 0u; + while (v_hg < a_n_huffman_groups) { + self->private_impl.f_ht_next_top = 1280u; + v_ht = 0u; + while (v_ht < 5u) { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1); + status = wuffs_webp__decoder__decode_huffman_tree(self, a_src, v_hg, v_ht); + if (status.repr) { + goto suspend; + } + v_ht += 1u; + } + v_red_entry = self->private_data.f_huffman_tables[v_hg][256u]; + v_blue_entry = self->private_data.f_huffman_tables[v_hg][512u]; + v_alpha_entry = self->private_data.f_huffman_tables[v_hg][768u]; + if (((v_red_entry & 2147483663u) == 2147483648u) && ((v_blue_entry & 2147483663u) == 2147483648u) && ((v_alpha_entry & 2147483663u) == 2147483648u)) { + self->private_data.f_hg_literal_arb[v_hg] = ((((v_alpha_entry >> 8u) & 255u) << 24u) | (((v_red_entry >> 8u) & 255u) << 16u) | ((v_blue_entry >> 8u) & 255u)); + v_green_entry = self->private_data.f_huffman_tables[v_hg][0u]; + if (((v_green_entry & 2147483663u) == 2147483648u) && (((v_green_entry >> 8u) & 65535u) < 256u)) { + self->private_data.f_hg_trivial[v_hg] = 2u; + self->private_data.f_hg_literal_arb[v_hg] |= (((v_green_entry >> 8u) & 255u) << 8u); + } else { + self->private_data.f_hg_trivial[v_hg] = 1u; + } + } else { + self->private_data.f_hg_trivial[v_hg] = 0u; } - v_ht += 1u; + v_hg += 1u; + } + } else { + v_sorted_idx = 0u; + v_raw_hg = 0u; + while (v_raw_hg < a_n_bitstream_groups) { + if ((v_sorted_idx < self->private_impl.f_hg_n_sorted) && (v_sorted_idx < 1024u)) { + if (((uint32_t)(self->private_data.f_hg_sorted[v_sorted_idx])) == v_raw_hg) { + v_target = v_sorted_idx; + v_sorted_idx += 1u; + } else { + v_target = 1024u; + } + } else { + v_target = 1024u; + } + self->private_impl.f_ht_next_top = 1280u; + v_ht = 0u; + while (v_ht < 5u) { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(2); + status = wuffs_webp__decoder__decode_huffman_tree(self, a_src, v_target, v_ht); + if (status.repr) { + goto suspend; + } + v_ht += 1u; + } + if (v_target < 1024u) { + v_red_entry = self->private_data.f_huffman_tables[v_target][256u]; + v_blue_entry = self->private_data.f_huffman_tables[v_target][512u]; + v_alpha_entry = self->private_data.f_huffman_tables[v_target][768u]; + if (((v_red_entry & 2147483663u) == 2147483648u) && ((v_blue_entry & 2147483663u) == 2147483648u) && ((v_alpha_entry & 2147483663u) == 2147483648u)) { + self->private_data.f_hg_literal_arb[v_target] = ((((v_alpha_entry >> 8u) & 255u) << 24u) | (((v_red_entry >> 8u) & 255u) << 16u) | ((v_blue_entry >> 8u) & 255u)); + v_green_entry = self->private_data.f_huffman_tables[v_target][0u]; + if (((v_green_entry & 2147483663u) == 2147483648u) && (((v_green_entry >> 8u) & 65535u) < 256u)) { + self->private_data.f_hg_trivial[v_target] = 2u; + self->private_data.f_hg_literal_arb[v_target] |= (((v_green_entry >> 8u) & 255u) << 8u); + } else { + self->private_data.f_hg_trivial[v_target] = 1u; + } + } else { + self->private_data.f_hg_trivial[v_target] = 0u; + } + } + v_raw_hg += 1u; } - v_hg += 1u; } goto ok; @@ -82607,6 +97534,9 @@ wuffs_webp__decoder__decode_huffman_groups( self->private_impl.p_decode_huffman_groups = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0; self->private_data.s_decode_huffman_groups.v_hg = v_hg; self->private_data.s_decode_huffman_groups.v_ht = v_ht; + self->private_data.s_decode_huffman_groups.v_target = v_target; + self->private_data.s_decode_huffman_groups.v_sorted_idx = v_sorted_idx; + self->private_data.s_decode_huffman_groups.v_raw_hg = v_raw_hg; goto exit; exit: @@ -82714,7 +97644,7 @@ wuffs_webp__decoder__decode_huffman_tree( if (status.repr) { goto suspend; } - v_status = wuffs_webp__decoder__build_huffman_nodes(self, a_hg, a_ht); + v_status = wuffs_webp__decoder__build_huffman_table(self, a_hg, a_ht); if ( ! wuffs_base__status__is_ok(&v_status)) { status = v_status; if (wuffs_base__status__is_error(&status)) { @@ -82762,6 +97692,7 @@ wuffs_webp__decoder__decode_huffman_tree_simple( uint32_t v_symbol0 = 0; uint32_t v_symbol1 = 0; uint32_t v_base_offset = 0; + uint32_t v_i = 0; const uint8_t* iop_a_src = NULL; const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; @@ -82825,7 +97756,8 @@ wuffs_webp__decoder__decode_huffman_tree_simple( v_symbol0 = (self->private_impl.f_bits & ((((uint32_t)(1u)) << v_first_symbol_n_bits) - 1u)); self->private_impl.f_bits >>= v_first_symbol_n_bits; self->private_impl.f_n_bits -= v_first_symbol_n_bits; - v_base_offset = ((uint32_t)(WUFFS_WEBP__HUFFMAN_TABLE_BASE_OFFSETS[a_ht])); + v_base_offset = (a_ht * 256u); + self->private_data.f_huffman_table_base_offsets[a_hg][a_ht] = ((uint16_t)(v_base_offset)); if (v_use_second_symbol != 0u) { if (self->private_impl.f_n_bits < 8u) { { @@ -82847,11 +97779,18 @@ wuffs_webp__decoder__decode_huffman_tree_simple( v_symbol1 = (self->private_impl.f_bits & 255u); self->private_impl.f_bits >>= 8u; self->private_impl.f_n_bits -= 8u; - self->private_data.f_huffman_nodes[a_hg][(v_base_offset + 0u)] = ((uint16_t)((v_base_offset + 1u))); - self->private_data.f_huffman_nodes[a_hg][(v_base_offset + 1u)] = ((uint16_t)((v_symbol0 | 32768u))); - self->private_data.f_huffman_nodes[a_hg][(v_base_offset + 2u)] = ((uint16_t)((v_symbol1 | 32768u))); + v_i = 0u; + while (v_i < 256u) { + self->private_data.f_huffman_tables[a_hg][(((uint32_t)(v_base_offset + v_i)) & 4095u)] = (2147483648u | (v_symbol0 << 8u) | 1u); + self->private_data.f_huffman_tables[a_hg][(((uint32_t)(((uint32_t)(v_base_offset + v_i)) + 1u)) & 4095u)] = (2147483648u | (v_symbol1 << 8u) | 1u); + v_i += 2u; + } } else { - self->private_data.f_huffman_nodes[a_hg][v_base_offset] = ((uint16_t)((v_symbol0 | 32768u))); + v_i = 0u; + while (v_i < 256u) { + self->private_data.f_huffman_tables[a_hg][(((uint32_t)(v_base_offset + v_i)) & 4095u)] = (2147483648u | (v_symbol0 << 8u) | 0u); + v_i += 1u; + } } goto ok; @@ -83066,94 +98005,237 @@ wuffs_webp__decoder__build_code_lengths_huffman_nodes( return wuffs_base__make_status(NULL); } -// -------- func webp.decoder.build_huffman_nodes +// -------- func webp.decoder.build_huffman_table WUFFS_BASE__GENERATED_C_CODE static wuffs_base__status -wuffs_webp__decoder__build_huffman_nodes( +wuffs_webp__decoder__build_huffman_table( wuffs_webp__decoder* self, uint32_t a_hg, uint32_t a_ht) { uint32_t v_base_offset = 0; - uint32_t v_code_bits = 0; - uint32_t v_code_len = 0; - uint32_t v_symbol = 0; - uint32_t v_histogram[16] = {0}; + uint32_t v_i = 0; + uint32_t v_n_symbols = 0; + uint32_t v_count = 0; uint32_t v_n_used_symbols = 0; uint32_t v_last_used_symbol = 0; - uint32_t v_subscription_weight = 0; - uint32_t v_subscription_total = 0; - uint32_t v_curr_code = 0; - uint32_t v_next_codes[17] = {0}; - uint32_t v_n_branches = 0; - uint32_t v_h = 0; - uint32_t v_children = 0; - uint16_t v_node = 0; + uint32_t v_remaining = 0; + uint32_t v_min_cl = 0; + uint32_t v_max_cl = 0; + uint32_t v_initial_high_bits = 0; + uint32_t v_prev_cl = 0; + uint32_t v_prev_redirect_key = 0; + uint32_t v_top = 0; + uint32_t v_next_top = 0; + uint32_t v_code = 0; + uint32_t v_key = 0; + uint32_t v_value = 0; + uint32_t v_cl = 0; + uint32_t v_redirect_key = 0; + uint32_t v_j = 0; + uint32_t v_reversed_key = 0; + uint32_t v_symbol = 0; + uint32_t v_high_bits = 0; + uint32_t v_delta = 0; + uint16_t v_counts[16] = {0}; + uint16_t v_offsets[16] = {0}; + uint16_t v_symbols[2328] = {0}; - v_base_offset = ((uint32_t)(WUFFS_WEBP__HUFFMAN_TABLE_BASE_OFFSETS[a_ht])); - v_symbol = 0u; - while (v_symbol < self->private_impl.f_ht_n_symbols) { - v_code_len = ((uint32_t)(((uint16_t)(self->private_data.f_code_lengths[v_symbol] & 15u)))); - if (v_code_len != 0u) { - v_histogram[v_code_len] += 1u; + v_base_offset = (a_ht * 256u); + self->private_data.f_huffman_table_base_offsets[a_hg][a_ht] = ((uint16_t)(v_base_offset)); + v_i = 0u; + while (v_i < self->private_impl.f_ht_n_symbols) { + if (v_counts[((uint16_t)(self->private_data.f_code_lengths[v_i] & 15u))] >= 2328u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_counts[((uint16_t)(self->private_data.f_code_lengths[v_i] & 15u))] += 1u; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + if (((uint16_t)(self->private_data.f_code_lengths[v_i] & 15u)) != 0u) { v_n_used_symbols += 1u; - v_last_used_symbol = v_symbol; + v_last_used_symbol = v_i; } - v_symbol += 1u; + v_i += 1u; } if (v_n_used_symbols < 1u) { return wuffs_base__make_status(wuffs_webp__error__bad_huffman_code); } else if (v_n_used_symbols == 1u) { - self->private_data.f_huffman_nodes[a_hg][v_base_offset] = ((uint16_t)((v_last_used_symbol | 32768u))); + v_i = 0u; + while (v_i < 256u) { + self->private_data.f_huffman_tables[a_hg][(((uint32_t)(v_base_offset + v_i)) & 4095u)] = (2147483648u | (v_last_used_symbol << 8u)); + v_i += 1u; + } return wuffs_base__make_status(NULL); } - v_subscription_weight = 16384u; - v_code_len = 1u; + v_remaining = 1u; + v_i = 1u; + while (v_i <= 15u) { + if (v_remaining > 1073741824u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + v_remaining <<= 1u; + if (v_remaining < ((uint32_t)(v_counts[v_i]))) { + return wuffs_base__make_status(wuffs_webp__error__bad_huffman_code_over_subscribed); + } + v_remaining -= ((uint32_t)(v_counts[v_i])); + v_i += 1u; + } + if (v_remaining != 0u) { + return wuffs_base__make_status(wuffs_webp__error__bad_huffman_code_under_subscribed); + } + v_i = 1u; + while (v_i <= 15u) { + v_offsets[v_i] = ((uint16_t)(v_n_symbols)); + v_count = ((uint32_t)(v_counts[v_i])); + if (v_n_symbols > (2328u - v_count)) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + v_n_symbols = (v_n_symbols + v_count); + v_i += 1u; + } + if (v_n_symbols > 2328u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + v_i = 0u; + while (v_i < self->private_impl.f_ht_n_symbols) { + if (((uint16_t)(self->private_data.f_code_lengths[v_i] & 15u)) != 0u) { + if (v_offsets[((uint16_t)(self->private_data.f_code_lengths[v_i] & 15u))] >= 2328u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + v_symbols[v_offsets[((uint16_t)(self->private_data.f_code_lengths[v_i] & 15u))]] = ((uint16_t)(v_i)); +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_offsets[((uint16_t)(self->private_data.f_code_lengths[v_i] & 15u))] += 1u; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } + v_i += 1u; + } + v_min_cl = 1u; while (true) { - v_curr_code = ((uint32_t)(((uint32_t)(v_curr_code + v_histogram[v_code_len])) << 1u)); - v_next_codes[(v_code_len + 1u)] = v_curr_code; - v_subscription_total += ((uint32_t)(v_subscription_weight * v_histogram[v_code_len])); - v_subscription_weight >>= 1u; - if (v_code_len >= 15u) { + if (v_counts[v_min_cl] != 0u) { break; } - v_code_len += 1u; + if (v_min_cl >= 9u) { + return wuffs_base__make_status(wuffs_webp__error__bad_huffman_code); + } + v_min_cl += 1u; } - if (v_subscription_total > 32768u) { - return wuffs_base__make_status(wuffs_webp__error__bad_huffman_code_over_subscribed); - } else if (v_subscription_total < 32768u) { - return wuffs_base__make_status(wuffs_webp__error__bad_huffman_code_under_subscribed); + v_max_cl = 15u; + while (true) { + if (v_counts[v_max_cl] != 0u) { + break; + } + if (v_max_cl <= 1u) { + return wuffs_base__make_status(wuffs_webp__error__bad_huffman_code); + } + v_max_cl -= 1u; } - self->private_data.f_huffman_nodes[a_hg][v_base_offset] = 0u; - v_symbol = 0u; - while (v_symbol < self->private_impl.f_ht_n_symbols) { - v_code_len = ((uint32_t)(((uint16_t)(self->private_data.f_code_lengths[v_symbol] & 15u)))); - if (v_code_len != 0u) { - v_code_bits = v_next_codes[v_code_len]; - v_next_codes[v_code_len] += 1u; - v_code_bits <<= (32u - v_code_len); - v_h = v_base_offset; - while (v_code_len > 0u) { - v_node = self->private_data.f_huffman_nodes[a_hg][v_h]; - if (v_node == 0u) { - v_children = ((uint32_t)(v_base_offset + ((uint32_t)(1u + ((uint32_t)(2u * v_n_branches)))))); - v_children = wuffs_base__u32__min(v_children, 6265u); - self->private_data.f_huffman_nodes[a_hg][v_h] = ((uint16_t)(v_children)); - self->private_data.f_huffman_nodes[a_hg][(v_children + 0u)] = 0u; - self->private_data.f_huffman_nodes[a_hg][(v_children + 1u)] = 0u; - v_h = (v_children + (v_code_bits >> 31u)); - v_n_branches += 1u; - } else { - v_children = ((uint32_t)(v_node)); - v_h = (wuffs_base__u32__min(v_children, 6265u) + (v_code_bits >> 31u)); + v_initial_high_bits = 256u; + if (((uint32_t)(v_symbols[0u])) >= self->private_impl.f_ht_n_symbols) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + v_prev_cl = ((uint32_t)(((uint16_t)(self->private_data.f_code_lengths[((uint32_t)(v_symbols[0u]))] & 15u)))); + v_prev_redirect_key = 4294967295u; + v_top = v_base_offset; + v_next_top = self->private_impl.f_ht_next_top; + v_code = 0u; + v_key = 0u; + v_value = 0u; + v_i = 0u; + while (true) { + if (((uint32_t)(v_symbols[v_i])) >= self->private_impl.f_ht_n_symbols) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + v_cl = ((uint32_t)(((uint16_t)(self->private_data.f_code_lengths[((uint32_t)(v_symbols[v_i]))] & 15u)))); + if (v_cl > v_prev_cl) { + v_code <<= (v_cl - v_prev_cl); + if (v_code >= 32768u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + } + v_prev_cl = v_cl; + v_key = v_code; + if (v_cl > 8u) { + v_cl -= 8u; + v_redirect_key = ((v_key >> v_cl) & 255u); + v_key = ((v_key) & WUFFS_PRIVATE_IMPL__LOW_BITS_MASK__U32(v_cl)); + if (v_prev_redirect_key != ((uint32_t)(v_redirect_key))) { + v_prev_redirect_key = ((uint32_t)(v_redirect_key)); + v_remaining = (((uint32_t)(1u)) << v_cl); + v_j = v_prev_cl; + while (v_j <= 15u) { + if (v_remaining <= ((uint32_t)(v_counts[v_j]))) { + break; + } + v_remaining -= ((uint32_t)(v_counts[v_j])); + if (v_remaining > 1073741824u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + v_remaining <<= 1u; + v_j += 1u; } - v_code_bits <<= 1u; - v_code_len -= 1u; + if ((v_j <= 8u) || (15u < v_j)) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + v_j -= 8u; + v_initial_high_bits = (((uint32_t)(1u)) << v_j); + v_top = v_next_top; + if ((v_top + (((uint32_t)(1u)) << v_j)) > 4096u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + v_next_top = (v_top + (((uint32_t)(1u)) << v_j)); + v_redirect_key = ((uint32_t)(WUFFS_WEBP__REVERSE8[v_redirect_key])); + if ((v_base_offset + v_redirect_key) >= 4096u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + self->private_data.f_huffman_tables[a_hg][(v_base_offset + v_redirect_key)] = (268435464u | (v_top << 8u) | (v_j << 4u)); } - self->private_data.f_huffman_nodes[a_hg][v_h] = ((uint16_t)((v_symbol | 32768u))); } - v_symbol += 1u; + if ((v_cl > 8u) || (v_key >= 256u) || (v_counts[v_prev_cl] <= 0u)) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_counts[v_prev_cl] -= 1u; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + v_reversed_key = ((((uint32_t)(WUFFS_WEBP__REVERSE8[(v_key & 255u)])) >> (8u - v_cl)) & 255u); + if (((uint32_t)(v_symbols[v_i])) >= 2328u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + v_symbol = ((uint32_t)(v_symbols[v_i])); + v_value = (2147483648u | (v_symbol << 8u) | v_cl); + v_high_bits = v_initial_high_bits; + v_delta = (((uint32_t)(1u)) << v_cl); + while (v_high_bits >= v_delta) { + v_high_bits -= v_delta; + if ((v_top + ((v_high_bits | v_reversed_key) & 255u)) >= 4096u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } + self->private_data.f_huffman_tables[a_hg][(v_top + ((v_high_bits | v_reversed_key) & 255u))] = v_value; + } + v_i += 1u; + if (v_i >= v_n_symbols) { + break; + } + v_code += 1u; + if (v_code >= 32768u) { + return wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_decoder_state); + } } + self->private_impl.f_ht_next_top = v_next_top; return wuffs_base__make_status(NULL); } @@ -83376,6 +98458,340 @@ wuffs_webp__decoder__build_code_lengths( return status; } +// -------- func webp.decoder.decode_pixels_fast + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__status +wuffs_webp__decoder__decode_pixels_fast( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_dst, + wuffs_base__io_buffer* a_src, + uint32_t a_width, + uint32_t a_height, + wuffs_base__slice_u8 a_tile_data, + uint32_t a_tile_size_log2) { + wuffs_base__status status = wuffs_base__make_status(NULL); + + uint64_t v_bits = 0; + uint32_t v_n_bits = 0; + uint64_t v_p = 0; + uint64_t v_p_max = 0; + uint32_t v_tile_size_log2 = 0; + uint32_t v_width_in_tiles = 0; + uint32_t v_x = 0; + uint32_t v_y = 0; + uint32_t v_i = 0; + uint32_t v_hg = 0; + uint8_t v_trivial = 0; + uint32_t v_table_entry = 0; + uint32_t v_table_entry_n_bits = 0; + uint32_t v_redir_top = 0; + uint32_t v_redir_mask = 0; + uint32_t v_pixel_g = 0; + uint32_t v_color = 0; + uint32_t v_back_ref_len_n_bits = 0; + uint32_t v_back_ref_len_minus_1 = 0; + uint32_t v_back_ref_dist_n_bits = 0; + uint32_t v_back_ref_dist_sym = 0; + uint32_t v_back_ref_dist_premap_minus_1 = 0; + uint32_t v_back_ref_dist_minus_1 = 0; + uint32_t v_dm = 0; + uint32_t v_dx = 0; + uint32_t v_dy = 0; + uint64_t v_p_end = 0; + uint64_t v_dist4 = 0; + uint64_t v_q = 0; + uint32_t v_tmask = 0; + uint32_t v_tile_x_end = 0; + uint32_t v_color_cache_shift = 0; + wuffs_base__slice_u8 v_color_cache_pixels = {0}; + + const uint8_t* iop_a_src = NULL; + const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + if (a_src && a_src->data.ptr) { + io0_a_src = a_src->data.ptr; + io1_a_src = io0_a_src + a_src->meta.ri; + iop_a_src = io1_a_src; + io2_a_src = io0_a_src + a_src->meta.wi; + } + + v_bits = ((uint64_t)(self->private_impl.f_bits)); + v_n_bits = self->private_impl.f_n_bits; + v_p = self->private_impl.f_pix_p; + v_x = self->private_impl.f_pix_x; + v_y = self->private_impl.f_pix_y; + v_p_max = ((uint64_t)((4u * a_width * a_height))); + if (((uint64_t)(a_dst.len)) < v_p_max) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_dst_buffer); + goto exit; + } + v_color_cache_shift = ((32u - self->private_impl.f_color_cache_bits) & 31u); + if (a_tile_size_log2 != 0u) { + v_tile_size_log2 = a_tile_size_log2; + v_width_in_tiles = ((a_width + ((((uint32_t)(1u)) << v_tile_size_log2) - 1u)) >> v_tile_size_log2); + } else { + v_tile_size_log2 = 31u; + v_width_in_tiles = 1u; + } + v_tmask = ((((uint32_t)(1u)) << v_tile_size_log2) - 1u); + while ((v_p < v_p_max) && (((uint64_t)(io2_a_src - iop_a_src)) >= 16u)) { + v_i = ((uint32_t)(((uint32_t)(((uint32_t)(((uint32_t)((v_y >> v_tile_size_log2) * v_width_in_tiles)) + (v_x >> v_tile_size_log2))) * 4u)) + 1u)); + if (((uint64_t)(v_i)) < ((uint64_t)(a_tile_data.len))) { + v_hg = ((uint32_t)(a_tile_data.ptr[((uint64_t)(v_i))])); + if ((((uint64_t)(v_i)) + 1u) < ((uint64_t)(a_tile_data.len))) { + v_hg = (((((uint32_t)(a_tile_data.ptr[(((uint64_t)(v_i)) + 1u)])) << 8u) | v_hg) & 1023u); + } + } + v_trivial = self->private_data.f_hg_trivial[v_hg]; + v_tile_x_end = ((uint32_t)((v_x | v_tmask) + 1u)); + if (v_tile_x_end > a_width) { + v_tile_x_end = a_width; + } + while ((v_x < v_tile_x_end) && (v_p < v_p_max) && (((uint64_t)(io2_a_src - iop_a_src)) >= 16u)) { + if (v_trivial >= 2u) { + v_color = self->private_data.f_hg_literal_arb[v_hg]; + } else { + v_bits |= ((uint64_t)(wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63u))); + iop_a_src += ((63u - (v_n_bits & 63u)) >> 3u); + v_n_bits |= 56u; + v_table_entry = self->private_data.f_huffman_tables[v_hg][((uint32_t)((v_bits & 255u)))]; + v_table_entry_n_bits = (v_table_entry & 15u); + v_bits >>= v_table_entry_n_bits; + v_n_bits -= v_table_entry_n_bits; + if ((v_table_entry >> 31u) == 0u) { + v_redir_top = ((v_table_entry >> 8u) & 65535u); + v_redir_mask = ((((uint32_t)(1u)) << ((v_table_entry >> 4u) & 15u)) - 1u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][((v_redir_top + (((uint32_t)(v_bits)) & v_redir_mask)) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + v_bits >>= v_table_entry_n_bits; + v_n_bits -= v_table_entry_n_bits; + } + v_pixel_g = ((v_table_entry >> 8u) & 65535u); + if (v_pixel_g < 256u) { + if (v_trivial >= 1u) { + v_color = (self->private_data.f_hg_literal_arb[v_hg] | (v_pixel_g << 8u)); + } else { + v_color = (v_pixel_g << 8u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][(256u + ((uint32_t)((v_bits & 255u))))]; + v_table_entry_n_bits = (v_table_entry & 15u); + v_bits >>= v_table_entry_n_bits; + v_n_bits -= v_table_entry_n_bits; + if ((v_table_entry >> 31u) == 0u) { + v_redir_top = ((v_table_entry >> 8u) & 65535u); + v_redir_mask = ((((uint32_t)(1u)) << ((v_table_entry >> 4u) & 15u)) - 1u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][((v_redir_top + (((uint32_t)(v_bits)) & v_redir_mask)) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + v_bits >>= v_table_entry_n_bits; + v_n_bits -= v_table_entry_n_bits; + } + v_color |= (((uint32_t)(((v_table_entry >> 8u) & 255u))) << 16u); + if (v_n_bits < 30u) { + v_bits |= ((uint64_t)(wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63u))); + iop_a_src += ((63u - (v_n_bits & 63u)) >> 3u); + v_n_bits |= 56u; + } + v_table_entry = self->private_data.f_huffman_tables[v_hg][(512u + ((uint32_t)((v_bits & 255u))))]; + v_table_entry_n_bits = (v_table_entry & 15u); + v_bits >>= v_table_entry_n_bits; + v_n_bits -= v_table_entry_n_bits; + if ((v_table_entry >> 31u) == 0u) { + v_redir_top = ((v_table_entry >> 8u) & 65535u); + v_redir_mask = ((((uint32_t)(1u)) << ((v_table_entry >> 4u) & 15u)) - 1u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][((v_redir_top + (((uint32_t)(v_bits)) & v_redir_mask)) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + v_bits >>= v_table_entry_n_bits; + v_n_bits -= v_table_entry_n_bits; + } + v_color |= (((uint32_t)(((v_table_entry >> 8u) & 255u))) << 0u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][(768u + ((uint32_t)((v_bits & 255u))))]; + v_table_entry_n_bits = (v_table_entry & 15u); + v_bits >>= v_table_entry_n_bits; + v_n_bits -= v_table_entry_n_bits; + if ((v_table_entry >> 31u) == 0u) { + v_redir_top = ((v_table_entry >> 8u) & 65535u); + v_redir_mask = ((((uint32_t)(1u)) << ((v_table_entry >> 4u) & 15u)) - 1u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][((v_redir_top + (((uint32_t)(v_bits)) & v_redir_mask)) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + v_bits >>= v_table_entry_n_bits; + v_n_bits -= v_table_entry_n_bits; + } + v_color |= (((uint32_t)(((v_table_entry >> 8u) & 255u))) << 24u); + } + } else if (v_pixel_g < 280u) { + if (v_pixel_g < 260u) { + v_back_ref_len_minus_1 = (v_pixel_g - 256u); + } else { + v_back_ref_len_n_bits = ((v_pixel_g - 258u) >> 1u); + v_back_ref_len_minus_1 = ((((uint32_t)(2u)) + (v_pixel_g & 1u)) << v_back_ref_len_n_bits); + v_back_ref_len_minus_1 += (((uint32_t)(((v_bits) & WUFFS_PRIVATE_IMPL__LOW_BITS_MASK__U64(v_back_ref_len_n_bits)))) & 8191u); + v_bits >>= v_back_ref_len_n_bits; + v_n_bits -= v_back_ref_len_n_bits; + } + if (v_n_bits < 33u) { + v_bits |= ((uint64_t)(wuffs_base__peek_u64le__no_bounds_check(iop_a_src) << (v_n_bits & 63u))); + iop_a_src += ((63u - (v_n_bits & 63u)) >> 3u); + v_n_bits |= 56u; + } + v_table_entry = self->private_data.f_huffman_tables[v_hg][(1024u + ((uint32_t)((v_bits & 255u))))]; + v_table_entry_n_bits = (v_table_entry & 15u); + v_bits >>= v_table_entry_n_bits; + v_n_bits -= v_table_entry_n_bits; + if ((v_table_entry >> 31u) == 0u) { + v_redir_top = ((v_table_entry >> 8u) & 65535u); + v_redir_mask = ((((uint32_t)(1u)) << ((v_table_entry >> 4u) & 15u)) - 1u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][((v_redir_top + (((uint32_t)(v_bits)) & v_redir_mask)) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + v_bits >>= v_table_entry_n_bits; + v_n_bits -= v_table_entry_n_bits; + } + v_back_ref_dist_sym = ((v_table_entry >> 8u) & 65535u); + if (v_back_ref_dist_sym < 4u) { + v_back_ref_dist_premap_minus_1 = v_back_ref_dist_sym; + } else if (v_back_ref_dist_sym < 40u) { + v_back_ref_dist_n_bits = ((v_back_ref_dist_sym - 2u) >> 1u); + v_back_ref_dist_premap_minus_1 = ((((uint32_t)(2u)) + (v_back_ref_dist_sym & 1u)) << v_back_ref_dist_n_bits); + v_back_ref_dist_premap_minus_1 += (((uint32_t)(((v_bits) & WUFFS_PRIVATE_IMPL__LOW_BITS_MASK__U64(v_back_ref_dist_n_bits)))) & 1048575u); + v_bits >>= v_back_ref_dist_n_bits; + v_n_bits -= v_back_ref_dist_n_bits; + } + if (v_back_ref_dist_premap_minus_1 >= 120u) { + v_back_ref_dist_minus_1 = (v_back_ref_dist_premap_minus_1 - 120u); + } else { + v_dm = ((uint32_t)(WUFFS_WEBP__DISTANCE_MAP[v_back_ref_dist_premap_minus_1])); + v_dy = (v_dm >> 4u); + v_dx = ((uint32_t)(7u - (v_dm & 15u))); + v_back_ref_dist_minus_1 = ((uint32_t)((a_width * v_dy) + v_dx)); + } + v_p_end = (v_p + ((uint64_t)(((v_back_ref_len_minus_1 + 1u) * 4u)))); + v_dist4 = ((((uint64_t)(v_back_ref_dist_minus_1)) * 4u) + 4u); + if ((v_p_end > v_p_max) || (v_p_end > ((uint64_t)(a_dst.len))) || (v_p < v_dist4)) { + status = wuffs_base__make_status(wuffs_webp__error__bad_back_reference); + goto exit; + } + v_q = (v_p - v_dist4); + if (v_p > v_p_end) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_dst_buffer); + goto exit; + } + if (v_back_ref_dist_minus_1 >= v_back_ref_len_minus_1) { + if ((v_q > v_p) || (v_p > ((uint64_t)(a_dst.len)))) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_dst_buffer); + goto exit; + } + wuffs_private_impl__slice_u8__copy_from_slice(wuffs_base__slice_u8__subslice_ij(a_dst, v_p, v_p_end), wuffs_base__slice_u8__subslice_ij(a_dst, v_q, v_p)); + if (v_color_cache_shift > 0u) { + if (v_p_end > ((uint64_t)(a_dst.len))) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_dst_buffer); + goto exit; + } + { + wuffs_base__slice_u8 i_slice_color_cache_pixels = wuffs_base__slice_u8__subslice_ij(a_dst, v_p, v_p_end); + v_color_cache_pixels.ptr = i_slice_color_cache_pixels.ptr; + v_color_cache_pixels.len = 4; + const uint8_t* i_end0_color_cache_pixels = wuffs_private_impl__ptr_u8_plus_len(v_color_cache_pixels.ptr, (((i_slice_color_cache_pixels.len - (size_t)(v_color_cache_pixels.ptr - i_slice_color_cache_pixels.ptr)) / 16) * 16)); + while (v_color_cache_pixels.ptr < i_end0_color_cache_pixels) { + v_color = wuffs_base__peek_u32le__no_bounds_check(v_color_cache_pixels.ptr); + self->private_data.f_color_cache[((((uint32_t)(v_color * 506832829u)) >> v_color_cache_shift) & 2047u)] = v_color; + v_color_cache_pixels.ptr += 4; + v_color = wuffs_base__peek_u32le__no_bounds_check(v_color_cache_pixels.ptr); + self->private_data.f_color_cache[((((uint32_t)(v_color * 506832829u)) >> v_color_cache_shift) & 2047u)] = v_color; + v_color_cache_pixels.ptr += 4; + v_color = wuffs_base__peek_u32le__no_bounds_check(v_color_cache_pixels.ptr); + self->private_data.f_color_cache[((((uint32_t)(v_color * 506832829u)) >> v_color_cache_shift) & 2047u)] = v_color; + v_color_cache_pixels.ptr += 4; + v_color = wuffs_base__peek_u32le__no_bounds_check(v_color_cache_pixels.ptr); + self->private_data.f_color_cache[((((uint32_t)(v_color * 506832829u)) >> v_color_cache_shift) & 2047u)] = v_color; + v_color_cache_pixels.ptr += 4; + } + v_color_cache_pixels.len = 4; + const uint8_t* i_end1_color_cache_pixels = wuffs_private_impl__ptr_u8_plus_len(v_color_cache_pixels.ptr, (((i_slice_color_cache_pixels.len - (size_t)(v_color_cache_pixels.ptr - i_slice_color_cache_pixels.ptr)) / 4) * 4)); + while (v_color_cache_pixels.ptr < i_end1_color_cache_pixels) { + v_color = wuffs_base__peek_u32le__no_bounds_check(v_color_cache_pixels.ptr); + self->private_data.f_color_cache[((((uint32_t)(v_color * 506832829u)) >> v_color_cache_shift) & 2047u)] = v_color; + v_color_cache_pixels.ptr += 4; + } + v_color_cache_pixels.len = 0; + } + } + v_p = v_p_end; + } else { + while ((v_q < v_p) && (v_p < v_p_end)) { + if (((v_p + 4u) <= v_p_end) && ((v_q + 4u) <= v_p)) { + v_color = wuffs_base__peek_u32le__no_bounds_check(wuffs_base__slice_u8__subslice_ij(a_dst, v_q, (v_q + 4u)).ptr); + wuffs_base__poke_u32le__no_bounds_check(wuffs_base__slice_u8__subslice_ij(a_dst, v_p, (v_p + 4u)).ptr, v_color); + if (v_color_cache_shift > 0u) { + self->private_data.f_color_cache[((((uint32_t)(v_color * 506832829u)) >> v_color_cache_shift) & 2047u)] = v_color; + } + v_p += 4u; + v_q += 4u; + } else { + a_dst.ptr[v_p] = a_dst.ptr[v_q]; + v_p += 1u; + v_q += 1u; + } + } + } + v_x += (v_back_ref_len_minus_1 + 1u); + while (v_x >= a_width) { + v_x -= a_width; + v_y += 1u; + } + break; + } else { + v_color = self->private_data.f_color_cache[((v_pixel_g - 280u) & 2047u)]; + } + } + if ((v_p + 4u) > ((uint64_t)(a_dst.len))) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_dst_buffer); + goto exit; + } + wuffs_base__poke_u32le__no_bounds_check(wuffs_base__slice_u8__subslice_ij(a_dst, v_p, (v_p + 4u)).ptr, v_color); + v_p += 4u; + if (v_color_cache_shift > 0u) { + self->private_data.f_color_cache[((((uint32_t)(v_color * 506832829u)) >> v_color_cache_shift) & 2047u)] = v_color; + } + v_x += 1u; + if (v_x == a_width) { + v_x = 0u; + v_y += 1u; + break; + } + } + } + if (v_n_bits > 63u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); + goto exit; + } + while (v_n_bits >= 8u) { + v_n_bits -= 8u; + if (iop_a_src > io1_a_src) { + iop_a_src--; + } else { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_i_o); + goto exit; + } + } + self->private_impl.f_bits = ((uint32_t)((v_bits & ((((uint64_t)(1u)) << v_n_bits) - 1u)))); + self->private_impl.f_n_bits = v_n_bits; + self->private_impl.f_pix_p = v_p; + self->private_impl.f_pix_x = v_x; + self->private_impl.f_pix_y = v_y; + self->private_impl.f_pix_cc_p = v_p; + status = wuffs_base__make_status(NULL); + goto ok; + + ok: + goto exit; + exit: + if (a_src && a_src->data.ptr) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + + return status; +} + // -------- func webp.decoder.decode_pixels_slow WUFFS_BASE__GENERATED_C_CODE @@ -83399,8 +98815,11 @@ wuffs_webp__decoder__decode_pixels_slow( uint32_t v_y = 0; uint32_t v_i = 0; uint32_t v_hg = 0; - uint32_t v_h = 0; - uint16_t v_node = 0; + uint32_t v_ht_base = 0; + uint32_t v_table_entry = 0; + uint32_t v_table_entry_n_bits = 0; + uint32_t v_redir_top = 0; + uint32_t v_redir_mask = 0; uint32_t v_pixel_g = 0; uint32_t v_color = 0; wuffs_base__slice_u8 v_dst_pixel = {0}; @@ -83440,7 +98859,7 @@ wuffs_webp__decoder__decode_pixels_slow( v_x = self->private_data.s_decode_pixels_slow.v_x; v_y = self->private_data.s_decode_pixels_slow.v_y; v_hg = self->private_data.s_decode_pixels_slow.v_hg; - v_node = self->private_data.s_decode_pixels_slow.v_node; + v_table_entry = self->private_data.s_decode_pixels_slow.v_table_entry; v_color = self->private_data.s_decode_pixels_slow.v_color; v_back_ref_len_n_bits = self->private_data.s_decode_pixels_slow.v_back_ref_len_n_bits; v_back_ref_len_minus_1 = self->private_data.s_decode_pixels_slow.v_back_ref_len_minus_1; @@ -83456,6 +98875,10 @@ wuffs_webp__decoder__decode_pixels_slow( status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_dst_buffer); goto exit; } + v_p = self->private_impl.f_pix_p; + v_x = self->private_impl.f_pix_x; + v_y = self->private_impl.f_pix_y; + v_color_cache_p = self->private_impl.f_pix_cc_p; if (a_tile_size_log2 != 0u) { v_tile_size_log2 = a_tile_size_log2; v_width_in_tiles = ((a_width + ((((uint32_t)(1u)) << v_tile_size_log2) - 1u)) >> v_tile_size_log2); @@ -83467,108 +98890,204 @@ wuffs_webp__decoder__decode_pixels_slow( v_i = ((uint32_t)(((uint32_t)(((uint32_t)(((uint32_t)((v_y >> v_tile_size_log2) * v_width_in_tiles)) + (v_x >> v_tile_size_log2))) * 4u)) + 1u)); if (((uint64_t)(v_i)) < ((uint64_t)(a_tile_data.len))) { v_hg = ((uint32_t)(a_tile_data.ptr[((uint64_t)(v_i))])); + if ((((uint64_t)(v_i)) + 1u) < ((uint64_t)(a_tile_data.len))) { + v_hg = (((((uint32_t)(a_tile_data.ptr[(((uint64_t)(v_i)) + 1u)])) << 8u) | v_hg) & 1023u); + } } - v_h = ((uint32_t)(WUFFS_WEBP__HUFFMAN_TABLE_BASE_OFFSETS[0u])); - while (true) { - v_node = self->private_data.f_huffman_nodes[v_hg][v_h]; - if (v_node >= 32768u) { - break; - } else if (v_node > 6265u) { - status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_huffman_code); + while ((self->private_impl.f_n_bits < 8u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint8_t t_0 = *iop_a_src++; + v_c8 = t_0; + } + if (self->private_impl.f_n_bits >= 8u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); goto exit; } - if (self->private_impl.f_n_bits < 1u) { + self->private_impl.f_bits |= (((uint32_t)(v_c8)) << self->private_impl.f_n_bits); + self->private_impl.f_n_bits += 8u; + } + v_ht_base = ((uint32_t)(self->private_data.f_huffman_table_base_offsets[v_hg][0u])); + v_table_entry = self->private_data.f_huffman_tables[v_hg][(((uint32_t)(v_ht_base + (self->private_impl.f_bits & 255u))) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + self->private_impl.f_bits >>= v_table_entry_n_bits; + self->private_impl.f_n_bits = (((uint32_t)(self->private_impl.f_n_bits - v_table_entry_n_bits)) & 31u); + if ((v_table_entry >> 31u) == 0u) { + while ((self->private_impl.f_n_bits < 7u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { { - WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1); + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(2); if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { status = wuffs_base__make_status(wuffs_base__suspension__short_read); goto suspend; } - uint8_t t_0 = *iop_a_src++; - v_c8 = t_0; + uint8_t t_1 = *iop_a_src++; + v_c8 = t_1; } - self->private_impl.f_bits = ((uint32_t)(v_c8)); - self->private_impl.f_n_bits = 8u; + if (self->private_impl.f_n_bits >= 7u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); + goto exit; + } + self->private_impl.f_bits |= (((uint32_t)(v_c8)) << self->private_impl.f_n_bits); + self->private_impl.f_n_bits += 8u; } - v_h = (((uint32_t)(v_node)) + (self->private_impl.f_bits & 1u)); - self->private_impl.f_bits >>= 1u; - self->private_impl.f_n_bits -= 1u; + v_redir_top = ((v_table_entry >> 8u) & 65535u); + v_redir_mask = ((((uint32_t)(1u)) << ((v_table_entry >> 4u) & 15u)) - 1u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][((v_redir_top + (self->private_impl.f_bits & v_redir_mask)) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + self->private_impl.f_bits >>= v_table_entry_n_bits; + self->private_impl.f_n_bits = (((uint32_t)(self->private_impl.f_n_bits - v_table_entry_n_bits)) & 31u); } - v_pixel_g = ((uint32_t)(((uint16_t)(v_node & 32767u)))); + v_pixel_g = ((v_table_entry >> 8u) & 65535u); if (v_pixel_g < 256u) { v_color = (v_pixel_g << 8u); - v_h = ((uint32_t)(WUFFS_WEBP__HUFFMAN_TABLE_BASE_OFFSETS[1u])); - while (true) { - v_node = self->private_data.f_huffman_nodes[v_hg][v_h]; - if (v_node >= 32768u) { - break; + while ((self->private_impl.f_n_bits < 8u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(3); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint8_t t_2 = *iop_a_src++; + v_c8 = t_2; + } + if (self->private_impl.f_n_bits >= 8u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); + goto exit; } - if (self->private_impl.f_n_bits < 1u) { + self->private_impl.f_bits |= (((uint32_t)(v_c8)) << self->private_impl.f_n_bits); + self->private_impl.f_n_bits += 8u; + } + v_ht_base = ((uint32_t)(self->private_data.f_huffman_table_base_offsets[v_hg][1u])); + v_table_entry = self->private_data.f_huffman_tables[v_hg][(((uint32_t)(v_ht_base + (self->private_impl.f_bits & 255u))) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + self->private_impl.f_bits >>= v_table_entry_n_bits; + self->private_impl.f_n_bits = (((uint32_t)(self->private_impl.f_n_bits - v_table_entry_n_bits)) & 31u); + if ((v_table_entry >> 31u) == 0u) { + while ((self->private_impl.f_n_bits < 7u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { { - WUFFS_BASE__COROUTINE_SUSPENSION_POINT(2); + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(4); if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { status = wuffs_base__make_status(wuffs_base__suspension__short_read); goto suspend; } - uint8_t t_1 = *iop_a_src++; - v_c8 = t_1; + uint8_t t_3 = *iop_a_src++; + v_c8 = t_3; + } + if (self->private_impl.f_n_bits >= 7u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); + goto exit; } - self->private_impl.f_bits = ((uint32_t)(v_c8)); - self->private_impl.f_n_bits = 8u; + self->private_impl.f_bits |= (((uint32_t)(v_c8)) << self->private_impl.f_n_bits); + self->private_impl.f_n_bits += 8u; } - v_h = ((((uint32_t)(v_node)) & 4095u) + (self->private_impl.f_bits & 1u)); - self->private_impl.f_bits >>= 1u; - self->private_impl.f_n_bits -= 1u; + v_redir_top = ((v_table_entry >> 8u) & 65535u); + v_redir_mask = ((((uint32_t)(1u)) << ((v_table_entry >> 4u) & 15u)) - 1u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][((v_redir_top + (self->private_impl.f_bits & v_redir_mask)) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + self->private_impl.f_bits >>= v_table_entry_n_bits; + self->private_impl.f_n_bits = (((uint32_t)(self->private_impl.f_n_bits - v_table_entry_n_bits)) & 31u); } - v_color |= (((uint32_t)(((uint16_t)(v_node & 255u)))) << 16u); - v_h = ((uint32_t)(WUFFS_WEBP__HUFFMAN_TABLE_BASE_OFFSETS[2u])); - while (true) { - v_node = self->private_data.f_huffman_nodes[v_hg][v_h]; - if (v_node >= 32768u) { - break; + v_color |= (((uint32_t)(((v_table_entry >> 8u) & 255u))) << 16u); + while ((self->private_impl.f_n_bits < 8u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(5); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint8_t t_4 = *iop_a_src++; + v_c8 = t_4; + } + if (self->private_impl.f_n_bits >= 8u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); + goto exit; } - if (self->private_impl.f_n_bits < 1u) { + self->private_impl.f_bits |= (((uint32_t)(v_c8)) << self->private_impl.f_n_bits); + self->private_impl.f_n_bits += 8u; + } + v_ht_base = ((uint32_t)(self->private_data.f_huffman_table_base_offsets[v_hg][2u])); + v_table_entry = self->private_data.f_huffman_tables[v_hg][(((uint32_t)(v_ht_base + (self->private_impl.f_bits & 255u))) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + self->private_impl.f_bits >>= v_table_entry_n_bits; + self->private_impl.f_n_bits = (((uint32_t)(self->private_impl.f_n_bits - v_table_entry_n_bits)) & 31u); + if ((v_table_entry >> 31u) == 0u) { + while ((self->private_impl.f_n_bits < 7u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { { - WUFFS_BASE__COROUTINE_SUSPENSION_POINT(3); + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(6); if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { status = wuffs_base__make_status(wuffs_base__suspension__short_read); goto suspend; } - uint8_t t_2 = *iop_a_src++; - v_c8 = t_2; + uint8_t t_5 = *iop_a_src++; + v_c8 = t_5; + } + if (self->private_impl.f_n_bits >= 7u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); + goto exit; } - self->private_impl.f_bits = ((uint32_t)(v_c8)); - self->private_impl.f_n_bits = 8u; + self->private_impl.f_bits |= (((uint32_t)(v_c8)) << self->private_impl.f_n_bits); + self->private_impl.f_n_bits += 8u; } - v_h = ((((uint32_t)(v_node)) & 4095u) + (self->private_impl.f_bits & 1u)); - self->private_impl.f_bits >>= 1u; - self->private_impl.f_n_bits -= 1u; + v_redir_top = ((v_table_entry >> 8u) & 65535u); + v_redir_mask = ((((uint32_t)(1u)) << ((v_table_entry >> 4u) & 15u)) - 1u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][((v_redir_top + (self->private_impl.f_bits & v_redir_mask)) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + self->private_impl.f_bits >>= v_table_entry_n_bits; + self->private_impl.f_n_bits = (((uint32_t)(self->private_impl.f_n_bits - v_table_entry_n_bits)) & 31u); } - v_color |= (((uint32_t)(((uint16_t)(v_node & 255u)))) << 0u); - v_h = ((uint32_t)(WUFFS_WEBP__HUFFMAN_TABLE_BASE_OFFSETS[3u])); - while (true) { - v_node = self->private_data.f_huffman_nodes[v_hg][v_h]; - if (v_node >= 32768u) { - break; + v_color |= (((uint32_t)(((v_table_entry >> 8u) & 255u))) << 0u); + while ((self->private_impl.f_n_bits < 8u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(7); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint8_t t_6 = *iop_a_src++; + v_c8 = t_6; } - if (self->private_impl.f_n_bits < 1u) { + if (self->private_impl.f_n_bits >= 8u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); + goto exit; + } + self->private_impl.f_bits |= (((uint32_t)(v_c8)) << self->private_impl.f_n_bits); + self->private_impl.f_n_bits += 8u; + } + v_ht_base = ((uint32_t)(self->private_data.f_huffman_table_base_offsets[v_hg][3u])); + v_table_entry = self->private_data.f_huffman_tables[v_hg][(((uint32_t)(v_ht_base + (self->private_impl.f_bits & 255u))) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + self->private_impl.f_bits >>= v_table_entry_n_bits; + self->private_impl.f_n_bits = (((uint32_t)(self->private_impl.f_n_bits - v_table_entry_n_bits)) & 31u); + if ((v_table_entry >> 31u) == 0u) { + while ((self->private_impl.f_n_bits < 7u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { { - WUFFS_BASE__COROUTINE_SUSPENSION_POINT(4); + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(8); if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { status = wuffs_base__make_status(wuffs_base__suspension__short_read); goto suspend; } - uint8_t t_3 = *iop_a_src++; - v_c8 = t_3; + uint8_t t_7 = *iop_a_src++; + v_c8 = t_7; + } + if (self->private_impl.f_n_bits >= 7u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); + goto exit; } - self->private_impl.f_bits = ((uint32_t)(v_c8)); - self->private_impl.f_n_bits = 8u; + self->private_impl.f_bits |= (((uint32_t)(v_c8)) << self->private_impl.f_n_bits); + self->private_impl.f_n_bits += 8u; } - v_h = ((((uint32_t)(v_node)) & 4095u) + (self->private_impl.f_bits & 1u)); - self->private_impl.f_bits >>= 1u; - self->private_impl.f_n_bits -= 1u; + v_redir_top = ((v_table_entry >> 8u) & 65535u); + v_redir_mask = ((((uint32_t)(1u)) << ((v_table_entry >> 4u) & 15u)) - 1u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][((v_redir_top + (self->private_impl.f_bits & v_redir_mask)) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + self->private_impl.f_bits >>= v_table_entry_n_bits; + self->private_impl.f_n_bits = (((uint32_t)(self->private_impl.f_n_bits - v_table_entry_n_bits)) & 31u); } - v_color |= (((uint32_t)(((uint16_t)(v_node & 255u)))) << 24u); + v_color |= (((uint32_t)(((v_table_entry >> 8u) & 255u))) << 24u); } else if (v_pixel_g < 280u) { if (v_pixel_g < 260u) { v_back_ref_len_minus_1 = (v_pixel_g - 256u); @@ -83577,13 +99096,13 @@ wuffs_webp__decoder__decode_pixels_slow( v_back_ref_len_minus_1 = ((((uint32_t)(2u)) + (v_pixel_g & 1u)) << v_back_ref_len_n_bits); while (self->private_impl.f_n_bits < v_back_ref_len_n_bits) { { - WUFFS_BASE__COROUTINE_SUSPENSION_POINT(5); + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(9); if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { status = wuffs_base__make_status(wuffs_base__suspension__short_read); goto suspend; } - uint8_t t_4 = *iop_a_src++; - v_c8 = t_4; + uint8_t t_8 = *iop_a_src++; + v_c8 = t_8; } if (self->private_impl.f_n_bits >= v_back_ref_len_n_bits) { status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); @@ -83596,30 +99115,54 @@ wuffs_webp__decoder__decode_pixels_slow( self->private_impl.f_bits >>= v_back_ref_len_n_bits; self->private_impl.f_n_bits -= v_back_ref_len_n_bits; } - v_h = ((uint32_t)(WUFFS_WEBP__HUFFMAN_TABLE_BASE_OFFSETS[4u])); - while (true) { - v_node = self->private_data.f_huffman_nodes[v_hg][v_h]; - if (v_node >= 32768u) { - break; + while ((self->private_impl.f_n_bits < 8u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(10); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint8_t t_9 = *iop_a_src++; + v_c8 = t_9; + } + if (self->private_impl.f_n_bits >= 8u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); + goto exit; } - if (self->private_impl.f_n_bits < 1u) { + self->private_impl.f_bits |= (((uint32_t)(v_c8)) << self->private_impl.f_n_bits); + self->private_impl.f_n_bits += 8u; + } + v_ht_base = ((uint32_t)(self->private_data.f_huffman_table_base_offsets[v_hg][4u])); + v_table_entry = self->private_data.f_huffman_tables[v_hg][(((uint32_t)(v_ht_base + (self->private_impl.f_bits & 255u))) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + self->private_impl.f_bits >>= v_table_entry_n_bits; + self->private_impl.f_n_bits = (((uint32_t)(self->private_impl.f_n_bits - v_table_entry_n_bits)) & 31u); + if ((v_table_entry >> 31u) == 0u) { + while ((self->private_impl.f_n_bits < 7u) && (((uint64_t)(io2_a_src - iop_a_src)) > 0u)) { { - WUFFS_BASE__COROUTINE_SUSPENSION_POINT(6); + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(11); if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { status = wuffs_base__make_status(wuffs_base__suspension__short_read); goto suspend; } - uint8_t t_5 = *iop_a_src++; - v_c8 = t_5; + uint8_t t_10 = *iop_a_src++; + v_c8 = t_10; + } + if (self->private_impl.f_n_bits >= 7u) { + status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); + goto exit; } - self->private_impl.f_bits = ((uint32_t)(v_c8)); - self->private_impl.f_n_bits = 8u; + self->private_impl.f_bits |= (((uint32_t)(v_c8)) << self->private_impl.f_n_bits); + self->private_impl.f_n_bits += 8u; } - v_h = ((((uint32_t)(v_node)) & 4095u) + (self->private_impl.f_bits & 1u)); - self->private_impl.f_bits >>= 1u; - self->private_impl.f_n_bits -= 1u; + v_redir_top = ((v_table_entry >> 8u) & 65535u); + v_redir_mask = ((((uint32_t)(1u)) << ((v_table_entry >> 4u) & 15u)) - 1u); + v_table_entry = self->private_data.f_huffman_tables[v_hg][((v_redir_top + (self->private_impl.f_bits & v_redir_mask)) & 4095u)]; + v_table_entry_n_bits = (v_table_entry & 15u); + self->private_impl.f_bits >>= v_table_entry_n_bits; + self->private_impl.f_n_bits = (((uint32_t)(self->private_impl.f_n_bits - v_table_entry_n_bits)) & 31u); } - v_back_ref_dist_sym = ((uint32_t)(((uint16_t)(v_node & 32767u)))); + v_back_ref_dist_sym = ((v_table_entry >> 8u) & 65535u); if (v_back_ref_dist_sym < 4u) { v_back_ref_dist_premap_minus_1 = v_back_ref_dist_sym; } else if (v_back_ref_dist_sym < 40u) { @@ -83627,13 +99170,13 @@ wuffs_webp__decoder__decode_pixels_slow( v_back_ref_dist_premap_minus_1 = ((((uint32_t)(2u)) + (v_back_ref_dist_sym & 1u)) << v_back_ref_dist_n_bits); while (self->private_impl.f_n_bits < v_back_ref_dist_n_bits) { { - WUFFS_BASE__COROUTINE_SUSPENSION_POINT(7); + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(12); if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { status = wuffs_base__make_status(wuffs_base__suspension__short_read); goto suspend; } - uint8_t t_6 = *iop_a_src++; - v_c8 = t_6; + uint8_t t_11 = *iop_a_src++; + v_c8 = t_11; } if (self->private_impl.f_n_bits >= v_back_ref_dist_n_bits) { status = wuffs_base__make_status(wuffs_webp__error__internal_error_inconsistent_n_bits); @@ -83704,6 +99247,10 @@ wuffs_webp__decoder__decode_pixels_slow( v_y += 1u; } } + self->private_impl.f_pix_p = v_p; + self->private_impl.f_pix_x = v_x; + self->private_impl.f_pix_y = v_y; + self->private_impl.f_pix_cc_p = v_color_cache_p; goto ok; ok: @@ -83721,7 +99268,7 @@ wuffs_webp__decoder__decode_pixels_slow( self->private_data.s_decode_pixels_slow.v_x = v_x; self->private_data.s_decode_pixels_slow.v_y = v_y; self->private_data.s_decode_pixels_slow.v_hg = v_hg; - self->private_data.s_decode_pixels_slow.v_node = v_node; + self->private_data.s_decode_pixels_slow.v_table_entry = v_table_entry; self->private_data.s_decode_pixels_slow.v_color = v_color; self->private_data.s_decode_pixels_slow.v_back_ref_len_n_bits = v_back_ref_len_n_bits; self->private_data.s_decode_pixels_slow.v_back_ref_len_minus_1 = v_back_ref_len_minus_1; @@ -83746,6 +99293,15 @@ wuffs_webp__decoder__apply_transform_predictor( wuffs_webp__decoder* self, wuffs_base__slice_u8 a_pix, wuffs_base__slice_u8 a_tile_data) { + return (*self->private_impl.choosy_apply_transform_predictor)(self, a_pix, a_tile_data); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_predictor__choosy_default( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix, + wuffs_base__slice_u8 a_tile_data) { uint64_t v_w4 = 0; wuffs_base__slice_u8 v_prev_row = {0}; wuffs_base__slice_u8 v_curr_row = {0}; @@ -84134,9 +99690,19 @@ wuffs_webp__decoder__apply_transform_cross_color( wuffs_webp__decoder* self, wuffs_base__slice_u8 a_pix, wuffs_base__slice_u8 a_tile_data) { + return (*self->private_impl.choosy_apply_transform_cross_color)(self, a_pix, a_tile_data); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_cross_color__choosy_default( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix, + wuffs_base__slice_u8 a_tile_data) { uint32_t v_tile_size_log2 = 0; uint32_t v_tiles_per_row = 0; uint32_t v_mask = 0; + bool v_do_subtract_green = false; uint32_t v_y = 0; uint32_t v_x = 0; uint64_t v_t = 0; @@ -84151,6 +99717,7 @@ wuffs_webp__decoder__apply_transform_cross_color( v_tile_size_log2 = ((uint32_t)(self->private_impl.f_transform_tile_size_log2[1u])); v_tiles_per_row = ((self->private_impl.f_width + ((((uint32_t)(1u)) << v_tile_size_log2) - 1u)) >> v_tile_size_log2); v_mask = ((((uint32_t)(1u)) << v_tile_size_log2) - 1u); + v_do_subtract_green = self->private_impl.f_fuse_subtract_green; v_y = 0u; while (v_y < self->private_impl.f_height) { v_t = ((uint64_t)((4u * (v_y >> v_tile_size_log2) * v_tiles_per_row))); @@ -84174,12 +99741,23 @@ wuffs_webp__decoder__apply_transform_cross_color( #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" #endif - v_r += ((uint8_t)((((uint32_t)(wuffs_base__utility__sign_extend_convert_u8_u32(v_g) * v_g2r)) >> 5u))); - v_b += ((uint8_t)((((uint32_t)(wuffs_base__utility__sign_extend_convert_u8_u32(v_g) * v_g2b)) >> 5u))); - v_b += ((uint8_t)((((uint32_t)(wuffs_base__utility__sign_extend_convert_u8_u32(v_r) * v_r2b)) >> 5u))); + v_r += ((uint8_t)((((uint32_t)(wuffs_base__utility__sign_extend_convert_u8_u32(v_g) * v_g2r)) >> 5u))); + v_b += ((uint8_t)((((uint32_t)(wuffs_base__utility__sign_extend_convert_u8_u32(v_g) * v_g2b)) >> 5u))); + v_b += ((uint8_t)((((uint32_t)(wuffs_base__utility__sign_extend_convert_u8_u32(v_r) * v_r2b)) >> 5u))); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + if (v_do_subtract_green) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_r += v_g; + v_b += v_g; #if defined(__GNUC__) #pragma GCC diagnostic pop #endif + } a_pix.ptr[0u] = v_b; a_pix.ptr[2u] = v_r; a_pix = wuffs_base__slice_u8__subslice_i(a_pix, 4u); @@ -84198,6 +99776,14 @@ static wuffs_base__empty_struct wuffs_webp__decoder__apply_transform_subtract_green( wuffs_webp__decoder* self, wuffs_base__slice_u8 a_pix) { + return (*self->private_impl.choosy_apply_transform_subtract_green)(self, a_pix); +} + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_subtract_green__choosy_default( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix) { wuffs_base__slice_u8 v_p = {0}; uint8_t v_g = 0; @@ -84307,6 +99893,682 @@ wuffs_webp__decoder__apply_transform_color_indexing( return wuffs_base__make_empty_struct(); } +// ‼ WUFFS MULTI-FILE SECTION +x86_avx2 +// -------- func webp.decoder.apply_transform_subtract_green_x86_avx2 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_subtract_green_x86_avx2( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix) { + wuffs_base__slice_u8 v_tail = {0}; + __m256i v_v = {0}; + __m256i v_mask = {0}; + __m256i v_green = {0}; + __m256i v_g_br = {0}; + + v_mask = _mm256_set1_epi32((int32_t)(65280u)); + v_tail = a_pix; + while (((uint64_t)(v_tail.len)) >= 32u) { + v_v = _mm256_lddqu_si256((const __m256i*)(const void*)(v_tail.ptr)); + v_green = _mm256_and_si256(v_v, v_mask); + v_g_br = _mm256_or_si256(_mm256_srli_epi32(v_green, (int32_t)(8u)), _mm256_slli_epi32(v_green, (int32_t)(8u))); + v_v = _mm256_add_epi8(v_v, v_g_br); + _mm256_storeu_si256((__m256i*)(void*)(v_tail.ptr), v_v); + v_tail = wuffs_base__slice_u8__subslice_i(v_tail, 32u); + } + while (((uint64_t)(v_tail.len)) >= 4u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_tail.ptr[0u] += v_tail.ptr[1u]; + v_tail.ptr[2u] += v_tail.ptr[1u]; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + v_tail = wuffs_base__slice_u8__subslice_i(v_tail, 4u); + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +// ‼ WUFFS MULTI-FILE SECTION -x86_avx2 + +// ‼ WUFFS MULTI-FILE SECTION +x86_avx2 +// -------- func webp.decoder.apply_transform_cross_color_x86_avx2 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_cross_color_x86_avx2( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix, + wuffs_base__slice_u8 a_tile_data) { + uint32_t v_tile_size_log2 = 0; + uint32_t v_tiles_per_row = 0; + uint32_t v_tmask = 0; + bool v_do_subtract_green = false; + uint32_t v_y = 0; + uint32_t v_x = 0; + uint64_t v_t = 0; + wuffs_base__slice_u8 v_tile_data = {0}; + uint32_t v_x_end = 0; + uint32_t v_g2r = 0; + uint32_t v_g2b = 0; + uint32_t v_r2b = 0; + uint8_t v_raw_g2r = 0; + uint8_t v_raw_g2b = 0; + uint8_t v_raw_r2b = 0; + uint64_t v_skip_bytes = 0; + uint8_t v_b = 0; + uint8_t v_g = 0; + uint8_t v_r = 0; + __m256i v_pix = {0}; + __m256i v_green_i16 = {0}; + __m256i v_red_i16 = {0}; + __m256i v_new_r_i16 = {0}; + __m256i v_delta_r_i16 = {0}; + __m256i v_delta_b_i16 = {0}; + __m256i v_g2r_vec = {0}; + __m256i v_g2b_vec = {0}; + __m256i v_r2b_vec = {0}; + __m256i v_delta_r_packed = {0}; + __m256i v_delta_b_packed = {0}; + __m256i v_green_shuf = {0}; + __m256i v_red_shuf = {0}; + __m256i v_r_scatter = {0}; + __m256i v_b_scatter = {0}; + __m256i v_sg_mask = {0}; + __m256i v_sg_green = {0}; + __m256i v_sg_br = {0}; + + v_tile_size_log2 = ((uint32_t)(self->private_impl.f_transform_tile_size_log2[1u])); + v_tiles_per_row = ((self->private_impl.f_width + ((((uint32_t)(1u)) << v_tile_size_log2) - 1u)) >> v_tile_size_log2); + v_tmask = ((((uint32_t)(1u)) << v_tile_size_log2) - 1u); + v_do_subtract_green = self->private_impl.f_fuse_subtract_green; + v_green_shuf = _mm256_set_epi32((int32_t)(2155905152u), (int32_t)(2155905152u), (int32_t)(2148368393u), (int32_t)(2147844097u), (int32_t)(2155905152u), (int32_t)(2155905152u), (int32_t)(2148368393u), (int32_t)(2147844097u)); + v_red_shuf = _mm256_set_epi32((int32_t)(2155905152u), (int32_t)(2155905152u), (int32_t)(2148433930u), (int32_t)(2147909634u), (int32_t)(2155905152u), (int32_t)(2155905152u), (int32_t)(2148433930u), (int32_t)(2147909634u)); + v_r_scatter = _mm256_set_epi32((int32_t)(2147909760u), (int32_t)(2147778688u), (int32_t)(2147647616u), (int32_t)(2147516544u), (int32_t)(2147909760u), (int32_t)(2147778688u), (int32_t)(2147647616u), (int32_t)(2147516544u)); + v_b_scatter = _mm256_set_epi32((int32_t)(2155905030u), (int32_t)(2155905028u), (int32_t)(2155905026u), (int32_t)(2155905024u), (int32_t)(2155905030u), (int32_t)(2155905028u), (int32_t)(2155905026u), (int32_t)(2155905024u)); + v_sg_mask = _mm256_set1_epi32((int32_t)(65280u)); + v_y = 0u; + while (v_y < self->private_impl.f_height) { + v_t = ((uint64_t)((4u * (v_y >> v_tile_size_log2) * v_tiles_per_row))); + v_tile_data = wuffs_base__utility__empty_slice_u8(); + if (v_t <= ((uint64_t)(a_tile_data.len))) { + v_tile_data = wuffs_base__slice_u8__subslice_i(a_tile_data, v_t); + } + v_x = 0u; + while (v_x < self->private_impl.f_width) { + if (((v_x & v_tmask) == 0u) && (((uint64_t)(v_tile_data.len)) >= 4u)) { + v_raw_g2r = v_tile_data.ptr[0u]; + v_raw_g2b = v_tile_data.ptr[1u]; + v_raw_r2b = v_tile_data.ptr[2u]; + v_g2r = wuffs_base__utility__sign_extend_convert_u8_u32(v_raw_g2r); + v_g2b = wuffs_base__utility__sign_extend_convert_u8_u32(v_raw_g2b); + v_r2b = wuffs_base__utility__sign_extend_convert_u8_u32(v_raw_r2b); + v_tile_data = wuffs_base__slice_u8__subslice_i(v_tile_data, 4u); + } + v_x_end = ((v_x | v_tmask) + 1u); + if (v_x_end > self->private_impl.f_width) { + v_x_end = self->private_impl.f_width; + } + while ((v_x_end < self->private_impl.f_width) && (((uint64_t)(v_tile_data.len)) >= 4u)) { + if ((v_tile_data.ptr[0u] != v_raw_g2r) || (v_tile_data.ptr[1u] != v_raw_g2b) || (v_tile_data.ptr[2u] != v_raw_r2b)) { + break; + } + v_tile_data = wuffs_base__slice_u8__subslice_i(v_tile_data, 4u); + v_x_end = ((uint32_t)((v_x_end | v_tmask) + 1u)); + if (v_x_end > self->private_impl.f_width) { + v_x_end = self->private_impl.f_width; + } + } + if (v_x_end > self->private_impl.f_width) { + v_x_end = self->private_impl.f_width; + } + if ((v_g2r == 0u) && + (v_g2b == 0u) && + (v_r2b == 0u) && + ! v_do_subtract_green) { + if ((v_x_end > v_x) && (v_x_end <= self->private_impl.f_width)) { + v_skip_bytes = (((uint64_t)((v_x_end - v_x))) * 4u); + v_x = v_x_end; + if (v_skip_bytes <= ((uint64_t)(a_pix.len))) { + a_pix = wuffs_base__slice_u8__subslice_i(a_pix, v_skip_bytes); + } + } + } else { + v_g2r_vec = _mm256_set1_epi16((int16_t)(((uint16_t)(v_g2r)))); + v_g2b_vec = _mm256_set1_epi16((int16_t)(((uint16_t)(v_g2b)))); + v_r2b_vec = _mm256_set1_epi16((int16_t)(((uint16_t)(v_r2b)))); + if (v_x_end >= 8u) { + while ((v_x < self->private_impl.f_width) && + (v_x <= (v_x_end - 8u)) && + (((uint64_t)(a_pix.len)) >= 32u) && + (v_x_end <= self->private_impl.f_width)) { + v_pix = _mm256_lddqu_si256((const __m256i*)(const void*)(a_pix.ptr)); + v_green_i16 = _mm256_shuffle_epi8(v_pix, v_green_shuf); + v_green_i16 = _mm256_srai_epi16(_mm256_slli_epi16(v_green_i16, (int32_t)(8u)), (int32_t)(8u)); + v_delta_r_i16 = _mm256_srai_epi16(_mm256_mullo_epi16(v_green_i16, v_g2r_vec), (int32_t)(5u)); + v_delta_b_i16 = _mm256_srai_epi16(_mm256_mullo_epi16(v_green_i16, v_g2b_vec), (int32_t)(5u)); + v_red_i16 = _mm256_shuffle_epi8(v_pix, v_red_shuf); + v_red_i16 = _mm256_srai_epi16(_mm256_slli_epi16(v_red_i16, (int32_t)(8u)), (int32_t)(8u)); + v_new_r_i16 = _mm256_add_epi16(v_red_i16, v_delta_r_i16); + v_new_r_i16 = _mm256_srai_epi16(_mm256_slli_epi16(v_new_r_i16, (int32_t)(8u)), (int32_t)(8u)); + v_delta_b_i16 = _mm256_add_epi16(v_delta_b_i16, _mm256_srai_epi16(_mm256_mullo_epi16(v_new_r_i16, v_r2b_vec), (int32_t)(5u))); + v_delta_r_packed = _mm256_shuffle_epi8(v_delta_r_i16, v_r_scatter); + v_delta_b_packed = _mm256_shuffle_epi8(v_delta_b_i16, v_b_scatter); + v_pix = _mm256_add_epi8(v_pix, v_delta_r_packed); + v_pix = _mm256_add_epi8(v_pix, v_delta_b_packed); + if (v_do_subtract_green) { + v_sg_green = _mm256_and_si256(v_pix, v_sg_mask); + v_sg_br = _mm256_or_si256(_mm256_srli_epi32(v_sg_green, (int32_t)(8u)), _mm256_slli_epi32(v_sg_green, (int32_t)(8u))); + v_pix = _mm256_add_epi8(v_pix, v_sg_br); + } + _mm256_storeu_si256((__m256i*)(void*)(a_pix.ptr), v_pix); + a_pix = wuffs_base__slice_u8__subslice_i(a_pix, 32u); + v_x += 8u; + } + } + while ((v_x < v_x_end) && (v_x_end <= self->private_impl.f_width)) { + if (((uint64_t)(a_pix.len)) >= 4u) { + v_b = a_pix.ptr[0u]; + v_g = a_pix.ptr[1u]; + v_r = a_pix.ptr[2u]; +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_r += ((uint8_t)((((uint32_t)(wuffs_base__utility__sign_extend_convert_u8_u32(v_g) * v_g2r)) >> 5u))); + v_b += ((uint8_t)((((uint32_t)(wuffs_base__utility__sign_extend_convert_u8_u32(v_g) * v_g2b)) >> 5u))); + v_b += ((uint8_t)((((uint32_t)(wuffs_base__utility__sign_extend_convert_u8_u32(v_r) * v_r2b)) >> 5u))); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + if (v_do_subtract_green) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_r += v_g; + v_b += v_g; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } + a_pix.ptr[0u] = v_b; + a_pix.ptr[2u] = v_r; + a_pix = wuffs_base__slice_u8__subslice_i(a_pix, 4u); + } + v_x += 1u; + } + } + } + v_y += 1u; + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +// ‼ WUFFS MULTI-FILE SECTION -x86_avx2 + +// ‼ WUFFS MULTI-FILE SECTION +x86_avx2 +// -------- func webp.decoder.apply_transform_predictor_x86_avx2 + +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2") +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_transform_predictor_x86_avx2( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_pix, + wuffs_base__slice_u8 a_tile_data) { + uint64_t v_w4 = 0; + wuffs_base__slice_u8 v_prev_row = {0}; + wuffs_base__slice_u8 v_curr_row = {0}; + uint32_t v_tile_size_log2 = 0; + uint32_t v_tiles_per_row = 0; + uint32_t v_mask = 0; + uint32_t v_y = 0; + uint32_t v_x = 0; + uint64_t v_t = 0; + wuffs_base__slice_u8 v_tile_data = {0}; + uint8_t v_mode = 0; + uint32_t v_x_end = 0; + __m256i v_avx_pix = {0}; + __m256i v_avx_prev = {0}; + __m256i v_avx_opaque = {0}; + __m256i v_avx_carry = {0}; + uint32_t v_l0 = 0; + uint32_t v_l1 = 0; + uint32_t v_l2 = 0; + uint32_t v_l3 = 0; + uint32_t v_c0 = 0; + uint32_t v_c1 = 0; + uint32_t v_c2 = 0; + uint32_t v_c3 = 0; + uint32_t v_t0 = 0; + uint32_t v_t1 = 0; + uint32_t v_t2 = 0; + uint32_t v_t3 = 0; + uint32_t v_sum_l = 0; + uint32_t v_sum_t = 0; + + if ((self->private_impl.f_width <= 0u) || (self->private_impl.f_height <= 0u)) { + return wuffs_base__make_empty_struct(); + } + v_w4 = ((uint64_t)((self->private_impl.f_width * 4u))); + v_curr_row = wuffs_base__utility__empty_slice_u8(); + if (v_w4 <= ((uint64_t)(a_pix.len))) { + v_curr_row = wuffs_base__slice_u8__subslice_j(a_pix, v_w4); + } + if (((uint64_t)(v_curr_row.len)) >= 4u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[3u] += 255u; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } + if (((uint64_t)(v_curr_row.len)) >= 4u) { + v_avx_carry = _mm256_set1_epi32((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_curr_row.ptr))); + while (((uint64_t)(v_curr_row.len)) >= 36u) { + v_avx_pix = _mm256_lddqu_si256((const __m256i*)(const void*)(v_curr_row.ptr + 4u)); + v_avx_prev = _mm256_slli_si256(v_avx_pix, (int32_t)(4u)); + v_avx_pix = _mm256_add_epi8(v_avx_pix, v_avx_prev); + v_avx_prev = _mm256_slli_si256(v_avx_pix, (int32_t)(8u)); + v_avx_pix = _mm256_add_epi8(v_avx_pix, v_avx_prev); + v_avx_opaque = _mm256_shuffle_epi32(v_avx_pix, (int32_t)(255u)); + v_avx_opaque = _mm256_permute2x128_si256(v_avx_opaque, v_avx_opaque, (int32_t)(8u)); + v_avx_pix = _mm256_add_epi8(v_avx_pix, v_avx_opaque); + v_avx_pix = _mm256_add_epi8(v_avx_pix, v_avx_carry); + _mm256_storeu_si256((__m256i*)(void*)(v_curr_row.ptr + 4u), v_avx_pix); + v_avx_carry = _mm256_permute4x64_epi64(v_avx_pix, (int32_t)(255u)); + v_avx_carry = _mm256_shuffle_epi32(v_avx_carry, (int32_t)(255u)); + v_curr_row = wuffs_base__slice_u8__subslice_i(v_curr_row, 32u); + } + } + while (((uint64_t)(v_curr_row.len)) >= 8u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += v_curr_row.ptr[0u]; + v_curr_row.ptr[5u] += v_curr_row.ptr[1u]; + v_curr_row.ptr[6u] += v_curr_row.ptr[2u]; + v_curr_row.ptr[7u] += v_curr_row.ptr[3u]; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + v_curr_row = wuffs_base__slice_u8__subslice_i(v_curr_row, 4u); + } + v_tile_size_log2 = ((uint32_t)(self->private_impl.f_transform_tile_size_log2[0u])); + v_tiles_per_row = ((self->private_impl.f_width + ((((uint32_t)(1u)) << v_tile_size_log2) - 1u)) >> v_tile_size_log2); + v_mask = ((((uint32_t)(1u)) << v_tile_size_log2) - 1u); + v_y = 1u; + while (v_y < self->private_impl.f_height) { + v_t = ((uint64_t)((4u * (v_y >> v_tile_size_log2) * v_tiles_per_row))); + v_tile_data = wuffs_base__utility__empty_slice_u8(); + if (v_t <= ((uint64_t)(a_tile_data.len))) { + v_tile_data = wuffs_base__slice_u8__subslice_i(a_tile_data, v_t); + if (((uint64_t)(v_tile_data.len)) >= 4u) { + v_mode = ((uint8_t)(v_tile_data.ptr[1u] & 15u)); + v_tile_data = wuffs_base__slice_u8__subslice_i(v_tile_data, 4u); + } + } + if (v_w4 <= ((uint64_t)(a_pix.len))) { + v_prev_row = a_pix; + a_pix = wuffs_base__slice_u8__subslice_i(a_pix, v_w4); + v_curr_row = a_pix; + } + if ((((uint64_t)(v_prev_row.len)) >= 4u) && (((uint64_t)(v_curr_row.len)) >= 4u)) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[0u] += v_prev_row.ptr[0u]; + v_curr_row.ptr[1u] += v_prev_row.ptr[1u]; + v_curr_row.ptr[2u] += v_prev_row.ptr[2u]; + v_curr_row.ptr[3u] += v_prev_row.ptr[3u]; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } + v_x = 1u; + while (v_x < self->private_impl.f_width) { + if (((v_x & v_mask) == 0u) && (((uint64_t)(v_tile_data.len)) >= 4u)) { + v_mode = ((uint8_t)(v_tile_data.ptr[1u] & 15u)); + v_tile_data = wuffs_base__slice_u8__subslice_i(v_tile_data, 4u); + } + v_x_end = ((v_x | v_mask) + 1u); + if (v_x_end > self->private_impl.f_width) { + v_x_end = self->private_impl.f_width; + } + while ((v_x_end < self->private_impl.f_width) && (((uint64_t)(v_tile_data.len)) >= 4u)) { + if (((uint8_t)(v_tile_data.ptr[1u] & 15u)) != v_mode) { + break; + } + v_tile_data = wuffs_base__slice_u8__subslice_i(v_tile_data, 4u); + v_x_end = ((uint32_t)((v_x_end | v_mask) + 1u)); + if (v_x_end > self->private_impl.f_width) { + v_x_end = self->private_impl.f_width; + } + } + if (v_x_end > self->private_impl.f_width) { + v_x_end = self->private_impl.f_width; + } + if (v_mode == 0u) { + v_avx_opaque = _mm256_set1_epi32((int32_t)(4278190080u)); + if (v_x_end >= 8u) { + while ((v_x < self->private_impl.f_width) && + (v_x <= (v_x_end - 8u)) && + (((uint64_t)(v_curr_row.len)) >= 36u) && + (((uint64_t)(v_prev_row.len)) >= 32u)) { + v_avx_pix = _mm256_lddqu_si256((const __m256i*)(const void*)(v_curr_row.ptr + 4u)); + v_avx_pix = _mm256_add_epi8(v_avx_pix, v_avx_opaque); + _mm256_storeu_si256((__m256i*)(void*)(v_curr_row.ptr + 4u), v_avx_pix); + v_curr_row = wuffs_base__slice_u8__subslice_i(v_curr_row, 32u); + v_prev_row = wuffs_base__slice_u8__subslice_i(v_prev_row, 32u); + v_x += 8u; + } + } + } else if (v_mode == 1u) { + if ((v_x_end >= 8u) && (((uint64_t)(v_curr_row.len)) >= 4u)) { + v_avx_carry = _mm256_set1_epi32((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_curr_row.ptr))); + while ((v_x < self->private_impl.f_width) && + (v_x <= (v_x_end - 8u)) && + (((uint64_t)(v_curr_row.len)) >= 36u) && + (((uint64_t)(v_prev_row.len)) >= 32u)) { + v_avx_pix = _mm256_lddqu_si256((const __m256i*)(const void*)(v_curr_row.ptr + 4u)); + v_avx_prev = _mm256_slli_si256(v_avx_pix, (int32_t)(4u)); + v_avx_pix = _mm256_add_epi8(v_avx_pix, v_avx_prev); + v_avx_prev = _mm256_slli_si256(v_avx_pix, (int32_t)(8u)); + v_avx_pix = _mm256_add_epi8(v_avx_pix, v_avx_prev); + v_avx_opaque = _mm256_shuffle_epi32(v_avx_pix, (int32_t)(255u)); + v_avx_opaque = _mm256_permute2x128_si256(v_avx_opaque, v_avx_opaque, (int32_t)(8u)); + v_avx_pix = _mm256_add_epi8(v_avx_pix, v_avx_opaque); + v_avx_pix = _mm256_add_epi8(v_avx_pix, v_avx_carry); + _mm256_storeu_si256((__m256i*)(void*)(v_curr_row.ptr + 4u), v_avx_pix); + v_avx_carry = _mm256_permute4x64_epi64(v_avx_pix, (int32_t)(255u)); + v_avx_carry = _mm256_shuffle_epi32(v_avx_carry, (int32_t)(255u)); + v_curr_row = wuffs_base__slice_u8__subslice_i(v_curr_row, 32u); + v_prev_row = wuffs_base__slice_u8__subslice_i(v_prev_row, 32u); + v_x += 8u; + } + } + while ((v_x < v_x_end) && + (v_x_end <= self->private_impl.f_width) && + (((uint64_t)(v_curr_row.len)) >= 8u) && + (((uint64_t)(v_prev_row.len)) >= 4u)) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += v_curr_row.ptr[0u]; + v_curr_row.ptr[5u] += v_curr_row.ptr[1u]; + v_curr_row.ptr[6u] += v_curr_row.ptr[2u]; + v_curr_row.ptr[7u] += v_curr_row.ptr[3u]; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + v_curr_row = wuffs_base__slice_u8__subslice_i(v_curr_row, 4u); + v_prev_row = wuffs_base__slice_u8__subslice_i(v_prev_row, 4u); + v_x += 1u; + } + } else if ((v_mode == 2u) || (v_mode == 3u) || (v_mode == 4u)) { + if (v_x_end >= 8u) { + while ((v_x < self->private_impl.f_width) && + (v_x <= (v_x_end - 8u)) && + (((uint64_t)(v_curr_row.len)) >= 36u) && + (((uint64_t)(v_prev_row.len)) >= 40u)) { + v_avx_pix = _mm256_lddqu_si256((const __m256i*)(const void*)(v_curr_row.ptr + 4u)); + if (v_mode == 2u) { + v_avx_prev = _mm256_lddqu_si256((const __m256i*)(const void*)(v_prev_row.ptr + 4u)); + } else if (v_mode == 3u) { + v_avx_prev = _mm256_lddqu_si256((const __m256i*)(const void*)(v_prev_row.ptr + 8u)); + } else { + v_avx_prev = _mm256_lddqu_si256((const __m256i*)(const void*)(v_prev_row.ptr + 0u)); + } + v_avx_pix = _mm256_add_epi8(v_avx_pix, v_avx_prev); + _mm256_storeu_si256((__m256i*)(void*)(v_curr_row.ptr + 4u), v_avx_pix); + v_curr_row = wuffs_base__slice_u8__subslice_i(v_curr_row, 32u); + v_prev_row = wuffs_base__slice_u8__subslice_i(v_prev_row, 32u); + v_x += 8u; + } + } + } + while ((v_x < v_x_end) && (v_x_end <= self->private_impl.f_width)) { + if ((((uint64_t)(v_prev_row.len)) < 12u) || (((uint64_t)(v_curr_row.len)) < 8u)) { + break; + } + if (v_mode == 0u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[7u] += 255u; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 1u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += v_curr_row.ptr[0u]; + v_curr_row.ptr[5u] += v_curr_row.ptr[1u]; + v_curr_row.ptr[6u] += v_curr_row.ptr[2u]; + v_curr_row.ptr[7u] += v_curr_row.ptr[3u]; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 2u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += v_prev_row.ptr[4u]; + v_curr_row.ptr[5u] += v_prev_row.ptr[5u]; + v_curr_row.ptr[6u] += v_prev_row.ptr[6u]; + v_curr_row.ptr[7u] += v_prev_row.ptr[7u]; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 3u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += v_prev_row.ptr[8u]; + v_curr_row.ptr[5u] += v_prev_row.ptr[9u]; + v_curr_row.ptr[6u] += v_prev_row.ptr[10u]; + v_curr_row.ptr[7u] += v_prev_row.ptr[11u]; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 4u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += v_prev_row.ptr[0u]; + v_curr_row.ptr[5u] += v_prev_row.ptr[1u]; + v_curr_row.ptr[6u] += v_prev_row.ptr[2u]; + v_curr_row.ptr[7u] += v_prev_row.ptr[3u]; +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 5u) { + v_l0 = ((((uint32_t)(v_curr_row.ptr[0u])) + ((uint32_t)(v_prev_row.ptr[8u]))) / 2u); + v_l1 = ((((uint32_t)(v_curr_row.ptr[1u])) + ((uint32_t)(v_prev_row.ptr[9u]))) / 2u); + v_l2 = ((((uint32_t)(v_curr_row.ptr[2u])) + ((uint32_t)(v_prev_row.ptr[10u]))) / 2u); + v_l3 = ((((uint32_t)(v_curr_row.ptr[3u])) + ((uint32_t)(v_prev_row.ptr[11u]))) / 2u); +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += ((uint8_t)(((v_l0 + ((uint32_t)(v_prev_row.ptr[4u]))) / 2u))); + v_curr_row.ptr[5u] += ((uint8_t)(((v_l1 + ((uint32_t)(v_prev_row.ptr[5u]))) / 2u))); + v_curr_row.ptr[6u] += ((uint8_t)(((v_l2 + ((uint32_t)(v_prev_row.ptr[6u]))) / 2u))); + v_curr_row.ptr[7u] += ((uint8_t)(((v_l3 + ((uint32_t)(v_prev_row.ptr[7u]))) / 2u))); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 6u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += ((uint8_t)(((((uint32_t)(v_curr_row.ptr[0u])) + ((uint32_t)(v_prev_row.ptr[0u]))) / 2u))); + v_curr_row.ptr[5u] += ((uint8_t)(((((uint32_t)(v_curr_row.ptr[1u])) + ((uint32_t)(v_prev_row.ptr[1u]))) / 2u))); + v_curr_row.ptr[6u] += ((uint8_t)(((((uint32_t)(v_curr_row.ptr[2u])) + ((uint32_t)(v_prev_row.ptr[2u]))) / 2u))); + v_curr_row.ptr[7u] += ((uint8_t)(((((uint32_t)(v_curr_row.ptr[3u])) + ((uint32_t)(v_prev_row.ptr[3u]))) / 2u))); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 7u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += ((uint8_t)(((((uint32_t)(v_curr_row.ptr[0u])) + ((uint32_t)(v_prev_row.ptr[4u]))) / 2u))); + v_curr_row.ptr[5u] += ((uint8_t)(((((uint32_t)(v_curr_row.ptr[1u])) + ((uint32_t)(v_prev_row.ptr[5u]))) / 2u))); + v_curr_row.ptr[6u] += ((uint8_t)(((((uint32_t)(v_curr_row.ptr[2u])) + ((uint32_t)(v_prev_row.ptr[6u]))) / 2u))); + v_curr_row.ptr[7u] += ((uint8_t)(((((uint32_t)(v_curr_row.ptr[3u])) + ((uint32_t)(v_prev_row.ptr[7u]))) / 2u))); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 8u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += ((uint8_t)(((((uint32_t)(v_prev_row.ptr[0u])) + ((uint32_t)(v_prev_row.ptr[4u]))) / 2u))); + v_curr_row.ptr[5u] += ((uint8_t)(((((uint32_t)(v_prev_row.ptr[1u])) + ((uint32_t)(v_prev_row.ptr[5u]))) / 2u))); + v_curr_row.ptr[6u] += ((uint8_t)(((((uint32_t)(v_prev_row.ptr[2u])) + ((uint32_t)(v_prev_row.ptr[6u]))) / 2u))); + v_curr_row.ptr[7u] += ((uint8_t)(((((uint32_t)(v_prev_row.ptr[3u])) + ((uint32_t)(v_prev_row.ptr[7u]))) / 2u))); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 9u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += ((uint8_t)(((((uint32_t)(v_prev_row.ptr[4u])) + ((uint32_t)(v_prev_row.ptr[8u]))) / 2u))); + v_curr_row.ptr[5u] += ((uint8_t)(((((uint32_t)(v_prev_row.ptr[5u])) + ((uint32_t)(v_prev_row.ptr[9u]))) / 2u))); + v_curr_row.ptr[6u] += ((uint8_t)(((((uint32_t)(v_prev_row.ptr[6u])) + ((uint32_t)(v_prev_row.ptr[10u]))) / 2u))); + v_curr_row.ptr[7u] += ((uint8_t)(((((uint32_t)(v_prev_row.ptr[7u])) + ((uint32_t)(v_prev_row.ptr[11u]))) / 2u))); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 10u) { + v_l0 = ((((uint32_t)(v_curr_row.ptr[0u])) + ((uint32_t)(v_prev_row.ptr[0u]))) / 2u); + v_l1 = ((((uint32_t)(v_curr_row.ptr[1u])) + ((uint32_t)(v_prev_row.ptr[1u]))) / 2u); + v_l2 = ((((uint32_t)(v_curr_row.ptr[2u])) + ((uint32_t)(v_prev_row.ptr[2u]))) / 2u); + v_l3 = ((((uint32_t)(v_curr_row.ptr[3u])) + ((uint32_t)(v_prev_row.ptr[3u]))) / 2u); + v_t0 = ((((uint32_t)(v_prev_row.ptr[4u])) + ((uint32_t)(v_prev_row.ptr[8u]))) / 2u); + v_t1 = ((((uint32_t)(v_prev_row.ptr[5u])) + ((uint32_t)(v_prev_row.ptr[9u]))) / 2u); + v_t2 = ((((uint32_t)(v_prev_row.ptr[6u])) + ((uint32_t)(v_prev_row.ptr[10u]))) / 2u); + v_t3 = ((((uint32_t)(v_prev_row.ptr[7u])) + ((uint32_t)(v_prev_row.ptr[11u]))) / 2u); +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += ((uint8_t)(((v_l0 + v_t0) / 2u))); + v_curr_row.ptr[5u] += ((uint8_t)(((v_l1 + v_t1) / 2u))); + v_curr_row.ptr[6u] += ((uint8_t)(((v_l2 + v_t2) / 2u))); + v_curr_row.ptr[7u] += ((uint8_t)(((v_l3 + v_t3) / 2u))); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 11u) { + v_l0 = ((uint32_t)(v_curr_row.ptr[0u])); + v_l1 = ((uint32_t)(v_curr_row.ptr[1u])); + v_l2 = ((uint32_t)(v_curr_row.ptr[2u])); + v_l3 = ((uint32_t)(v_curr_row.ptr[3u])); + v_c0 = ((uint32_t)(v_prev_row.ptr[0u])); + v_c1 = ((uint32_t)(v_prev_row.ptr[1u])); + v_c2 = ((uint32_t)(v_prev_row.ptr[2u])); + v_c3 = ((uint32_t)(v_prev_row.ptr[3u])); + v_t0 = ((uint32_t)(v_prev_row.ptr[4u])); + v_t1 = ((uint32_t)(v_prev_row.ptr[5u])); + v_t2 = ((uint32_t)(v_prev_row.ptr[6u])); + v_t3 = ((uint32_t)(v_prev_row.ptr[7u])); + v_sum_l = (wuffs_webp__decoder__absolute_difference(self, v_c0, v_t0) + + wuffs_webp__decoder__absolute_difference(self, v_c1, v_t1) + + wuffs_webp__decoder__absolute_difference(self, v_c2, v_t2) + + wuffs_webp__decoder__absolute_difference(self, v_c3, v_t3)); + v_sum_t = (wuffs_webp__decoder__absolute_difference(self, v_c0, v_l0) + + wuffs_webp__decoder__absolute_difference(self, v_c1, v_l1) + + wuffs_webp__decoder__absolute_difference(self, v_c2, v_l2) + + wuffs_webp__decoder__absolute_difference(self, v_c3, v_l3)); + if (v_sum_l < v_sum_t) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += ((uint8_t)(v_l0)); + v_curr_row.ptr[5u] += ((uint8_t)(v_l1)); + v_curr_row.ptr[6u] += ((uint8_t)(v_l2)); + v_curr_row.ptr[7u] += ((uint8_t)(v_l3)); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += ((uint8_t)(v_t0)); + v_curr_row.ptr[5u] += ((uint8_t)(v_t1)); + v_curr_row.ptr[6u] += ((uint8_t)(v_t2)); + v_curr_row.ptr[7u] += ((uint8_t)(v_t3)); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } + } else if (v_mode == 12u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += wuffs_webp__decoder__mode12(self, v_curr_row.ptr[0u], v_prev_row.ptr[4u], v_prev_row.ptr[0u]); + v_curr_row.ptr[5u] += wuffs_webp__decoder__mode12(self, v_curr_row.ptr[1u], v_prev_row.ptr[5u], v_prev_row.ptr[1u]); + v_curr_row.ptr[6u] += wuffs_webp__decoder__mode12(self, v_curr_row.ptr[2u], v_prev_row.ptr[6u], v_prev_row.ptr[2u]); + v_curr_row.ptr[7u] += wuffs_webp__decoder__mode12(self, v_curr_row.ptr[3u], v_prev_row.ptr[7u], v_prev_row.ptr[3u]); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } else if (v_mode == 13u) { +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif + v_curr_row.ptr[4u] += wuffs_webp__decoder__mode13(self, v_curr_row.ptr[0u], v_prev_row.ptr[4u], v_prev_row.ptr[0u]); + v_curr_row.ptr[5u] += wuffs_webp__decoder__mode13(self, v_curr_row.ptr[1u], v_prev_row.ptr[5u], v_prev_row.ptr[1u]); + v_curr_row.ptr[6u] += wuffs_webp__decoder__mode13(self, v_curr_row.ptr[2u], v_prev_row.ptr[6u], v_prev_row.ptr[2u]); + v_curr_row.ptr[7u] += wuffs_webp__decoder__mode13(self, v_curr_row.ptr[3u], v_prev_row.ptr[7u], v_prev_row.ptr[3u]); +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + } + v_curr_row = wuffs_base__slice_u8__subslice_i(v_curr_row, 4u); + v_prev_row = wuffs_base__slice_u8__subslice_i(v_prev_row, 4u); + v_x += 1u; + } + } + v_y += 1u; + } + return wuffs_base__make_empty_struct(); +} +#endif // defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) +// ‼ WUFFS MULTI-FILE SECTION -x86_avx2 + // -------- func webp.decoder.get_quirk WUFFS_BASE__GENERATED_C_CODE @@ -84559,7 +100821,7 @@ wuffs_webp__decoder__do_decode_image_config( WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(5); } self->private_impl.f_frame_config_io_position = wuffs_base__u64__sat_add((a_src ? a_src->meta.pos : 0), ((uint64_t)(iop_a_src - io0_a_src))); - if ( ! self->private_impl.f_is_vp8_lossy && (a_dst != NULL)) { + if (( ! self->private_impl.f_is_vp8_lossy || self->private_impl.f_is_vp8x) && (a_dst != NULL)) { wuffs_base__image_config__set( a_dst, self->private_impl.f_pixfmt, @@ -84602,6 +100864,8 @@ wuffs_webp__decoder__do_decode_image_config_limited( uint32_t v_c32 = 0; uint64_t v_r_mark = 0; wuffs_base__status v_status = wuffs_base__make_status(NULL); + uint8_t v_flags = 0; + uint32_t v_mb_width = 0; const uint8_t* iop_a_src = NULL; const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; @@ -84684,8 +100948,7 @@ wuffs_webp__decoder__do_decode_image_config_limited( self->private_impl.f_is_vp8_lossy = true; } else if (v_c32 == 1278758998u) { } else if (v_c32 == 1480085590u) { - status = wuffs_base__make_status(wuffs_webp__error__unsupported_webp_file); - goto exit; + self->private_impl.f_is_vp8x = true; } else { status = wuffs_base__make_status(wuffs_webp__error__bad_header); goto exit; @@ -84724,6 +100987,151 @@ wuffs_webp__decoder__do_decode_image_config_limited( goto exit; } self->private_impl.f_sub_chunk_has_padding = ((self->private_impl.f_sub_chunk_length & 1u) != 0u); + if (self->private_impl.f_is_vp8x) { + if (self->private_impl.f_sub_chunk_length < 10u) { + status = wuffs_base__make_status(wuffs_webp__error__bad_header); + goto exit; + } + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(7); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint8_t t_3 = *iop_a_src++; + v_flags = t_3; + } + self->private_impl.f_has_alpha = (((uint8_t)(v_flags & 16u)) != 0u); + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(8); + uint32_t t_4; + if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 3)) { + t_4 = ((uint32_t)(wuffs_base__peek_u24le__no_bounds_check(iop_a_src))); + iop_a_src += 3; + } else { + self->private_data.s_do_decode_image_config_limited.scratch = 0; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(9); + while (true) { + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint64_t* scratch = &self->private_data.s_do_decode_image_config_limited.scratch; + uint32_t num_bits_4 = ((uint32_t)(*scratch >> 56)); + *scratch <<= 8; + *scratch >>= 8; + *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_4; + if (num_bits_4 == 16) { + t_4 = ((uint32_t)(*scratch)); + break; + } + num_bits_4 += 8u; + *scratch |= ((uint64_t)(num_bits_4)) << 56; + } + } + v_c32 = t_4; + } + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(10); + uint32_t t_5; + if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 3)) { + t_5 = ((uint32_t)(wuffs_base__peek_u24le__no_bounds_check(iop_a_src))); + iop_a_src += 3; + } else { + self->private_data.s_do_decode_image_config_limited.scratch = 0; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(11); + while (true) { + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint64_t* scratch = &self->private_data.s_do_decode_image_config_limited.scratch; + uint32_t num_bits_5 = ((uint32_t)(*scratch >> 56)); + *scratch <<= 8; + *scratch >>= 8; + *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_5; + if (num_bits_5 == 16) { + t_5 = ((uint32_t)(*scratch)); + break; + } + num_bits_5 += 8u; + *scratch |= ((uint64_t)(num_bits_5)) << 56; + } + } + v_c32 = t_5; + } + self->private_impl.f_width = ((v_c32 + 1u) & 16383u); + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(12); + uint32_t t_6; + if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 3)) { + t_6 = ((uint32_t)(wuffs_base__peek_u24le__no_bounds_check(iop_a_src))); + iop_a_src += 3; + } else { + self->private_data.s_do_decode_image_config_limited.scratch = 0; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(13); + while (true) { + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint64_t* scratch = &self->private_data.s_do_decode_image_config_limited.scratch; + uint32_t num_bits_6 = ((uint32_t)(*scratch >> 56)); + *scratch <<= 8; + *scratch >>= 8; + *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_6; + if (num_bits_6 == 16) { + t_6 = ((uint32_t)(*scratch)); + break; + } + num_bits_6 += 8u; + *scratch |= ((uint64_t)(num_bits_6)) << 56; + } + } + v_c32 = t_6; + } + self->private_impl.f_height = ((v_c32 + 1u) & 16383u); + if ((self->private_impl.f_width == 0u) || (self->private_impl.f_height == 0u)) { + status = wuffs_base__make_status(wuffs_webp__error__bad_header); + goto exit; + } + v_mb_width = ((self->private_impl.f_width + 15u) / 16u); + self->private_impl.f_vp8x_workbuf_len = (((uint64_t)(v_mb_width)) * ((uint64_t)((((self->private_impl.f_height + 15u) / 16u) * 384u)))); + if (self->private_impl.f_has_alpha) { + self->private_impl.f_vp8l_alpha_workbuf_len = ((4u * ((uint64_t)(self->private_impl.f_width)) * ((uint64_t)(self->private_impl.f_height))) + (16u * ((uint64_t)((((self->private_impl.f_width + 3u) >> 2u) * ((self->private_impl.f_height + 3u) >> 2u)))))); + if (self->private_impl.f_vp8l_alpha_workbuf_len > self->private_impl.f_vp8x_workbuf_len) { + self->private_impl.f_vp8x_workbuf_len = self->private_impl.f_vp8l_alpha_workbuf_len; + } + self->private_impl.f_vp8x_workbuf_len += (((uint64_t)(self->private_impl.f_width)) * ((uint64_t)(self->private_impl.f_height))); + } + if (self->private_impl.f_has_alpha) { + self->private_impl.f_pixfmt = 2164295816u; + } else { + self->private_impl.f_pixfmt = 2415954056u; + } + wuffs_private_impl__u32__sat_sub_indirect(&self->private_impl.f_sub_chunk_length, 10u); + if (self->private_impl.f_sub_chunk_length > 0u) { + self->private_data.s_do_decode_image_config_limited.scratch = self->private_impl.f_sub_chunk_length; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(14); + if (self->private_data.s_do_decode_image_config_limited.scratch > ((uint64_t)(io2_a_src - iop_a_src))) { + self->private_data.s_do_decode_image_config_limited.scratch -= ((uint64_t)(io2_a_src - iop_a_src)); + iop_a_src = io2_a_src; + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + iop_a_src += self->private_data.s_do_decode_image_config_limited.scratch; + } + if (self->private_impl.f_sub_chunk_has_padding) { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(15); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + iop_a_src++; + } + status = wuffs_base__make_status(NULL); + goto ok; + } while (true) { { const bool o_0_closed_a_src = a_src->meta.closed; @@ -84741,8 +101149,8 @@ wuffs_webp__decoder__do_decode_image_config_limited( if (a_src) { a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); } - wuffs_base__status t_3 = wuffs_vp8__decoder__decode_image_config(&self->private_data.f_vp8, a_dst, a_src); - v_status = t_3; + wuffs_base__status t_7 = wuffs_vp8__decoder__decode_image_config(&self->private_data.f_vp8, a_dst, a_src); + v_status = t_7; if (a_src) { iop_a_src = a_src->data.ptr + a_src->meta.ri; } @@ -84752,8 +101160,8 @@ wuffs_webp__decoder__do_decode_image_config_limited( if (a_src) { a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); } - wuffs_base__status t_4 = wuffs_webp__decoder__do_decode_image_config_limited_vp8l(self, a_src); - v_status = t_4; + wuffs_base__status t_8 = wuffs_webp__decoder__do_decode_image_config_limited_vp8l(self, a_src); + v_status = t_8; if (a_src) { iop_a_src = a_src->data.ptr + a_src->meta.ri; } @@ -84782,7 +101190,7 @@ wuffs_webp__decoder__do_decode_image_config_limited( goto exit; } status = v_status; - WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(7); + WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(16); } ok: @@ -84876,10 +101284,7 @@ wuffs_webp__decoder__do_decode_image_config_limited_vp8l( v_c32 >>= 14u; self->private_impl.f_height = ((v_c32 & 16383u) + 1u); v_c32 >>= 14u; - self->private_impl.f_pixfmt = 2415954056u; - if ((v_c32 & 1u) != 0u) { - self->private_impl.f_pixfmt = 2164295816u; - } + self->private_impl.f_pixfmt = 2164295816u; v_c32 >>= 1u; if (v_c32 != 0u) { status = wuffs_base__make_status(wuffs_webp__error__bad_header); @@ -84946,14 +101351,14 @@ wuffs_webp__decoder__decode_frame_config( WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0; while (true) { - if (self->private_impl.f_is_vp8_lossy) { + if (self->private_impl.f_is_vp8x || ! self->private_impl.f_is_vp8_lossy) { { - wuffs_base__status t_0 = wuffs_vp8__decoder__decode_frame_config(&self->private_data.f_vp8, a_dst, a_src); + wuffs_base__status t_0 = wuffs_webp__decoder__do_decode_frame_config(self, a_dst, a_src); v_status = t_0; } } else { { - wuffs_base__status t_1 = wuffs_webp__decoder__do_decode_frame_config(self, a_dst, a_src); + wuffs_base__status t_1 = wuffs_vp8__decoder__decode_frame_config(&self->private_data.f_vp8, a_dst, a_src); v_status = t_1; } } @@ -85106,31 +101511,89 @@ wuffs_webp__decoder__decode_frame( wuffs_base__status status = wuffs_base__make_status(NULL); wuffs_base__status v_status = wuffs_base__make_status(NULL); + uint64_t v_r_mark = 0; + + const uint8_t* iop_a_src = NULL; + const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + if (a_src && a_src->data.ptr) { + io0_a_src = a_src->data.ptr; + io1_a_src = io0_a_src + a_src->meta.ri; + iop_a_src = io1_a_src; + io2_a_src = io0_a_src + a_src->meta.wi; + } uint32_t coro_susp_point = self->private_impl.p_decode_frame; switch (coro_susp_point) { WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0; while (true) { - if (self->private_impl.f_is_vp8_lossy) { + if (self->private_impl.f_is_vp8x) { { - wuffs_base__status t_0 = wuffs_vp8__decoder__decode_frame(&self->private_data.f_vp8, + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + wuffs_base__status t_0 = wuffs_webp__decoder__do_decode_frame_vp8x(self, a_dst, a_src, a_blend, a_workbuf, a_opts); v_status = t_0; + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } + } + } else if (self->private_impl.f_is_vp8_lossy) { + { + const bool o_0_closed_a_src = a_src->meta.closed; + const uint8_t* o_0_io2_a_src = io2_a_src; + wuffs_private_impl__io_reader__limit(&io2_a_src, iop_a_src, + ((uint64_t)(self->private_impl.f_sub_chunk_length))); + if (a_src) { + size_t n = ((size_t)(io2_a_src - a_src->data.ptr)); + a_src->meta.closed = a_src->meta.closed && (a_src->meta.wi <= n); + a_src->meta.wi = n; + } + v_r_mark = ((uint64_t)(iop_a_src - io0_a_src)); + { + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + wuffs_base__status t_1 = wuffs_vp8__decoder__decode_frame(&self->private_data.f_vp8, + a_dst, + a_src, + a_blend, + a_workbuf, + a_opts); + v_status = t_1; + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } + } + wuffs_private_impl__u32__sat_sub_indirect(&self->private_impl.f_sub_chunk_length, ((uint32_t)(wuffs_private_impl__io__count_since(v_r_mark, ((uint64_t)(iop_a_src - io0_a_src)))))); + io2_a_src = o_0_io2_a_src; + if (a_src) { + a_src->meta.closed = o_0_closed_a_src; + a_src->meta.wi = ((size_t)(io2_a_src - a_src->data.ptr)); + } } } else { { - wuffs_base__status t_1 = wuffs_webp__decoder__do_decode_frame(self, + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + wuffs_base__status t_2 = wuffs_webp__decoder__do_decode_frame(self, a_dst, a_src, a_blend, a_workbuf, a_opts); - v_status = t_1; + v_status = t_2; + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } } } if ((v_status.repr == wuffs_base__suspension__short_read) && (a_src && a_src->meta.closed)) { @@ -85153,12 +101616,647 @@ wuffs_webp__decoder__decode_frame( goto exit; exit: + if (a_src && a_src->data.ptr) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + if (wuffs_base__status__is_error(&status)) { self->private_impl.magic = WUFFS_BASE__DISABLED; } return status; } +// -------- func webp.decoder.do_decode_frame_vp8x + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__status +wuffs_webp__decoder__do_decode_frame_vp8x( + wuffs_webp__decoder* self, + wuffs_base__pixel_buffer* a_dst, + wuffs_base__io_buffer* a_src, + wuffs_base__pixel_blend a_blend, + wuffs_base__slice_u8 a_workbuf, + wuffs_base__decode_frame_options* a_opts) { + wuffs_base__status status = wuffs_base__make_status(NULL); + + uint32_t v_c32 = 0; + uint32_t v_chunk_length = 0; + bool v_chunk_padding = false; + wuffs_base__status v_status = wuffs_base__make_status(NULL); + uint64_t v_r_mark = 0; + uint64_t v_alpha_offset = 0; + uint32_t v_alph_length = 0; + uint8_t v_alph_header = 0; + uint8_t v_alph_comp = 0; + uint8_t v_alph_filter = 0; + uint64_t v_alpha_i = 0; + uint64_t v_alpha_n = 0; + uint32_t v_y = 0; + uint32_t v_x = 0; + wuffs_base__table_u8 v_tab = {0}; + wuffs_base__slice_u8 v_row = {0}; + uint64_t v_row_idx = 0; + + const uint8_t* iop_a_src = NULL; + const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io1_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + const uint8_t* io2_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; + if (a_src && a_src->data.ptr) { + io0_a_src = a_src->data.ptr; + io1_a_src = io0_a_src + a_src->meta.ri; + iop_a_src = io1_a_src; + io2_a_src = io0_a_src + a_src->meta.wi; + } + + uint32_t coro_susp_point = self->private_impl.p_do_decode_frame_vp8x; + if (coro_susp_point) { + v_c32 = self->private_data.s_do_decode_frame_vp8x.v_c32; + v_chunk_length = self->private_data.s_do_decode_frame_vp8x.v_chunk_length; + v_chunk_padding = self->private_data.s_do_decode_frame_vp8x.v_chunk_padding; + v_alpha_offset = self->private_data.s_do_decode_frame_vp8x.v_alpha_offset; + v_alph_length = self->private_data.s_do_decode_frame_vp8x.v_alph_length; + v_alph_filter = self->private_data.s_do_decode_frame_vp8x.v_alph_filter; + v_alpha_i = self->private_data.s_do_decode_frame_vp8x.v_alpha_i; + v_alpha_n = self->private_data.s_do_decode_frame_vp8x.v_alpha_n; + } + switch (coro_susp_point) { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0; + + if (self->private_impl.f_call_sequence == 64u) { + } else if (self->private_impl.f_call_sequence < 64u) { + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1); + status = wuffs_webp__decoder__do_decode_frame_config(self, NULL, a_src); + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } + if (status.repr) { + goto suspend; + } + } else { + status = wuffs_base__make_status(wuffs_base__note__end_of_data); + goto ok; + } + v_alpha_offset = self->private_impl.f_vp8x_workbuf_len; + if (self->private_impl.f_has_alpha) { + v_alpha_offset -= (((uint64_t)(self->private_impl.f_width)) * ((uint64_t)(self->private_impl.f_height))); + } + while (true) { + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(2); + uint32_t t_0; + if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 4)) { + t_0 = wuffs_base__peek_u32le__no_bounds_check(iop_a_src); + iop_a_src += 4; + } else { + self->private_data.s_do_decode_frame_vp8x.scratch = 0; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(3); + while (true) { + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint64_t* scratch = &self->private_data.s_do_decode_frame_vp8x.scratch; + uint32_t num_bits_0 = ((uint32_t)(*scratch >> 56)); + *scratch <<= 8; + *scratch >>= 8; + *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_0; + if (num_bits_0 == 24) { + t_0 = ((uint32_t)(*scratch)); + break; + } + num_bits_0 += 8u; + *scratch |= ((uint64_t)(num_bits_0)) << 56; + } + } + v_c32 = t_0; + } + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(4); + uint32_t t_1; + if (WUFFS_BASE__LIKELY(io2_a_src - iop_a_src >= 4)) { + t_1 = wuffs_base__peek_u32le__no_bounds_check(iop_a_src); + iop_a_src += 4; + } else { + self->private_data.s_do_decode_frame_vp8x.scratch = 0; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(5); + while (true) { + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint64_t* scratch = &self->private_data.s_do_decode_frame_vp8x.scratch; + uint32_t num_bits_1 = ((uint32_t)(*scratch >> 56)); + *scratch <<= 8; + *scratch >>= 8; + *scratch |= ((uint64_t)(*iop_a_src++)) << num_bits_1; + if (num_bits_1 == 24) { + t_1 = ((uint32_t)(*scratch)); + break; + } + num_bits_1 += 8u; + *scratch |= ((uint64_t)(num_bits_1)) << 56; + } + } + v_chunk_length = t_1; + } + v_chunk_padding = ((v_chunk_length & 1u) != 0u); + if (v_c32 == 1213221953u) { + if ((v_chunk_length < 1u) || ! self->private_impl.f_has_alpha) { + self->private_data.s_do_decode_frame_vp8x.scratch = v_chunk_length; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(6); + if (self->private_data.s_do_decode_frame_vp8x.scratch > ((uint64_t)(io2_a_src - iop_a_src))) { + self->private_data.s_do_decode_frame_vp8x.scratch -= ((uint64_t)(io2_a_src - iop_a_src)); + iop_a_src = io2_a_src; + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + iop_a_src += self->private_data.s_do_decode_frame_vp8x.scratch; + if (v_chunk_padding) { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(7); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + iop_a_src++; + } + continue; + } + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(8); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint8_t t_2 = *iop_a_src++; + v_alph_header = t_2; + } + v_alph_comp = ((uint8_t)(v_alph_header & 3u)); + v_alph_filter = ((uint8_t)(((uint8_t)(v_alph_header >> 2u)) & 3u)); + v_alph_length = wuffs_base__u32__sat_sub(v_chunk_length, 1u); + if (v_alph_comp == 0u) { + v_alpha_n = (((uint64_t)(self->private_impl.f_width)) * ((uint64_t)(self->private_impl.f_height))); + v_alpha_i = 0u; + while (v_alpha_i < v_alpha_n) { + if (v_alph_length == 0u) { + break; + } + if (((uint64_t)(v_alpha_offset + v_alpha_i)) < ((uint64_t)(a_workbuf.len))) { + { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(9); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + uint8_t t_3 = *iop_a_src++; + a_workbuf.ptr[((uint64_t)(v_alpha_offset + v_alpha_i))] = t_3; + } + } + v_alpha_i += 1u; + wuffs_private_impl__u32__sat_sub_indirect(&v_alph_length, 1u); + } + } else { + self->private_impl.f_workbuf_offset_for_transform[0u] = (4u * self->private_impl.f_width * self->private_impl.f_height); + self->private_impl.f_workbuf_offset_for_transform[1u] = (self->private_impl.f_workbuf_offset_for_transform[0u] + (4u * ((self->private_impl.f_width + 3u) >> 2u) * ((self->private_impl.f_height + 3u) >> 2u))); + self->private_impl.f_workbuf_offset_for_transform[2u] = (self->private_impl.f_workbuf_offset_for_transform[1u] + (4u * ((self->private_impl.f_width + 3u) >> 2u) * ((self->private_impl.f_height + 3u) >> 2u))); + self->private_impl.f_workbuf_offset_for_transform[3u] = (self->private_impl.f_workbuf_offset_for_transform[2u] + (4u * ((self->private_impl.f_width + 3u) >> 2u) * ((self->private_impl.f_height + 3u) >> 2u))); + self->private_impl.f_call_sequence = 64u; + while (true) { + { + const bool o_0_closed_a_src = a_src->meta.closed; + const uint8_t* o_0_io2_a_src = io2_a_src; + wuffs_private_impl__io_reader__limit(&io2_a_src, iop_a_src, + ((uint64_t)(v_alph_length))); + if (a_src) { + size_t n = ((size_t)(io2_a_src - a_src->data.ptr)); + a_src->meta.closed = a_src->meta.closed && (a_src->meta.wi <= n); + a_src->meta.wi = n; + } + v_r_mark = ((uint64_t)(iop_a_src - io0_a_src)); + { + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + wuffs_base__status t_4 = wuffs_webp__decoder__do_decode_frame(self, + a_dst, + a_src, + a_blend, + a_workbuf, + a_opts); + v_status = t_4; + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } + } + wuffs_private_impl__u32__sat_sub_indirect(&v_alph_length, ((uint32_t)(wuffs_private_impl__io__count_since(v_r_mark, ((uint64_t)(iop_a_src - io0_a_src)))))); + io2_a_src = o_0_io2_a_src; + if (a_src) { + a_src->meta.closed = o_0_closed_a_src; + a_src->meta.wi = ((size_t)(io2_a_src - a_src->data.ptr)); + } + } + if (wuffs_base__status__is_ok(&v_status)) { + break; + } else if ( ! wuffs_base__status__is_suspension(&v_status)) { + status = v_status; + if (wuffs_base__status__is_error(&status)) { + goto exit; + } else if (wuffs_base__status__is_suspension(&status)) { + status = wuffs_base__make_status(wuffs_base__error__cannot_return_a_suspension); + goto exit; + } + goto ok; + } + status = v_status; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(10); + } + v_alpha_n = (((uint64_t)(self->private_impl.f_width)) * ((uint64_t)(self->private_impl.f_height))); + v_alpha_i = 0u; + v_row_idx = 1u; + while (v_alpha_i < v_alpha_n) { + if ((((uint64_t)(v_alpha_offset + v_alpha_i)) < ((uint64_t)(a_workbuf.len))) && (v_row_idx < ((uint64_t)(a_workbuf.len)))) { + a_workbuf.ptr[((uint64_t)(v_alpha_offset + v_alpha_i))] = a_workbuf.ptr[v_row_idx]; + } + v_alpha_i += 1u; + v_row_idx += 4u; + } + self->private_impl.f_call_sequence = 64u; + } + if (v_alph_filter == 1u) { + wuffs_webp__decoder__apply_alpha_filter_horizontal(self, a_workbuf, v_alpha_offset); + } else if (v_alph_filter == 2u) { + wuffs_webp__decoder__apply_alpha_filter_vertical(self, a_workbuf, v_alpha_offset); + } else if (v_alph_filter == 3u) { + wuffs_webp__decoder__apply_alpha_filter_gradient(self, a_workbuf, v_alpha_offset); + } + if (v_alph_length > 0u) { + self->private_data.s_do_decode_frame_vp8x.scratch = v_alph_length; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(11); + if (self->private_data.s_do_decode_frame_vp8x.scratch > ((uint64_t)(io2_a_src - iop_a_src))) { + self->private_data.s_do_decode_frame_vp8x.scratch -= ((uint64_t)(io2_a_src - iop_a_src)); + iop_a_src = io2_a_src; + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + iop_a_src += self->private_data.s_do_decode_frame_vp8x.scratch; + } + if (v_chunk_padding) { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(12); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + iop_a_src++; + } + } else if ((v_c32 == 540561494u) || (v_c32 == 1278758998u)) { + self->private_impl.f_is_vp8_lossy = (v_c32 == 540561494u); + self->private_impl.f_sub_chunk_length = v_chunk_length; + self->private_impl.f_sub_chunk_has_padding = v_chunk_padding; + break; + } else { + self->private_data.s_do_decode_frame_vp8x.scratch = v_chunk_length; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(13); + if (self->private_data.s_do_decode_frame_vp8x.scratch > ((uint64_t)(io2_a_src - iop_a_src))) { + self->private_data.s_do_decode_frame_vp8x.scratch -= ((uint64_t)(io2_a_src - iop_a_src)); + iop_a_src = io2_a_src; + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + iop_a_src += self->private_data.s_do_decode_frame_vp8x.scratch; + if (v_chunk_padding) { + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(14); + if (WUFFS_BASE__UNLIKELY(iop_a_src == io2_a_src)) { + status = wuffs_base__make_status(wuffs_base__suspension__short_read); + goto suspend; + } + iop_a_src++; + } + } + } + if (self->private_impl.f_is_vp8_lossy) { + while (true) { + { + const bool o_1_closed_a_src = a_src->meta.closed; + const uint8_t* o_1_io2_a_src = io2_a_src; + wuffs_private_impl__io_reader__limit(&io2_a_src, iop_a_src, + ((uint64_t)(self->private_impl.f_sub_chunk_length))); + if (a_src) { + size_t n = ((size_t)(io2_a_src - a_src->data.ptr)); + a_src->meta.closed = a_src->meta.closed && (a_src->meta.wi <= n); + a_src->meta.wi = n; + } + v_r_mark = ((uint64_t)(iop_a_src - io0_a_src)); + { + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + wuffs_base__status t_5 = wuffs_vp8__decoder__decode_image_config(&self->private_data.f_vp8, NULL, a_src); + v_status = t_5; + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } + } + wuffs_private_impl__u32__sat_sub_indirect(&self->private_impl.f_sub_chunk_length, ((uint32_t)(wuffs_private_impl__io__count_since(v_r_mark, ((uint64_t)(iop_a_src - io0_a_src)))))); + io2_a_src = o_1_io2_a_src; + if (a_src) { + a_src->meta.closed = o_1_closed_a_src; + a_src->meta.wi = ((size_t)(io2_a_src - a_src->data.ptr)); + } + } + if (wuffs_base__status__is_ok(&v_status)) { + break; + } else if ( ! wuffs_base__status__is_suspension(&v_status)) { + status = v_status; + if (wuffs_base__status__is_error(&status)) { + goto exit; + } else if (wuffs_base__status__is_suspension(&status)) { + status = wuffs_base__make_status(wuffs_base__error__cannot_return_a_suspension); + goto exit; + } + goto ok; + } + status = v_status; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(15); + } + while (true) { + { + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + wuffs_base__status t_6 = wuffs_vp8__decoder__decode_frame_config(&self->private_data.f_vp8, NULL, a_src); + v_status = t_6; + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } + } + if (wuffs_base__status__is_ok(&v_status)) { + break; + } else if ( ! wuffs_base__status__is_suspension(&v_status)) { + status = v_status; + if (wuffs_base__status__is_error(&status)) { + goto exit; + } else if (wuffs_base__status__is_suspension(&status)) { + status = wuffs_base__make_status(wuffs_base__error__cannot_return_a_suspension); + goto exit; + } + goto ok; + } + status = v_status; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(16); + } + while (true) { + { + const bool o_2_closed_a_src = a_src->meta.closed; + const uint8_t* o_2_io2_a_src = io2_a_src; + wuffs_private_impl__io_reader__limit(&io2_a_src, iop_a_src, + ((uint64_t)(self->private_impl.f_sub_chunk_length))); + if (a_src) { + size_t n = ((size_t)(io2_a_src - a_src->data.ptr)); + a_src->meta.closed = a_src->meta.closed && (a_src->meta.wi <= n); + a_src->meta.wi = n; + } + v_r_mark = ((uint64_t)(iop_a_src - io0_a_src)); + { + if (a_src) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + wuffs_base__status t_7 = wuffs_vp8__decoder__decode_frame(&self->private_data.f_vp8, + a_dst, + a_src, + a_blend, + a_workbuf, + a_opts); + v_status = t_7; + if (a_src) { + iop_a_src = a_src->data.ptr + a_src->meta.ri; + } + } + wuffs_private_impl__u32__sat_sub_indirect(&self->private_impl.f_sub_chunk_length, ((uint32_t)(wuffs_private_impl__io__count_since(v_r_mark, ((uint64_t)(iop_a_src - io0_a_src)))))); + io2_a_src = o_2_io2_a_src; + if (a_src) { + a_src->meta.closed = o_2_closed_a_src; + a_src->meta.wi = ((size_t)(io2_a_src - a_src->data.ptr)); + } + } + if (wuffs_base__status__is_ok(&v_status)) { + break; + } else if ( ! wuffs_base__status__is_suspension(&v_status)) { + status = v_status; + if (wuffs_base__status__is_error(&status)) { + goto exit; + } else if (wuffs_base__status__is_suspension(&status)) { + status = wuffs_base__make_status(wuffs_base__error__cannot_return_a_suspension); + goto exit; + } + goto ok; + } + status = v_status; + WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(17); + } + } else { + status = wuffs_base__make_status(wuffs_webp__error__unsupported_webp_file); + goto exit; + } + if (self->private_impl.f_has_alpha) { + v_tab = wuffs_base__pixel_buffer__plane(a_dst, 0u); + v_y = 0u; + while (v_y < self->private_impl.f_height) { + v_row = wuffs_private_impl__table_u8__row_u32(v_tab, v_y); + v_x = 0u; + while (v_x < self->private_impl.f_width) { + v_row_idx = ((((uint64_t)(v_x)) * 4u) + 3u); + v_alpha_i = ((uint64_t)(((uint64_t)(v_alpha_offset + (((uint64_t)(v_y)) * ((uint64_t)(self->private_impl.f_width))))) + ((uint64_t)(v_x)))); + if ((v_row_idx < ((uint64_t)(v_row.len))) && (v_alpha_i < ((uint64_t)(a_workbuf.len)))) { + v_row.ptr[v_row_idx] = a_workbuf.ptr[v_alpha_i]; + } + v_x += 1u; + } + v_y += 1u; + } + } + self->private_impl.f_call_sequence = 96u; + + ok: + self->private_impl.p_do_decode_frame_vp8x = 0; + goto exit; + } + + goto suspend; + suspend: + self->private_impl.p_do_decode_frame_vp8x = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0; + self->private_data.s_do_decode_frame_vp8x.v_c32 = v_c32; + self->private_data.s_do_decode_frame_vp8x.v_chunk_length = v_chunk_length; + self->private_data.s_do_decode_frame_vp8x.v_chunk_padding = v_chunk_padding; + self->private_data.s_do_decode_frame_vp8x.v_alpha_offset = v_alpha_offset; + self->private_data.s_do_decode_frame_vp8x.v_alph_length = v_alph_length; + self->private_data.s_do_decode_frame_vp8x.v_alph_filter = v_alph_filter; + self->private_data.s_do_decode_frame_vp8x.v_alpha_i = v_alpha_i; + self->private_data.s_do_decode_frame_vp8x.v_alpha_n = v_alpha_n; + + goto exit; + exit: + if (a_src && a_src->data.ptr) { + a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); + } + + return status; +} + +// -------- func webp.decoder.apply_alpha_filter_horizontal + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_alpha_filter_horizontal( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_alpha_offset) { + uint32_t v_y = 0; + uint32_t v_x = 0; + uint64_t v_i = 0; + uint8_t v_prev = 0; + + v_y = 0u; + while (v_y < self->private_impl.f_height) { + v_prev = 0u; + if (v_y > 0u) { + v_i = ((uint64_t)(a_alpha_offset + (((uint64_t)(((uint32_t)(v_y - 1u)))) * ((uint64_t)(self->private_impl.f_width))))); + if (v_i < ((uint64_t)(a_workbuf.len))) { + v_prev = a_workbuf.ptr[v_i]; + } + } + v_x = 0u; + while (v_x < self->private_impl.f_width) { + v_i = ((uint64_t)(((uint64_t)(a_alpha_offset + (((uint64_t)(v_y)) * ((uint64_t)(self->private_impl.f_width))))) + ((uint64_t)(v_x)))); + if (v_i < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_i] = ((uint8_t)(((uint8_t)(a_workbuf.ptr[v_i] + v_prev)))); + v_prev = a_workbuf.ptr[v_i]; + } + v_x += 1u; + } + v_y += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func webp.decoder.apply_alpha_filter_vertical + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_alpha_filter_vertical( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_alpha_offset) { + uint32_t v_y = 0; + uint32_t v_x = 0; + uint64_t v_i = 0; + uint8_t v_prev = 0; + + v_prev = 0u; + v_x = 0u; + while (v_x < self->private_impl.f_width) { + v_i = ((uint64_t)(a_alpha_offset + ((uint64_t)(v_x)))); + if (v_i < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_i] = ((uint8_t)(((uint8_t)(a_workbuf.ptr[v_i] + v_prev)))); + v_prev = a_workbuf.ptr[v_i]; + } + v_x += 1u; + } + v_y = 1u; + while (v_y < self->private_impl.f_height) { + v_x = 0u; + while (v_x < self->private_impl.f_width) { + v_i = ((uint64_t)(((uint64_t)(a_alpha_offset + (((uint64_t)(((uint32_t)(v_y - 1u)))) * ((uint64_t)(self->private_impl.f_width))))) + ((uint64_t)(v_x)))); + v_prev = 0u; + if (v_i < ((uint64_t)(a_workbuf.len))) { + v_prev = a_workbuf.ptr[v_i]; + } + v_i = ((uint64_t)(((uint64_t)(a_alpha_offset + (((uint64_t)(v_y)) * ((uint64_t)(self->private_impl.f_width))))) + ((uint64_t)(v_x)))); + if (v_i < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_i] = ((uint8_t)(((uint8_t)(a_workbuf.ptr[v_i] + v_prev)))); + } + v_x += 1u; + } + v_y += 1u; + } + return wuffs_base__make_empty_struct(); +} + +// -------- func webp.decoder.apply_alpha_filter_gradient + +WUFFS_BASE__GENERATED_C_CODE +static wuffs_base__empty_struct +wuffs_webp__decoder__apply_alpha_filter_gradient( + wuffs_webp__decoder* self, + wuffs_base__slice_u8 a_workbuf, + uint64_t a_alpha_offset) { + uint32_t v_y = 0; + uint32_t v_x = 0; + uint64_t v_i = 0; + uint8_t v_prev = 0; + uint32_t v_left = 0; + uint32_t v_above = 0; + uint32_t v_tl = 0; + uint32_t v_pred = 0; + + v_prev = 0u; + v_x = 0u; + while (v_x < self->private_impl.f_width) { + v_i = ((uint64_t)(a_alpha_offset + ((uint64_t)(v_x)))); + if (v_i < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_i] = ((uint8_t)(((uint8_t)(a_workbuf.ptr[v_i] + v_prev)))); + v_prev = a_workbuf.ptr[v_i]; + } + v_x += 1u; + } + v_y = 1u; + while (v_y < self->private_impl.f_height) { + v_i = ((uint64_t)(a_alpha_offset + (((uint64_t)(((uint32_t)(v_y - 1u)))) * ((uint64_t)(self->private_impl.f_width))))); + v_above = 0u; + if (v_i < ((uint64_t)(a_workbuf.len))) { + v_above = ((uint32_t)(a_workbuf.ptr[v_i])); + } + v_i = ((uint64_t)(a_alpha_offset + (((uint64_t)(v_y)) * ((uint64_t)(self->private_impl.f_width))))); + if (v_i < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_i] = ((uint8_t)(((uint32_t)(((uint32_t)(a_workbuf.ptr[v_i])) + v_above)))); + } + v_x = 1u; + while (v_x < self->private_impl.f_width) { + v_left = 0u; + v_i = ((uint64_t)(((uint64_t)(a_alpha_offset + (((uint64_t)(v_y)) * ((uint64_t)(self->private_impl.f_width))))) + ((uint64_t)(((uint32_t)(v_x - 1u)))))); + if (v_i < ((uint64_t)(a_workbuf.len))) { + v_left = ((uint32_t)(a_workbuf.ptr[v_i])); + } + v_above = 0u; + v_i = ((uint64_t)(((uint64_t)(a_alpha_offset + (((uint64_t)(((uint32_t)(v_y - 1u)))) * ((uint64_t)(self->private_impl.f_width))))) + ((uint64_t)(v_x)))); + if (v_i < ((uint64_t)(a_workbuf.len))) { + v_above = ((uint32_t)(a_workbuf.ptr[v_i])); + } + v_tl = 0u; + v_i = ((uint64_t)(((uint64_t)(a_alpha_offset + (((uint64_t)(((uint32_t)(v_y - 1u)))) * ((uint64_t)(self->private_impl.f_width))))) + ((uint64_t)(((uint32_t)(v_x - 1u)))))); + if (v_i < ((uint64_t)(a_workbuf.len))) { + v_tl = ((uint32_t)(a_workbuf.ptr[v_i])); + } + v_pred = ((uint32_t)(((uint32_t)(v_left + v_above)) - v_tl)); + if (v_pred > 255u) { + if (((uint32_t)(v_left + v_above)) < v_tl) { + v_pred = 0u; + } else { + v_pred = 255u; + } + } + v_i = ((uint64_t)(((uint64_t)(a_alpha_offset + (((uint64_t)(v_y)) * ((uint64_t)(self->private_impl.f_width))))) + ((uint64_t)(v_x)))); + if (v_i < ((uint64_t)(a_workbuf.len))) { + a_workbuf.ptr[v_i] = ((uint8_t)(((uint32_t)(((uint32_t)(a_workbuf.ptr[v_i])) + v_pred)))); + } + v_x += 1u; + } + v_y += 1u; + } + return wuffs_base__make_empty_struct(); +} + // -------- func webp.decoder.do_decode_frame WUFFS_BASE__GENERATED_C_CODE @@ -85175,6 +102273,7 @@ wuffs_webp__decoder__do_decode_frame( uint8_t v_c8 = 0; uint32_t v_has_more = 0; uint32_t v_width = 0; + uint32_t v_saved_width = 0; wuffs_base__slice_u8 v_dst = {0}; wuffs_base__slice_u8 v_tile_data = {0}; wuffs_base__status v_status = wuffs_base__make_status(NULL); @@ -85288,7 +102387,7 @@ wuffs_webp__decoder__do_decode_frame( a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); } WUFFS_BASE__COROUTINE_SUSPENSION_POINT(6); - status = wuffs_webp__decoder__decode_huffman_groups(self, a_src, self->private_impl.f_overall_n_huffman_groups); + status = wuffs_webp__decoder__decode_huffman_groups(self, a_src, self->private_impl.f_overall_n_huffman_groups, self->private_impl.f_hg_bitstream_groups); if (a_src) { iop_a_src = a_src->data.ptr + a_src->meta.ri; } @@ -85336,6 +102435,30 @@ wuffs_webp__decoder__do_decode_frame( goto exit; } v_pix = wuffs_base__slice_u8__subslice_j(a_workbuf, ((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[0u]))); + self->private_impl.choosy_apply_transform_predictor = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_webp__decoder__apply_transform_predictor_x86_avx2 : +#endif + self->private_impl.choosy_apply_transform_predictor); + self->private_impl.choosy_apply_transform_cross_color = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_webp__decoder__apply_transform_cross_color_x86_avx2 : +#endif + self->private_impl.choosy_apply_transform_cross_color); + self->private_impl.choosy_apply_transform_subtract_green = ( +#if defined(WUFFS_PRIVATE_IMPL__CPU_ARCH__X86_64_V3) + wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_webp__decoder__apply_transform_subtract_green_x86_avx2 : +#endif + self->private_impl.choosy_apply_transform_subtract_green); + v_saved_width = self->private_impl.f_width; + if (self->private_impl.f_seen_transform[3u]) { + self->private_impl.f_width = self->private_impl.f_color_indexing_width; + if ((((uint64_t)(self->private_impl.f_workbuf_offset_for_color_indexing)) <= ((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[0u]))) && (((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[0u])) <= ((uint64_t)(a_workbuf.len)))) { + v_pix = wuffs_base__slice_u8__subslice_ij(a_workbuf, + ((uint64_t)(self->private_impl.f_workbuf_offset_for_color_indexing)), + ((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[0u]))); + } + } v_which = self->private_impl.f_n_transforms; while (v_which > 0u) { v_which -= 1u; @@ -85351,14 +102474,26 @@ wuffs_webp__decoder__do_decode_frame( if (v_transform_type == 0u) { wuffs_webp__decoder__apply_transform_predictor(self, v_pix, v_tile_data); } else if (v_transform_type == 1u) { + if (v_which > 0u) { + if (self->private_impl.f_transform_type[(v_which - 1u)] == 2u) { + self->private_impl.f_fuse_subtract_green = true; + v_which -= 1u; + } + } wuffs_webp__decoder__apply_transform_cross_color(self, v_pix, v_tile_data); + self->private_impl.f_fuse_subtract_green = false; } else if (v_transform_type == 2u) { wuffs_webp__decoder__apply_transform_subtract_green(self, v_pix); } else { + self->private_impl.f_width = v_saved_width; + if (((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[0u])) <= ((uint64_t)(a_workbuf.len))) { + v_pix = wuffs_base__slice_u8__subslice_j(a_workbuf, ((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[0u]))); + } wuffs_webp__decoder__apply_transform_color_indexing(self, v_pix); v_width = self->private_impl.f_width; } } + self->private_impl.f_width = v_saved_width; v_status = wuffs_webp__decoder__swizzle(self, a_dst, v_pix, a_blend); if ( ! wuffs_base__status__is_ok(&v_status)) { status = v_status; @@ -85405,6 +102540,7 @@ wuffs_webp__decoder__decode_transform( uint8_t v_c8 = 0; uint32_t v_transform_type = 0; uint32_t v_tile_size_log2 = 0; + uint32_t v_effective_width = 0; wuffs_base__slice_u8 v_p = {0}; const uint8_t* iop_a_src = NULL; @@ -85422,6 +102558,7 @@ wuffs_webp__decoder__decode_transform( if (coro_susp_point) { v_transform_type = self->private_data.s_decode_transform.v_transform_type; v_tile_size_log2 = self->private_data.s_decode_transform.v_tile_size_log2; + v_effective_width = self->private_data.s_decode_transform.v_effective_width; } switch (coro_susp_point) { WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0; @@ -85449,9 +102586,6 @@ wuffs_webp__decoder__decode_transform( if (self->private_impl.f_seen_transform[v_transform_type] || (self->private_impl.f_n_transforms >= 4u)) { status = wuffs_base__make_status(wuffs_webp__error__bad_transform); goto exit; - } else if (self->private_impl.f_seen_transform[3u]) { - status = wuffs_base__make_status(wuffs_webp__error__unsupported_transform_after_color_indexing_transform); - goto exit; } self->private_impl.f_seen_transform[v_transform_type] = true; self->private_impl.f_transform_type[self->private_impl.f_n_transforms] = ((uint8_t)(v_transform_type)); @@ -85478,6 +102612,10 @@ wuffs_webp__decoder__decode_transform( self->private_impl.f_transform_tile_size_log2[v_transform_type] = ((uint8_t)(v_tile_size_log2)); self->private_impl.f_bits >>= 3u; self->private_impl.f_n_bits -= 3u; + v_effective_width = self->private_impl.f_width; + if (self->private_impl.f_seen_transform[3u]) { + v_effective_width = self->private_impl.f_color_indexing_width; + } if (a_src) { a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); } @@ -85493,7 +102631,7 @@ wuffs_webp__decoder__decode_transform( a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); } WUFFS_BASE__COROUTINE_SUSPENSION_POINT(4); - status = wuffs_webp__decoder__decode_huffman_groups(self, a_src, 1u); + status = wuffs_webp__decoder__decode_huffman_groups(self, a_src, 1u, 1u); if (a_src) { iop_a_src = a_src->data.ptr + a_src->meta.ri; } @@ -85514,7 +102652,7 @@ wuffs_webp__decoder__decode_transform( ((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[(v_transform_type + 1u)])), ((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[(v_transform_type + 2u)]))), a_src, - ((self->private_impl.f_width + ((((uint32_t)(1u)) << v_tile_size_log2) - 1u)) >> v_tile_size_log2), + ((v_effective_width + ((((uint32_t)(1u)) << v_tile_size_log2) - 1u)) >> v_tile_size_log2), ((self->private_impl.f_height + ((((uint32_t)(1u)) << v_tile_size_log2) - 1u)) >> v_tile_size_log2), wuffs_base__utility__empty_slice_u8(), 0u); @@ -85582,7 +102720,7 @@ wuffs_webp__decoder__decode_transform( a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); } WUFFS_BASE__COROUTINE_SUSPENSION_POINT(8); - status = wuffs_webp__decoder__decode_huffman_groups(self, a_src, 1u); + status = wuffs_webp__decoder__decode_huffman_groups(self, a_src, 1u, 1u); if (a_src) { iop_a_src = a_src->data.ptr + a_src->meta.ri; } @@ -85634,6 +102772,7 @@ wuffs_webp__decoder__decode_transform( self->private_impl.p_decode_transform = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0; self->private_data.s_decode_transform.v_transform_type = v_transform_type; self->private_data.s_decode_transform.v_tile_size_log2 = v_tile_size_log2; + self->private_data.s_decode_transform.v_effective_width = v_effective_width; goto exit; exit: @@ -85754,7 +102893,19 @@ wuffs_webp__decoder__decode_hg_table( wuffs_base__slice_u8 v_hg_pixels = {0}; uint64_t v_n = 0; wuffs_base__slice_u8 v_p = {0}; - uint32_t v_hg_plus_1 = 0; + uint32_t v_hg_raw = 0; + uint32_t v_max_hg = 0; + uint32_t v_k = 0; + uint32_t v_j = 0; + bool v_found = false; + uint32_t v_sort_i = 0; + uint32_t v_sort_j = 0; + uint16_t v_sort_val = 0; + wuffs_base__slice_u8 v_q = {0}; + uint32_t v_lo = 0; + uint32_t v_hi = 0; + uint32_t v_mid = 0; + uint32_t v_compact = 0; const uint8_t* iop_a_src = NULL; const uint8_t* io0_a_src WUFFS_BASE__POTENTIALLY_UNUSED = NULL; @@ -85792,6 +102943,8 @@ wuffs_webp__decoder__decode_hg_table( self->private_impl.f_n_bits -= 1u; if (v_use_hg_table == 0u) { self->private_impl.f_overall_n_huffman_groups = 1u; + self->private_impl.f_hg_compacted = false; + self->private_impl.f_hg_bitstream_groups = 1u; self->private_impl.f_overall_tile_size_log2 = 0u; if ((((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[0u])) > ((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[1u]))) || (((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[1u])) > ((uint64_t)(a_workbuf.len)))) { status = wuffs_base__make_status(wuffs_base__error__bad_workbuf_length); @@ -85845,7 +102998,7 @@ wuffs_webp__decoder__decode_hg_table( a_src->meta.ri = ((size_t)(iop_a_src - a_src->data.ptr)); } WUFFS_BASE__COROUTINE_SUSPENSION_POINT(4); - status = wuffs_webp__decoder__decode_huffman_groups(self, a_src, 1u); + status = wuffs_webp__decoder__decode_huffman_groups(self, a_src, 1u, 1u); if (a_src) { iop_a_src = a_src->data.ptr + a_src->meta.ri; } @@ -85881,7 +103034,6 @@ wuffs_webp__decoder__decode_hg_table( status = v_status; WUFFS_BASE__COROUTINE_SUSPENSION_POINT_MAYBE_SUSPEND(5); } - self->private_impl.f_overall_n_huffman_groups = 1u; if ((((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[0u])) > ((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[1u]))) || (((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[1u])) > ((uint64_t)(a_workbuf.len)))) { status = wuffs_base__make_status(wuffs_base__error__bad_workbuf_length); goto exit; @@ -85894,18 +103046,97 @@ wuffs_webp__decoder__decode_hg_table( status = wuffs_base__make_status(wuffs_base__error__bad_workbuf_length); goto exit; } + v_max_hg = 0u; v_p = wuffs_base__slice_u8__subslice_j(v_hg_pixels, v_n); while (((uint64_t)(v_p.len)) >= 4u) { - if (v_p.ptr[2u] != 0u) { - status = wuffs_base__make_status(wuffs_webp__error__unsupported_number_of_huffman_groups); - goto exit; + v_hg_raw = ((((uint32_t)(v_p.ptr[2u])) << 8u) | ((uint32_t)(v_p.ptr[1u]))); + if (v_max_hg < v_hg_raw) { + v_max_hg = v_hg_raw; + } + v_p = wuffs_base__slice_u8__subslice_i(v_p, 4u); + } + if (v_max_hg < 1024u) { + self->private_impl.f_hg_compacted = false; + self->private_impl.f_overall_n_huffman_groups = ((v_max_hg & 1023u) + 1u); + self->private_impl.f_hg_bitstream_groups = ((v_max_hg & 1023u) + 1u); + status = wuffs_base__make_status(NULL); + goto ok; + } + v_k = 0u; + if (v_n > ((uint64_t)(v_hg_pixels.len))) { + status = wuffs_base__make_status(wuffs_base__error__bad_workbuf_length); + goto exit; + } + v_p = wuffs_base__slice_u8__subslice_j(v_hg_pixels, v_n); + while (((uint64_t)(v_p.len)) >= 4u) { + v_hg_raw = ((((uint32_t)(v_p.ptr[2u])) << 8u) | ((uint32_t)(v_p.ptr[1u]))); + v_found = false; + v_j = 0u; + while (v_j < v_k) { + if (((uint32_t)(self->private_data.f_hg_sorted[v_j])) == (v_hg_raw & 65535u)) { + v_found = true; + break; + } + v_j += 1u; } - v_hg_plus_1 = (((uint32_t)(v_p.ptr[1u])) + 1u); - if (self->private_impl.f_overall_n_huffman_groups < v_hg_plus_1) { - self->private_impl.f_overall_n_huffman_groups = v_hg_plus_1; + if ( ! v_found) { + if (v_k >= 1024u) { + status = wuffs_base__make_status(wuffs_webp__error__unsupported_number_of_huffman_groups); + goto exit; + } + self->private_data.f_hg_sorted[v_k] = ((uint16_t)(v_hg_raw)); + v_k += 1u; } v_p = wuffs_base__slice_u8__subslice_i(v_p, 4u); } + v_sort_i = 1u; + while (v_sort_i < v_k) { + v_sort_val = self->private_data.f_hg_sorted[v_sort_i]; + v_sort_j = v_sort_i; + while (v_sort_j > 0u) { + if (v_sort_j < 1024u) { + if (self->private_data.f_hg_sorted[(v_sort_j - 1u)] <= v_sort_val) { + break; + } + self->private_data.f_hg_sorted[v_sort_j] = self->private_data.f_hg_sorted[(v_sort_j - 1u)]; + } + v_sort_j -= 1u; + } + if (v_sort_j < 1024u) { + self->private_data.f_hg_sorted[v_sort_j] = v_sort_val; + } + v_sort_i += 1u; + } + if (v_n > ((uint64_t)(v_hg_pixels.len))) { + status = wuffs_base__make_status(wuffs_base__error__bad_workbuf_length); + goto exit; + } + v_q = wuffs_base__slice_u8__subslice_j(v_hg_pixels, v_n); + while (((uint64_t)(v_q.len)) >= 4u) { + v_hg_raw = ((((uint32_t)(v_q.ptr[2u])) << 8u) | ((uint32_t)(v_q.ptr[1u]))); + v_lo = 0u; + v_hi = v_k; + while (v_lo < v_hi) { + v_mid = ((v_lo + v_hi) / 2u); + if (v_mid < 1024u) { + if (((uint32_t)(self->private_data.f_hg_sorted[v_mid])) < v_hg_raw) { + v_lo = (v_mid + 1u); + } else { + v_hi = v_mid; + } + } else { + break; + } + } + v_compact = v_lo; + v_q.ptr[1u] = ((uint8_t)(v_compact)); + v_q.ptr[2u] = ((uint8_t)((v_compact >> 8u))); + v_q = wuffs_base__slice_u8__subslice_i(v_q, 4u); + } + self->private_impl.f_hg_compacted = true; + self->private_impl.f_overall_n_huffman_groups = v_k; + self->private_impl.f_hg_bitstream_groups = ((v_max_hg & 65535u) + 1u); + self->private_impl.f_hg_n_sorted = v_k; ok: self->private_impl.p_decode_hg_table = 0; @@ -85940,10 +103171,15 @@ wuffs_webp__decoder__decode_pixels( uint32_t a_tile_size_log2) { wuffs_base__status status = wuffs_base__make_status(NULL); + wuffs_base__status v_status = wuffs_base__make_status(NULL); uint32_t v_i = 0; uint32_t v_n = 0; + uint64_t v_p_max = 0; uint32_t coro_susp_point = self->private_impl.p_decode_pixels; + if (coro_susp_point) { + v_p_max = self->private_data.s_decode_pixels.v_p_max; + } switch (coro_susp_point) { WUFFS_BASE__COROUTINE_SUSPENSION_POINT_0; @@ -85953,19 +103189,44 @@ wuffs_webp__decoder__decode_pixels( self->private_data.f_color_cache[v_i] = 0u; v_i += 1u; } - WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1); - status = wuffs_webp__decoder__decode_pixels_slow(self, - a_dst, - a_src, - a_width, - a_height, - a_tile_data, - a_tile_size_log2); - if (status.repr) { - goto suspend; + self->private_impl.f_pix_p = 0u; + self->private_impl.f_pix_x = 0u; + self->private_impl.f_pix_y = 0u; + self->private_impl.f_pix_cc_p = 0u; + v_p_max = ((uint64_t)((4u * a_width * a_height))); + while (true) { + v_status = wuffs_webp__decoder__decode_pixels_fast(self, + a_dst, + a_src, + a_width, + a_height, + a_tile_data, + a_tile_size_log2); + if (wuffs_base__status__is_error(&v_status)) { + status = v_status; + goto exit; + } + if (self->private_impl.f_pix_p >= v_p_max) { + status = wuffs_base__make_status(NULL); + goto ok; + } + WUFFS_BASE__COROUTINE_SUSPENSION_POINT(1); + status = wuffs_webp__decoder__decode_pixels_slow(self, + a_dst, + a_src, + a_width, + a_height, + a_tile_data, + a_tile_size_log2); + if (status.repr) { + goto suspend; + } + if (self->private_impl.f_pix_p >= v_p_max) { + status = wuffs_base__make_status(NULL); + goto ok; + } } - goto ok; ok: self->private_impl.p_decode_pixels = 0; goto exit; @@ -85974,6 +103235,7 @@ wuffs_webp__decoder__decode_pixels( goto suspend; suspend: self->private_impl.p_decode_pixels = wuffs_base__status__is_suspension(&status) ? coro_susp_point : 0; + self->private_data.s_decode_pixels.v_p_max = v_p_max; goto exit; exit: @@ -86045,7 +103307,7 @@ wuffs_webp__decoder__frame_dirty_rect( return wuffs_base__utility__empty_rect_ie_u32(); } - if (self->private_impl.f_is_vp8_lossy) { + if (self->private_impl.f_is_vp8_lossy && ! self->private_impl.f_is_vp8x) { return wuffs_vp8__decoder__frame_dirty_rect(&self->private_data.f_vp8); } return wuffs_base__utility__make_rect_ie_u32( @@ -86220,8 +103482,14 @@ wuffs_webp__decoder__workbuf_len( return wuffs_base__utility__empty_range_ii_u64(); } + uint64_t v_total = 0; + + if (self->private_impl.f_is_vp8x) { + return wuffs_base__utility__make_range_ii_u64(self->private_impl.f_vp8x_workbuf_len, self->private_impl.f_vp8x_workbuf_len); + } if (self->private_impl.f_is_vp8_lossy) { - return wuffs_vp8__decoder__workbuf_len(&self->private_data.f_vp8); + v_total = wuffs_base__u64__sat_add(wuffs_vp8__decoder__workbuf_len_total(&self->private_data.f_vp8), ((uint64_t)(self->private_impl.f_sub_chunk_length))); + return wuffs_base__utility__make_range_ii_u64(v_total, v_total); } return wuffs_base__utility__make_range_ii_u64(((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[3u])), ((uint64_t)(self->private_impl.f_workbuf_offset_for_transform[3u]))); } diff --git a/std/jpeg/decode_jpeg.wuffs b/std/jpeg/decode_jpeg.wuffs index 4240ffcaa..62a547fb9 100644 --- a/std/jpeg/decode_jpeg.wuffs +++ b/std/jpeg/decode_jpeg.wuffs @@ -2168,6 +2168,7 @@ pri func decoder.swizzle_colorful!(dst: ptr base.pixel_buffer, workbuf: slice ba v3: this.components_v[3], is_rgb_or_cmyk: this.is_rgb_or_cmyk, triangle_filter_for_2to1: not this.use_lower_quality, + src_is_bt601: false, scratch_buffer_2k: this.swizzle_ycck_scratch_buffer_2k[..]) return status } diff --git a/std/vp8/common_consts.wuffs b/std/vp8/common_consts.wuffs new file mode 100644 index 000000000..5d7f822b2 --- /dev/null +++ b/std/vp8/common_consts.wuffs @@ -0,0 +1,704 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// -------- + +// DC_QUANT maps quantizer index (0..127) to DC dequantization factor. +pri const DC_QUANT : roarray[128] base.u16 = [ + 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17, + 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 91, 93, 95, 96, 98, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, + 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 143, 145, 148, 151, 154, 157, +] + +// AC_QUANT maps quantizer index (0..127) to AC dequantization factor. +pri const AC_QUANT : roarray[128] base.u16 = [ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, + 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, + 110, 112, 114, 116, 119, 122, 125, 128, 131, 134, 137, 140, 143, 146, 149, 152, + 155, 158, 161, 164, 167, 170, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209, + 213, 217, 221, 225, 229, 234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284, +] + +// COEFF_BANDS maps coefficient index (0..15) to band (0..7) for probability +// table lookup during coefficient decoding. +pri const COEFF_BANDS : roarray[16] base.u8[..= 7] = [ + 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, +] + +// COEFF_BAND_OFFSET is COEFF_BANDS[i] * 33, precomputed for probability index +// calculation. Eliminates the band * 33 multiply in the coefficient decode loop. +pri const COEFF_BAND_OFFSET : roarray[16] base.u8[..= 231] = [ + 0, 33, 66, 99, 198, 132, 165, 198, 198, 198, 198, 198, 198, 198, 198, 231, +] + +// ZIGZAG maps zig-zag order index (0..15) to raster scan index for 4x4 blocks. +pri const ZIGZAG : roarray[16] base.u8[..= 15] = [ + 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15, +] + +// DEFAULT_COEFF_PROBS contains the default coefficient probabilities. +// Flattened from [4 types][8 bands][3 contexts][11 tokens] = 1056 values. +// Index = type*264 + band*33 + ctx*11 + token. +pri const DEFAULT_COEFF_PROBS : roarray[1056] base.u8 = [ + // Type 0: Y blocks (after Y2, i.e. 16x16 mode, AC only) + // Band 0 + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + // Band 1 + 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128, + 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128, + 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128, + // Band 2 + 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128, + 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128, + 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128, + // Band 3 + 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128, + 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128, + 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128, + // Band 4 + 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128, + 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128, + 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128, + // Band 5 + 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128, + 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128, + 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128, + // Band 6 + 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128, + 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128, + 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128, + // Band 7 + 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128, + 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128, + 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + + // Type 1: Y2 block (luma DC) + // Band 0 + 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62, + 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1, + 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128, + // Band 1 + 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128, + 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128, + 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128, + // Band 2 + 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128, + 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128, + 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128, + // Band 3 + 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128, + 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128, + 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128, + // Band 4 + 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128, + 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128, + 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128, + // Band 5 + 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128, + 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128, + 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128, + // Band 6 + 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128, + 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128, + 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128, + // Band 7 + 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128, + 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128, + 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128, + + // Type 2: UV blocks (chroma) + // Band 0 + 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128, + 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128, + 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128, + // Band 1 + 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128, + 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128, + 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128, + // Band 2 + 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128, + 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128, + 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128, + // Band 3 + 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128, + 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128, + 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128, + // Band 4 + 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128, + 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128, + 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128, + // Band 5 + 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128, + 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128, + 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128, + // Band 6 + 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128, + 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128, + 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128, + // Band 7 + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + + // Type 3: Y blocks (4x4 mode, with DC) + // Band 0 + 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255, + 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128, + 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128, + // Band 1 + 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128, + 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128, + 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128, + // Band 2 + 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128, + 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128, + 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128, + // Band 3 + 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128, + 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128, + 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128, + // Band 4 + 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128, + 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128, + 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128, + // Band 5 + 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128, + 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128, + 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128, + // Band 6 + 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128, + 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128, + 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128, + // Band 7 + 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128, + 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128, + 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128, +] + +// COEFF_UPDATE_PROBS contains the probabilities used to decide whether each +// coefficient probability is updated in the frame header. Same layout as +// DEFAULT_COEFF_PROBS: [4 types][8 bands][3 contexts][11 tokens] = 1056. +pri const COEFF_UPDATE_PROBS : roarray[1056] base.u8 = [ + // Type 0 + // Band 0 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 1 + 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, + 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 2 + 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, + 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 3 + 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 4 + 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 5 + 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 6 + 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, + 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 7 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + + // Type 1 + // Band 0 + 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, + 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, + // Band 1 + 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, + // Band 2 + 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 3 + 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 4 + 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 5 + 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 6 + 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, + 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 7 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + + // Type 2 + // Band 0 + 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, + 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, + 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, + // Band 1 + 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, + // Band 2 + 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 3 + 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 4 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 5 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 6 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 7 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + + // Type 3 + // Band 0 + 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, + 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, + // Band 1 + 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, + 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, + 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, + // Band 2 + 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, + 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, + 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, + // Band 3 + 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 4 + 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, + 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 5 + 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 6 + 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, + 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + // Band 7 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +] + +// MV_UPDATE_PROBS contains the probabilities for updating motion vector +// probabilities. Not used for intra-only (key frame) decoding. +// [2 components (y, x)][19 probabilities each] = 38 values. +pri const MV_UPDATE_PROBS : roarray[38] base.u8 = [ + // Y component + 237, 246, 253, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 250, 250, 252, 254, 254, + // X component + 231, 243, 245, 253, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 251, 251, 254, 254, 254, +] + +// DEFAULT_MV_PROBS contains the default motion vector probabilities. +// [2 components (y, x)][19 probabilities each] = 38 values. +pri const DEFAULT_MV_PROBS : roarray[38] base.u8 = [ + // Y component + 162, 128, 225, 146, 172, 147, 214, 39, 156, + 128, 129, 132, 75, 145, 178, 206, 239, 254, 254, + // X component + 164, 128, 204, 170, 119, 235, 140, 230, 228, + 128, 130, 130, 74, 148, 180, 203, 236, 254, 254, +] + +// NORM_LUT maps range values to the number of bits to left-shift for +// renormalization in the boolean decoder. For range < 128, we need to shift +// until range >= 128. +pri const NORM_LUT : roarray[256] base.u8[..= 7] = [ + 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +] + +// TOKEN_EXTRA_BITS maps token categories to the number of extra bits to read. +// Tokens 0-4 have no extra bits. Token 5 (cat1) has 1, token 6 (cat2) has 2, +// etc. +pri const TOKEN_EXTRA_BITS : roarray[12] base.u8[..= 11] = [ + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 11, +] + +// TOKEN_EXTRA_BASE maps token categories to the base value. +// dct_val = base + extra_bits_value. +pri const TOKEN_EXTRA_BASE : roarray[12] base.u16 = [ + 0, 1, 2, 3, 4, 5, 7, 11, 19, 35, 67, 2048, +] + +// CAT_PROBS contains the probabilities for decoding extra bits in each +// token category (cat1 through cat6). Indexed by: category offset then bit. +// cat1: 1 prob, cat2: 2, cat3: 3, cat4: 4, cat5: 5, cat6: 11. +// Total = 1+2+3+4+5+11 = 26 probs. +// Offsets: cat1=0, cat2=1, cat3=3, cat4=6, cat5=10, cat6=15. +pri const CAT_PROBS : roarray[26] base.u8 = [ + // cat1 (1 extra bit) + 159, + // cat2 (2 extra bits) + 165, 145, + // cat3 (3 extra bits) + 173, 148, 140, + // cat4 (4 extra bits) + 176, 155, 140, 135, + // cat5 (5 extra bits) + 180, 157, 141, 134, 130, + // cat6 (11 extra bits) + 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, +] + +// CAT_PROBS_OFFSET maps category index (0..5, meaning cat1..cat6) to an +// offset into CAT_PROBS. +pri const CAT_PROBS_OFFSET : roarray[6] base.u8[..= 15] = [ + 0, 1, 3, 6, 10, 15, +] + +// CAT_EXTRA_BITS is the number of extra bits per category (cat1..cat6). +pri const CAT_EXTRA_BITS : roarray[6] base.u8[..= 11] = [ + 1, 2, 3, 4, 5, 11, +] + +// CAT_BASE_VALUE is the base coefficient value per category (cat1..cat6). +pri const CAT_BASE_VALUE : roarray[6] base.u16 = [ + 5, 7, 11, 19, 35, 67, +] + +// KF_Y_MODE_PROBS are the probabilities for decoding the luma prediction +// mode in key frames. Tree: 0→B_PRED(4), 1→(0→(0→DC, 1→V), 1→(0→H, 1→TM)). +// Mode values: DC=0, V=1, H=2, TM=3, B_PRED=4. +pri const KF_Y_MODE_PROBS : roarray[4] base.u8 = [ + 145, 156, 163, 128, +] + +// KF_UV_MODE_PROBS are the probabilities for decoding the chroma +// prediction mode in key frames. DC=0, V=1, H=2, TM=3. +pri const KF_UV_MODE_PROBS : roarray[3] base.u8 = [ + 142, 114, 183, +] + +// KF_B_MODE_PROBS are the probabilities for decoding 4x4 sub-block luma +// prediction modes in key frames. Indexed by [above_mode][left_mode][9 probs]. +// 10 modes × 10 modes × 9 probabilities = 900 values. +pri const KF_B_MODE_PROBS : roarray[900] base.u8 = [ + // above=B_DC(0), left=B_DC(0) + 231, 120, 48, 89, 115, 113, 120, 152, 112, + // above=B_DC, left=B_TM(1) + 152, 179, 64, 126, 170, 118, 46, 70, 95, + // above=B_DC, left=B_VE(2) + 175, 69, 143, 80, 85, 82, 72, 155, 103, + // above=B_DC, left=B_HE(3) + 56, 58, 10, 171, 218, 189, 17, 13, 152, + // above=B_DC, left=B_LD(4) + 144, 71, 10, 38, 171, 213, 144, 34, 26, + // above=B_DC, left=B_RD(5) + 114, 26, 17, 163, 44, 195, 21, 10, 173, + // above=B_DC, left=B_VR(6) + 121, 24, 80, 195, 26, 62, 44, 64, 85, + // above=B_DC, left=B_VL(7) + 170, 46, 55, 19, 136, 160, 33, 206, 71, + // above=B_DC, left=B_HD(8) + 63, 20, 8, 114, 114, 208, 12, 9, 226, + // above=B_DC, left=B_HU(9) + 81, 40, 11, 96, 182, 84, 29, 16, 36, + + // above=B_TM(1), left=B_DC(0) + 134, 183, 89, 137, 98, 101, 106, 165, 148, + // above=B_TM, left=B_TM + 72, 187, 100, 130, 157, 111, 32, 75, 80, + // above=B_TM, left=B_VE + 66, 102, 167, 99, 74, 62, 40, 234, 128, + // above=B_TM, left=B_HE + 41, 53, 9, 178, 241, 141, 26, 8, 107, + // above=B_TM, left=B_LD + 104, 79, 12, 27, 217, 255, 87, 17, 7, + // above=B_TM, left=B_RD + 74, 43, 26, 146, 73, 166, 49, 23, 157, + // above=B_TM, left=B_VR + 65, 38, 105, 160, 51, 52, 31, 115, 128, + // above=B_TM, left=B_VL + 87, 68, 71, 44, 114, 51, 15, 186, 23, + // above=B_TM, left=B_HD + 47, 41, 14, 110, 182, 183, 21, 17, 194, + // above=B_TM, left=B_HU + 66, 45, 25, 102, 197, 189, 23, 18, 22, + + // above=B_VE(2), left=B_DC(0) + 88, 88, 147, 150, 42, 46, 45, 196, 205, + // above=B_VE, left=B_TM + 43, 97, 183, 117, 85, 38, 35, 179, 61, + // above=B_VE, left=B_VE + 39, 53, 200, 87, 26, 21, 43, 232, 171, + // above=B_VE, left=B_HE + 56, 34, 51, 104, 114, 102, 29, 93, 77, + // above=B_VE, left=B_LD + 107, 54, 32, 26, 51, 1, 81, 43, 31, + // above=B_VE, left=B_RD + 39, 28, 85, 171, 58, 165, 90, 98, 64, + // above=B_VE, left=B_VR + 34, 22, 116, 206, 23, 34, 43, 166, 73, + // above=B_VE, left=B_VL + 68, 25, 106, 22, 64, 171, 36, 225, 114, + // above=B_VE, left=B_HD + 34, 19, 21, 102, 132, 188, 16, 76, 124, + // above=B_VE, left=B_HU + 62, 18, 78, 95, 85, 57, 50, 48, 51, + + // above=B_HE(3), left=B_DC(0) + 193, 101, 35, 159, 215, 111, 89, 46, 111, + // above=B_HE, left=B_TM + 60, 148, 31, 172, 219, 228, 21, 18, 111, + // above=B_HE, left=B_VE + 112, 113, 77, 85, 179, 255, 38, 120, 114, + // above=B_HE, left=B_HE + 40, 42, 1, 196, 245, 209, 10, 25, 109, + // above=B_HE, left=B_LD + 100, 80, 8, 43, 154, 1, 51, 26, 71, + // above=B_HE, left=B_RD + 88, 43, 29, 140, 166, 213, 37, 43, 154, + // above=B_HE, left=B_VR + 61, 63, 30, 155, 67, 45, 68, 1, 209, + // above=B_HE, left=B_VL + 142, 78, 78, 16, 255, 128, 34, 197, 171, + // above=B_HE, left=B_HD + 41, 40, 5, 102, 211, 183, 4, 1, 221, + // above=B_HE, left=B_HU + 51, 50, 17, 168, 209, 192, 23, 25, 82, + + // above=B_LD(4), left=B_DC(0) + 125, 98, 42, 88, 104, 85, 117, 175, 82, + // above=B_LD, left=B_TM + 95, 84, 53, 89, 128, 100, 113, 101, 45, + // above=B_LD, left=B_VE + 75, 79, 123, 47, 51, 128, 81, 171, 1, + // above=B_LD, left=B_HE + 57, 17, 5, 71, 102, 57, 53, 41, 49, + // above=B_LD, left=B_LD + 115, 21, 2, 10, 102, 255, 166, 23, 6, + // above=B_LD, left=B_RD + 38, 33, 13, 121, 57, 73, 26, 1, 85, + // above=B_LD, left=B_VR + 41, 10, 67, 138, 77, 110, 90, 47, 114, + // above=B_LD, left=B_VL + 101, 29, 16, 10, 85, 128, 101, 196, 26, + // above=B_LD, left=B_HD + 57, 18, 10, 102, 102, 213, 34, 20, 43, + // above=B_LD, left=B_HU + 117, 20, 15, 36, 163, 128, 68, 1, 26, + + // above=B_RD(5), left=B_DC(0) + 138, 31, 36, 171, 27, 166, 38, 44, 229, + // above=B_RD, left=B_TM + 67, 87, 58, 169, 82, 115, 26, 59, 179, + // above=B_RD, left=B_VE + 63, 59, 90, 180, 59, 166, 93, 73, 154, + // above=B_RD, left=B_HE + 40, 40, 21, 116, 143, 209, 34, 39, 175, + // above=B_RD, left=B_LD + 57, 46, 22, 24, 128, 1, 54, 17, 37, + // above=B_RD, left=B_RD + 47, 15, 16, 183, 34, 223, 49, 45, 183, + // above=B_RD, left=B_VR + 46, 17, 33, 183, 6, 98, 15, 32, 183, + // above=B_RD, left=B_VL + 65, 32, 73, 115, 28, 128, 23, 128, 205, + // above=B_RD, left=B_HD + 40, 3, 9, 115, 51, 192, 18, 6, 223, + // above=B_RD, left=B_HU + 87, 37, 9, 115, 59, 77, 64, 21, 47, + + // above=B_VR(6), left=B_DC(0) + 104, 55, 44, 218, 9, 54, 53, 130, 226, + // above=B_VR, left=B_TM + 64, 90, 70, 205, 40, 41, 23, 26, 57, + // above=B_VR, left=B_VE + 54, 57, 112, 184, 5, 41, 38, 166, 213, + // above=B_VR, left=B_HE + 30, 34, 26, 133, 152, 116, 10, 32, 134, + // above=B_VR, left=B_LD + 75, 32, 12, 51, 192, 255, 160, 43, 51, + // above=B_VR, left=B_RD + 39, 19, 53, 221, 26, 114, 32, 73, 255, + // above=B_VR, left=B_VR + 31, 9, 65, 234, 2, 15, 1, 118, 73, + // above=B_VR, left=B_VL + 88, 31, 35, 67, 102, 85, 55, 186, 85, + // above=B_VR, left=B_HD + 56, 21, 23, 111, 59, 205, 45, 37, 192, + // above=B_VR, left=B_HU + 55, 38, 70, 124, 73, 102, 1, 34, 98, + + // above=B_VL(7), left=B_DC(0) + 102, 61, 71, 37, 34, 53, 31, 243, 192, + // above=B_VL, left=B_TM + 69, 60, 71, 38, 73, 119, 28, 222, 37, + // above=B_VL, left=B_VE + 68, 45, 128, 34, 1, 47, 11, 245, 171, + // above=B_VL, left=B_HE + 62, 17, 19, 70, 146, 85, 55, 62, 70, + // above=B_VL, left=B_LD + 75, 15, 9, 9, 64, 255, 184, 119, 16, + // above=B_VL, left=B_RD + 37, 43, 37, 154, 100, 163, 85, 160, 1, + // above=B_VL, left=B_VR + 63, 9, 92, 136, 28, 64, 32, 201, 85, + // above=B_VL, left=B_VL + 86, 6, 28, 5, 64, 255, 25, 248, 1, + // above=B_VL, left=B_HD + 56, 8, 17, 132, 137, 255, 55, 116, 128, + // above=B_VL, left=B_HU + 58, 15, 20, 82, 135, 57, 26, 121, 40, + + // above=B_HD(8), left=B_DC(0) + 164, 50, 31, 137, 154, 133, 25, 35, 218, + // above=B_HD, left=B_TM + 51, 103, 44, 131, 131, 123, 31, 6, 158, + // above=B_HD, left=B_VE + 86, 40, 64, 135, 148, 224, 45, 183, 128, + // above=B_HD, left=B_HE + 22, 26, 17, 131, 240, 154, 14, 1, 209, + // above=B_HD, left=B_LD + 83, 12, 13, 54, 192, 255, 68, 47, 28, + // above=B_HD, left=B_RD + 45, 16, 21, 91, 64, 222, 7, 1, 197, + // above=B_HD, left=B_VR + 56, 21, 39, 155, 60, 138, 23, 102, 213, + // above=B_HD, left=B_VL + 85, 26, 85, 85, 128, 128, 32, 146, 171, + // above=B_HD, left=B_HD + 18, 11, 7, 63, 144, 171, 4, 4, 246, + // above=B_HD, left=B_HU + 35, 27, 10, 146, 174, 171, 12, 26, 128, + + // above=B_HU(9), left=B_DC(0) + 190, 80, 35, 99, 180, 80, 126, 54, 45, + // above=B_HU, left=B_TM + 85, 126, 47, 87, 176, 51, 41, 20, 32, + // above=B_HU, left=B_VE + 101, 75, 128, 139, 118, 146, 116, 128, 85, + // above=B_HU, left=B_HE + 56, 41, 15, 176, 236, 85, 37, 9, 62, + // above=B_HU, left=B_LD + 146, 36, 19, 30, 171, 255, 97, 27, 20, + // above=B_HU, left=B_RD + 71, 30, 17, 119, 118, 255, 17, 18, 138, + // above=B_HU, left=B_VR + 101, 38, 60, 138, 55, 70, 43, 26, 142, + // above=B_HU, left=B_VL + 138, 45, 61, 62, 219, 1, 81, 188, 64, + // above=B_HU, left=B_HD + 32, 41, 20, 117, 151, 142, 20, 21, 163, + // above=B_HU, left=B_HU + 112, 19, 12, 61, 195, 128, 48, 4, 24, +] + +// -------- + +// RENORM_SHIFT_256 maps a range-1 value (0..254) to the number of left shifts +// needed for renormalization. Indices 0-127: shift to bring range into +// [128, 255] (matching libwebp's kVP8Log2Range). Indices 128-254: 0 (already +// normalized). Enables branchless renorm: shift = table[lr]; lr = +// range_table[lr]; lb -= shift. +pri const RENORM_SHIFT_256 : roarray[256] base.u8[..= 7] = [ + 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +] + +// RENORM_RANGE_256 maps a range-1 value (0..254) to the post-shift range-1. +// Indices 0-127: renormalized values in [127, 253] (matching libwebp's +// kVP8NewRange). Indices 128-254: identity (already normalized). +pri const RENORM_RANGE_256 : roarray[256] base.u8 = [ + 127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239, 127, + 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 127, + 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179, 183, 187, 191, + 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 127, + 129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159, + 161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, + 193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, + 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 127, + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 254, +] diff --git a/std/vp8/decode_bool.wuffs b/std/vp8/decode_bool.wuffs new file mode 100644 index 000000000..bdc11bbab --- /dev/null +++ b/std/vp8/decode_bool.wuffs @@ -0,0 +1,341 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 boolean (arithmetic) decoder per RFC 6386 Section 7. +// +// Partition 0 (mode data) state — wide-accumulator approach: +// bool_range: [0, 254], range-1 convention (actual range = bool_range + 1) +// bool_value (u64): wide accumulator +// bool_bits (u32): position counter +// +// The boolean decoder reads from data previously buffered into +// this.bool_buffer[this.bool_ri .. this.bool_wi]. + +// bool_init! initializes the boolean decoder state from the first two bytes +// in the buffer. Uses wide-accumulator approach matching partition 1. +pri func decoder.bool_init!() { + var bb : base.u64 + + this.bool_range = 254 // range-1 convention: actual range 255 + this.bool_value = 0 + this.bool_bits = 0 + + // Pre-load bytes into the wide accumulator. + while (this.bool_bits <= 48) and (this.bool_ri < this.bool_wi) { + assert this.bool_ri < 0x1000 via "a < b: a < c; c <= b"(c: this.bool_wi) + bb = this.bool_buffer[this.bool_ri] as base.u64 + this.bool_ri += 1 + this.bool_value = (this.bool_value ~mod<< 8) | bb + this.bool_bits ~mod+= 8 + } +} + +// bool_read_bool! reads a single boolean with the given probability (0..255). +// Returns 0 or 1. +// +// Uses range-1 convention and position-based value tracking (matching +// partition 1). Table-based renormalization with fast-path for shift=1. +pri func decoder.bool_read_bool!(prob: base.u8) base.u32[..= 1] { + var s : base.u32 + var retval : base.u32[..= 1] + var v : base.u32 + var shift : base.u32[..= 7] + var bb : base.u64 + var pos : base.u32 + + // Load bytes when bit position gets low. + if this.bool_bits < 16 { + while (this.bool_bits <= 48) and (this.bool_ri < this.bool_wi) { + assert this.bool_ri < 0x1000 via "a < b: a < c; c <= b"(c: this.bool_wi) + bb = this.bool_buffer[this.bool_ri] as base.u64 + this.bool_ri += 1 + this.bool_value = (this.bool_value ~mod<< 8) | bb + this.bool_bits ~mod+= 8 + } + } + + // s = floor(range_m1 * prob / 256). split = s + 1. + s = (this.bool_range * (args.prob as base.u32)) >> 8 + + // Extract active bits. + pos = (this.bool_bits ~mod- 8) & 63 + v = ((this.bool_value >> pos) & 0xFFFF_FFFF) as base.u32 + + if v > s { + retval = 1 + this.bool_value ~mod-= ((s ~mod+ 1) as base.u64) ~mod<< pos + this.bool_range = ((this.bool_range ~mod- s) ~mod- 1) & 0xFF + } else { + retval = 0 + this.bool_range = s + } + + // Branchless renormalization via extended 256-entry tables. + shift = RENORM_SHIFT_256[this.bool_range & 0xFF] as base.u32 + this.bool_range = RENORM_RANGE_256[this.bool_range & 0xFF] as base.u32 + this.bool_bits ~mod-= shift + + return retval +} + +// bool_read_literal! reads n bits at probability 128 (uniform). +pri func decoder.bool_read_literal!(n: base.u32[..= 24]) base.u32 { + var result : base.u32 + var i : base.u32 + var bit : base.u32[..= 1] + + result = 0 + i = 0 + while i < args.n, + inv i <= 24, + { + assert i < 24 via "a < b: a < c; c <= b"(c: args.n) + bit = this.bool_read_bool!(prob: 128) + result = (result ~mod<< 1) | bit + i += 1 + } + return result +} + +// bool_read_signed! reads an optional signed value. First reads a flag bit; +// if 1, reads n bits of magnitude + sign bit. +pri func decoder.bool_read_signed!(n: base.u32[..= 24]) base.i32 { + var flag : base.u32[..= 1] + var magnitude : base.u32 + var sign : base.u32[..= 1] + + flag = this.bool_read_bool!(prob: 128) + if flag == 0 { + return 0 + } + + magnitude = this.bool_read_literal!(n: args.n) + magnitude &= 0x7FFF_FFFF + sign = this.bool_read_bool!(prob: 128) + if sign <> 0 { + return -(magnitude as base.i32) + } + return magnitude as base.i32 +} + +// ---- Partition 1 (coefficient data) boolean decoder ---- +// +// Range-1 convention: p1_range stores (range - 1) to save one subtract +// per split computation (matches libwebp's convention). +// +// Position-based value tracking (inspired by libwebp's VP8GetBit): +// p1_value (u64) is a wide accumulator. p1_bits (u32) is the position. +// Extraction: v = p1_value >> ((p1_bits - 8) & 63), giving the active +// ~8 bits for comparison against split. +// Renormalization only decrements p1_bits — no value shifting needed. +// Byte loading: triggered when p1_bits < 16; shifts value left by 8 +// per byte and ORs in the new byte, incrementing p1_bits by 8. +// +// p1_bits uses an implicit +8 offset relative to libwebp's br->bits +// to keep unsigned: p1_bits=0 corresponds to libwebp's bits=-8. + +// p1_init! initializes the partition 1 boolean decoder. +pri func decoder.p1_init!() { + this.p1_range = 254 + this.p1_value = 0 + this.p1_bits = 0 + // Bytes are loaded on the first p1_read_bool call. +} + +// p1_read_bool! reads a boolean from partition 1. +// Uses range-1 convention and position-based value tracking. +// Table-based renormalization decrements p1_bits without shifting value. +pri func decoder.p1_read_bool!(prob: base.u8) base.u32[..= 1] { + var s : base.u32 + var retval : base.u32[..= 1] + var v : base.u32 + var shift : base.u32[..= 7] + var bb : base.u64 + var pos : base.u32 + + // Load bytes when bit position gets low. + if this.p1_bits < 16 { + while (this.p1_bits <= 48) and (this.p1_ri < this.p1_wi) { + assert this.p1_ri < 0x1000 via "a < b: a < c; c <= b"(c: this.p1_wi) + bb = this.p1_buffer[this.p1_ri] as base.u64 + this.p1_ri += 1 + this.p1_value = (this.p1_value ~mod<< 8) | bb + this.p1_bits ~mod+= 8 + } + } + + // s = floor(range_m1 * prob / 256). split = s + 1. + s = (this.p1_range * (args.prob as base.u32)) >> 8 + + // Extract active bits: value >> actual_pos, where actual_pos = p1_bits - 8. + pos = (this.p1_bits ~mod- 8) & 63 + v = ((this.p1_value >> pos) & 0xFFFF_FFFF) as base.u32 + + if v > s { + retval = 1 + this.p1_value ~mod-= ((s ~mod+ 1) as base.u64) ~mod<< pos + this.p1_range = ((this.p1_range ~mod- s) ~mod- 1) & 0xFF + } else { + retval = 0 + this.p1_range = s + } + + // Branchless renormalization via extended 256-entry tables. + shift = RENORM_SHIFT_256[this.p1_range & 0xFF] as base.u32 + this.p1_range = RENORM_RANGE_256[this.p1_range & 0xFF] as base.u32 + // Detect EOF: if shift > p1_bits, the unsigned subtraction would wrap, + // meaning all valid bits are consumed. Set accumulator to produce 0 for + // all subsequent reads, matching libwebp's EOF behavior. + if shift > this.p1_bits { + this.p1_value = 0 + this.p1_bits = 56 + } else { + this.p1_bits ~mod-= shift + } + + return retval +} + +// p1_read_sign! reads a sign bit (prob=128) from partition 1. +// Specialized: prob=128 means s = range_m1 / 2, and the new range is in +// [64, 128] so at most 1 bit of normalization is needed. +pri func decoder.p1_read_sign!() base.u32[..= 1] { + var s : base.u32 + var retval : base.u32[..= 1] + var v : base.u32 + var shift : base.u32[..= 7] + var bb : base.u64 + var pos : base.u32 + + // Load bytes when bit position gets low. + if this.p1_bits < 16 { + if ((this.p1_ri ~mod+ 4) <= this.p1_wi) and (this.p1_ri < 0xFFD) { + this.p1_value = (this.p1_value ~mod<< 32) | + ((this.p1_buffer[this.p1_ri + 0] as base.u64) ~mod<< 24) | + ((this.p1_buffer[this.p1_ri + 1] as base.u64) ~mod<< 16) | + ((this.p1_buffer[this.p1_ri + 2] as base.u64) ~mod<< 8) | + (this.p1_buffer[this.p1_ri + 3] as base.u64) + this.p1_ri += 4 + this.p1_bits ~mod+= 32 + } else { + while (this.p1_bits <= 48) and (this.p1_ri < this.p1_wi) { + assert this.p1_ri < 0x1000 via "a < b: a < c; c <= b"(c: this.p1_wi) + bb = this.p1_buffer[this.p1_ri] as base.u64 + this.p1_ri += 1 + this.p1_value = (this.p1_value ~mod<< 8) | bb + this.p1_bits ~mod+= 8 + } + } + } + + s = this.p1_range >> 1 + + pos = (this.p1_bits ~mod- 8) & 63 + v = ((this.p1_value >> pos) & 0xFFFF_FFFF) as base.u32 + + if v > s { + retval = 1 + this.p1_value ~mod-= ((s ~mod+ 1) as base.u64) ~mod<< pos + this.p1_range = ((this.p1_range ~mod- s) ~mod- 1) & 0xFF + } else { + retval = 0 + this.p1_range = s + } + + // Branchless renormalization. + shift = RENORM_SHIFT_256[this.p1_range & 0xFF] as base.u32 + this.p1_range = RENORM_RANGE_256[this.p1_range & 0xFF] as base.u32 + if shift > this.p1_bits { + this.p1_value = 0 + this.p1_bits = 56 + } else { + this.p1_bits ~mod-= shift + } + + return retval +} + +// p1_fill_buffer! reads data from src into the partition 1 buffer. +pri func decoder.p1_fill_buffer!(src: base.io_reader, n: base.u32) { + var remaining : base.u32 + var c8 : base.u8 + + if (this.p1_ri > 0) and (this.p1_ri <= this.p1_wi) { + this.p1_buffer[.. 0x1000].copy_from_slice!(s: this.p1_buffer[this.p1_ri .. this.p1_wi]) + this.p1_wi ~sat-= this.p1_ri + this.p1_ri = 0 + } + + remaining = args.n + while (remaining > 0) and (this.p1_wi < 0x1000) and (args.src.length() > 0) { + c8 = args.src.peek_u8() + args.src.skip_u32_fast!(actual: 1, worst_case: 1) + if this.p1_wi < 0x1000 { + this.p1_buffer[this.p1_wi] = c8 + this.p1_wi += 1 + } + remaining -= 1 + } +} + +// p1_fill_from_workbuf! refills the partition 1 buffer from the current +// coefficient partition's data in workbuf (multi-partition mode). +pri func decoder.p1_fill_from_workbuf!(workbuf: slice base.u8) { + var idx : base.u64 + var p : base.u32[..= 7] + var poff : base.u64 + + // Compact the buffer first. + if (this.p1_ri > 0) and (this.p1_ri <= this.p1_wi) { + this.p1_buffer[.. 0x1000].copy_from_slice!(s: this.p1_buffer[this.p1_ri .. this.p1_wi]) + this.p1_wi ~sat-= this.p1_ri + this.p1_ri = 0 + } + + // Fill from current partition's workbuf region. + p = this.current_partition + poff = this.part_wbuf_offset[p] + while (this.p1_wi < 0x1000) and (this.current_part_wbuf_ri < this.part_wbuf_size[p]) { + idx = poff ~mod+ (this.current_part_wbuf_ri as base.u64) + if idx >= args.workbuf.length() { + break + } + this.p1_buffer[this.p1_wi] = args.workbuf[idx] + this.p1_wi += 1 + this.current_part_wbuf_ri ~mod+= 1 + } + +} + +// ---- Partition 0 (mode data) boolean decoder ---- + +// bool_fill_from_workbuf! refills the boolean decoder buffer from the +// partition 0 data stored in workbuf (starting at workbuf_offset_v_end). +pri func decoder.bool_fill_from_workbuf!(workbuf: slice base.u8) { + var idx : base.u64 + + // Compact the buffer first. + if (this.bool_ri > 0) and (this.bool_ri <= this.bool_wi) { + this.bool_buffer[.. 0x1000].copy_from_slice!(s: this.bool_buffer[this.bool_ri .. this.bool_wi]) + this.bool_wi ~sat-= this.bool_ri + this.bool_ri = 0 + } + + // Fill from workbuf partition 0 data (limited to bytes actually copied). + while (this.bool_wi < 0x1000) and (this.p0_wbuf_ri < this.p0_wbuf_count) { + idx = this.workbuf_offset_v_end ~mod+ (this.p0_wbuf_ri as base.u64) + if idx >= args.workbuf.length() { + break + } + this.bool_buffer[this.bool_wi] = args.workbuf[idx] + this.bool_wi += 1 + this.p0_wbuf_ri ~mod+= 1 + } +} diff --git a/std/vp8/decode_filter.wuffs b/std/vp8/decode_filter.wuffs new file mode 100644 index 000000000..1c8e5384e --- /dev/null +++ b/std/vp8/decode_filter.wuffs @@ -0,0 +1,800 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 post-reconstruction deblocking filter. +// +// filter_type=1 is the simple filter: processes macroblock boundary edges and +// sub-block (4x4) edges on the Y plane only. Each edge is filtered using a +// 2-pixel kernel: a = 3*(q0-p0) + clamp127(p1-q1), then split into a1/a2 +// adjustments with clamp15. +// +// filter_type=0 is the normal filter: processes Y, U, V planes using a +// 2/4/6-pixel kernel (filter246). +// +// The filter is applied AFTER all macroblocks are reconstructed (RFC 6386 §15). +// Per-MB filter parameters (level, ilevel, hlevel, inner flag) are stored +// during reconstruction and read back during this pass. + +// apply_simple_filter_all! applies the simple loop filter (filter_type=1). +// Only processes the Y plane. +pri func decoder.apply_simple_filter_all!(workbuf: slice base.u8) { + var mby : base.u32 + + mby = 0 + while mby < this.mb_height { + this.apply_simple_filter_row!(workbuf: args.workbuf, mby: mby) + if mby < 0x3FF { + mby += 1 + } + } +} + +// apply_simple_filter_row! applies the simple loop filter to a single MB row. +pri func decoder.apply_simple_filter_row!(workbuf: slice base.u8, mby: base.u32) { + var mbx : base.u32 + var mb_idx : base.u32 + var f_level : base.u32 + var has_inner : base.bool + var mb_lim : base.u32 + var sub_lim : base.u32 + var y_off : base.u64 + var r : base.u32 + var idx : base.u64 + + mbx = 0 + while mbx < this.mb_width { + mb_idx = ((args.mby & 1) * 0x400) ~mod+ mbx + if mb_idx >= 0x800 { + mbx ~mod+= 1 + continue + } + + f_level = this.mb_filter_level[mb_idx] as base.u32 + if f_level == 0 { + mbx ~mod+= 1 + continue + } + + has_inner = this.mb_filter_inner[mb_idx] <> 0 + + sub_lim = f_level + mb_lim = sub_lim ~mod+ 4 + + y_off = ((args.mby as base.u64) * 16 * (this.y_stride as base.u64)) + + ((mbx as base.u64) * 16) + + if mbx > 0 { + r = 0 + while r < 16 { + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + this.filter2!(workbuf: args.workbuf, + q0_idx: idx, step: 1, limit: mb_lim) + r += 1 + } + } + + if has_inner { + r = 0 + while r < 16 { + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + this.filter2!(workbuf: args.workbuf, + q0_idx: idx ~mod+ 4, step: 1, limit: sub_lim) + this.filter2!(workbuf: args.workbuf, + q0_idx: idx ~mod+ 8, step: 1, limit: sub_lim) + this.filter2!(workbuf: args.workbuf, + q0_idx: idx ~mod+ 12, step: 1, limit: sub_lim) + r += 1 + } + } + + if args.mby > 0 { + this.simple_vfilter_16!(workbuf: args.workbuf, + q0_off: y_off, limit: mb_lim) + } + + if has_inner { + this.simple_vfilter_16!(workbuf: args.workbuf, + q0_off: y_off ~mod+ (4 * (this.y_stride as base.u64)), limit: sub_lim) + this.simple_vfilter_16!(workbuf: args.workbuf, + q0_off: y_off ~mod+ (8 * (this.y_stride as base.u64)), limit: sub_lim) + this.simple_vfilter_16!(workbuf: args.workbuf, + q0_off: y_off ~mod+ (12 * (this.y_stride as base.u64)), limit: sub_lim) + } + + if mbx < 0x3FF { + mbx += 1 + } + } +} + +// simple_vfilter_16! filters 16 contiguous pixels at a horizontal edge. +// q0_off is the offset of the first q0 pixel (the first pixel below the edge). +// Stride is this.y_stride. Processes pixels q0_off[0..16], with p0 at -stride, +// p1 at -2*stride, q1 at +stride. +pri func decoder.simple_vfilter_16!(workbuf: slice base.u8, q0_off: base.u64, limit: base.u32), + choosy, +{ + var r : base.u32 + + r = 0 + while r < 16 { + this.filter2!(workbuf: args.workbuf, + q0_idx: args.q0_off ~mod+ (r as base.u64), + step: this.y_stride as base.u64, + limit: args.limit) + r += 1 + } +} + +// normal_vfilter_inner_16! filters 16 contiguous pixels at a horizontal edge +// using the normal filter4 (inner sub-block edges, four_not_six=true). +// q0_off is the offset of the first q0 pixel, stride is y_stride. +pri func decoder.normal_vfilter_inner_16!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + var r : base.u32 + + r = 0 + while r < 16 { + this.filter246!(workbuf: args.workbuf, + q0_idx: args.q0_off ~mod+ (r as base.u64), + step: this.y_stride as base.u64, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel, + four_not_six: true) + r += 1 + } +} + +// normal_vfilter_mb_16! filters 16 contiguous pixels at a horizontal edge +// using the normal filter6 (MB boundary edges, four_not_six=false). +pri func decoder.normal_vfilter_mb_16!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + var r : base.u32 + + r = 0 + while r < 16 { + this.filter246!(workbuf: args.workbuf, + q0_idx: args.q0_off ~mod+ (r as base.u64), + step: this.y_stride as base.u64, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel, + four_not_six: false) + r += 1 + } +} + +// normal_vfilter_mb_8! filters 8 contiguous pixels at a horizontal edge +// using the normal filter6 (MB boundary edges, four_not_six=false). +// Used for U/V planes where the edge is 8 pixels wide. +pri func decoder.normal_vfilter_mb_8!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + var r : base.u32 + + r = 0 + while r < 8 { + this.filter246!(workbuf: args.workbuf, + q0_idx: args.q0_off ~mod+ (r as base.u64), + step: this.uv_stride as base.u64, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel, + four_not_six: false) + r += 1 + } +} + +// normal_hfilter_mb_16! filters 16 rows at a vertical MB boundary edge +// using the normal filter6. Each row's edge pixels (p3..q3) are at step=1 +// (consecutive bytes). Used for the Y plane. +pri func decoder.normal_hfilter_mb_16!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + var r : base.u32 + + r = 0 + while r < 16 { + this.filter246!(workbuf: args.workbuf, + q0_idx: args.q0_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)), + step: 1, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel, + four_not_six: false) + r += 1 + } +} + +// normal_hfilter_mb_8! filters 8 rows at a vertical MB boundary edge +// using the normal filter6. Used for U/V planes. +pri func decoder.normal_hfilter_mb_8!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + var r : base.u32 + + r = 0 + while r < 8 { + this.filter246!(workbuf: args.workbuf, + q0_idx: args.q0_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)), + step: 1, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel, + four_not_six: false) + r += 1 + } +} + +// normal_hfilter_inner_16! filters 16 rows at a vertical inner sub-block edge +// using the normal filter4. Used for the Y plane (3 inner edges per MB). +pri func decoder.normal_hfilter_inner_16!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + var r : base.u32 + + r = 0 + while r < 16 { + this.filter246!(workbuf: args.workbuf, + q0_idx: args.q0_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)), + step: 1, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel, + four_not_six: true) + r += 1 + } +} + +// normal_hfilter_inner_8! filters 8 rows at a vertical inner sub-block edge +// using the normal filter4. Used for U/V planes (1 inner edge per plane). +pri func decoder.normal_hfilter_inner_8!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + var r : base.u32 + + r = 0 + while r < 8 { + this.filter246!(workbuf: args.workbuf, + q0_idx: args.q0_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)), + step: 1, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel, + four_not_six: true) + r += 1 + } +} + +// normal_vfilter_inner_8! filters 8 contiguous pixels at a horizontal inner +// sub-block edge using the normal filter4. Used for U/V planes. +pri func decoder.normal_vfilter_inner_8!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + var r : base.u32 + + r = 0 + while r < 8 { + this.filter246!(workbuf: args.workbuf, + q0_idx: args.q0_off ~mod+ (r as base.u64), + step: this.uv_stride as base.u64, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel, + four_not_six: true) + r += 1 + } +} + +// ---- Combined U+V filter functions (scalar defaults) ---- + +// normal_vfilter_mb_uv! filters both U and V planes at a horizontal MB boundary +// edge using the normal filter6. Combines two normal_vfilter_mb_8 calls. +pri func decoder.normal_vfilter_mb_uv!(workbuf: slice base.u8, + u_off: base.u64, v_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + this.normal_vfilter_mb_8!(workbuf: args.workbuf, + q0_off: args.u_off, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel) + this.normal_vfilter_mb_8!(workbuf: args.workbuf, + q0_off: args.v_off, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel) +} + +// normal_hfilter_mb_uv! filters both U and V planes at a vertical MB boundary +// edge using the normal filter6. Combines two normal_hfilter_mb_8 calls. +pri func decoder.normal_hfilter_mb_uv!(workbuf: slice base.u8, + u_off: base.u64, v_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + this.normal_hfilter_mb_8!(workbuf: args.workbuf, + q0_off: args.u_off, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel) + this.normal_hfilter_mb_8!(workbuf: args.workbuf, + q0_off: args.v_off, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel) +} + +// normal_vfilter_inner_uv! filters both U and V planes at a horizontal inner +// sub-block edge using the normal filter4. Combines two normal_vfilter_inner_8 calls. +pri func decoder.normal_vfilter_inner_uv!(workbuf: slice base.u8, + u_off: base.u64, v_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + this.normal_vfilter_inner_8!(workbuf: args.workbuf, + q0_off: args.u_off, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel) + this.normal_vfilter_inner_8!(workbuf: args.workbuf, + q0_off: args.v_off, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel) +} + +// normal_hfilter_inner_uv! filters both U and V planes at a vertical inner +// sub-block edge using the normal filter4. Combines two normal_hfilter_inner_8 calls. +pri func decoder.normal_hfilter_inner_uv!(workbuf: slice base.u8, + u_off: base.u64, v_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choosy, +{ + this.normal_hfilter_inner_8!(workbuf: args.workbuf, + q0_off: args.u_off, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel) + this.normal_hfilter_inner_8!(workbuf: args.workbuf, + q0_off: args.v_off, + level: args.level, ilevel: args.ilevel, hlevel: args.hlevel) +} + +// filter2! filters one pixel position along an edge using the simple filter. +// Implements Go's filter2: adjust p0/q0 based on the 4-pixel neighborhood. +// q0_idx is the index of q0 (first pixel inside the edge). +// step is the distance between pixels across the edge (1=vertical, stride=horizontal). +pri func decoder.filter2!(workbuf: slice base.u8, q0_idx: base.u64, step: base.u64, limit: base.u32) { + var p1_idx : base.u64 + var p0_idx : base.u64 + var q1_idx : base.u64 + var p1 : base.u32 + var p0 : base.u32 + var q0 : base.u32 + var q1 : base.u32 + var dp0q0 : base.u32 + var dp1q1 : base.u32 + var thresh : base.u32 + var a : base.u32 + var a1 : base.u32 + var a2 : base.u32 + var pq_diff : base.u32 + var val : base.u32 + + // Compute pixel indices: p1, p0 | q0, q1. + if args.q0_idx < args.step { + return nothing + } + p0_idx = args.q0_idx - args.step + if p0_idx < args.step { + return nothing + } + p1_idx = p0_idx - args.step + q1_idx = args.q0_idx ~mod+ args.step + + // Bounds check. + if (q1_idx >= args.workbuf.length()) or + (args.q0_idx >= args.workbuf.length()) or + (p0_idx >= args.workbuf.length()) or + (p1_idx >= args.workbuf.length()) { + return nothing + } + + p1 = args.workbuf[p1_idx] as base.u32 + p0 = args.workbuf[p0_idx] as base.u32 + q0 = args.workbuf[args.q0_idx] as base.u32 + q1 = args.workbuf[q1_idx] as base.u32 + + // Threshold check: abs(p0-q0)*2 + abs(p1-q1)/2 <= limit. + // Use signed subtraction via u32 modular arithmetic. + dp0q0 = p0 ~mod- q0 + if (dp0q0 & 0x8000_0000) <> 0 { + dp0q0 = 0 ~mod- dp0q0 + } + dp0q0 = dp0q0 & 0xFF + dp1q1 = p1 ~mod- q1 + if (dp1q1 & 0x8000_0000) <> 0 { + dp1q1 = 0 ~mod- dp1q1 + } + dp1q1 = dp1q1 & 0xFF + thresh = (dp0q0 * 2) + (dp1q1 >> 1) + if thresh > args.limit { + return nothing + } + + // a = 3*(q0-p0) + clamp127(p1-q1). + // All arithmetic in signed 32-bit via u32 modular ops. + pq_diff = p1 ~mod- q1 + // clamp127: clamp to [-128, 127]. + if (pq_diff & 0x8000_0000) <> 0 { + if pq_diff < 0xFFFF_FF80 { + pq_diff = 0xFFFF_FF80 + } + } else { + if pq_diff > 127 { + pq_diff = 127 + } + } + + a = (3 ~mod* (q0 ~mod- p0)) ~mod+ pq_diff + + // clamp15: a1 = clamp((a+4)>>3, -16, 15), a2 = clamp((a+3)>>3, -16, 15). + a1 = this.clamp15_asr3!(v: a ~mod+ 4) + a2 = this.clamp15_asr3!(v: a ~mod+ 3) + + // p0 = clamp(p0 + a2, 0, 255). + val = p0 ~mod+ a2 + if val > 255 { + if (val & 0x8000_0000) <> 0 { + val = 0 + } else { + val = 255 + } + } + args.workbuf[p0_idx] = (val & 0xFF) as base.u8 + + // q0 = clamp(q0 - a1, 0, 255). + val = q0 ~mod- a1 + if val > 255 { + if (val & 0x8000_0000) <> 0 { + val = 0 + } else { + val = 255 + } + } + args.workbuf[args.q0_idx] = (val & 0xFF) as base.u8 +} + +// clamp15_asr3! performs arithmetic right shift by 3 and clamps to [-16, 15]. +// Input is in u32 two's complement. +pri func decoder.clamp15_asr3!(v: base.u32) base.u32 { + var result : base.u32 + + // Arithmetic right shift by 3. + if (args.v & 0x8000_0000) <> 0 { + result = (args.v >> 3) | 0xE000_0000 + } else { + result = args.v >> 3 + } + + // Clamp to [-16, 15]. + if (result & 0x8000_0000) <> 0 { + // Negative. + if result < 0xFFFF_FFF0 { + result = 0xFFFF_FFF0 + } + } else { + // Positive. + if result > 15 { + result = 15 + } + } + + return result +} + +// clamp127! clamps a signed value (in u32 two's complement) to [-128, 127]. +pri func decoder.clamp127!(v: base.u32) base.u32 { + if (args.v & 0x8000_0000) <> 0 { + if args.v < 0xFFFF_FF80 { + return 0xFFFF_FF80 + } + } else { + if args.v > 127 { + return 127 + } + } + return args.v +} + +// abs_u32! returns the absolute value of a signed value in u32 two's complement. +// Result is always in [0, 0x8000_0000]. +pri func decoder.abs_u32!(v: base.u32) base.u32 { + if (args.v & 0x8000_0000) <> 0 { + return 0 ~mod- args.v + } + return args.v +} + +// clamp255! clamps a signed value (in u32 two's complement) to [0, 255]. +pri func decoder.clamp255!(v: base.u32) base.u32 { + if (args.v & 0x8000_0000) <> 0 { + return 0 + } + if args.v > 255 { + return 255 + } + return args.v +} + +// ---- Normal filter (filter_type=0) ---- + +// apply_normal_filter_all! applies the normal loop filter (filter_type=0). +// Processes Y, U, and V planes. +pri func decoder.apply_normal_filter_all!(workbuf: slice base.u8) { + var mby : base.u32 + + mby = 0 + while mby < this.mb_height { + this.apply_normal_filter_row!(workbuf: args.workbuf, mby: mby) + if mby < 0x3FF { + mby += 1 + } + } +} + +// apply_normal_filter_row! applies the normal loop filter to a single MB row. +pri func decoder.apply_normal_filter_row!(workbuf: slice base.u8, mby: base.u32) { + var mbx : base.u32 + var mb_idx : base.u32 + var f_level : base.u32 + var f_ilevel : base.u32 + var f_hlevel : base.u32 + var has_inner : base.bool + var y_off : base.u64 + var u_off : base.u64 + var v_off : base.u64 + + mbx = 0 + while mbx < this.mb_width { + mb_idx = ((args.mby & 1) * 0x400) ~mod+ mbx + if mb_idx >= 0x800 { + mbx ~mod+= 1 + continue + } + + f_level = this.mb_filter_level[mb_idx] as base.u32 + if f_level == 0 { + mbx ~mod+= 1 + continue + } + + f_ilevel = this.mb_filter_ilevel[mb_idx] as base.u32 + f_hlevel = this.mb_filter_hlevel[mb_idx] as base.u32 + has_inner = this.mb_filter_inner[mb_idx] <> 0 + + y_off = ((args.mby as base.u64) * 16 * (this.y_stride as base.u64)) + + ((mbx as base.u64) * 16) + u_off = this.workbuf_offset_y_end + + ((args.mby as base.u64) * 8 * (this.uv_stride as base.u64)) + + ((mbx as base.u64) * 8) + v_off = this.workbuf_offset_u_end + + ((args.mby as base.u64) * 8 * (this.uv_stride as base.u64)) + + ((mbx as base.u64) * 8) + + // --- Vertical edges (left-to-right filtering) --- + + if mbx > 0 { + this.normal_hfilter_mb_16!(workbuf: args.workbuf, + q0_off: y_off, + level: f_level ~mod+ 4, ilevel: f_ilevel, hlevel: f_hlevel) + this.normal_hfilter_mb_uv!(workbuf: args.workbuf, + u_off: u_off, v_off: v_off, + level: f_level ~mod+ 4, ilevel: f_ilevel, hlevel: f_hlevel) + } + + if has_inner { + this.normal_hfilter_inner_16!(workbuf: args.workbuf, + q0_off: y_off ~mod+ 4, + level: f_level, ilevel: f_ilevel, hlevel: f_hlevel) + this.normal_hfilter_inner_16!(workbuf: args.workbuf, + q0_off: y_off ~mod+ 8, + level: f_level, ilevel: f_ilevel, hlevel: f_hlevel) + this.normal_hfilter_inner_16!(workbuf: args.workbuf, + q0_off: y_off ~mod+ 12, + level: f_level, ilevel: f_ilevel, hlevel: f_hlevel) + this.normal_hfilter_inner_uv!(workbuf: args.workbuf, + u_off: u_off ~mod+ 4, v_off: v_off ~mod+ 4, + level: f_level, ilevel: f_ilevel, hlevel: f_hlevel) + } + + // --- Horizontal edges (top-to-bottom filtering) --- + + if args.mby > 0 { + this.normal_vfilter_mb_16!(workbuf: args.workbuf, + q0_off: y_off, + level: f_level ~mod+ 4, ilevel: f_ilevel, hlevel: f_hlevel) + this.normal_vfilter_mb_uv!(workbuf: args.workbuf, + u_off: u_off, v_off: v_off, + level: f_level ~mod+ 4, ilevel: f_ilevel, hlevel: f_hlevel) + } + + if has_inner { + this.normal_vfilter_inner_16!(workbuf: args.workbuf, + q0_off: y_off ~mod+ (4 * (this.y_stride as base.u64)), + level: f_level, ilevel: f_ilevel, hlevel: f_hlevel) + this.normal_vfilter_inner_16!(workbuf: args.workbuf, + q0_off: y_off ~mod+ (8 * (this.y_stride as base.u64)), + level: f_level, ilevel: f_ilevel, hlevel: f_hlevel) + this.normal_vfilter_inner_16!(workbuf: args.workbuf, + q0_off: y_off ~mod+ (12 * (this.y_stride as base.u64)), + level: f_level, ilevel: f_ilevel, hlevel: f_hlevel) + this.normal_vfilter_inner_uv!(workbuf: args.workbuf, + u_off: u_off ~mod+ (4 * (this.uv_stride as base.u64)), + v_off: v_off ~mod+ (4 * (this.uv_stride as base.u64)), + level: f_level, ilevel: f_ilevel, hlevel: f_hlevel) + } + + if mbx < 0x3FF { + mbx += 1 + } + } +} + +// filter246! implements Go's filter246 — the normal loop filter for one edge position. +// Reads 8 pixels (p3,p2,p1,p0,q0,q1,q2,q3) and filters 2, 4, or 6 of them. +pri func decoder.filter246!(workbuf: slice base.u8, q0_idx: base.u64, step: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32, four_not_six: base.bool) { + var p3_idx : base.u64 + var p2_idx : base.u64 + var p1_idx : base.u64 + var p0_idx : base.u64 + var q1_idx : base.u64 + var q2_idx : base.u64 + var q3_idx : base.u64 + var p3 : base.u32 + var p2 : base.u32 + var p1 : base.u32 + var p0 : base.u32 + var q0 : base.u32 + var q1 : base.u32 + var q2 : base.u32 + var q3 : base.u32 + var a : base.u32 + var a1 : base.u32 + var a2 : base.u32 + var a3 : base.u32 + var t1 : base.u32 + var t2 : base.u32 + + // Compute pixel indices: p3, p2, p1, p0 | q0, q1, q2, q3. + if args.q0_idx < args.step { + return nothing + } + p0_idx = args.q0_idx - args.step + if p0_idx < args.step { + return nothing + } + p1_idx = p0_idx - args.step + if p1_idx < args.step { + return nothing + } + p2_idx = p1_idx - args.step + if p2_idx < args.step { + return nothing + } + p3_idx = p2_idx - args.step + q1_idx = args.q0_idx ~mod+ args.step + q2_idx = q1_idx ~mod+ args.step + q3_idx = q2_idx ~mod+ args.step + + // Bounds check. + if (q3_idx >= args.workbuf.length()) or + (q2_idx >= args.workbuf.length()) or + (q1_idx >= args.workbuf.length()) or + (args.q0_idx >= args.workbuf.length()) or + (p0_idx >= args.workbuf.length()) or + (p1_idx >= args.workbuf.length()) or + (p2_idx >= args.workbuf.length()) or + (p3_idx >= args.workbuf.length()) { + return nothing + } + + p3 = args.workbuf[p3_idx] as base.u32 + p2 = args.workbuf[p2_idx] as base.u32 + p1 = args.workbuf[p1_idx] as base.u32 + p0 = args.workbuf[p0_idx] as base.u32 + q0 = args.workbuf[args.q0_idx] as base.u32 + q1 = args.workbuf[q1_idx] as base.u32 + q2 = args.workbuf[q2_idx] as base.u32 + q3 = args.workbuf[q3_idx] as base.u32 + + // First threshold: abs(p0-q0)*2 + abs(p1-q1)/2 > level. + t1 = this.abs_u32!(v: p0 ~mod- q0) + t1 = t1 & 0xFF + t2 = this.abs_u32!(v: p1 ~mod- q1) + t2 = t2 & 0xFF + if ((t1 * 2) + (t2 >> 1)) > args.level { + return nothing + } + + // Second threshold (ilevel): check smoothness of all 8 pixels. + t1 = this.abs_u32!(v: p3 ~mod- p2) + if t1 > args.ilevel { + return nothing + } + t1 = this.abs_u32!(v: p2 ~mod- p1) + if t1 > args.ilevel { + return nothing + } + t1 = this.abs_u32!(v: p1 ~mod- p0) + if t1 > args.ilevel { + return nothing + } + t1 = this.abs_u32!(v: q1 ~mod- q0) + if t1 > args.ilevel { + return nothing + } + t1 = this.abs_u32!(v: q2 ~mod- q1) + if t1 > args.ilevel { + return nothing + } + t1 = this.abs_u32!(v: q3 ~mod- q2) + if t1 > args.ilevel { + return nothing + } + + // Third threshold (hlevel): high edge variance check. + t1 = this.abs_u32!(v: p1 ~mod- p0) + t2 = this.abs_u32!(v: q1 ~mod- q0) + if (t1 > args.hlevel) or (t2 > args.hlevel) { + // Filter 2 pixels (same as simple filter core). + t1 = this.clamp127!(v: p1 ~mod- q1) + a = (3 ~mod* (q0 ~mod- p0)) ~mod+ t1 + a1 = this.clamp15_asr3!(v: a ~mod+ 4) + a2 = this.clamp15_asr3!(v: a ~mod+ 3) + t1 = this.clamp255!(v: p0 ~mod+ a2) + args.workbuf[p0_idx] = (t1 & 0xFF) as base.u8 + t1 = this.clamp255!(v: q0 ~mod- a1) + args.workbuf[args.q0_idx] = (t1 & 0xFF) as base.u8 + } else if args.four_not_six { + // Filter 4 pixels. + a = 3 ~mod* (q0 ~mod- p0) + a1 = this.clamp15_asr3!(v: a ~mod+ 4) + a2 = this.clamp15_asr3!(v: a ~mod+ 3) + // a3 = (a1 + 1) >> 1, signed arithmetic shift. + a3 = a1 ~mod+ 1 + if (a3 & 0x8000_0000) <> 0 { + a3 = (a3 >> 1) | 0x8000_0000 + } else { + a3 >>= 1 + } + t1 = this.clamp255!(v: p1 ~mod+ a3) + args.workbuf[p1_idx] = (t1 & 0xFF) as base.u8 + t1 = this.clamp255!(v: p0 ~mod+ a2) + args.workbuf[p0_idx] = (t1 & 0xFF) as base.u8 + t1 = this.clamp255!(v: q0 ~mod- a1) + args.workbuf[args.q0_idx] = (t1 & 0xFF) as base.u8 + t1 = this.clamp255!(v: q1 ~mod- a3) + args.workbuf[q1_idx] = (t1 & 0xFF) as base.u8 + } else { + // Filter 6 pixels. + t1 = this.clamp127!(v: p1 ~mod- q1) + t2 = (3 ~mod* (q0 ~mod- p0)) ~mod+ t1 + a = this.clamp127!(v: t2) + // Arithmetic right shift by 7 for signed values. + a1 = this.signed_shift_right_7!(v: (27 ~mod* a) ~mod+ 63) + a2 = this.signed_shift_right_7!(v: (18 ~mod* a) ~mod+ 63) + a3 = this.signed_shift_right_7!(v: (9 ~mod* a) ~mod+ 63) + t1 = this.clamp255!(v: p2 ~mod+ a3) + args.workbuf[p2_idx] = (t1 & 0xFF) as base.u8 + t1 = this.clamp255!(v: p1 ~mod+ a2) + args.workbuf[p1_idx] = (t1 & 0xFF) as base.u8 + t1 = this.clamp255!(v: p0 ~mod+ a1) + args.workbuf[p0_idx] = (t1 & 0xFF) as base.u8 + t1 = this.clamp255!(v: q0 ~mod- a1) + args.workbuf[args.q0_idx] = (t1 & 0xFF) as base.u8 + t1 = this.clamp255!(v: q1 ~mod- a2) + args.workbuf[q1_idx] = (t1 & 0xFF) as base.u8 + t1 = this.clamp255!(v: q2 ~mod- a3) + args.workbuf[q2_idx] = (t1 & 0xFF) as base.u8 + } +} + +// signed_shift_right_7! performs arithmetic right shift by 7 on a u32 two's complement value. +pri func decoder.signed_shift_right_7!(v: base.u32) base.u32 { + if (args.v & 0x8000_0000) <> 0 { + return (args.v >> 7) | 0xFE00_0000 + } + return args.v >> 7 +} diff --git a/std/vp8/decode_filter_arm_neon.wuffs b/std/vp8/decode_filter_arm_neon.wuffs new file mode 100644 index 000000000..563014f5b --- /dev/null +++ b/std/vp8/decode_filter_arm_neon.wuffs @@ -0,0 +1,3124 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 loop filter, ARM NEON version. +// +// Processes 16 contiguous pixels at a horizontal edge in parallel. +// Algorithm from libwebp (SimpleVFilter16_NEON / VFilter16_NEON): +// 1. NeedsFilter: abs(p0-q0)*2 + abs(p1-q1)/2 <= limit (per-byte mask) +// 2. Convert u8 to i8 (XOR with 0x80) +// 3. GetBaseDelta: delta = sat_i8(3*(q0-p0) + sat_i8(p1-q1)) +// 4. DoSimpleFilter: v4 = asr3_i8(delta+4), v3 = asr3_i8(delta+3) +// q0 -= v4, p0 += v3 (saturating i8) +// 5. Convert back to u8 + +pri func decoder.simple_vfilter_16_arm_neon!(workbuf: slice base.u8, q0_off: base.u64, limit: base.u32), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + + // NEON registers. + var p1 : base.arm_neon_u8x16 + var p0 : base.arm_neon_u8x16 + var q0 : base.arm_neon_u8x16 + var q1 : base.arm_neon_u8x16 + var sign_bit : base.arm_neon_u8x16 + var kFE : base.arm_neon_u8x16 + var m_thresh : base.arm_neon_u8x16 + var k3 : base.arm_neon_u8x16 + var k4 : base.arm_neon_u8x16 + var mask : base.arm_neon_u8x16 + var t1 : base.arm_neon_u8x16 + var t2 : base.arm_neon_u8x16 + var t3 : base.arm_neon_u8x16 + var delta : base.arm_neon_u8x16 + var v3 : base.arm_neon_u8x16 + var v4 : base.arm_neon_u8x16 + var zero : base.arm_neon_u8x16 + + var wb : slice base.u8 + + // Need at least 2*stride bytes before q0_off for p1 and p0 rows. + if args.q0_off < (2 * (this.y_stride as base.u64)) { + return nothing + } + + wb = args.workbuf + + // Reslice wb to p1 position: q0_off - 2*stride. + if (args.q0_off - (2 * (this.y_stride as base.u64))) <= wb.length() { + wb = wb[args.q0_off - (2 * (this.y_stride as base.u64)) ..] + } else { + return nothing + } + + // Load p1 row (16 contiguous bytes). + if 16 > wb.length() { + return nothing + } + p1 = util.make_u8x16_slice128(a: wb[.. 16]) + + // Advance by stride to p0 row. + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + // Load p0 row. + if 16 > wb.length() { + return nothing + } + p0 = util.make_u8x16_slice128(a: wb[.. 16]) + + // Advance by stride to q0 row. + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + // Load q0 row. + if 16 > wb.length() { + return nothing + } + q0 = util.make_u8x16_slice128(a: wb[.. 16]) + + // Advance by stride to q1 row. + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + // Load q1 row. + if 16 > wb.length() { + return nothing + } + q1 = util.make_u8x16_slice128(a: wb[.. 16]) + + // Set up constants. + zero = util.make_u8x16_repeat(a: 0) + sign_bit = util.make_u8x16_repeat(a: 0x80) + kFE = util.make_u8x16_repeat(a: 0xFE) + m_thresh = util.make_u8x16_repeat(a: (args.limit & 0xFF) as base.u8) + k3 = util.make_u8x16_repeat(a: 3) + k4 = util.make_u8x16_repeat(a: 4) + + // ---- NeedsFilter: abs(p0-q0)*2 + abs(p1-q1)/2 <= limit ---- + // abs(a-b) for u8 via NEON: vabdq_u8. + t1 = p1.vabdq_u8(b: q1) + // abs(p1-q1) / 2: clear LSB then shift right 1. + t2 = t1.vandq_u8(b: kFE) + t2 = t2.vshrq_n_u8(b: 1) + // abs(p0-q0) * 2: abd + saturating double. + t3 = p0.vabdq_u8(b: q0) + t3 = t3.vqaddq_u8(b: t3) + // abs(p0-q0)*2 + abs(p1-q1)/2. + t3 = t3.vqaddq_u8(b: t2) + // mask: 0xFF where value <= thresh, 0x00 where > thresh. + // subs_u8 saturates to 0 when t3 <= m_thresh. + mask = t3.vqsubq_u8(b: m_thresh) + mask = mask.vceqq_u8(b: zero) + + // ---- GetBaseDelta (i8 domain) ---- + // Convert to signed by XOR with 0x80. + p1 = p1.veorq_u8(b: sign_bit) + p0 = p0.veorq_u8(b: sign_bit) + q0 = q0.veorq_u8(b: sign_bit) + q1 = q1.veorq_u8(b: sign_bit) + + // delta = sat_i8(3*(q0-p0) + sat_i8(p1-q1)) + // Accumulate via saturating signed adds to avoid overflow. + t1 = p1.vqsubq_s8(b: q1) // p1 - q1 (saturating i8) + t2 = q0.vqsubq_s8(b: p0) // q0 - p0 + t1 = t1.vqaddq_s8(b: t2) // (p1-q1) + 1*(q0-p0) + t1 = t1.vqaddq_s8(b: t2) // (p1-q1) + 2*(q0-p0) + delta = t1.vqaddq_s8(b: t2) // (p1-q1) + 3*(q0-p0) + + // Mask: only apply filter where NeedsFilter passed. + delta = delta.vandq_u8(b: mask) + + // ---- DoSimpleFilter ---- + // v4 = signed_byte_shr3(delta + 4) + v4 = delta.vqaddq_s8(b: k4) + v4 = v4.vshrq_n_s8(b: 3) + + // v3 = signed_byte_shr3(delta + 3) + v3 = delta.vqaddq_s8(b: k3) + v3 = v3.vshrq_n_s8(b: 3) + + // Apply: q0 -= v4, p0 += v3 (saturating i8). + q0 = q0.vqsubq_s8(b: v4) + p0 = p0.vqaddq_s8(b: v3) + + // Convert back to unsigned. + p0 = p0.veorq_u8(b: sign_bit) + q0 = q0.veorq_u8(b: sign_bit) + + // ---- Store p0 and q0 back ---- + if args.q0_off < (this.y_stride as base.u64) { + return nothing + } + if (args.q0_off - (this.y_stride as base.u64)) <= args.workbuf.length() { + args.workbuf = args.workbuf[args.q0_off - (this.y_stride as base.u64) ..] + } else { + return nothing + } + + // Store p0 row. + if 16 <= args.workbuf.length() { + p0.store_slice128!(a: args.workbuf[.. 16]) + } + + // Advance by stride to q0. + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + // Store q0 row. + if 16 <= args.workbuf.length() { + q0.store_slice128!(a: args.workbuf[.. 16]) + } +} + +// VP8 normal loop filter (filter4), ARM NEON version. +// +// Filters 16 contiguous pixels at a horizontal inner sub-block edge. +// Loads 8 rows (p3..q3), computes NeedsFilter2 + HEV masks, then applies +// filter2 (HEV) or filter4 (!HEV). + +pri func decoder.normal_vfilter_inner_16_arm_neon!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + var wb : slice base.u8 + + var p3 : base.arm_neon_u8x16 + var p2 : base.arm_neon_u8x16 + var p1 : base.arm_neon_u8x16 + var p0 : base.arm_neon_u8x16 + var q0 : base.arm_neon_u8x16 + var q1 : base.arm_neon_u8x16 + var q2 : base.arm_neon_u8x16 + var q3 : base.arm_neon_u8x16 + var zero : base.arm_neon_u8x16 + var sign_bit : base.arm_neon_u8x16 + var kFE : base.arm_neon_u8x16 + var m_thresh : base.arm_neon_u8x16 + var m_ithresh : base.arm_neon_u8x16 + var m_hthresh : base.arm_neon_u8x16 + var k1 : base.arm_neon_u8x16 + var k3 : base.arm_neon_u8x16 + var k4 : base.arm_neon_u8x16 + var mask : base.arm_neon_u8x16 + var not_hev : base.arm_neon_u8x16 + var delta : base.arm_neon_u8x16 + var v3 : base.arm_neon_u8x16 + var v4 : base.arm_neon_u8x16 + var a3 : base.arm_neon_u8x16 + var t1 : base.arm_neon_u8x16 + var t2 : base.arm_neon_u8x16 + var t3 : base.arm_neon_u8x16 + + // Need at least 4*stride bytes before q0_off for p3 row. + if args.q0_off < (4 * (this.y_stride as base.u64)) { + return nothing + } + + wb = args.workbuf + + // Reslice to p3 position. + if (args.q0_off - (4 * (this.y_stride as base.u64))) <= wb.length() { + wb = wb[args.q0_off - (4 * (this.y_stride as base.u64)) ..] + } else { + return nothing + } + + // Load 8 rows: p3, p2, p1, p0, q0, q1, q2, q3. + if 16 > wb.length() { + return nothing + } + p3 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + p2 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + p1 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + p0 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + q0 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + q1 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + q2 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + q3 = util.make_u8x16_slice128(a: wb[.. 16]) + + // Constants. + zero = util.make_u8x16_repeat(a: 0) + sign_bit = util.make_u8x16_repeat(a: 0x80) + kFE = util.make_u8x16_repeat(a: 0xFE) + m_thresh = util.make_u8x16_repeat(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_u8x16_repeat(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_u8x16_repeat(a: (args.hlevel & 0xFF) as base.u8) + k1 = util.make_u8x16_repeat(a: 1) + k3 = util.make_u8x16_repeat(a: 3) + k4 = util.make_u8x16_repeat(a: 4) + + // ---- NeedsFilter: abs(p0-q0)*2 + abs(p1-q1)/2 <= level ---- + t1 = p1.vabdq_u8(b: q1) + t2 = t1.vandq_u8(b: kFE) + t2 = t2.vshrq_n_u8(b: 1) + t3 = p0.vabdq_u8(b: q0) + t3 = t3.vqaddq_u8(b: t3) + t3 = t3.vqaddq_u8(b: t2) + mask = t3.vqsubq_u8(b: m_thresh) + mask = mask.vceqq_u8(b: zero) + + // ---- NeedsFilter2: ilevel checks on 6 adjacent pairs ---- + t1 = p3.vabdq_u8(b: p2) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = p2.vabdq_u8(b: p1) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = p1.vabdq_u8(b: p0) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q0.vabdq_u8(b: q1) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q1.vabdq_u8(b: q2) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q2.vabdq_u8(b: q3) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + + // ---- GetNotHEV: 0xFF where abs(p1-p0) <= hlevel AND abs(q1-q0) <= hlevel ---- + t1 = p1.vabdq_u8(b: p0) + t2 = q1.vabdq_u8(b: q0) + t3 = t1.vqsubq_u8(b: m_hthresh).vorrq_u8(b: t2.vqsubq_u8(b: m_hthresh)) + not_hev = t3.vceqq_u8(b: zero) + + // ---- Convert p1, p0, q0, q1 to signed (XOR 0x80) ---- + p1 = p1.veorq_u8(b: sign_bit) + p0 = p0.veorq_u8(b: sign_bit) + q0 = q0.veorq_u8(b: sign_bit) + q1 = q1.veorq_u8(b: sign_bit) + + // ---- Combined delta ---- + // HEV: delta = 3*(q0-p0) + (p1-q1) [filter2] + // !HEV: delta = 3*(q0-p0) [filter4 uses no p1-q1] + t1 = p1.vqsubq_s8(b: q1) // p1-q1 (sat i8) + t1 = t1.vbicq_u8(b: not_hev) // zero where !HEV: t1 & ~not_hev + t2 = q0.vqsubq_s8(b: p0) // q0-p0 + t1 = t1.vqaddq_s8(b: t2) + t1 = t1.vqaddq_s8(b: t2) + delta = t1.vqaddq_s8(b: t2) // 3*(q0-p0) + hev*(p1-q1) + delta = delta.vandq_u8(b: mask) + + // ---- v4 = SignedShift(delta+4, 3), v3 = SignedShift(delta+3, 3) ---- + v4 = delta.vqaddq_s8(b: k4) + v4 = v4.vshrq_n_s8(b: 3) + + v3 = delta.vqaddq_s8(b: k3) + v3 = v3.vshrq_n_s8(b: 3) + + // Apply to p0, q0 (both HEV and !HEV use these). + q0 = q0.vqsubq_s8(b: v4) + p0 = p0.vqaddq_s8(b: v3) + + // ---- Filter4 !HEV: a3 = SignedShift(v4+1, 1) & not_hev ---- + a3 = v4.vqaddq_s8(b: k1) + a3 = a3.vshrq_n_s8(b: 1) + a3 = a3.vandq_u8(b: not_hev) + + // Apply to p1, q1 (only !HEV). + q1 = q1.vqsubq_s8(b: a3) + p1 = p1.vqaddq_s8(b: a3) + + // ---- Convert back to unsigned ---- + p1 = p1.veorq_u8(b: sign_bit) + p0 = p0.veorq_u8(b: sign_bit) + q0 = q0.veorq_u8(b: sign_bit) + q1 = q1.veorq_u8(b: sign_bit) + + // ---- Store p1, p0, q0, q1 ---- + if args.q0_off < (2 * (this.y_stride as base.u64)) { + return nothing + } + if (args.q0_off - (2 * (this.y_stride as base.u64))) <= args.workbuf.length() { + args.workbuf = args.workbuf[args.q0_off - (2 * (this.y_stride as base.u64)) ..] + } else { + return nothing + } + + if 16 <= args.workbuf.length() { + p1.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 16 <= args.workbuf.length() { + p0.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 16 <= args.workbuf.length() { + q0.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 16 <= args.workbuf.length() { + q1.store_slice128!(a: args.workbuf[.. 16]) + } +} + +// VP8 normal loop filter (filter6) for 16-pixel MB horizontal edges, ARM NEON. +// +// Stronger filter applied at macroblock boundaries. Uses weighted deltas +// (27/18/9) for !HEV pixels, modifying 6 rows (p2..q2). + +pri func decoder.normal_vfilter_mb_16_arm_neon!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + var wb : slice base.u8 + + var p3 : base.arm_neon_u8x16 + var p2 : base.arm_neon_u8x16 + var p1 : base.arm_neon_u8x16 + var p0 : base.arm_neon_u8x16 + var q0 : base.arm_neon_u8x16 + var q1 : base.arm_neon_u8x16 + var q2 : base.arm_neon_u8x16 + var q3 : base.arm_neon_u8x16 + var zero : base.arm_neon_u8x16 + var sign_bit : base.arm_neon_u8x16 + var kFE : base.arm_neon_u8x16 + var m_thresh : base.arm_neon_u8x16 + var m_ithresh : base.arm_neon_u8x16 + var m_hthresh : base.arm_neon_u8x16 + var k3 : base.arm_neon_u8x16 + var k4 : base.arm_neon_u8x16 + var mask : base.arm_neon_u8x16 + var not_hev : base.arm_neon_u8x16 + var delta : base.arm_neon_u8x16 + var v3 : base.arm_neon_u8x16 + var v4 : base.arm_neon_u8x16 + var a1 : base.arm_neon_u8x16 + var a2 : base.arm_neon_u8x16 + var a3 : base.arm_neon_u8x16 + var t1 : base.arm_neon_u8x16 + var t2 : base.arm_neon_u8x16 + var t3 : base.arm_neon_u8x16 + var p0_adj : base.arm_neon_u8x16 + var q0_adj : base.arm_neon_u8x16 + + // Widening variables for filter6 multiply (i8 -> i16). + var d_lo : base.arm_neon_u8x8 + var d_hi : base.arm_neon_u8x8 + var lo : base.arm_neon_u16x8 + var hi : base.arm_neon_u16x8 + var k63_16 : base.arm_neon_u16x8 + var tmp_lo : base.arm_neon_u16x8 + var tmp_hi : base.arm_neon_u16x8 + var narrow_lo : base.arm_neon_u8x8 + var narrow_hi : base.arm_neon_u8x8 + + // Need at least 4*stride bytes before q0_off. + if args.q0_off < (4 * (this.y_stride as base.u64)) { + return nothing + } + + wb = args.workbuf + if (args.q0_off - (4 * (this.y_stride as base.u64))) <= wb.length() { + wb = wb[args.q0_off - (4 * (this.y_stride as base.u64)) ..] + } else { + return nothing + } + + // Load 8 rows: p3, p2, p1, p0, q0, q1, q2, q3. + if 16 > wb.length() { + return nothing + } + p3 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + p2 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + p1 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + p0 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + q0 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + q1 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + q2 = util.make_u8x16_slice128(a: wb[.. 16]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 16 > wb.length() { + return nothing + } + q3 = util.make_u8x16_slice128(a: wb[.. 16]) + + // Constants. + zero = util.make_u8x16_repeat(a: 0) + sign_bit = util.make_u8x16_repeat(a: 0x80) + kFE = util.make_u8x16_repeat(a: 0xFE) + m_thresh = util.make_u8x16_repeat(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_u8x16_repeat(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_u8x16_repeat(a: (args.hlevel & 0xFF) as base.u8) + k3 = util.make_u8x16_repeat(a: 3) + k4 = util.make_u8x16_repeat(a: 4) + k63_16 = util.make_u16x8_repeat(a: 63) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1.vabdq_u8(b: q1) + t2 = t1.vandq_u8(b: kFE).vshrq_n_u8(b: 1) + t3 = p0.vabdq_u8(b: q0) + t3 = t3.vqaddq_u8(b: t3) + t3 = t3.vqaddq_u8(b: t2) + mask = t3.vqsubq_u8(b: m_thresh).vceqq_u8(b: zero) + + t1 = p3.vabdq_u8(b: p2) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = p2.vabdq_u8(b: p1) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = p1.vabdq_u8(b: p0) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q0.vabdq_u8(b: q1) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q1.vabdq_u8(b: q2) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q2.vabdq_u8(b: q3) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1.vabdq_u8(b: p0) + t2 = q1.vabdq_u8(b: q0) + t3 = t1.vqsubq_u8(b: m_hthresh).vorrq_u8(b: t2.vqsubq_u8(b: m_hthresh)) + not_hev = t3.vceqq_u8(b: zero) + + // ---- Convert p2, p1, p0, q0, q1, q2 to signed ---- + p2 = p2.veorq_u8(b: sign_bit) + p1 = p1.veorq_u8(b: sign_bit) + p0 = p0.veorq_u8(b: sign_bit) + q0 = q0.veorq_u8(b: sign_bit) + q1 = q1.veorq_u8(b: sign_bit) + q2 = q2.veorq_u8(b: sign_bit) + + // ---- Delta = sat_i8(3*(q0-p0) + sat_i8(p1-q1)) for ALL positions ---- + // Filter6 uses p1-q1 in both HEV and !HEV paths. + t1 = p1.vqsubq_s8(b: q1) + t2 = q0.vqsubq_s8(b: p0) + t1 = t1.vqaddq_s8(b: t2) + t1 = t1.vqaddq_s8(b: t2) + delta = t1.vqaddq_s8(b: t2) + delta = delta.vandq_u8(b: mask) + + // ---- Filter2 (HEV path): v4 = asr3(delta+4), v3 = asr3(delta+3) ---- + v4 = delta.vqaddq_s8(b: k4) + v4 = v4.vshrq_n_s8(b: 3) + + v3 = delta.vqaddq_s8(b: k3) + v3 = v3.vshrq_n_s8(b: 3) + + // ---- Filter6 (!HEV path): widen delta to i16, multiply by 27/18/9 ---- + // Sign-extend delta i8 -> i16 (split into lo and hi halves). + d_lo = delta.vget_low_u8() + d_hi = delta.vget_high_u8() + lo = d_lo.vmovl_s8() + hi = d_hi.vmovl_s8() + + // a1 = (27 * delta + 63) >> 7 + tmp_lo = lo.vmulq_n_u16(b: 27) + tmp_lo = tmp_lo.vaddq_u16(b: k63_16) + tmp_lo = tmp_lo.vshrq_n_s16(b: 7) + tmp_hi = hi.vmulq_n_u16(b: 27) + tmp_hi = tmp_hi.vaddq_u16(b: k63_16) + tmp_hi = tmp_hi.vshrq_n_s16(b: 7) + narrow_lo = tmp_lo.vqmovn_s16() + narrow_hi = tmp_hi.vqmovn_s16() + a1 = narrow_lo.vcombine_u8(b: narrow_hi) + + // a2 = (18 * delta + 63) >> 7 + tmp_lo = lo.vmulq_n_u16(b: 18) + tmp_lo = tmp_lo.vaddq_u16(b: k63_16) + tmp_lo = tmp_lo.vshrq_n_s16(b: 7) + tmp_hi = hi.vmulq_n_u16(b: 18) + tmp_hi = tmp_hi.vaddq_u16(b: k63_16) + tmp_hi = tmp_hi.vshrq_n_s16(b: 7) + narrow_lo = tmp_lo.vqmovn_s16() + narrow_hi = tmp_hi.vqmovn_s16() + a2 = narrow_lo.vcombine_u8(b: narrow_hi) + + // a3 = (9 * delta + 63) >> 7 + tmp_lo = lo.vmulq_n_u16(b: 9) + tmp_lo = tmp_lo.vaddq_u16(b: k63_16) + tmp_lo = tmp_lo.vshrq_n_s16(b: 7) + tmp_hi = hi.vmulq_n_u16(b: 9) + tmp_hi = tmp_hi.vaddq_u16(b: k63_16) + tmp_hi = tmp_hi.vshrq_n_s16(b: 7) + narrow_lo = tmp_lo.vqmovn_s16() + narrow_hi = tmp_hi.vqmovn_s16() + a3 = narrow_lo.vcombine_u8(b: narrow_hi) + + // ---- Merge HEV (filter2) and !HEV (filter6) results ---- + // p0 += select(hev: v3, !hev: a1) + p0_adj = v3.vbicq_u8(b: not_hev) // v3 where HEV (v3 & ~not_hev) + p0_adj = p0_adj.vorrq_u8(b: a1.vandq_u8(b: not_hev)) // a1 where !HEV + p0 = p0.vqaddq_s8(b: p0_adj) + + // q0 -= select(hev: v4, !hev: a1) + q0_adj = v4.vbicq_u8(b: not_hev) // v4 where HEV + q0_adj = q0_adj.vorrq_u8(b: a1.vandq_u8(b: not_hev)) // a1 where !HEV + q0 = q0.vqsubq_s8(b: q0_adj) + + // p1 += a2 & not_hev (only !HEV) + p1 = p1.vqaddq_s8(b: a2.vandq_u8(b: not_hev)) + // q1 -= a2 & not_hev + q1 = q1.vqsubq_s8(b: a2.vandq_u8(b: not_hev)) + // p2 += a3 & not_hev + p2 = p2.vqaddq_s8(b: a3.vandq_u8(b: not_hev)) + // q2 -= a3 & not_hev + q2 = q2.vqsubq_s8(b: a3.vandq_u8(b: not_hev)) + + // ---- Convert back to unsigned ---- + p2 = p2.veorq_u8(b: sign_bit) + p1 = p1.veorq_u8(b: sign_bit) + p0 = p0.veorq_u8(b: sign_bit) + q0 = q0.veorq_u8(b: sign_bit) + q1 = q1.veorq_u8(b: sign_bit) + q2 = q2.veorq_u8(b: sign_bit) + + // ---- Store p2, p1, p0, q0, q1, q2 ---- + if args.q0_off < (3 * (this.y_stride as base.u64)) { + return nothing + } + if (args.q0_off - (3 * (this.y_stride as base.u64))) <= args.workbuf.length() { + args.workbuf = args.workbuf[args.q0_off - (3 * (this.y_stride as base.u64)) ..] + } else { + return nothing + } + + if 16 <= args.workbuf.length() { + p2.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 16 <= args.workbuf.length() { + p1.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 16 <= args.workbuf.length() { + p0.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 16 <= args.workbuf.length() { + q0.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 16 <= args.workbuf.length() { + q1.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 16 <= args.workbuf.length() { + q2.store_slice128!(a: args.workbuf[.. 16]) + } +} + +// VP8 normal loop filter (filter6) for 8-pixel chroma horizontal MB edges, ARM NEON. +// Same algorithm as the 16-pixel version but uses u8x8 and uv_stride. + +pri func decoder.normal_vfilter_mb_8_arm_neon!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + var wb : slice base.u8 + + var p3 : base.arm_neon_u8x8 + var p2 : base.arm_neon_u8x8 + var p1 : base.arm_neon_u8x8 + var p0 : base.arm_neon_u8x8 + var q0 : base.arm_neon_u8x8 + var q1 : base.arm_neon_u8x8 + var q2 : base.arm_neon_u8x8 + var q3 : base.arm_neon_u8x8 + var zero : base.arm_neon_u8x8 + var sign_bit : base.arm_neon_u8x8 + var kFE : base.arm_neon_u8x8 + var m_thresh : base.arm_neon_u8x8 + var m_ithresh : base.arm_neon_u8x8 + var m_hthresh : base.arm_neon_u8x8 + var k3 : base.arm_neon_u8x8 + var k4 : base.arm_neon_u8x8 + var mask : base.arm_neon_u8x8 + var not_hev : base.arm_neon_u8x8 + var delta : base.arm_neon_u8x8 + var v3 : base.arm_neon_u8x8 + var v4 : base.arm_neon_u8x8 + var a1 : base.arm_neon_u8x8 + var a2 : base.arm_neon_u8x8 + var a3 : base.arm_neon_u8x8 + var t1 : base.arm_neon_u8x8 + var t2 : base.arm_neon_u8x8 + var t3 : base.arm_neon_u8x8 + var p0_adj : base.arm_neon_u8x8 + var q0_adj : base.arm_neon_u8x8 + + // Widening variables for filter6 multiply. + var wide : base.arm_neon_u16x8 + var tmp : base.arm_neon_u16x8 + var k63_16 : base.arm_neon_u16x8 + + if args.q0_off < (4 * (this.uv_stride as base.u64)) { + return nothing + } + + wb = args.workbuf + if (args.q0_off - (4 * (this.uv_stride as base.u64))) <= wb.length() { + wb = wb[args.q0_off - (4 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + // Load 8 rows using uv_stride. + if 8 > wb.length() { + return nothing + } + p3 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + p2 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + p1 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + p0 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + q0 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + q1 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + q2 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + q3 = util.make_u8x8_slice64(a: wb[.. 8]) + + // Constants. + zero = util.make_u8x8_repeat(a: 0) + sign_bit = util.make_u8x8_repeat(a: 0x80) + kFE = util.make_u8x8_repeat(a: 0xFE) + m_thresh = util.make_u8x8_repeat(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_u8x8_repeat(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_u8x8_repeat(a: (args.hlevel & 0xFF) as base.u8) + k3 = util.make_u8x8_repeat(a: 3) + k4 = util.make_u8x8_repeat(a: 4) + k63_16 = util.make_u16x8_repeat(a: 63) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1.vabd_u8(b: q1) + t2 = t1.vand_u8(b: kFE).vshr_n_u8(b: 1) + t3 = p0.vabd_u8(b: q0) + t3 = t3.vqadd_u8(b: t3) + t3 = t3.vqadd_u8(b: t2) + mask = t3.vqsub_u8(b: m_thresh).vceq_u8(b: zero) + + t1 = p3.vabd_u8(b: p2) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = p2.vabd_u8(b: p1) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = p1.vabd_u8(b: p0) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q0.vabd_u8(b: q1) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q1.vabd_u8(b: q2) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q2.vabd_u8(b: q3) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1.vabd_u8(b: p0) + t2 = q1.vabd_u8(b: q0) + t3 = t1.vqsub_u8(b: m_hthresh).vorr_u8(b: t2.vqsub_u8(b: m_hthresh)) + not_hev = t3.vceq_u8(b: zero) + + // ---- Convert to signed ---- + p2 = p2.veor_u8(b: sign_bit) + p1 = p1.veor_u8(b: sign_bit) + p0 = p0.veor_u8(b: sign_bit) + q0 = q0.veor_u8(b: sign_bit) + q1 = q1.veor_u8(b: sign_bit) + q2 = q2.veor_u8(b: sign_bit) + + // ---- Delta ---- + t1 = p1.vqsub_s8(b: q1) + t2 = q0.vqsub_s8(b: p0) + t1 = t1.vqadd_s8(b: t2) + t1 = t1.vqadd_s8(b: t2) + delta = t1.vqadd_s8(b: t2) + delta = delta.vand_u8(b: mask) + + // ---- Filter2 (HEV path) ---- + v4 = delta.vqadd_s8(b: k4) + v4 = v4.vshr_n_s8(b: 3) + v3 = delta.vqadd_s8(b: k3) + v3 = v3.vshr_n_s8(b: 3) + + // ---- Filter6 (!HEV path): widen to i16, multiply by 27/18/9 ---- + // u8x8 -> u16x8 sign extension (no lo/hi split needed for 8 pixels). + wide = delta.vmovl_s8() + + // a1 = (27 * delta + 63) >> 7 + tmp = wide.vmulq_n_u16(b: 27) + tmp = tmp.vaddq_u16(b: k63_16) + tmp = tmp.vshrq_n_s16(b: 7) + a1 = tmp.vqmovn_s16() + + // a2 = (18 * delta + 63) >> 7 + tmp = wide.vmulq_n_u16(b: 18) + tmp = tmp.vaddq_u16(b: k63_16) + tmp = tmp.vshrq_n_s16(b: 7) + a2 = tmp.vqmovn_s16() + + // a3 = (9 * delta + 63) >> 7 + tmp = wide.vmulq_n_u16(b: 9) + tmp = tmp.vaddq_u16(b: k63_16) + tmp = tmp.vshrq_n_s16(b: 7) + a3 = tmp.vqmovn_s16() + + // ---- Merge HEV and !HEV results ---- + p0_adj = v3.vbic_u8(b: not_hev) + p0_adj = p0_adj.vorr_u8(b: a1.vand_u8(b: not_hev)) + p0 = p0.vqadd_s8(b: p0_adj) + + q0_adj = v4.vbic_u8(b: not_hev) + q0_adj = q0_adj.vorr_u8(b: a1.vand_u8(b: not_hev)) + q0 = q0.vqsub_s8(b: q0_adj) + + p1 = p1.vqadd_s8(b: a2.vand_u8(b: not_hev)) + q1 = q1.vqsub_s8(b: a2.vand_u8(b: not_hev)) + p2 = p2.vqadd_s8(b: a3.vand_u8(b: not_hev)) + q2 = q2.vqsub_s8(b: a3.vand_u8(b: not_hev)) + + // ---- Convert back to unsigned ---- + p2 = p2.veor_u8(b: sign_bit) + p1 = p1.veor_u8(b: sign_bit) + p0 = p0.veor_u8(b: sign_bit) + q0 = q0.veor_u8(b: sign_bit) + q1 = q1.veor_u8(b: sign_bit) + q2 = q2.veor_u8(b: sign_bit) + + // ---- Store p2, p1, p0, q0, q1, q2 ---- + if args.q0_off < (3 * (this.uv_stride as base.u64)) { + return nothing + } + if (args.q0_off - (3 * (this.uv_stride as base.u64))) <= args.workbuf.length() { + args.workbuf = args.workbuf[args.q0_off - (3 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + if 8 <= args.workbuf.length() { + p2.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + p1.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + p0.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + q0.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + q1.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + q2.store_slice64!(a: args.workbuf[.. 8]) + } +} + +// VP8 normal loop filter (filter4) for 8-pixel chroma horizontal inner edges, ARM NEON. + +pri func decoder.normal_vfilter_inner_8_arm_neon!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + var wb : slice base.u8 + + var p3 : base.arm_neon_u8x8 + var p2 : base.arm_neon_u8x8 + var p1 : base.arm_neon_u8x8 + var p0 : base.arm_neon_u8x8 + var q0 : base.arm_neon_u8x8 + var q1 : base.arm_neon_u8x8 + var q2 : base.arm_neon_u8x8 + var q3 : base.arm_neon_u8x8 + var zero : base.arm_neon_u8x8 + var sign_bit : base.arm_neon_u8x8 + var kFE : base.arm_neon_u8x8 + var m_thresh : base.arm_neon_u8x8 + var m_ithresh : base.arm_neon_u8x8 + var m_hthresh : base.arm_neon_u8x8 + var k1 : base.arm_neon_u8x8 + var k3 : base.arm_neon_u8x8 + var k4 : base.arm_neon_u8x8 + var mask : base.arm_neon_u8x8 + var not_hev : base.arm_neon_u8x8 + var delta : base.arm_neon_u8x8 + var v3 : base.arm_neon_u8x8 + var v4 : base.arm_neon_u8x8 + var a3 : base.arm_neon_u8x8 + var t1 : base.arm_neon_u8x8 + var t2 : base.arm_neon_u8x8 + var t3 : base.arm_neon_u8x8 + + if args.q0_off < (4 * (this.uv_stride as base.u64)) { + return nothing + } + + wb = args.workbuf + if (args.q0_off - (4 * (this.uv_stride as base.u64))) <= wb.length() { + wb = wb[args.q0_off - (4 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + // Load 8 rows using uv_stride. + if 8 > wb.length() { + return nothing + } + p3 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + p2 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + p1 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + p0 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + q0 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + q1 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + q2 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + q3 = util.make_u8x8_slice64(a: wb[.. 8]) + + // Constants. + zero = util.make_u8x8_repeat(a: 0) + sign_bit = util.make_u8x8_repeat(a: 0x80) + kFE = util.make_u8x8_repeat(a: 0xFE) + m_thresh = util.make_u8x8_repeat(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_u8x8_repeat(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_u8x8_repeat(a: (args.hlevel & 0xFF) as base.u8) + k1 = util.make_u8x8_repeat(a: 1) + k3 = util.make_u8x8_repeat(a: 3) + k4 = util.make_u8x8_repeat(a: 4) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1.vabd_u8(b: q1) + t2 = t1.vand_u8(b: kFE).vshr_n_u8(b: 1) + t3 = p0.vabd_u8(b: q0) + t3 = t3.vqadd_u8(b: t3) + t3 = t3.vqadd_u8(b: t2) + mask = t3.vqsub_u8(b: m_thresh).vceq_u8(b: zero) + + t1 = p3.vabd_u8(b: p2) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = p2.vabd_u8(b: p1) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = p1.vabd_u8(b: p0) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q0.vabd_u8(b: q1) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q1.vabd_u8(b: q2) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q2.vabd_u8(b: q3) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1.vabd_u8(b: p0) + t2 = q1.vabd_u8(b: q0) + t3 = t1.vqsub_u8(b: m_hthresh).vorr_u8(b: t2.vqsub_u8(b: m_hthresh)) + not_hev = t3.vceq_u8(b: zero) + + // ---- Convert to signed ---- + p1 = p1.veor_u8(b: sign_bit) + p0 = p0.veor_u8(b: sign_bit) + q0 = q0.veor_u8(b: sign_bit) + q1 = q1.veor_u8(b: sign_bit) + + // ---- Delta (filter4: !HEV uses no p1-q1 term) ---- + t1 = p1.vqsub_s8(b: q1) + t1 = t1.vbic_u8(b: not_hev) + t2 = q0.vqsub_s8(b: p0) + t1 = t1.vqadd_s8(b: t2) + t1 = t1.vqadd_s8(b: t2) + delta = t1.vqadd_s8(b: t2) + delta = delta.vand_u8(b: mask) + + // ---- v4, v3 ---- + v4 = delta.vqadd_s8(b: k4) + v4 = v4.vshr_n_s8(b: 3) + v3 = delta.vqadd_s8(b: k3) + v3 = v3.vshr_n_s8(b: 3) + + q0 = q0.vqsub_s8(b: v4) + p0 = p0.vqadd_s8(b: v3) + + // ---- Filter4 !HEV: a3 ---- + a3 = v4.vqadd_s8(b: k1) + a3 = a3.vshr_n_s8(b: 1) + a3 = a3.vand_u8(b: not_hev) + + q1 = q1.vqsub_s8(b: a3) + p1 = p1.vqadd_s8(b: a3) + + // ---- Convert back ---- + p1 = p1.veor_u8(b: sign_bit) + p0 = p0.veor_u8(b: sign_bit) + q0 = q0.veor_u8(b: sign_bit) + q1 = q1.veor_u8(b: sign_bit) + + // ---- Store ---- + if args.q0_off < (2 * (this.uv_stride as base.u64)) { + return nothing + } + if (args.q0_off - (2 * (this.uv_stride as base.u64))) <= args.workbuf.length() { + args.workbuf = args.workbuf[args.q0_off - (2 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + if 8 <= args.workbuf.length() { + p1.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + p0.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + q0.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + q1.store_slice64!(a: args.workbuf[.. 8]) + } +} + +// VP8 normal loop filter (filter6) for 8-row U/V vertical MB edges, ARM NEON. +// +// Horizontal filter: processes a vertical edge where consecutive pixels are at +// step=1 (same row). Uses transpose-filter-transpose: load 8 rows of 8 bytes, +// 8x8 transpose to get column vectors, apply filter6, inverse transpose, store. + +pri func decoder.normal_hfilter_mb_8_arm_neon!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + var wb : slice base.u8 + + // Row/scratch registers (reused across transpose phases). + var r0 : base.arm_neon_u8x8 + var r1 : base.arm_neon_u8x8 + var r2 : base.arm_neon_u8x8 + var r3 : base.arm_neon_u8x8 + var r4 : base.arm_neon_u8x8 + var r5 : base.arm_neon_u8x8 + var r6 : base.arm_neon_u8x8 + var r7 : base.arm_neon_u8x8 + var s0 : base.arm_neon_u8x8 + var s1 : base.arm_neon_u8x8 + var s2 : base.arm_neon_u8x8 + var s3 : base.arm_neon_u8x8 + var s4 : base.arm_neon_u8x8 + var s5 : base.arm_neon_u8x8 + var s6 : base.arm_neon_u8x8 + var s7 : base.arm_neon_u8x8 + + // Pixel columns after forward transpose. + var p3 : base.arm_neon_u8x8 + var p2 : base.arm_neon_u8x8 + var p1 : base.arm_neon_u8x8 + var p0 : base.arm_neon_u8x8 + var q0 : base.arm_neon_u8x8 + var q1 : base.arm_neon_u8x8 + var q2 : base.arm_neon_u8x8 + var q3 : base.arm_neon_u8x8 + + // Filter constants and temporaries. + var zero : base.arm_neon_u8x8 + var sign_bit : base.arm_neon_u8x8 + var kFE : base.arm_neon_u8x8 + var m_thresh : base.arm_neon_u8x8 + var m_ithresh : base.arm_neon_u8x8 + var m_hthresh : base.arm_neon_u8x8 + var k3 : base.arm_neon_u8x8 + var k4 : base.arm_neon_u8x8 + var mask : base.arm_neon_u8x8 + var not_hev : base.arm_neon_u8x8 + var delta : base.arm_neon_u8x8 + var v3 : base.arm_neon_u8x8 + var v4 : base.arm_neon_u8x8 + var a1 : base.arm_neon_u8x8 + var a2 : base.arm_neon_u8x8 + var a3 : base.arm_neon_u8x8 + var t1 : base.arm_neon_u8x8 + var t2 : base.arm_neon_u8x8 + var t3 : base.arm_neon_u8x8 + var p0_adj : base.arm_neon_u8x8 + var q0_adj : base.arm_neon_u8x8 + + // Widening variables for filter6 multiply. + var wide : base.arm_neon_u16x8 + var tmp : base.arm_neon_u16x8 + var k63_16 : base.arm_neon_u16x8 + + // Need 4 bytes before q0 for p3..p0 columns. + if args.q0_off < 4 { + return nothing + } + + wb = args.workbuf + if (args.q0_off - 4) > wb.length() { + return nothing + } + wb = wb[(args.q0_off - 4) ..] + + // ==== Load 8 rows of 8 bytes each ==== + if 8 > wb.length() { + return nothing + } + r0 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r1 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r2 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r3 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r4 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r5 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r6 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r7 = util.make_u8x8_slice64(a: wb[.. 8]) + + // ==== Forward 8x8 transpose: rows → columns ==== + // Step 1: byte-level transpose (2x2 blocks). + s0 = r0.vtrn1_u8(b: r1) + s1 = r0.vtrn2_u8(b: r1) + s2 = r2.vtrn1_u8(b: r3) + s3 = r2.vtrn2_u8(b: r3) + s4 = r4.vtrn1_u8(b: r5) + s5 = r4.vtrn2_u8(b: r5) + s6 = r6.vtrn1_u8(b: r7) + s7 = r6.vtrn2_u8(b: r7) + + // Step 2: halfword-level transpose (4x4 blocks). + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + // Step 3: word-level transpose → pixel columns. + p3 = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + q0 = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + p2 = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + q1 = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + p1 = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + q2 = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + p0 = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + q3 = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + + // ==== Filter6 computation (same as normal_vfilter_mb_8_arm_neon) ==== + + // Constants. + zero = util.make_u8x8_repeat(a: 0) + sign_bit = util.make_u8x8_repeat(a: 0x80) + kFE = util.make_u8x8_repeat(a: 0xFE) + m_thresh = util.make_u8x8_repeat(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_u8x8_repeat(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_u8x8_repeat(a: (args.hlevel & 0xFF) as base.u8) + k3 = util.make_u8x8_repeat(a: 3) + k4 = util.make_u8x8_repeat(a: 4) + k63_16 = util.make_u16x8_repeat(a: 63) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1.vabd_u8(b: q1) + t2 = t1.vand_u8(b: kFE).vshr_n_u8(b: 1) + t3 = p0.vabd_u8(b: q0) + t3 = t3.vqadd_u8(b: t3) + t3 = t3.vqadd_u8(b: t2) + mask = t3.vqsub_u8(b: m_thresh).vceq_u8(b: zero) + + t1 = p3.vabd_u8(b: p2) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = p2.vabd_u8(b: p1) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = p1.vabd_u8(b: p0) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q0.vabd_u8(b: q1) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q1.vabd_u8(b: q2) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q2.vabd_u8(b: q3) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1.vabd_u8(b: p0) + t2 = q1.vabd_u8(b: q0) + t3 = t1.vqsub_u8(b: m_hthresh).vorr_u8(b: t2.vqsub_u8(b: m_hthresh)) + not_hev = t3.vceq_u8(b: zero) + + // ---- Convert to signed ---- + p2 = p2.veor_u8(b: sign_bit) + p1 = p1.veor_u8(b: sign_bit) + p0 = p0.veor_u8(b: sign_bit) + q0 = q0.veor_u8(b: sign_bit) + q1 = q1.veor_u8(b: sign_bit) + q2 = q2.veor_u8(b: sign_bit) + + // ---- Delta ---- + t1 = p1.vqsub_s8(b: q1) + t2 = q0.vqsub_s8(b: p0) + t1 = t1.vqadd_s8(b: t2) + t1 = t1.vqadd_s8(b: t2) + delta = t1.vqadd_s8(b: t2) + delta = delta.vand_u8(b: mask) + + // ---- Filter2 (HEV path) ---- + v4 = delta.vqadd_s8(b: k4) + v4 = v4.vshr_n_s8(b: 3) + v3 = delta.vqadd_s8(b: k3) + v3 = v3.vshr_n_s8(b: 3) + + // ---- Filter6 (!HEV path): widen to i16, multiply by 27/18/9 ---- + wide = delta.vmovl_s8() + + tmp = wide.vmulq_n_u16(b: 27) + tmp = tmp.vaddq_u16(b: k63_16) + tmp = tmp.vshrq_n_s16(b: 7) + a1 = tmp.vqmovn_s16() + + tmp = wide.vmulq_n_u16(b: 18) + tmp = tmp.vaddq_u16(b: k63_16) + tmp = tmp.vshrq_n_s16(b: 7) + a2 = tmp.vqmovn_s16() + + tmp = wide.vmulq_n_u16(b: 9) + tmp = tmp.vaddq_u16(b: k63_16) + tmp = tmp.vshrq_n_s16(b: 7) + a3 = tmp.vqmovn_s16() + + // ---- Merge HEV and !HEV results ---- + p0_adj = v3.vbic_u8(b: not_hev) + p0_adj = p0_adj.vorr_u8(b: a1.vand_u8(b: not_hev)) + p0 = p0.vqadd_s8(b: p0_adj) + + q0_adj = v4.vbic_u8(b: not_hev) + q0_adj = q0_adj.vorr_u8(b: a1.vand_u8(b: not_hev)) + q0 = q0.vqsub_s8(b: q0_adj) + + p1 = p1.vqadd_s8(b: a2.vand_u8(b: not_hev)) + q1 = q1.vqsub_s8(b: a2.vand_u8(b: not_hev)) + p2 = p2.vqadd_s8(b: a3.vand_u8(b: not_hev)) + q2 = q2.vqsub_s8(b: a3.vand_u8(b: not_hev)) + + // ---- Convert back to unsigned ---- + p2 = p2.veor_u8(b: sign_bit) + p1 = p1.veor_u8(b: sign_bit) + p0 = p0.veor_u8(b: sign_bit) + q0 = q0.veor_u8(b: sign_bit) + q1 = q1.veor_u8(b: sign_bit) + q2 = q2.veor_u8(b: sign_bit) + + // ==== Inverse 8x8 transpose: columns → rows ==== + // Step 1: byte-level transpose. + s0 = p3.vtrn1_u8(b: p2) + s1 = p3.vtrn2_u8(b: p2) + s2 = p1.vtrn1_u8(b: p0) + s3 = p1.vtrn2_u8(b: p0) + s4 = q0.vtrn1_u8(b: q1) + s5 = q0.vtrn2_u8(b: q1) + s6 = q2.vtrn1_u8(b: q3) + s7 = q2.vtrn2_u8(b: q3) + + // Step 2: halfword-level transpose. + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + // Step 3: word-level transpose → output rows. + s0 = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + s4 = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + s1 = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + s5 = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + s2 = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + s6 = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + s3 = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + s7 = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + + // ==== Store 8 rows ==== + if (args.q0_off - 4) > args.workbuf.length() { + return nothing + } + args.workbuf = args.workbuf[(args.q0_off - 4) ..] + + if 8 <= args.workbuf.length() { + s0.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s1.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s2.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s3.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s4.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s5.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s6.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s7.store_slice64!(a: args.workbuf[.. 8]) + } +} + +// VP8 normal loop filter (filter4) for 8-row U/V vertical inner edges, ARM NEON. +// +// Same transpose approach as hfilter_mb_8 but uses filter4 (four_not_six=true). + +pri func decoder.normal_hfilter_inner_8_arm_neon!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + var wb : slice base.u8 + + var r0 : base.arm_neon_u8x8 + var r1 : base.arm_neon_u8x8 + var r2 : base.arm_neon_u8x8 + var r3 : base.arm_neon_u8x8 + var r4 : base.arm_neon_u8x8 + var r5 : base.arm_neon_u8x8 + var r6 : base.arm_neon_u8x8 + var r7 : base.arm_neon_u8x8 + var s0 : base.arm_neon_u8x8 + var s1 : base.arm_neon_u8x8 + var s2 : base.arm_neon_u8x8 + var s3 : base.arm_neon_u8x8 + var s4 : base.arm_neon_u8x8 + var s5 : base.arm_neon_u8x8 + var s6 : base.arm_neon_u8x8 + var s7 : base.arm_neon_u8x8 + + var p3 : base.arm_neon_u8x8 + var p2 : base.arm_neon_u8x8 + var p1 : base.arm_neon_u8x8 + var p0 : base.arm_neon_u8x8 + var q0 : base.arm_neon_u8x8 + var q1 : base.arm_neon_u8x8 + var q2 : base.arm_neon_u8x8 + var q3 : base.arm_neon_u8x8 + + var zero : base.arm_neon_u8x8 + var sign_bit : base.arm_neon_u8x8 + var kFE : base.arm_neon_u8x8 + var m_thresh : base.arm_neon_u8x8 + var m_ithresh : base.arm_neon_u8x8 + var m_hthresh : base.arm_neon_u8x8 + var k1 : base.arm_neon_u8x8 + var k3 : base.arm_neon_u8x8 + var k4 : base.arm_neon_u8x8 + var mask : base.arm_neon_u8x8 + var not_hev : base.arm_neon_u8x8 + var delta : base.arm_neon_u8x8 + var v3 : base.arm_neon_u8x8 + var v4 : base.arm_neon_u8x8 + var a3 : base.arm_neon_u8x8 + var t1 : base.arm_neon_u8x8 + var t2 : base.arm_neon_u8x8 + var t3 : base.arm_neon_u8x8 + + if args.q0_off < 4 { + return nothing + } + + wb = args.workbuf + if (args.q0_off - 4) > wb.length() { + return nothing + } + wb = wb[(args.q0_off - 4) ..] + + // ==== Load 8 rows of 8 bytes each ==== + if 8 > wb.length() { + return nothing + } + r0 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r1 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r2 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r3 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r4 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r5 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r6 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.uv_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.uv_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r7 = util.make_u8x8_slice64(a: wb[.. 8]) + + // ==== Forward 8x8 transpose ==== + s0 = r0.vtrn1_u8(b: r1) + s1 = r0.vtrn2_u8(b: r1) + s2 = r2.vtrn1_u8(b: r3) + s3 = r2.vtrn2_u8(b: r3) + s4 = r4.vtrn1_u8(b: r5) + s5 = r4.vtrn2_u8(b: r5) + s6 = r6.vtrn1_u8(b: r7) + s7 = r6.vtrn2_u8(b: r7) + + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + p3 = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + q0 = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + p2 = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + q1 = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + p1 = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + q2 = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + p0 = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + q3 = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + + // ==== Filter4 computation (same as normal_vfilter_inner_8_arm_neon) ==== + + zero = util.make_u8x8_repeat(a: 0) + sign_bit = util.make_u8x8_repeat(a: 0x80) + kFE = util.make_u8x8_repeat(a: 0xFE) + m_thresh = util.make_u8x8_repeat(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_u8x8_repeat(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_u8x8_repeat(a: (args.hlevel & 0xFF) as base.u8) + k1 = util.make_u8x8_repeat(a: 1) + k3 = util.make_u8x8_repeat(a: 3) + k4 = util.make_u8x8_repeat(a: 4) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1.vabd_u8(b: q1) + t2 = t1.vand_u8(b: kFE).vshr_n_u8(b: 1) + t3 = p0.vabd_u8(b: q0) + t3 = t3.vqadd_u8(b: t3) + t3 = t3.vqadd_u8(b: t2) + mask = t3.vqsub_u8(b: m_thresh).vceq_u8(b: zero) + + t1 = p3.vabd_u8(b: p2) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = p2.vabd_u8(b: p1) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = p1.vabd_u8(b: p0) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q0.vabd_u8(b: q1) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q1.vabd_u8(b: q2) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + t1 = q2.vabd_u8(b: q3) + mask = mask.vand_u8(b: t1.vqsub_u8(b: m_ithresh).vceq_u8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1.vabd_u8(b: p0) + t2 = q1.vabd_u8(b: q0) + t3 = t1.vqsub_u8(b: m_hthresh).vorr_u8(b: t2.vqsub_u8(b: m_hthresh)) + not_hev = t3.vceq_u8(b: zero) + + // ---- Convert to signed ---- + p1 = p1.veor_u8(b: sign_bit) + p0 = p0.veor_u8(b: sign_bit) + q0 = q0.veor_u8(b: sign_bit) + q1 = q1.veor_u8(b: sign_bit) + + // ---- Delta (filter4: !HEV uses no p1-q1 term) ---- + t1 = p1.vqsub_s8(b: q1) + t1 = t1.vbic_u8(b: not_hev) + t2 = q0.vqsub_s8(b: p0) + t1 = t1.vqadd_s8(b: t2) + t1 = t1.vqadd_s8(b: t2) + delta = t1.vqadd_s8(b: t2) + delta = delta.vand_u8(b: mask) + + // ---- v4, v3 ---- + v4 = delta.vqadd_s8(b: k4) + v4 = v4.vshr_n_s8(b: 3) + v3 = delta.vqadd_s8(b: k3) + v3 = v3.vshr_n_s8(b: 3) + + q0 = q0.vqsub_s8(b: v4) + p0 = p0.vqadd_s8(b: v3) + + // ---- Filter4 !HEV: a3 ---- + a3 = v4.vqadd_s8(b: k1) + a3 = a3.vshr_n_s8(b: 1) + a3 = a3.vand_u8(b: not_hev) + + q1 = q1.vqsub_s8(b: a3) + p1 = p1.vqadd_s8(b: a3) + + // ---- Convert back ---- + p1 = p1.veor_u8(b: sign_bit) + p0 = p0.veor_u8(b: sign_bit) + q0 = q0.veor_u8(b: sign_bit) + q1 = q1.veor_u8(b: sign_bit) + + // ==== Inverse 8x8 transpose ==== + s0 = p3.vtrn1_u8(b: p2) + s1 = p3.vtrn2_u8(b: p2) + s2 = p1.vtrn1_u8(b: p0) + s3 = p1.vtrn2_u8(b: p0) + s4 = q0.vtrn1_u8(b: q1) + s5 = q0.vtrn2_u8(b: q1) + s6 = q2.vtrn1_u8(b: q3) + s7 = q2.vtrn2_u8(b: q3) + + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + s0 = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + s4 = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + s1 = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + s5 = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + s2 = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + s6 = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + s3 = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + s7 = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + + // ==== Store 8 rows ==== + if (args.q0_off - 4) > args.workbuf.length() { + return nothing + } + args.workbuf = args.workbuf[(args.q0_off - 4) ..] + + if 8 <= args.workbuf.length() { + s0.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s1.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s2.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s3.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s4.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s5.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s6.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s7.store_slice64!(a: args.workbuf[.. 8]) + } +} + +// VP8 normal loop filter (filter6) for 16-row Y horizontal MB edges, ARM NEON. +// +// Horizontal filter on 16 rows: two 8x8 transposes → u8x16 columns, filter6, +// two inverse transposes → store. Uses y_stride for row stepping. + +pri func decoder.normal_hfilter_mb_16_arm_neon!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + var wb : slice base.u8 + + // Row/scratch registers for 8x8 transpose. + var r0 : base.arm_neon_u8x8 + var r1 : base.arm_neon_u8x8 + var r2 : base.arm_neon_u8x8 + var r3 : base.arm_neon_u8x8 + var r4 : base.arm_neon_u8x8 + var r5 : base.arm_neon_u8x8 + var r6 : base.arm_neon_u8x8 + var r7 : base.arm_neon_u8x8 + var s0 : base.arm_neon_u8x8 + var s1 : base.arm_neon_u8x8 + var s2 : base.arm_neon_u8x8 + var s3 : base.arm_neon_u8x8 + var s4 : base.arm_neon_u8x8 + var s5 : base.arm_neon_u8x8 + var s6 : base.arm_neon_u8x8 + var s7 : base.arm_neon_u8x8 + + // Low column halves from first 8x8 transpose. + var p3_lo : base.arm_neon_u8x8 + var p2_lo : base.arm_neon_u8x8 + var p1_lo : base.arm_neon_u8x8 + var p0_lo : base.arm_neon_u8x8 + var q0_lo : base.arm_neon_u8x8 + var q1_lo : base.arm_neon_u8x8 + var q2_lo : base.arm_neon_u8x8 + var q3_lo : base.arm_neon_u8x8 + + // u8x16 pixel columns for filter. + var p3 : base.arm_neon_u8x16 + var p2 : base.arm_neon_u8x16 + var p1 : base.arm_neon_u8x16 + var p0 : base.arm_neon_u8x16 + var q0 : base.arm_neon_u8x16 + var q1 : base.arm_neon_u8x16 + var q2 : base.arm_neon_u8x16 + var q3 : base.arm_neon_u8x16 + + // Filter constants and temporaries. + var zero : base.arm_neon_u8x16 + var sign_bit : base.arm_neon_u8x16 + var kFE : base.arm_neon_u8x16 + var m_thresh : base.arm_neon_u8x16 + var m_ithresh : base.arm_neon_u8x16 + var m_hthresh : base.arm_neon_u8x16 + var k3 : base.arm_neon_u8x16 + var k4 : base.arm_neon_u8x16 + var mask : base.arm_neon_u8x16 + var not_hev : base.arm_neon_u8x16 + var delta : base.arm_neon_u8x16 + var v3 : base.arm_neon_u8x16 + var v4 : base.arm_neon_u8x16 + var a1 : base.arm_neon_u8x16 + var a2 : base.arm_neon_u8x16 + var a3 : base.arm_neon_u8x16 + var t1 : base.arm_neon_u8x16 + var t2 : base.arm_neon_u8x16 + var t3 : base.arm_neon_u8x16 + var p0_adj : base.arm_neon_u8x16 + var q0_adj : base.arm_neon_u8x16 + + // Widening variables for filter6 multiply (i8 -> i16). + var d_lo : base.arm_neon_u8x8 + var d_hi : base.arm_neon_u8x8 + var lo : base.arm_neon_u16x8 + var hi : base.arm_neon_u16x8 + var k63_16 : base.arm_neon_u16x8 + var tmp_lo : base.arm_neon_u16x8 + var tmp_hi : base.arm_neon_u16x8 + var narrow_lo : base.arm_neon_u8x8 + var narrow_hi : base.arm_neon_u8x8 + + // Need 4 bytes before q0 for p3..p0 columns. + if args.q0_off < 4 { + return nothing + } + + wb = args.workbuf + if (args.q0_off - 4) > wb.length() { + return nothing + } + wb = wb[(args.q0_off - 4) ..] + + // ==== Load rows 0-7 of 8 bytes each ==== + if 8 > wb.length() { + return nothing + } + r0 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r1 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r2 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r3 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r4 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r5 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r6 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r7 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + // ==== Forward 8x8 transpose: rows 0-7 → lo column halves ==== + s0 = r0.vtrn1_u8(b: r1) + s1 = r0.vtrn2_u8(b: r1) + s2 = r2.vtrn1_u8(b: r3) + s3 = r2.vtrn2_u8(b: r3) + s4 = r4.vtrn1_u8(b: r5) + s5 = r4.vtrn2_u8(b: r5) + s6 = r6.vtrn1_u8(b: r7) + s7 = r6.vtrn2_u8(b: r7) + + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + p3_lo = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + q0_lo = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + p2_lo = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + q1_lo = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + p1_lo = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + q2_lo = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + p0_lo = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + q3_lo = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + + // ==== Load rows 8-15 of 8 bytes each ==== + if 8 > wb.length() { + return nothing + } + r0 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r1 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r2 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r3 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r4 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r5 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r6 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r7 = util.make_u8x8_slice64(a: wb[.. 8]) + + // ==== Forward 8x8 transpose: rows 8-15 → hi halves, then vcombine ==== + s0 = r0.vtrn1_u8(b: r1) + s1 = r0.vtrn2_u8(b: r1) + s2 = r2.vtrn1_u8(b: r3) + s3 = r2.vtrn2_u8(b: r3) + s4 = r4.vtrn1_u8(b: r5) + s5 = r4.vtrn2_u8(b: r5) + s6 = r6.vtrn1_u8(b: r7) + s7 = r6.vtrn2_u8(b: r7) + + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + // Step 3 + vcombine: hi halves combined with lo halves → u8x16 columns. + s0 = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + p3 = p3_lo.vcombine_u8(b: s0) + s0 = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + q0 = q0_lo.vcombine_u8(b: s0) + + s1 = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + p2 = p2_lo.vcombine_u8(b: s1) + s1 = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + q1 = q1_lo.vcombine_u8(b: s1) + + s2 = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + p1 = p1_lo.vcombine_u8(b: s2) + s2 = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + q2 = q2_lo.vcombine_u8(b: s2) + + s3 = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + p0 = p0_lo.vcombine_u8(b: s3) + s3 = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + q3 = q3_lo.vcombine_u8(b: s3) + + // ==== Filter6 computation (same as normal_vfilter_mb_16_arm_neon) ==== + + // Constants. + zero = util.make_u8x16_repeat(a: 0) + sign_bit = util.make_u8x16_repeat(a: 0x80) + kFE = util.make_u8x16_repeat(a: 0xFE) + m_thresh = util.make_u8x16_repeat(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_u8x16_repeat(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_u8x16_repeat(a: (args.hlevel & 0xFF) as base.u8) + k3 = util.make_u8x16_repeat(a: 3) + k4 = util.make_u8x16_repeat(a: 4) + k63_16 = util.make_u16x8_repeat(a: 63) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1.vabdq_u8(b: q1) + t2 = t1.vandq_u8(b: kFE).vshrq_n_u8(b: 1) + t3 = p0.vabdq_u8(b: q0) + t3 = t3.vqaddq_u8(b: t3) + t3 = t3.vqaddq_u8(b: t2) + mask = t3.vqsubq_u8(b: m_thresh).vceqq_u8(b: zero) + + t1 = p3.vabdq_u8(b: p2) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = p2.vabdq_u8(b: p1) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = p1.vabdq_u8(b: p0) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q0.vabdq_u8(b: q1) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q1.vabdq_u8(b: q2) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q2.vabdq_u8(b: q3) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1.vabdq_u8(b: p0) + t2 = q1.vabdq_u8(b: q0) + t3 = t1.vqsubq_u8(b: m_hthresh).vorrq_u8(b: t2.vqsubq_u8(b: m_hthresh)) + not_hev = t3.vceqq_u8(b: zero) + + // ---- Convert p2, p1, p0, q0, q1, q2 to signed ---- + p2 = p2.veorq_u8(b: sign_bit) + p1 = p1.veorq_u8(b: sign_bit) + p0 = p0.veorq_u8(b: sign_bit) + q0 = q0.veorq_u8(b: sign_bit) + q1 = q1.veorq_u8(b: sign_bit) + q2 = q2.veorq_u8(b: sign_bit) + + // ---- Delta = sat_i8(3*(q0-p0) + sat_i8(p1-q1)) ---- + t1 = p1.vqsubq_s8(b: q1) + t2 = q0.vqsubq_s8(b: p0) + t1 = t1.vqaddq_s8(b: t2) + t1 = t1.vqaddq_s8(b: t2) + delta = t1.vqaddq_s8(b: t2) + delta = delta.vandq_u8(b: mask) + + // ---- Filter2 (HEV path): v4 = asr3(delta+4), v3 = asr3(delta+3) ---- + v4 = delta.vqaddq_s8(b: k4) + v4 = v4.vshrq_n_s8(b: 3) + + v3 = delta.vqaddq_s8(b: k3) + v3 = v3.vshrq_n_s8(b: 3) + + // ---- Filter6 (!HEV path): widen delta to i16, multiply by 27/18/9 ---- + d_lo = delta.vget_low_u8() + d_hi = delta.vget_high_u8() + lo = d_lo.vmovl_s8() + hi = d_hi.vmovl_s8() + + // a1 = (27 * delta + 63) >> 7 + tmp_lo = lo.vmulq_n_u16(b: 27) + tmp_lo = tmp_lo.vaddq_u16(b: k63_16) + tmp_lo = tmp_lo.vshrq_n_s16(b: 7) + tmp_hi = hi.vmulq_n_u16(b: 27) + tmp_hi = tmp_hi.vaddq_u16(b: k63_16) + tmp_hi = tmp_hi.vshrq_n_s16(b: 7) + narrow_lo = tmp_lo.vqmovn_s16() + narrow_hi = tmp_hi.vqmovn_s16() + a1 = narrow_lo.vcombine_u8(b: narrow_hi) + + // a2 = (18 * delta + 63) >> 7 + tmp_lo = lo.vmulq_n_u16(b: 18) + tmp_lo = tmp_lo.vaddq_u16(b: k63_16) + tmp_lo = tmp_lo.vshrq_n_s16(b: 7) + tmp_hi = hi.vmulq_n_u16(b: 18) + tmp_hi = tmp_hi.vaddq_u16(b: k63_16) + tmp_hi = tmp_hi.vshrq_n_s16(b: 7) + narrow_lo = tmp_lo.vqmovn_s16() + narrow_hi = tmp_hi.vqmovn_s16() + a2 = narrow_lo.vcombine_u8(b: narrow_hi) + + // a3 = (9 * delta + 63) >> 7 + tmp_lo = lo.vmulq_n_u16(b: 9) + tmp_lo = tmp_lo.vaddq_u16(b: k63_16) + tmp_lo = tmp_lo.vshrq_n_s16(b: 7) + tmp_hi = hi.vmulq_n_u16(b: 9) + tmp_hi = tmp_hi.vaddq_u16(b: k63_16) + tmp_hi = tmp_hi.vshrq_n_s16(b: 7) + narrow_lo = tmp_lo.vqmovn_s16() + narrow_hi = tmp_hi.vqmovn_s16() + a3 = narrow_lo.vcombine_u8(b: narrow_hi) + + // ---- Merge HEV (filter2) and !HEV (filter6) results ---- + p0_adj = v3.vbicq_u8(b: not_hev) + p0_adj = p0_adj.vorrq_u8(b: a1.vandq_u8(b: not_hev)) + p0 = p0.vqaddq_s8(b: p0_adj) + + q0_adj = v4.vbicq_u8(b: not_hev) + q0_adj = q0_adj.vorrq_u8(b: a1.vandq_u8(b: not_hev)) + q0 = q0.vqsubq_s8(b: q0_adj) + + p1 = p1.vqaddq_s8(b: a2.vandq_u8(b: not_hev)) + q1 = q1.vqsubq_s8(b: a2.vandq_u8(b: not_hev)) + p2 = p2.vqaddq_s8(b: a3.vandq_u8(b: not_hev)) + q2 = q2.vqsubq_s8(b: a3.vandq_u8(b: not_hev)) + + // ---- Convert back to unsigned ---- + p2 = p2.veorq_u8(b: sign_bit) + p1 = p1.veorq_u8(b: sign_bit) + p0 = p0.veorq_u8(b: sign_bit) + q0 = q0.veorq_u8(b: sign_bit) + q1 = q1.veorq_u8(b: sign_bit) + q2 = q2.veorq_u8(b: sign_bit) + + // ==== Split columns back to lo halves, inverse transpose → rows 0-7 ==== + p3_lo = p3.vget_low_u8() + p2_lo = p2.vget_low_u8() + p1_lo = p1.vget_low_u8() + p0_lo = p0.vget_low_u8() + q0_lo = q0.vget_low_u8() + q1_lo = q1.vget_low_u8() + q2_lo = q2.vget_low_u8() + q3_lo = q3.vget_low_u8() + + s0 = p3_lo.vtrn1_u8(b: p2_lo) + s1 = p3_lo.vtrn2_u8(b: p2_lo) + s2 = p1_lo.vtrn1_u8(b: p0_lo) + s3 = p1_lo.vtrn2_u8(b: p0_lo) + s4 = q0_lo.vtrn1_u8(b: q1_lo) + s5 = q0_lo.vtrn2_u8(b: q1_lo) + s6 = q2_lo.vtrn1_u8(b: q3_lo) + s7 = q2_lo.vtrn2_u8(b: q3_lo) + + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + s0 = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + s4 = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + s1 = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + s5 = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + s2 = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + s6 = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + s3 = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + s7 = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + + // ==== Store rows 0-7 ==== + if (args.q0_off - 4) > args.workbuf.length() { + return nothing + } + args.workbuf = args.workbuf[(args.q0_off - 4) ..] + + if 8 <= args.workbuf.length() { + s0.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s1.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s2.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s3.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s4.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s5.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s6.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s7.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + // ==== Extract hi halves, inverse transpose → rows 8-15 ==== + r0 = p3.vget_high_u8() + r1 = p2.vget_high_u8() + r2 = p1.vget_high_u8() + r3 = p0.vget_high_u8() + r4 = q0.vget_high_u8() + r5 = q1.vget_high_u8() + r6 = q2.vget_high_u8() + r7 = q3.vget_high_u8() + + s0 = r0.vtrn1_u8(b: r1) + s1 = r0.vtrn2_u8(b: r1) + s2 = r2.vtrn1_u8(b: r3) + s3 = r2.vtrn2_u8(b: r3) + s4 = r4.vtrn1_u8(b: r5) + s5 = r4.vtrn2_u8(b: r5) + s6 = r6.vtrn1_u8(b: r7) + s7 = r6.vtrn2_u8(b: r7) + + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + s0 = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + s4 = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + s1 = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + s5 = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + s2 = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + s6 = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + s3 = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + s7 = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + + // ==== Store rows 8-15 ==== + if 8 <= args.workbuf.length() { + s0.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s1.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s2.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s3.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s4.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s5.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s6.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s7.store_slice64!(a: args.workbuf[.. 8]) + } +} + +// VP8 normal loop filter (filter4) for 16-row Y horizontal inner edges, ARM NEON. +// +// Same two-8x8-transpose approach as hfilter_mb_16 but uses filter4. + +pri func decoder.normal_hfilter_inner_16_arm_neon!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + var wb : slice base.u8 + + var r0 : base.arm_neon_u8x8 + var r1 : base.arm_neon_u8x8 + var r2 : base.arm_neon_u8x8 + var r3 : base.arm_neon_u8x8 + var r4 : base.arm_neon_u8x8 + var r5 : base.arm_neon_u8x8 + var r6 : base.arm_neon_u8x8 + var r7 : base.arm_neon_u8x8 + var s0 : base.arm_neon_u8x8 + var s1 : base.arm_neon_u8x8 + var s2 : base.arm_neon_u8x8 + var s3 : base.arm_neon_u8x8 + var s4 : base.arm_neon_u8x8 + var s5 : base.arm_neon_u8x8 + var s6 : base.arm_neon_u8x8 + var s7 : base.arm_neon_u8x8 + + var p3_lo : base.arm_neon_u8x8 + var p2_lo : base.arm_neon_u8x8 + var p1_lo : base.arm_neon_u8x8 + var p0_lo : base.arm_neon_u8x8 + var q0_lo : base.arm_neon_u8x8 + var q1_lo : base.arm_neon_u8x8 + var q2_lo : base.arm_neon_u8x8 + var q3_lo : base.arm_neon_u8x8 + + var p3 : base.arm_neon_u8x16 + var p2 : base.arm_neon_u8x16 + var p1 : base.arm_neon_u8x16 + var p0 : base.arm_neon_u8x16 + var q0 : base.arm_neon_u8x16 + var q1 : base.arm_neon_u8x16 + var q2 : base.arm_neon_u8x16 + var q3 : base.arm_neon_u8x16 + + var zero : base.arm_neon_u8x16 + var sign_bit : base.arm_neon_u8x16 + var kFE : base.arm_neon_u8x16 + var m_thresh : base.arm_neon_u8x16 + var m_ithresh : base.arm_neon_u8x16 + var m_hthresh : base.arm_neon_u8x16 + var k1 : base.arm_neon_u8x16 + var k3 : base.arm_neon_u8x16 + var k4 : base.arm_neon_u8x16 + var mask : base.arm_neon_u8x16 + var not_hev : base.arm_neon_u8x16 + var delta : base.arm_neon_u8x16 + var v3 : base.arm_neon_u8x16 + var v4 : base.arm_neon_u8x16 + var a3 : base.arm_neon_u8x16 + var t1 : base.arm_neon_u8x16 + var t2 : base.arm_neon_u8x16 + var t3 : base.arm_neon_u8x16 + + if args.q0_off < 4 { + return nothing + } + + wb = args.workbuf + if (args.q0_off - 4) > wb.length() { + return nothing + } + wb = wb[(args.q0_off - 4) ..] + + // ==== Load rows 0-7 ==== + if 8 > wb.length() { + return nothing + } + r0 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r1 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r2 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r3 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r4 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r5 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r6 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r7 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + // ==== Forward 8x8 transpose: rows 0-7 → lo column halves ==== + s0 = r0.vtrn1_u8(b: r1) + s1 = r0.vtrn2_u8(b: r1) + s2 = r2.vtrn1_u8(b: r3) + s3 = r2.vtrn2_u8(b: r3) + s4 = r4.vtrn1_u8(b: r5) + s5 = r4.vtrn2_u8(b: r5) + s6 = r6.vtrn1_u8(b: r7) + s7 = r6.vtrn2_u8(b: r7) + + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + p3_lo = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + q0_lo = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + p2_lo = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + q1_lo = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + p1_lo = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + q2_lo = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + p0_lo = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + q3_lo = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + + // ==== Load rows 8-15 ==== + if 8 > wb.length() { + return nothing + } + r0 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r1 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r2 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r3 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r4 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r5 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r6 = util.make_u8x8_slice64(a: wb[.. 8]) + if (this.y_stride as base.u64) > wb.length() { + return nothing + } + wb = wb[(this.y_stride as base.u64) ..] + + if 8 > wb.length() { + return nothing + } + r7 = util.make_u8x8_slice64(a: wb[.. 8]) + + // ==== Forward 8x8 transpose: rows 8-15 → hi halves, then vcombine ==== + s0 = r0.vtrn1_u8(b: r1) + s1 = r0.vtrn2_u8(b: r1) + s2 = r2.vtrn1_u8(b: r3) + s3 = r2.vtrn2_u8(b: r3) + s4 = r4.vtrn1_u8(b: r5) + s5 = r4.vtrn2_u8(b: r5) + s6 = r6.vtrn1_u8(b: r7) + s7 = r6.vtrn2_u8(b: r7) + + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + s0 = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + p3 = p3_lo.vcombine_u8(b: s0) + s0 = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + q0 = q0_lo.vcombine_u8(b: s0) + + s1 = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + p2 = p2_lo.vcombine_u8(b: s1) + s1 = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + q1 = q1_lo.vcombine_u8(b: s1) + + s2 = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + p1 = p1_lo.vcombine_u8(b: s2) + s2 = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + q2 = q2_lo.vcombine_u8(b: s2) + + s3 = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + p0 = p0_lo.vcombine_u8(b: s3) + s3 = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + q3 = q3_lo.vcombine_u8(b: s3) + + // ==== Filter4 computation (same as normal_vfilter_inner_16_arm_neon) ==== + + zero = util.make_u8x16_repeat(a: 0) + sign_bit = util.make_u8x16_repeat(a: 0x80) + kFE = util.make_u8x16_repeat(a: 0xFE) + m_thresh = util.make_u8x16_repeat(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_u8x16_repeat(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_u8x16_repeat(a: (args.hlevel & 0xFF) as base.u8) + k1 = util.make_u8x16_repeat(a: 1) + k3 = util.make_u8x16_repeat(a: 3) + k4 = util.make_u8x16_repeat(a: 4) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1.vabdq_u8(b: q1) + t2 = t1.vandq_u8(b: kFE).vshrq_n_u8(b: 1) + t3 = p0.vabdq_u8(b: q0) + t3 = t3.vqaddq_u8(b: t3) + t3 = t3.vqaddq_u8(b: t2) + mask = t3.vqsubq_u8(b: m_thresh).vceqq_u8(b: zero) + + t1 = p3.vabdq_u8(b: p2) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = p2.vabdq_u8(b: p1) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = p1.vabdq_u8(b: p0) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q0.vabdq_u8(b: q1) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q1.vabdq_u8(b: q2) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + t1 = q2.vabdq_u8(b: q3) + mask = mask.vandq_u8(b: t1.vqsubq_u8(b: m_ithresh).vceqq_u8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1.vabdq_u8(b: p0) + t2 = q1.vabdq_u8(b: q0) + t3 = t1.vqsubq_u8(b: m_hthresh).vorrq_u8(b: t2.vqsubq_u8(b: m_hthresh)) + not_hev = t3.vceqq_u8(b: zero) + + // ---- Convert to signed ---- + p1 = p1.veorq_u8(b: sign_bit) + p0 = p0.veorq_u8(b: sign_bit) + q0 = q0.veorq_u8(b: sign_bit) + q1 = q1.veorq_u8(b: sign_bit) + + // ---- Delta (filter4: !HEV zeroes p1-q1 term) ---- + t1 = p1.vqsubq_s8(b: q1) + t1 = t1.vbicq_u8(b: not_hev) + t2 = q0.vqsubq_s8(b: p0) + t1 = t1.vqaddq_s8(b: t2) + t1 = t1.vqaddq_s8(b: t2) + delta = t1.vqaddq_s8(b: t2) + delta = delta.vandq_u8(b: mask) + + // ---- v4, v3 ---- + v4 = delta.vqaddq_s8(b: k4) + v4 = v4.vshrq_n_s8(b: 3) + v3 = delta.vqaddq_s8(b: k3) + v3 = v3.vshrq_n_s8(b: 3) + + q0 = q0.vqsubq_s8(b: v4) + p0 = p0.vqaddq_s8(b: v3) + + // ---- Filter4 !HEV: a3 ---- + a3 = v4.vqaddq_s8(b: k1) + a3 = a3.vshrq_n_s8(b: 1) + a3 = a3.vandq_u8(b: not_hev) + + q1 = q1.vqsubq_s8(b: a3) + p1 = p1.vqaddq_s8(b: a3) + + // ---- Convert back to unsigned ---- + p1 = p1.veorq_u8(b: sign_bit) + p0 = p0.veorq_u8(b: sign_bit) + q0 = q0.veorq_u8(b: sign_bit) + q1 = q1.veorq_u8(b: sign_bit) + + // ==== Split columns back to lo halves, inverse transpose → rows 0-7 ==== + p3_lo = p3.vget_low_u8() + p2_lo = p2.vget_low_u8() + p1_lo = p1.vget_low_u8() + p0_lo = p0.vget_low_u8() + q0_lo = q0.vget_low_u8() + q1_lo = q1.vget_low_u8() + q2_lo = q2.vget_low_u8() + q3_lo = q3.vget_low_u8() + + s0 = p3_lo.vtrn1_u8(b: p2_lo) + s1 = p3_lo.vtrn2_u8(b: p2_lo) + s2 = p1_lo.vtrn1_u8(b: p0_lo) + s3 = p1_lo.vtrn2_u8(b: p0_lo) + s4 = q0_lo.vtrn1_u8(b: q1_lo) + s5 = q0_lo.vtrn2_u8(b: q1_lo) + s6 = q2_lo.vtrn1_u8(b: q3_lo) + s7 = q2_lo.vtrn2_u8(b: q3_lo) + + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + s0 = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + s4 = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + s1 = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + s5 = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + s2 = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + s6 = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + s3 = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + s7 = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + + // ==== Store rows 0-7 ==== + if (args.q0_off - 4) > args.workbuf.length() { + return nothing + } + args.workbuf = args.workbuf[(args.q0_off - 4) ..] + + if 8 <= args.workbuf.length() { + s0.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s1.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s2.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s3.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s4.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s5.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s6.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s7.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + // ==== Extract hi halves, inverse transpose → rows 8-15 ==== + r0 = p3.vget_high_u8() + r1 = p2.vget_high_u8() + r2 = p1.vget_high_u8() + r3 = p0.vget_high_u8() + r4 = q0.vget_high_u8() + r5 = q1.vget_high_u8() + r6 = q2.vget_high_u8() + r7 = q3.vget_high_u8() + + s0 = r0.vtrn1_u8(b: r1) + s1 = r0.vtrn2_u8(b: r1) + s2 = r2.vtrn1_u8(b: r3) + s3 = r2.vtrn2_u8(b: r3) + s4 = r4.vtrn1_u8(b: r5) + s5 = r4.vtrn2_u8(b: r5) + s6 = r6.vtrn1_u8(b: r7) + s7 = r6.vtrn2_u8(b: r7) + + r0 = s0.as_u16x4().vtrn1_u16(b: s2.as_u16x4()).as_u8x8() + r2 = s0.as_u16x4().vtrn2_u16(b: s2.as_u16x4()).as_u8x8() + r1 = s1.as_u16x4().vtrn1_u16(b: s3.as_u16x4()).as_u8x8() + r3 = s1.as_u16x4().vtrn2_u16(b: s3.as_u16x4()).as_u8x8() + r4 = s4.as_u16x4().vtrn1_u16(b: s6.as_u16x4()).as_u8x8() + r6 = s4.as_u16x4().vtrn2_u16(b: s6.as_u16x4()).as_u8x8() + r5 = s5.as_u16x4().vtrn1_u16(b: s7.as_u16x4()).as_u8x8() + r7 = s5.as_u16x4().vtrn2_u16(b: s7.as_u16x4()).as_u8x8() + + s0 = r0.as_u32x2().vtrn1_u32(b: r4.as_u32x2()).as_u8x8() + s4 = r0.as_u32x2().vtrn2_u32(b: r4.as_u32x2()).as_u8x8() + s1 = r1.as_u32x2().vtrn1_u32(b: r5.as_u32x2()).as_u8x8() + s5 = r1.as_u32x2().vtrn2_u32(b: r5.as_u32x2()).as_u8x8() + s2 = r2.as_u32x2().vtrn1_u32(b: r6.as_u32x2()).as_u8x8() + s6 = r2.as_u32x2().vtrn2_u32(b: r6.as_u32x2()).as_u8x8() + s3 = r3.as_u32x2().vtrn1_u32(b: r7.as_u32x2()).as_u8x8() + s7 = r3.as_u32x2().vtrn2_u32(b: r7.as_u32x2()).as_u8x8() + + // ==== Store rows 8-15 ==== + if 8 <= args.workbuf.length() { + s0.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s1.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s2.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s3.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s4.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s5.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s6.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + + if 8 <= args.workbuf.length() { + s7.store_slice64!(a: args.workbuf[.. 8]) + } +} diff --git a/std/vp8/decode_filter_x86_avx2.wuffs b/std/vp8/decode_filter_x86_avx2.wuffs new file mode 100644 index 000000000..492c44c80 --- /dev/null +++ b/std/vp8/decode_filter_x86_avx2.wuffs @@ -0,0 +1,1675 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 normal loop filter, AVX2 version. +// +// Combines U and V plane processing: lower 128-bit lane = U, upper = V. +// This halves function call overhead and compute instructions for chroma +// filtering at MB and inner sub-block edges. + +// ============================================================================ +// normal_vfilter_mb_uv: filter6 for horizontal MB boundary edges (U+V) +// ============================================================================ + +pri func decoder.normal_vfilter_mb_uv_x86_avx2!(workbuf: slice base.u8, + u_off: base.u64, v_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_avx2, +{ + var sse_util : base.x86_sse42_utility + var util : base.x86_avx2_utility + var u_wb : slice base.u8 + var v_wb : slice base.u8 + var u_128 : base.x86_m128i + var v_128 : base.x86_m128i + + var p3 : base.x86_m256i + var p2 : base.x86_m256i + var p1 : base.x86_m256i + var p0 : base.x86_m256i + var q0 : base.x86_m256i + var q1 : base.x86_m256i + var q2 : base.x86_m256i + var q3 : base.x86_m256i + + var zero : base.x86_m256i + var sign_bit : base.x86_m256i + var kFE : base.x86_m256i + var m_thresh : base.x86_m256i + var m_ithresh : base.x86_m256i + var m_hthresh : base.x86_m256i + var k3 : base.x86_m256i + var k4 : base.x86_m256i + var k63 : base.x86_m256i + var k27 : base.x86_m256i + var k18 : base.x86_m256i + var k9 : base.x86_m256i + + var mask : base.x86_m256i + var not_hev : base.x86_m256i + var delta : base.x86_m256i + var v3 : base.x86_m256i + var v4 : base.x86_m256i + var a1 : base.x86_m256i + var a2 : base.x86_m256i + var a3 : base.x86_m256i + var t1 : base.x86_m256i + var t2 : base.x86_m256i + var t3 : base.x86_m256i + var lo : base.x86_m256i + var hi : base.x86_m256i + var d_lo : base.x86_m256i + var d_hi : base.x86_m256i + var p0_adj : base.x86_m256i + var q0_adj : base.x86_m256i + + // Bounds: need 4*uv_stride before each offset. + if args.u_off < (4 * (this.uv_stride as base.u64)) { + return nothing + } + if args.v_off < (4 * (this.uv_stride as base.u64)) { + return nothing + } + + // Set up U slice at p3 position. + u_wb = args.workbuf + if (args.u_off - (4 * (this.uv_stride as base.u64))) <= u_wb.length() { + u_wb = u_wb[args.u_off - (4 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + // Set up V slice at p3 position. + v_wb = args.workbuf + if (args.v_off - (4 * (this.uv_stride as base.u64))) <= v_wb.length() { + v_wb = v_wb[args.v_off - (4 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + // ---- Load 8 rows: p3..q3, combining U (lower lane) + V (upper lane) ---- + + // p3 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + p3 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // p2 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + p2 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // p1 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + p1 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // p0 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + p0 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // q0 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + q0 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // q1 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + q1 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // q2 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + q2 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // q3 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + q3 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + + // ---- Constants ---- + zero = util.make_m256i_zeroes() + sign_bit = util.make_m256i_repeat_u8(a: 0x80) + kFE = util.make_m256i_repeat_u8(a: 0xFE) + m_thresh = util.make_m256i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m256i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m256i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k3 = util.make_m256i_repeat_u8(a: 3) + k4 = util.make_m256i_repeat_u8(a: 4) + k63 = util.make_m256i_repeat_u16(a: 63) + k27 = util.make_m256i_repeat_u16(a: 27) + k18 = util.make_m256i_repeat_u16(a: 18) + k9 = util.make_m256i_repeat_u16(a: 9) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1._mm256_subs_epu8(b: q1)._mm256_or_si256(b: q1._mm256_subs_epu8(b: p1)) + t2 = t1._mm256_and_si256(b: kFE)._mm256_srli_epi16(imm8: 1) + t3 = p0._mm256_subs_epu8(b: q0)._mm256_or_si256(b: q0._mm256_subs_epu8(b: p0)) + t3 = t3._mm256_adds_epu8(b: t3) + t3 = t3._mm256_adds_epu8(b: t2) + mask = t3._mm256_subs_epu8(b: m_thresh)._mm256_cmpeq_epi8(b: zero) + + t1 = p3._mm256_subs_epu8(b: p2)._mm256_or_si256(b: p2._mm256_subs_epu8(b: p3)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = p2._mm256_subs_epu8(b: p1)._mm256_or_si256(b: p1._mm256_subs_epu8(b: p2)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = p1._mm256_subs_epu8(b: p0)._mm256_or_si256(b: p0._mm256_subs_epu8(b: p1)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q0._mm256_subs_epu8(b: q1)._mm256_or_si256(b: q1._mm256_subs_epu8(b: q0)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q1._mm256_subs_epu8(b: q2)._mm256_or_si256(b: q2._mm256_subs_epu8(b: q1)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q2._mm256_subs_epu8(b: q3)._mm256_or_si256(b: q3._mm256_subs_epu8(b: q2)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1._mm256_subs_epu8(b: p0)._mm256_or_si256(b: p0._mm256_subs_epu8(b: p1)) + t2 = q1._mm256_subs_epu8(b: q0)._mm256_or_si256(b: q0._mm256_subs_epu8(b: q1)) + t3 = t1._mm256_subs_epu8(b: m_hthresh)._mm256_or_si256(b: t2._mm256_subs_epu8(b: m_hthresh)) + not_hev = t3._mm256_cmpeq_epi8(b: zero) + + // ---- Convert to signed ---- + p2 = p2._mm256_xor_si256(b: sign_bit) + p1 = p1._mm256_xor_si256(b: sign_bit) + p0 = p0._mm256_xor_si256(b: sign_bit) + q0 = q0._mm256_xor_si256(b: sign_bit) + q1 = q1._mm256_xor_si256(b: sign_bit) + q2 = q2._mm256_xor_si256(b: sign_bit) + + // ---- Delta = sat_i8(3*(q0-p0) + sat_i8(p1-q1)) ---- + t1 = p1._mm256_subs_epi8(b: q1) + t2 = q0._mm256_subs_epi8(b: p0) + t1 = t1._mm256_adds_epi8(b: t2) + t1 = t1._mm256_adds_epi8(b: t2) + delta = t1._mm256_adds_epi8(b: t2) + delta = delta._mm256_and_si256(b: mask) + + // ---- Filter2 (HEV path): v4, v3 ---- + v4 = delta._mm256_adds_epi8(b: k4) + lo = zero._mm256_unpacklo_epi8(b: v4) + hi = zero._mm256_unpackhi_epi8(b: v4) + lo = lo._mm256_srai_epi16(imm8: 11) + hi = hi._mm256_srai_epi16(imm8: 11) + v4 = lo._mm256_packs_epi16(b: hi) + + v3 = delta._mm256_adds_epi8(b: k3) + lo = zero._mm256_unpacklo_epi8(b: v3) + hi = zero._mm256_unpackhi_epi8(b: v3) + lo = lo._mm256_srai_epi16(imm8: 11) + hi = hi._mm256_srai_epi16(imm8: 11) + v3 = lo._mm256_packs_epi16(b: hi) + + // ---- Filter6 (!HEV path): widen delta to i16, multiply by 27/18/9 ---- + d_lo = zero._mm256_unpacklo_epi8(b: delta)._mm256_srai_epi16(imm8: 8) + d_hi = zero._mm256_unpackhi_epi8(b: delta)._mm256_srai_epi16(imm8: 8) + + lo = d_lo._mm256_mullo_epi16(b: k27)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + hi = d_hi._mm256_mullo_epi16(b: k27)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + a1 = lo._mm256_packs_epi16(b: hi) + + lo = d_lo._mm256_mullo_epi16(b: k18)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + hi = d_hi._mm256_mullo_epi16(b: k18)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + a2 = lo._mm256_packs_epi16(b: hi) + + lo = d_lo._mm256_mullo_epi16(b: k9)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + hi = d_hi._mm256_mullo_epi16(b: k9)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + a3 = lo._mm256_packs_epi16(b: hi) + + // ---- Merge HEV/!HEV ---- + p0_adj = not_hev._mm256_andnot_si256(b: v3)._mm256_or_si256(b: a1._mm256_and_si256(b: not_hev)) + p0 = p0._mm256_adds_epi8(b: p0_adj) + q0_adj = not_hev._mm256_andnot_si256(b: v4)._mm256_or_si256(b: a1._mm256_and_si256(b: not_hev)) + q0 = q0._mm256_subs_epi8(b: q0_adj) + p1 = p1._mm256_adds_epi8(b: a2._mm256_and_si256(b: not_hev)) + q1 = q1._mm256_subs_epi8(b: a2._mm256_and_si256(b: not_hev)) + p2 = p2._mm256_adds_epi8(b: a3._mm256_and_si256(b: not_hev)) + q2 = q2._mm256_subs_epi8(b: a3._mm256_and_si256(b: not_hev)) + + // ---- Convert back to unsigned ---- + p2 = p2._mm256_xor_si256(b: sign_bit) + p1 = p1._mm256_xor_si256(b: sign_bit) + p0 = p0._mm256_xor_si256(b: sign_bit) + q0 = q0._mm256_xor_si256(b: sign_bit) + q1 = q1._mm256_xor_si256(b: sign_bit) + q2 = q2._mm256_xor_si256(b: sign_bit) + + // ---- Store p2, p1, p0, q0, q1, q2 back to U and V planes ---- + + // U store: reslice to p2 position (u_off - 3*stride). + if args.u_off < (3 * (this.uv_stride as base.u64)) { + return nothing + } + u_wb = args.workbuf + if (args.u_off - (3 * (this.uv_stride as base.u64))) <= u_wb.length() { + u_wb = u_wb[args.u_off - (3 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + // V store: reslice to p2 position (v_off - 3*stride). + if args.v_off < (3 * (this.uv_stride as base.u64)) { + return nothing + } + v_wb = args.workbuf + if (args.v_off - (3 * (this.uv_stride as base.u64))) <= v_wb.length() { + v_wb = v_wb[args.v_off - (3 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + // Store p2. + u_128 = p2._mm256_castsi256_si128() + v_128 = p2._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if (this.uv_stride as base.u64) <= u_wb.length() { + u_wb = u_wb[(this.uv_stride as base.u64) ..] + } + if (this.uv_stride as base.u64) <= v_wb.length() { + v_wb = v_wb[(this.uv_stride as base.u64) ..] + } + + // Store p1. + u_128 = p1._mm256_castsi256_si128() + v_128 = p1._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if (this.uv_stride as base.u64) <= u_wb.length() { + u_wb = u_wb[(this.uv_stride as base.u64) ..] + } + if (this.uv_stride as base.u64) <= v_wb.length() { + v_wb = v_wb[(this.uv_stride as base.u64) ..] + } + + // Store p0. + u_128 = p0._mm256_castsi256_si128() + v_128 = p0._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if (this.uv_stride as base.u64) <= u_wb.length() { + u_wb = u_wb[(this.uv_stride as base.u64) ..] + } + if (this.uv_stride as base.u64) <= v_wb.length() { + v_wb = v_wb[(this.uv_stride as base.u64) ..] + } + + // Store q0. + u_128 = q0._mm256_castsi256_si128() + v_128 = q0._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if (this.uv_stride as base.u64) <= u_wb.length() { + u_wb = u_wb[(this.uv_stride as base.u64) ..] + } + if (this.uv_stride as base.u64) <= v_wb.length() { + v_wb = v_wb[(this.uv_stride as base.u64) ..] + } + + // Store q1. + u_128 = q1._mm256_castsi256_si128() + v_128 = q1._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if (this.uv_stride as base.u64) <= u_wb.length() { + u_wb = u_wb[(this.uv_stride as base.u64) ..] + } + if (this.uv_stride as base.u64) <= v_wb.length() { + v_wb = v_wb[(this.uv_stride as base.u64) ..] + } + + // Store q2. + u_128 = q2._mm256_castsi256_si128() + v_128 = q2._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } +} + +// ============================================================================ +// normal_vfilter_inner_uv: filter4 for horizontal inner sub-block edges (U+V) +// ============================================================================ + +pri func decoder.normal_vfilter_inner_uv_x86_avx2!(workbuf: slice base.u8, + u_off: base.u64, v_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_avx2, +{ + var sse_util : base.x86_sse42_utility + var util : base.x86_avx2_utility + var u_wb : slice base.u8 + var v_wb : slice base.u8 + var u_128 : base.x86_m128i + var v_128 : base.x86_m128i + + var p3 : base.x86_m256i + var p2 : base.x86_m256i + var p1 : base.x86_m256i + var p0 : base.x86_m256i + var q0 : base.x86_m256i + var q1 : base.x86_m256i + var q2 : base.x86_m256i + var q3 : base.x86_m256i + + var zero : base.x86_m256i + var sign_bit : base.x86_m256i + var kFE : base.x86_m256i + var m_thresh : base.x86_m256i + var m_ithresh : base.x86_m256i + var m_hthresh : base.x86_m256i + var k1 : base.x86_m256i + var k3 : base.x86_m256i + var k4 : base.x86_m256i + + var mask : base.x86_m256i + var not_hev : base.x86_m256i + var delta : base.x86_m256i + var v3 : base.x86_m256i + var v4 : base.x86_m256i + var a3 : base.x86_m256i + var t1 : base.x86_m256i + var t2 : base.x86_m256i + var t3 : base.x86_m256i + var lo : base.x86_m256i + var hi : base.x86_m256i + + // Bounds: need 4*uv_stride before each offset. + if args.u_off < (4 * (this.uv_stride as base.u64)) { + return nothing + } + if args.v_off < (4 * (this.uv_stride as base.u64)) { + return nothing + } + + // Set up U slice at p3 position. + u_wb = args.workbuf + if (args.u_off - (4 * (this.uv_stride as base.u64))) <= u_wb.length() { + u_wb = u_wb[args.u_off - (4 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + // Set up V slice at p3 position. + v_wb = args.workbuf + if (args.v_off - (4 * (this.uv_stride as base.u64))) <= v_wb.length() { + v_wb = v_wb[args.v_off - (4 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + // ---- Load 8 rows: p3..q3 ---- + + // p3 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + p3 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // p2 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + p2 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // p1 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + p1 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // p0 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + p0 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // q0 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + q0 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // q1 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + q1 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // q2 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + q2 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + if ((this.uv_stride as base.u64) > u_wb.length()) or + ((this.uv_stride as base.u64) > v_wb.length()) { + return nothing + } + u_wb = u_wb[(this.uv_stride as base.u64) ..] + v_wb = v_wb[(this.uv_stride as base.u64) ..] + + // q3 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_128 = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_128 = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + q3 = u_128._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_128, imm8: 1) + + // ---- Constants ---- + zero = util.make_m256i_zeroes() + sign_bit = util.make_m256i_repeat_u8(a: 0x80) + kFE = util.make_m256i_repeat_u8(a: 0xFE) + m_thresh = util.make_m256i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m256i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m256i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k1 = util.make_m256i_repeat_u8(a: 1) + k3 = util.make_m256i_repeat_u8(a: 3) + k4 = util.make_m256i_repeat_u8(a: 4) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1._mm256_subs_epu8(b: q1)._mm256_or_si256(b: q1._mm256_subs_epu8(b: p1)) + t2 = t1._mm256_and_si256(b: kFE)._mm256_srli_epi16(imm8: 1) + t3 = p0._mm256_subs_epu8(b: q0)._mm256_or_si256(b: q0._mm256_subs_epu8(b: p0)) + t3 = t3._mm256_adds_epu8(b: t3) + t3 = t3._mm256_adds_epu8(b: t2) + mask = t3._mm256_subs_epu8(b: m_thresh)._mm256_cmpeq_epi8(b: zero) + + t1 = p3._mm256_subs_epu8(b: p2)._mm256_or_si256(b: p2._mm256_subs_epu8(b: p3)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = p2._mm256_subs_epu8(b: p1)._mm256_or_si256(b: p1._mm256_subs_epu8(b: p2)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = p1._mm256_subs_epu8(b: p0)._mm256_or_si256(b: p0._mm256_subs_epu8(b: p1)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q0._mm256_subs_epu8(b: q1)._mm256_or_si256(b: q1._mm256_subs_epu8(b: q0)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q1._mm256_subs_epu8(b: q2)._mm256_or_si256(b: q2._mm256_subs_epu8(b: q1)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q2._mm256_subs_epu8(b: q3)._mm256_or_si256(b: q3._mm256_subs_epu8(b: q2)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1._mm256_subs_epu8(b: p0)._mm256_or_si256(b: p0._mm256_subs_epu8(b: p1)) + t2 = q1._mm256_subs_epu8(b: q0)._mm256_or_si256(b: q0._mm256_subs_epu8(b: q1)) + t3 = t1._mm256_subs_epu8(b: m_hthresh)._mm256_or_si256(b: t2._mm256_subs_epu8(b: m_hthresh)) + not_hev = t3._mm256_cmpeq_epi8(b: zero) + + // ---- Convert p1, p0, q0, q1 to signed ---- + p1 = p1._mm256_xor_si256(b: sign_bit) + p0 = p0._mm256_xor_si256(b: sign_bit) + q0 = q0._mm256_xor_si256(b: sign_bit) + q1 = q1._mm256_xor_si256(b: sign_bit) + + // ---- Combined delta ---- + t1 = p1._mm256_subs_epi8(b: q1) + t1 = not_hev._mm256_andnot_si256(b: t1) + t2 = q0._mm256_subs_epi8(b: p0) + t1 = t1._mm256_adds_epi8(b: t2) + t1 = t1._mm256_adds_epi8(b: t2) + delta = t1._mm256_adds_epi8(b: t2) + delta = delta._mm256_and_si256(b: mask) + + // ---- v4, v3 ---- + v4 = delta._mm256_adds_epi8(b: k4) + lo = zero._mm256_unpacklo_epi8(b: v4) + hi = zero._mm256_unpackhi_epi8(b: v4) + lo = lo._mm256_srai_epi16(imm8: 11) + hi = hi._mm256_srai_epi16(imm8: 11) + v4 = lo._mm256_packs_epi16(b: hi) + + v3 = delta._mm256_adds_epi8(b: k3) + lo = zero._mm256_unpacklo_epi8(b: v3) + hi = zero._mm256_unpackhi_epi8(b: v3) + lo = lo._mm256_srai_epi16(imm8: 11) + hi = hi._mm256_srai_epi16(imm8: 11) + v3 = lo._mm256_packs_epi16(b: hi) + + // Apply to p0, q0. + q0 = q0._mm256_subs_epi8(b: v4) + p0 = p0._mm256_adds_epi8(b: v3) + + // ---- Filter4 !HEV: a3 = SignedShift(v4+1, 1) & not_hev ---- + a3 = v4._mm256_adds_epi8(b: k1) + lo = zero._mm256_unpacklo_epi8(b: a3) + hi = zero._mm256_unpackhi_epi8(b: a3) + lo = lo._mm256_srai_epi16(imm8: 9) + hi = hi._mm256_srai_epi16(imm8: 9) + a3 = lo._mm256_packs_epi16(b: hi) + a3 = a3._mm256_and_si256(b: not_hev) + + // Apply to p1, q1. + q1 = q1._mm256_subs_epi8(b: a3) + p1 = p1._mm256_adds_epi8(b: a3) + + // ---- Convert back to unsigned ---- + p1 = p1._mm256_xor_si256(b: sign_bit) + p0 = p0._mm256_xor_si256(b: sign_bit) + q0 = q0._mm256_xor_si256(b: sign_bit) + q1 = q1._mm256_xor_si256(b: sign_bit) + + // ---- Store p1, p0, q0, q1 ---- + if args.u_off < (2 * (this.uv_stride as base.u64)) { + return nothing + } + u_wb = args.workbuf + if (args.u_off - (2 * (this.uv_stride as base.u64))) <= u_wb.length() { + u_wb = u_wb[args.u_off - (2 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + if args.v_off < (2 * (this.uv_stride as base.u64)) { + return nothing + } + v_wb = args.workbuf + if (args.v_off - (2 * (this.uv_stride as base.u64))) <= v_wb.length() { + v_wb = v_wb[args.v_off - (2 * (this.uv_stride as base.u64)) ..] + } else { + return nothing + } + + // Store p1. + u_128 = p1._mm256_castsi256_si128() + v_128 = p1._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if (this.uv_stride as base.u64) <= u_wb.length() { + u_wb = u_wb[(this.uv_stride as base.u64) ..] + } + if (this.uv_stride as base.u64) <= v_wb.length() { + v_wb = v_wb[(this.uv_stride as base.u64) ..] + } + + // Store p0. + u_128 = p0._mm256_castsi256_si128() + v_128 = p0._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if (this.uv_stride as base.u64) <= u_wb.length() { + u_wb = u_wb[(this.uv_stride as base.u64) ..] + } + if (this.uv_stride as base.u64) <= v_wb.length() { + v_wb = v_wb[(this.uv_stride as base.u64) ..] + } + + // Store q0. + u_128 = q0._mm256_castsi256_si128() + v_128 = q0._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if (this.uv_stride as base.u64) <= u_wb.length() { + u_wb = u_wb[(this.uv_stride as base.u64) ..] + } + if (this.uv_stride as base.u64) <= v_wb.length() { + v_wb = v_wb[(this.uv_stride as base.u64) ..] + } + + // Store q1. + u_128 = q1._mm256_castsi256_si128() + v_128 = q1._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } +} + +// ============================================================================ +// normal_hfilter_mb_uv: filter6 for vertical MB boundary edges (U+V) +// 8x8 transpose + filter6 + reverse transpose, both planes simultaneously. +// ============================================================================ + +pri func decoder.normal_hfilter_mb_uv_x86_avx2!(workbuf: slice base.u8, + u_off: base.u64, v_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_avx2, +{ + var sse_util : base.x86_sse42_utility + var util : base.x86_avx2_utility + var u_wb : slice base.u8 + var v_wb : slice base.u8 + var stride : base.u64 + var u_ra : base.x86_m128i + var u_rb : base.x86_m128i + var v_ra : base.x86_m128i + var v_rb : base.x86_m128i + var u_128 : base.x86_m128i + var v_128 : base.x86_m128i + + // Transpose scratch (AVX2 — each lane does independent 8x8 transpose). + var f0 : base.x86_m256i + var f1 : base.x86_m256i + var f2 : base.x86_m256i + var f3 : base.x86_m256i + var g0 : base.x86_m256i + var g1 : base.x86_m256i + var g2 : base.x86_m256i + var g3 : base.x86_m256i + + var p3 : base.x86_m256i + var p2 : base.x86_m256i + var p1 : base.x86_m256i + var p0 : base.x86_m256i + var q0 : base.x86_m256i + var q1 : base.x86_m256i + var q2 : base.x86_m256i + var q3 : base.x86_m256i + + var zero : base.x86_m256i + var sign_bit : base.x86_m256i + var kFE : base.x86_m256i + var m_thresh : base.x86_m256i + var m_ithresh : base.x86_m256i + var m_hthresh : base.x86_m256i + var k3 : base.x86_m256i + var k4 : base.x86_m256i + var k63 : base.x86_m256i + var k27 : base.x86_m256i + var k18 : base.x86_m256i + var k9 : base.x86_m256i + + var mask : base.x86_m256i + var not_hev : base.x86_m256i + var delta : base.x86_m256i + var v3 : base.x86_m256i + var v4 : base.x86_m256i + var a1 : base.x86_m256i + var a2 : base.x86_m256i + var a3 : base.x86_m256i + var t1 : base.x86_m256i + var t2 : base.x86_m256i + var t3 : base.x86_m256i + var lo : base.x86_m256i + var hi : base.x86_m256i + var d_lo : base.x86_m256i + var d_hi : base.x86_m256i + var p0_adj : base.x86_m256i + var q0_adj : base.x86_m256i + var ra : base.x86_m256i + + stride = this.uv_stride as base.u64 + + // Bounds: need 4 bytes before q0 for p3 column. + if (args.u_off < 4) or (args.v_off < 4) { + return nothing + } + u_wb = args.workbuf + if (args.u_off - 4) > u_wb.length() { + return nothing + } + u_wb = u_wb[(args.u_off - 4) ..] + + v_wb = args.workbuf + if (args.v_off - 4) > v_wb.length() { + return nothing + } + v_wb = v_wb[(args.v_off - 4) ..] + + // ==== Load 8 rows of 8 bytes from each plane + forward transpose phase 1 ==== + // Each pair: load U rows, load V rows, combine into m256i, interleave. + + // Rows 0,1 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_ra = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_ra = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_rb = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_rb = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + ra = u_ra._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_ra, imm8: 1) + t1 = u_rb._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_rb, imm8: 1) + f0 = ra._mm256_unpacklo_epi8(b: t1) + + // Rows 2,3 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_ra = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_ra = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_rb = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_rb = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + ra = u_ra._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_ra, imm8: 1) + t1 = u_rb._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_rb, imm8: 1) + f1 = ra._mm256_unpacklo_epi8(b: t1) + + // Rows 4,5 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_ra = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_ra = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_rb = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_rb = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + ra = u_ra._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_ra, imm8: 1) + t1 = u_rb._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_rb, imm8: 1) + f2 = ra._mm256_unpacklo_epi8(b: t1) + + // Rows 6,7 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_ra = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_ra = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if stride > u_wb.length() { + return nothing + } + u_wb = u_wb[stride ..] + if stride > v_wb.length() { + return nothing + } + v_wb = v_wb[stride ..] + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_rb = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_rb = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + ra = u_ra._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_ra, imm8: 1) + t1 = u_rb._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_rb, imm8: 1) + f3 = ra._mm256_unpacklo_epi8(b: t1) + + // ==== Forward transpose phase 2 ==== + g0 = f0._mm256_unpacklo_epi16(b: f1) + g1 = f0._mm256_unpackhi_epi16(b: f1) + g2 = f2._mm256_unpacklo_epi16(b: f3) + g3 = f2._mm256_unpackhi_epi16(b: f3) + + // ==== Forward transpose phase 3 ==== + f0 = g0._mm256_unpacklo_epi32(b: g2) + f1 = g0._mm256_unpackhi_epi32(b: g2) + f2 = g1._mm256_unpacklo_epi32(b: g3) + f3 = g1._mm256_unpackhi_epi32(b: g3) + + // ==== Extract columns ==== + p3 = f0 + t1 = f0 // save for unpackhi + p2 = t1._mm256_unpackhi_epi64(b: t1) + p1 = f1 + t1 = f1 + p0 = t1._mm256_unpackhi_epi64(b: t1) + q0 = f2 + t1 = f2 + q1 = t1._mm256_unpackhi_epi64(b: t1) + q2 = f3 + t1 = f3 + q3 = t1._mm256_unpackhi_epi64(b: t1) + + // ==== Filter computation (filter6) ==== + + zero = util.make_m256i_zeroes() + sign_bit = util.make_m256i_repeat_u8(a: 0x80) + kFE = util.make_m256i_repeat_u8(a: 0xFE) + m_thresh = util.make_m256i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m256i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m256i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k3 = util.make_m256i_repeat_u8(a: 3) + k4 = util.make_m256i_repeat_u8(a: 4) + k63 = util.make_m256i_repeat_u16(a: 63) + k27 = util.make_m256i_repeat_u16(a: 27) + k18 = util.make_m256i_repeat_u16(a: 18) + k9 = util.make_m256i_repeat_u16(a: 9) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1._mm256_subs_epu8(b: q1)._mm256_or_si256(b: q1._mm256_subs_epu8(b: p1)) + t2 = t1._mm256_and_si256(b: kFE)._mm256_srli_epi16(imm8: 1) + t3 = p0._mm256_subs_epu8(b: q0)._mm256_or_si256(b: q0._mm256_subs_epu8(b: p0)) + t3 = t3._mm256_adds_epu8(b: t3) + t3 = t3._mm256_adds_epu8(b: t2) + mask = t3._mm256_subs_epu8(b: m_thresh)._mm256_cmpeq_epi8(b: zero) + + t1 = p3._mm256_subs_epu8(b: p2)._mm256_or_si256(b: p2._mm256_subs_epu8(b: p3)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = p2._mm256_subs_epu8(b: p1)._mm256_or_si256(b: p1._mm256_subs_epu8(b: p2)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = p1._mm256_subs_epu8(b: p0)._mm256_or_si256(b: p0._mm256_subs_epu8(b: p1)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q0._mm256_subs_epu8(b: q1)._mm256_or_si256(b: q1._mm256_subs_epu8(b: q0)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q1._mm256_subs_epu8(b: q2)._mm256_or_si256(b: q2._mm256_subs_epu8(b: q1)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q2._mm256_subs_epu8(b: q3)._mm256_or_si256(b: q3._mm256_subs_epu8(b: q2)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1._mm256_subs_epu8(b: p0)._mm256_or_si256(b: p0._mm256_subs_epu8(b: p1)) + t2 = q1._mm256_subs_epu8(b: q0)._mm256_or_si256(b: q0._mm256_subs_epu8(b: q1)) + t3 = t1._mm256_subs_epu8(b: m_hthresh)._mm256_or_si256(b: t2._mm256_subs_epu8(b: m_hthresh)) + not_hev = t3._mm256_cmpeq_epi8(b: zero) + + // ---- Convert to signed ---- + p2 = p2._mm256_xor_si256(b: sign_bit) + p1 = p1._mm256_xor_si256(b: sign_bit) + p0 = p0._mm256_xor_si256(b: sign_bit) + q0 = q0._mm256_xor_si256(b: sign_bit) + q1 = q1._mm256_xor_si256(b: sign_bit) + q2 = q2._mm256_xor_si256(b: sign_bit) + + // ---- Delta ---- + t1 = p1._mm256_subs_epi8(b: q1) + t2 = q0._mm256_subs_epi8(b: p0) + t1 = t1._mm256_adds_epi8(b: t2) + t1 = t1._mm256_adds_epi8(b: t2) + delta = t1._mm256_adds_epi8(b: t2) + delta = delta._mm256_and_si256(b: mask) + + // ---- Filter2 (HEV path) ---- + v4 = delta._mm256_adds_epi8(b: k4) + lo = zero._mm256_unpacklo_epi8(b: v4) + hi = zero._mm256_unpackhi_epi8(b: v4) + lo = lo._mm256_srai_epi16(imm8: 11) + hi = hi._mm256_srai_epi16(imm8: 11) + v4 = lo._mm256_packs_epi16(b: hi) + + v3 = delta._mm256_adds_epi8(b: k3) + lo = zero._mm256_unpacklo_epi8(b: v3) + hi = zero._mm256_unpackhi_epi8(b: v3) + lo = lo._mm256_srai_epi16(imm8: 11) + hi = hi._mm256_srai_epi16(imm8: 11) + v3 = lo._mm256_packs_epi16(b: hi) + + // ---- Filter6 (!HEV path) ---- + d_lo = zero._mm256_unpacklo_epi8(b: delta)._mm256_srai_epi16(imm8: 8) + d_hi = zero._mm256_unpackhi_epi8(b: delta)._mm256_srai_epi16(imm8: 8) + + lo = d_lo._mm256_mullo_epi16(b: k27)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + hi = d_hi._mm256_mullo_epi16(b: k27)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + a1 = lo._mm256_packs_epi16(b: hi) + + lo = d_lo._mm256_mullo_epi16(b: k18)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + hi = d_hi._mm256_mullo_epi16(b: k18)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + a2 = lo._mm256_packs_epi16(b: hi) + + lo = d_lo._mm256_mullo_epi16(b: k9)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + hi = d_hi._mm256_mullo_epi16(b: k9)._mm256_add_epi16(b: k63)._mm256_srai_epi16(imm8: 7) + a3 = lo._mm256_packs_epi16(b: hi) + + // ---- Merge HEV/!HEV ---- + p0_adj = not_hev._mm256_andnot_si256(b: v3)._mm256_or_si256(b: a1._mm256_and_si256(b: not_hev)) + p0 = p0._mm256_adds_epi8(b: p0_adj) + q0_adj = not_hev._mm256_andnot_si256(b: v4)._mm256_or_si256(b: a1._mm256_and_si256(b: not_hev)) + q0 = q0._mm256_subs_epi8(b: q0_adj) + p1 = p1._mm256_adds_epi8(b: a2._mm256_and_si256(b: not_hev)) + q1 = q1._mm256_subs_epi8(b: a2._mm256_and_si256(b: not_hev)) + p2 = p2._mm256_adds_epi8(b: a3._mm256_and_si256(b: not_hev)) + q2 = q2._mm256_subs_epi8(b: a3._mm256_and_si256(b: not_hev)) + + // ---- Convert back to unsigned ---- + p2 = p2._mm256_xor_si256(b: sign_bit) + p1 = p1._mm256_xor_si256(b: sign_bit) + p0 = p0._mm256_xor_si256(b: sign_bit) + q0 = q0._mm256_xor_si256(b: sign_bit) + q1 = q1._mm256_xor_si256(b: sign_bit) + q2 = q2._mm256_xor_si256(b: sign_bit) + + // ==== Reverse transpose ==== + f0 = p3._mm256_unpacklo_epi8(b: p2) + f1 = p1._mm256_unpacklo_epi8(b: p0) + f2 = q0._mm256_unpacklo_epi8(b: q1) + f3 = q2._mm256_unpacklo_epi8(b: q3) + + g0 = f0._mm256_unpacklo_epi16(b: f1) + g1 = f0._mm256_unpackhi_epi16(b: f1) + g2 = f2._mm256_unpacklo_epi16(b: f3) + g3 = f2._mm256_unpackhi_epi16(b: f3) + + f0 = g0._mm256_unpacklo_epi32(b: g2) + f1 = g0._mm256_unpackhi_epi32(b: g2) + f2 = g1._mm256_unpacklo_epi32(b: g3) + f3 = g1._mm256_unpackhi_epi32(b: g3) + + // ==== Store 8 rows of 8 bytes to each plane ==== + if (args.u_off - 4) > args.workbuf.length() { + return nothing + } + u_wb = args.workbuf[(args.u_off - 4) ..] + if (args.v_off - 4) > args.workbuf.length() { + return nothing + } + v_wb = args.workbuf[(args.v_off - 4) ..] + + // Rows 0,1 + u_128 = f0._mm256_castsi256_si128() + v_128 = f0._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + u_128 = f0._mm256_castsi256_si128()._mm_unpackhi_epi64(b: f0._mm256_castsi256_si128()) + v_128 = f0._mm256_extracti128_si256(imm8: 1)._mm_unpackhi_epi64(b: f0._mm256_extracti128_si256(imm8: 1)) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + + // Rows 2,3 + u_128 = f1._mm256_castsi256_si128() + v_128 = f1._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + u_128 = f1._mm256_castsi256_si128()._mm_unpackhi_epi64(b: f1._mm256_castsi256_si128()) + v_128 = f1._mm256_extracti128_si256(imm8: 1)._mm_unpackhi_epi64(b: f1._mm256_extracti128_si256(imm8: 1)) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + + // Rows 4,5 + u_128 = f2._mm256_castsi256_si128() + v_128 = f2._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + u_128 = f2._mm256_castsi256_si128()._mm_unpackhi_epi64(b: f2._mm256_castsi256_si128()) + v_128 = f2._mm256_extracti128_si256(imm8: 1)._mm_unpackhi_epi64(b: f2._mm256_extracti128_si256(imm8: 1)) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + + // Rows 6,7 + u_128 = f3._mm256_castsi256_si128() + v_128 = f3._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + u_128 = f3._mm256_castsi256_si128()._mm_unpackhi_epi64(b: f3._mm256_castsi256_si128()) + v_128 = f3._mm256_extracti128_si256(imm8: 1)._mm_unpackhi_epi64(b: f3._mm256_extracti128_si256(imm8: 1)) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } +} + +// ============================================================================ +// normal_hfilter_inner_uv: filter4 for vertical inner sub-block edges (U+V) +// ============================================================================ + +pri func decoder.normal_hfilter_inner_uv_x86_avx2!(workbuf: slice base.u8, + u_off: base.u64, v_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_avx2, +{ + var sse_util : base.x86_sse42_utility + var util : base.x86_avx2_utility + var u_wb : slice base.u8 + var v_wb : slice base.u8 + var stride : base.u64 + var u_ra : base.x86_m128i + var u_rb : base.x86_m128i + var v_ra : base.x86_m128i + var v_rb : base.x86_m128i + var u_128 : base.x86_m128i + var v_128 : base.x86_m128i + + var f0 : base.x86_m256i + var f1 : base.x86_m256i + var f2 : base.x86_m256i + var f3 : base.x86_m256i + var g0 : base.x86_m256i + var g1 : base.x86_m256i + var g2 : base.x86_m256i + var g3 : base.x86_m256i + + var p3 : base.x86_m256i + var p2 : base.x86_m256i + var p1 : base.x86_m256i + var p0 : base.x86_m256i + var q0 : base.x86_m256i + var q1 : base.x86_m256i + var q2 : base.x86_m256i + var q3 : base.x86_m256i + + var zero : base.x86_m256i + var sign_bit : base.x86_m256i + var kFE : base.x86_m256i + var m_thresh : base.x86_m256i + var m_ithresh : base.x86_m256i + var m_hthresh : base.x86_m256i + var k1 : base.x86_m256i + var k3 : base.x86_m256i + var k4 : base.x86_m256i + + var mask : base.x86_m256i + var not_hev : base.x86_m256i + var delta : base.x86_m256i + var v3 : base.x86_m256i + var v4 : base.x86_m256i + var a3 : base.x86_m256i + var t1 : base.x86_m256i + var t2 : base.x86_m256i + var t3 : base.x86_m256i + var lo : base.x86_m256i + var hi : base.x86_m256i + var ra : base.x86_m256i + + stride = this.uv_stride as base.u64 + + if (args.u_off < 4) or (args.v_off < 4) { + return nothing + } + u_wb = args.workbuf + if (args.u_off - 4) > u_wb.length() { + return nothing + } + u_wb = u_wb[(args.u_off - 4) ..] + + v_wb = args.workbuf + if (args.v_off - 4) > v_wb.length() { + return nothing + } + v_wb = v_wb[(args.v_off - 4) ..] + + // ==== Load + forward transpose ==== + + // Rows 0,1 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_ra = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_ra = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_rb = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_rb = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + ra = u_ra._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_ra, imm8: 1) + t1 = u_rb._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_rb, imm8: 1) + f0 = ra._mm256_unpacklo_epi8(b: t1) + + // Rows 2,3 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_ra = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_ra = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_rb = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_rb = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + ra = u_ra._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_ra, imm8: 1) + t1 = u_rb._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_rb, imm8: 1) + f1 = ra._mm256_unpacklo_epi8(b: t1) + + // Rows 4,5 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_ra = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_ra = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_rb = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_rb = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if (stride > u_wb.length()) or (stride > v_wb.length()) { + return nothing + } + u_wb = u_wb[stride ..] + v_wb = v_wb[stride ..] + ra = u_ra._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_ra, imm8: 1) + t1 = u_rb._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_rb, imm8: 1) + f2 = ra._mm256_unpacklo_epi8(b: t1) + + // Rows 6,7 + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_ra = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_ra = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + if stride > u_wb.length() { + return nothing + } + u_wb = u_wb[stride ..] + if stride > v_wb.length() { + return nothing + } + v_wb = v_wb[stride ..] + if (8 > u_wb.length()) or (8 > v_wb.length()) { + return nothing + } + u_rb = sse_util.make_m128i_slice64(a: u_wb[.. 8]) + v_rb = sse_util.make_m128i_slice64(a: v_wb[.. 8]) + ra = u_ra._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_ra, imm8: 1) + t1 = u_rb._mm256_castsi128_si256()._mm256_inserti128_si256(b: v_rb, imm8: 1) + f3 = ra._mm256_unpacklo_epi8(b: t1) + + // Transpose phases 2,3 + g0 = f0._mm256_unpacklo_epi16(b: f1) + g1 = f0._mm256_unpackhi_epi16(b: f1) + g2 = f2._mm256_unpacklo_epi16(b: f3) + g3 = f2._mm256_unpackhi_epi16(b: f3) + + f0 = g0._mm256_unpacklo_epi32(b: g2) + f1 = g0._mm256_unpackhi_epi32(b: g2) + f2 = g1._mm256_unpacklo_epi32(b: g3) + f3 = g1._mm256_unpackhi_epi32(b: g3) + + // Extract columns + p3 = f0 + t1 = f0 + p2 = t1._mm256_unpackhi_epi64(b: t1) + p1 = f1 + t1 = f1 + p0 = t1._mm256_unpackhi_epi64(b: t1) + q0 = f2 + t1 = f2 + q1 = t1._mm256_unpackhi_epi64(b: t1) + q2 = f3 + t1 = f3 + q3 = t1._mm256_unpackhi_epi64(b: t1) + + // ==== Filter4 computation ==== + + zero = util.make_m256i_zeroes() + sign_bit = util.make_m256i_repeat_u8(a: 0x80) + kFE = util.make_m256i_repeat_u8(a: 0xFE) + m_thresh = util.make_m256i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m256i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m256i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k1 = util.make_m256i_repeat_u8(a: 1) + k3 = util.make_m256i_repeat_u8(a: 3) + k4 = util.make_m256i_repeat_u8(a: 4) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1._mm256_subs_epu8(b: q1)._mm256_or_si256(b: q1._mm256_subs_epu8(b: p1)) + t2 = t1._mm256_and_si256(b: kFE)._mm256_srli_epi16(imm8: 1) + t3 = p0._mm256_subs_epu8(b: q0)._mm256_or_si256(b: q0._mm256_subs_epu8(b: p0)) + t3 = t3._mm256_adds_epu8(b: t3) + t3 = t3._mm256_adds_epu8(b: t2) + mask = t3._mm256_subs_epu8(b: m_thresh)._mm256_cmpeq_epi8(b: zero) + + t1 = p3._mm256_subs_epu8(b: p2)._mm256_or_si256(b: p2._mm256_subs_epu8(b: p3)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = p2._mm256_subs_epu8(b: p1)._mm256_or_si256(b: p1._mm256_subs_epu8(b: p2)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = p1._mm256_subs_epu8(b: p0)._mm256_or_si256(b: p0._mm256_subs_epu8(b: p1)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q0._mm256_subs_epu8(b: q1)._mm256_or_si256(b: q1._mm256_subs_epu8(b: q0)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q1._mm256_subs_epu8(b: q2)._mm256_or_si256(b: q2._mm256_subs_epu8(b: q1)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + t1 = q2._mm256_subs_epu8(b: q3)._mm256_or_si256(b: q3._mm256_subs_epu8(b: q2)) + mask = mask._mm256_and_si256(b: t1._mm256_subs_epu8(b: m_ithresh)._mm256_cmpeq_epi8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1._mm256_subs_epu8(b: p0)._mm256_or_si256(b: p0._mm256_subs_epu8(b: p1)) + t2 = q1._mm256_subs_epu8(b: q0)._mm256_or_si256(b: q0._mm256_subs_epu8(b: q1)) + t3 = t1._mm256_subs_epu8(b: m_hthresh)._mm256_or_si256(b: t2._mm256_subs_epu8(b: m_hthresh)) + not_hev = t3._mm256_cmpeq_epi8(b: zero) + + // ---- Convert to signed ---- + p1 = p1._mm256_xor_si256(b: sign_bit) + p0 = p0._mm256_xor_si256(b: sign_bit) + q0 = q0._mm256_xor_si256(b: sign_bit) + q1 = q1._mm256_xor_si256(b: sign_bit) + + // ---- Combined delta ---- + t1 = p1._mm256_subs_epi8(b: q1) + t1 = not_hev._mm256_andnot_si256(b: t1) + t2 = q0._mm256_subs_epi8(b: p0) + t1 = t1._mm256_adds_epi8(b: t2) + t1 = t1._mm256_adds_epi8(b: t2) + delta = t1._mm256_adds_epi8(b: t2) + delta = delta._mm256_and_si256(b: mask) + + // ---- v4, v3 ---- + v4 = delta._mm256_adds_epi8(b: k4) + lo = zero._mm256_unpacklo_epi8(b: v4) + hi = zero._mm256_unpackhi_epi8(b: v4) + lo = lo._mm256_srai_epi16(imm8: 11) + hi = hi._mm256_srai_epi16(imm8: 11) + v4 = lo._mm256_packs_epi16(b: hi) + + v3 = delta._mm256_adds_epi8(b: k3) + lo = zero._mm256_unpacklo_epi8(b: v3) + hi = zero._mm256_unpackhi_epi8(b: v3) + lo = lo._mm256_srai_epi16(imm8: 11) + hi = hi._mm256_srai_epi16(imm8: 11) + v3 = lo._mm256_packs_epi16(b: hi) + + q0 = q0._mm256_subs_epi8(b: v4) + p0 = p0._mm256_adds_epi8(b: v3) + + // ---- Filter4 !HEV: a3 ---- + a3 = v4._mm256_adds_epi8(b: k1) + lo = zero._mm256_unpacklo_epi8(b: a3) + hi = zero._mm256_unpackhi_epi8(b: a3) + lo = lo._mm256_srai_epi16(imm8: 9) + hi = hi._mm256_srai_epi16(imm8: 9) + a3 = lo._mm256_packs_epi16(b: hi) + a3 = a3._mm256_and_si256(b: not_hev) + + q1 = q1._mm256_subs_epi8(b: a3) + p1 = p1._mm256_adds_epi8(b: a3) + + // ---- Convert back to unsigned ---- + p1 = p1._mm256_xor_si256(b: sign_bit) + p0 = p0._mm256_xor_si256(b: sign_bit) + q0 = q0._mm256_xor_si256(b: sign_bit) + q1 = q1._mm256_xor_si256(b: sign_bit) + + // ==== Reverse transpose (only p1, p0, q0, q1 modified — but we need to + // write all 8 columns back because the transpose mixes them) ==== + f0 = p3._mm256_unpacklo_epi8(b: p2) + f1 = p1._mm256_unpacklo_epi8(b: p0) + f2 = q0._mm256_unpacklo_epi8(b: q1) + f3 = q2._mm256_unpacklo_epi8(b: q3) + + g0 = f0._mm256_unpacklo_epi16(b: f1) + g1 = f0._mm256_unpackhi_epi16(b: f1) + g2 = f2._mm256_unpacklo_epi16(b: f3) + g3 = f2._mm256_unpackhi_epi16(b: f3) + + f0 = g0._mm256_unpacklo_epi32(b: g2) + f1 = g0._mm256_unpackhi_epi32(b: g2) + f2 = g1._mm256_unpacklo_epi32(b: g3) + f3 = g1._mm256_unpackhi_epi32(b: g3) + + // ==== Store ==== + if (args.u_off - 4) > args.workbuf.length() { + return nothing + } + u_wb = args.workbuf[(args.u_off - 4) ..] + if (args.v_off - 4) > args.workbuf.length() { + return nothing + } + v_wb = args.workbuf[(args.v_off - 4) ..] + + // Rows 0,1 + u_128 = f0._mm256_castsi256_si128() + v_128 = f0._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + u_128 = f0._mm256_castsi256_si128()._mm_unpackhi_epi64(b: f0._mm256_castsi256_si128()) + v_128 = f0._mm256_extracti128_si256(imm8: 1)._mm_unpackhi_epi64(b: f0._mm256_extracti128_si256(imm8: 1)) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + + // Rows 2,3 + u_128 = f1._mm256_castsi256_si128() + v_128 = f1._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + u_128 = f1._mm256_castsi256_si128()._mm_unpackhi_epi64(b: f1._mm256_castsi256_si128()) + v_128 = f1._mm256_extracti128_si256(imm8: 1)._mm_unpackhi_epi64(b: f1._mm256_extracti128_si256(imm8: 1)) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + + // Rows 4,5 + u_128 = f2._mm256_castsi256_si128() + v_128 = f2._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + u_128 = f2._mm256_castsi256_si128()._mm_unpackhi_epi64(b: f2._mm256_castsi256_si128()) + v_128 = f2._mm256_extracti128_si256(imm8: 1)._mm_unpackhi_epi64(b: f2._mm256_extracti128_si256(imm8: 1)) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + + // Rows 6,7 + u_128 = f3._mm256_castsi256_si128() + v_128 = f3._mm256_extracti128_si256(imm8: 1) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } + if stride <= u_wb.length() { + u_wb = u_wb[stride ..] + } + if stride <= v_wb.length() { + v_wb = v_wb[stride ..] + } + u_128 = f3._mm256_castsi256_si128()._mm_unpackhi_epi64(b: f3._mm256_castsi256_si128()) + v_128 = f3._mm256_extracti128_si256(imm8: 1)._mm_unpackhi_epi64(b: f3._mm256_extracti128_si256(imm8: 1)) + if 8 <= u_wb.length() { + u_128.store_slice64!(a: u_wb[.. 8]) + } + if 8 <= v_wb.length() { + v_128.store_slice64!(a: v_wb[.. 8]) + } +} diff --git a/std/vp8/decode_filter_x86_sse42.wuffs b/std/vp8/decode_filter_x86_sse42.wuffs new file mode 100644 index 000000000..ac8f645c0 --- /dev/null +++ b/std/vp8/decode_filter_x86_sse42.wuffs @@ -0,0 +1,2922 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 simple loop filter, SSE4.2 version. +// +// Filters 16 contiguous pixels at a horizontal edge in parallel. +// Algorithm from libwebp (DoFilter2_SSE2): +// 1. NeedsFilter: abs(p0-q0)*2 + abs(p1-q1)/2 <= limit (per-byte mask) +// 2. Convert u8 to i8 (XOR with 0x80) +// 3. GetBaseDelta: delta = sat_i8(3*(q0-p0) + sat_i8(p1-q1)) +// 4. Mask delta: delta &= mask +// 5. DoSimpleFilter: v4 = signed_byte_shr3(delta+4), +// v3 = signed_byte_shr3(delta+3), q0 -= v4, p0 += v3 (saturating i8) +// 6. Convert back to u8 + +pri func decoder.simple_vfilter_16_x86_sse42!(workbuf: slice base.u8, q0_off: base.u64, limit: base.u32), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + var wb : slice base.u8 + var stride : base.u64 + + // SSE registers. + var p1 : base.x86_m128i + var p0 : base.x86_m128i + var q0 : base.x86_m128i + var q1 : base.x86_m128i + var sign_bit : base.x86_m128i + var zero : base.x86_m128i + var kFE : base.x86_m128i + var m_thresh : base.x86_m128i + var k3 : base.x86_m128i + var k4 : base.x86_m128i + var mask : base.x86_m128i + var t1 : base.x86_m128i + var t2 : base.x86_m128i + var t3 : base.x86_m128i + var delta : base.x86_m128i + var v3 : base.x86_m128i + var v4 : base.x86_m128i + var lo : base.x86_m128i + var hi : base.x86_m128i + var p1s : base.x86_m128i + var q1s : base.x86_m128i + + stride = this.y_stride as base.u64 + if stride < 16 { + return nothing + } + assert 16 <= stride via "a <= b: b >= a"() + + // Need at least 2*stride bytes before q0_off for p1 and p0 rows. + if args.q0_off < (2 * stride) { + return nothing + } + + // Use wb for loads (progressive reslicing). + // Use args.workbuf (unmodified until store phase) for stores. + wb = args.workbuf + + // Reslice wb to p1 position: q0_off - 2*stride. + if (args.q0_off - (2 * stride)) <= wb.length() { + wb = wb[args.q0_off - (2 * stride) ..] + } else { + return nothing + } + + // Load p1 row (16 contiguous bytes). + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p1 = util.make_m128i_slice128(a: wb[.. 16]) + + // Advance by stride to p0 row. + wb = wb[stride ..] + + // Load p0 row. + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p0 = util.make_m128i_slice128(a: wb[.. 16]) + + // Advance by stride to q0 row. + wb = wb[stride ..] + + // Load q0 row. + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q0 = util.make_m128i_slice128(a: wb[.. 16]) + + // Advance by stride to q1 row. + wb = wb[stride ..] + + // Load q1 row (last row, no stride advance). + if 16 > wb.length() { + return nothing + } + q1 = util.make_m128i_slice128(a: wb[.. 16]) + + // Set up constants. + zero = util.make_m128i_zeroes() + sign_bit = util.make_m128i_repeat_u8(a: 0x80) + kFE = util.make_m128i_repeat_u8(a: 0xFE) + m_thresh = util.make_m128i_repeat_u8(a: (args.limit & 0xFF) as base.u8) + k3 = util.make_m128i_repeat_u8(a: 3) + k4 = util.make_m128i_repeat_u8(a: 4) + + // ---- NeedsFilter: abs(p0-q0)*2 + abs(p1-q1)/2 <= limit ---- + // abs(a-b) for u8: or(subs_epu8(a,b), subs_epu8(b,a)) + t1 = p1._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: p1)) + // abs(p1-q1) / 2: clear LSB then shift right 1 (safe across byte boundaries + // because the cleared bit prevents cross-byte contamination in 16-bit shift). + t2 = t1._mm_and_si128(b: kFE) + t2 = t2._mm_srli_epi16(imm8: 1) + // abs(p0-q0) * 2 + t3 = p0._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: p0)) + t3 = t3._mm_adds_epu8(b: t3) + // abs(p0-q0)*2 + abs(p1-q1)/2 + t3 = t3._mm_adds_epu8(b: t2) + // mask: 0xFF where value <= thresh, 0x00 where > thresh. + mask = t3._mm_subs_epu8(b: m_thresh) + mask = mask._mm_cmpeq_epi8(b: zero) + + // ---- GetBaseDelta (i8 domain) ---- + // Convert to signed by XOR with 0x80. + p1s = p1._mm_xor_si128(b: sign_bit) + q1s = q1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + + // delta = sat_i8(3*(q0-p0) + sat_i8(p1-q1)) + // Accumulate via saturating adds to avoid overflow. + t1 = p1s._mm_subs_epi8(b: q1s) // p1 - q1 (saturating i8) + t2 = q0._mm_subs_epi8(b: p0) // q0 - p0 + t1 = t1._mm_adds_epi8(b: t2) // (p1-q1) + 1*(q0-p0) + t1 = t1._mm_adds_epi8(b: t2) // (p1-q1) + 2*(q0-p0) + delta = t1._mm_adds_epi8(b: t2) // (p1-q1) + 3*(q0-p0) + + // Mask: only apply filter where NeedsFilter passed. + delta = delta._mm_and_si128(b: mask) + + // ---- DoSimpleFilter ---- + // v4 = signed_byte_shr3(delta + 4) + v4 = delta._mm_adds_epi8(b: k4) + // Signed byte shift right by 3: place bytes in high byte of i16 words, + // arithmetic shift right by 11 (= 8 positioning + 3 actual), pack back. + lo = zero._mm_unpacklo_epi8(b: v4) + hi = zero._mm_unpackhi_epi8(b: v4) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v4 = lo._mm_packs_epi16(b: hi) + + // v3 = signed_byte_shr3(delta + 3) + v3 = delta._mm_adds_epi8(b: k3) + lo = zero._mm_unpacklo_epi8(b: v3) + hi = zero._mm_unpackhi_epi8(b: v3) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v3 = lo._mm_packs_epi16(b: hi) + + // Apply: q0 -= v4, p0 += v3 (saturating i8). + q0 = q0._mm_subs_epi8(b: v4) + p0 = p0._mm_adds_epi8(b: v3) + + // Convert back to unsigned. + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + + // ---- Store p0 and q0 back ---- + // Reslice args.workbuf to p0 position: q0_off - stride. + // We know q0_off >= 2*stride from the guard above, so q0_off >= stride. + if args.q0_off < stride { + return nothing + } + if (args.q0_off - stride) <= args.workbuf.length() { + args.workbuf = args.workbuf[args.q0_off - stride ..] + } else { + return nothing + } + + // Store p0 row. + if stride > args.workbuf.length() { + return nothing + } + assert 16 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p0.store_slice128!(a: args.workbuf[.. 16]) + args.workbuf = args.workbuf[stride ..] + + // Store q0 row (last row, no stride advance). + if 16 > args.workbuf.length() { + return nothing + } + q0.store_slice128!(a: args.workbuf[.. 16]) +} + +// VP8 normal loop filter (filter4), SSE4.2 version. +// +// Filters 16 contiguous pixels at a horizontal inner sub-block edge. +// Loads 8 rows (p3..q3), computes NeedsFilter2 + HEV masks, then applies +// filter2 (HEV) or filter4 (!HEV). + +pri func decoder.normal_vfilter_inner_16_x86_sse42!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + var wb : slice base.u8 + var stride : base.u64 + + var p3 : base.x86_m128i + var p2 : base.x86_m128i + var p1 : base.x86_m128i + var p0 : base.x86_m128i + var q0 : base.x86_m128i + var q1 : base.x86_m128i + var q2 : base.x86_m128i + var q3 : base.x86_m128i + var zero : base.x86_m128i + var sign_bit : base.x86_m128i + var kFE : base.x86_m128i + var m_thresh : base.x86_m128i + var m_ithresh : base.x86_m128i + var m_hthresh : base.x86_m128i + var k1 : base.x86_m128i + var k3 : base.x86_m128i + var k4 : base.x86_m128i + var mask : base.x86_m128i + var not_hev : base.x86_m128i + var delta : base.x86_m128i + var v3 : base.x86_m128i + var v4 : base.x86_m128i + var a3 : base.x86_m128i + var t1 : base.x86_m128i + var t2 : base.x86_m128i + var t3 : base.x86_m128i + var lo : base.x86_m128i + var hi : base.x86_m128i + + stride = this.y_stride as base.u64 + if stride < 16 { + return nothing + } + assert 16 <= stride via "a <= b: b >= a"() + + // Need at least 4*stride bytes before q0_off for p3 row. + if args.q0_off < (4 * stride) { + return nothing + } + + wb = args.workbuf + + // Reslice to p3 position. + if (args.q0_off - (4 * stride)) <= wb.length() { + wb = wb[args.q0_off - (4 * stride) ..] + } else { + return nothing + } + + // Load 8 rows: p3, p2, p1, p0, q0, q1, q2, q3. + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p3 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p2 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p1 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p0 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q0 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q1 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q2 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if 16 > wb.length() { + return nothing + } + q3 = util.make_m128i_slice128(a: wb[.. 16]) + + // Constants. + zero = util.make_m128i_zeroes() + sign_bit = util.make_m128i_repeat_u8(a: 0x80) + kFE = util.make_m128i_repeat_u8(a: 0xFE) + m_thresh = util.make_m128i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m128i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m128i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k1 = util.make_m128i_repeat_u8(a: 1) + k3 = util.make_m128i_repeat_u8(a: 3) + k4 = util.make_m128i_repeat_u8(a: 4) + + // ---- NeedsFilter: abs(p0-q0)*2 + abs(p1-q1)/2 <= level ---- + t1 = p1._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: p1)) + t2 = t1._mm_and_si128(b: kFE)._mm_srli_epi16(imm8: 1) + t3 = p0._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: p0)) + t3 = t3._mm_adds_epu8(b: t3) + t3 = t3._mm_adds_epu8(b: t2) + mask = t3._mm_subs_epu8(b: m_thresh)._mm_cmpeq_epi8(b: zero) + + // ---- NeedsFilter2: ilevel checks on 6 adjacent pairs ---- + t1 = p3._mm_subs_epu8(b: p2)._mm_or_si128(b: p2._mm_subs_epu8(b: p3)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p2._mm_subs_epu8(b: p1)._mm_or_si128(b: p1._mm_subs_epu8(b: p2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q0._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: q0)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q1._mm_subs_epu8(b: q2)._mm_or_si128(b: q2._mm_subs_epu8(b: q1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q2._mm_subs_epu8(b: q3)._mm_or_si128(b: q3._mm_subs_epu8(b: q2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + + // ---- GetNotHEV: 0xFF where abs(p1-p0) <= hlevel AND abs(q1-q0) <= hlevel ---- + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + t2 = q1._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: q1)) + t3 = t1._mm_subs_epu8(b: m_hthresh)._mm_or_si128(b: t2._mm_subs_epu8(b: m_hthresh)) + not_hev = t3._mm_cmpeq_epi8(b: zero) + + // ---- Convert p1, p0, q0, q1 to signed (XOR 0x80) ---- + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + + // ---- Combined delta ---- + // HEV: delta = 3*(q0-p0) + (p1-q1) [filter2] + // !HEV: delta = 3*(q0-p0) [filter4 uses no p1-q1] + t1 = p1._mm_subs_epi8(b: q1) // p1-q1 (sat i8) + t1 = not_hev._mm_andnot_si128(b: t1) // zero where !HEV + t2 = q0._mm_subs_epi8(b: p0) // q0-p0 + t1 = t1._mm_adds_epi8(b: t2) + t1 = t1._mm_adds_epi8(b: t2) + delta = t1._mm_adds_epi8(b: t2) // 3*(q0-p0) + hev*(p1-q1) + delta = delta._mm_and_si128(b: mask) + + // ---- v4 = SignedShift(delta+4, 3), v3 = SignedShift(delta+3, 3) ---- + v4 = delta._mm_adds_epi8(b: k4) + lo = zero._mm_unpacklo_epi8(b: v4) + hi = zero._mm_unpackhi_epi8(b: v4) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v4 = lo._mm_packs_epi16(b: hi) + + v3 = delta._mm_adds_epi8(b: k3) + lo = zero._mm_unpacklo_epi8(b: v3) + hi = zero._mm_unpackhi_epi8(b: v3) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v3 = lo._mm_packs_epi16(b: hi) + + // Apply to p0, q0 (both HEV and !HEV use these). + q0 = q0._mm_subs_epi8(b: v4) + p0 = p0._mm_adds_epi8(b: v3) + + // ---- Filter4 !HEV: a3 = SignedShift(v4+1, 1) & not_hev ---- + a3 = v4._mm_adds_epi8(b: k1) + lo = zero._mm_unpacklo_epi8(b: a3) + hi = zero._mm_unpackhi_epi8(b: a3) + lo = lo._mm_srai_epi16(imm8: 9) + hi = hi._mm_srai_epi16(imm8: 9) + a3 = lo._mm_packs_epi16(b: hi) + a3 = a3._mm_and_si128(b: not_hev) + + // Apply to p1, q1 (only !HEV). + q1 = q1._mm_subs_epi8(b: a3) + p1 = p1._mm_adds_epi8(b: a3) + + // ---- Convert back to unsigned ---- + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + + // ---- Store p1, p0, q0, q1 ---- + if args.q0_off < (2 * stride) { + return nothing + } + if (args.q0_off - (2 * stride)) <= args.workbuf.length() { + args.workbuf = args.workbuf[args.q0_off - (2 * stride) ..] + } else { + return nothing + } + + if stride > args.workbuf.length() { + return nothing + } + assert 16 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p1.store_slice128!(a: args.workbuf[.. 16]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 16 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p0.store_slice128!(a: args.workbuf[.. 16]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 16 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + q0.store_slice128!(a: args.workbuf[.. 16]) + args.workbuf = args.workbuf[stride ..] + + if 16 > args.workbuf.length() { + return nothing + } + q1.store_slice128!(a: args.workbuf[.. 16]) +} + +// VP8 normal loop filter (filter6), SSE4.2 version. +// +// Filters 16 contiguous pixels at a horizontal MB boundary edge. +// Applies filter2 (HEV) or filter6 (!HEV, weighted: 27/18/9 * delta). + +pri func decoder.normal_vfilter_mb_16_x86_sse42!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + var wb : slice base.u8 + var stride : base.u64 + + var p3 : base.x86_m128i + var p2 : base.x86_m128i + var p1 : base.x86_m128i + var p0 : base.x86_m128i + var q0 : base.x86_m128i + var q1 : base.x86_m128i + var q2 : base.x86_m128i + var q3 : base.x86_m128i + var zero : base.x86_m128i + var sign_bit : base.x86_m128i + var kFE : base.x86_m128i + var m_thresh : base.x86_m128i + var m_ithresh : base.x86_m128i + var m_hthresh : base.x86_m128i + var k3 : base.x86_m128i + var k4 : base.x86_m128i + var k63 : base.x86_m128i + var k27 : base.x86_m128i + var k18 : base.x86_m128i + var k9 : base.x86_m128i + var mask : base.x86_m128i + var not_hev : base.x86_m128i + var delta : base.x86_m128i + var v3 : base.x86_m128i + var v4 : base.x86_m128i + var a1 : base.x86_m128i + var a2 : base.x86_m128i + var a3 : base.x86_m128i + var t1 : base.x86_m128i + var t2 : base.x86_m128i + var t3 : base.x86_m128i + var lo : base.x86_m128i + var hi : base.x86_m128i + var d_lo : base.x86_m128i + var d_hi : base.x86_m128i + var p0_adj : base.x86_m128i + var q0_adj : base.x86_m128i + + stride = this.y_stride as base.u64 + if stride < 16 { + return nothing + } + assert 16 <= stride via "a <= b: b >= a"() + + // Need at least 4*stride bytes before q0_off. + if args.q0_off < (4 * stride) { + return nothing + } + + wb = args.workbuf + if (args.q0_off - (4 * stride)) <= wb.length() { + wb = wb[args.q0_off - (4 * stride) ..] + } else { + return nothing + } + + // Load 8 rows. + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p3 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p2 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p1 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p0 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q0 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q1 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 16 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q2 = util.make_m128i_slice128(a: wb[.. 16]) + wb = wb[stride ..] + + if 16 > wb.length() { + return nothing + } + q3 = util.make_m128i_slice128(a: wb[.. 16]) + + // Constants. + zero = util.make_m128i_zeroes() + sign_bit = util.make_m128i_repeat_u8(a: 0x80) + kFE = util.make_m128i_repeat_u8(a: 0xFE) + m_thresh = util.make_m128i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m128i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m128i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k3 = util.make_m128i_repeat_u8(a: 3) + k4 = util.make_m128i_repeat_u8(a: 4) + k63 = util.make_m128i_repeat_u16(a: 63) + k27 = util.make_m128i_repeat_u16(a: 27) + k18 = util.make_m128i_repeat_u16(a: 18) + k9 = util.make_m128i_repeat_u16(a: 9) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: p1)) + t2 = t1._mm_and_si128(b: kFE)._mm_srli_epi16(imm8: 1) + t3 = p0._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: p0)) + t3 = t3._mm_adds_epu8(b: t3) + t3 = t3._mm_adds_epu8(b: t2) + mask = t3._mm_subs_epu8(b: m_thresh)._mm_cmpeq_epi8(b: zero) + + t1 = p3._mm_subs_epu8(b: p2)._mm_or_si128(b: p2._mm_subs_epu8(b: p3)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p2._mm_subs_epu8(b: p1)._mm_or_si128(b: p1._mm_subs_epu8(b: p2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q0._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: q0)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q1._mm_subs_epu8(b: q2)._mm_or_si128(b: q2._mm_subs_epu8(b: q1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q2._mm_subs_epu8(b: q3)._mm_or_si128(b: q3._mm_subs_epu8(b: q2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + t2 = q1._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: q1)) + t3 = t1._mm_subs_epu8(b: m_hthresh)._mm_or_si128(b: t2._mm_subs_epu8(b: m_hthresh)) + not_hev = t3._mm_cmpeq_epi8(b: zero) + + // ---- Convert p2, p1, p0, q0, q1, q2 to signed ---- + p2 = p2._mm_xor_si128(b: sign_bit) + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + q2 = q2._mm_xor_si128(b: sign_bit) + + // ---- Delta = sat_i8(3*(q0-p0) + sat_i8(p1-q1)) for ALL positions ---- + // Filter6 uses p1-q1 in both HEV and !HEV paths. + t1 = p1._mm_subs_epi8(b: q1) + t2 = q0._mm_subs_epi8(b: p0) + t1 = t1._mm_adds_epi8(b: t2) + t1 = t1._mm_adds_epi8(b: t2) + delta = t1._mm_adds_epi8(b: t2) + delta = delta._mm_and_si128(b: mask) + + // ---- Filter2 (HEV path): v4, v3 ---- + v4 = delta._mm_adds_epi8(b: k4) + lo = zero._mm_unpacklo_epi8(b: v4) + hi = zero._mm_unpackhi_epi8(b: v4) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v4 = lo._mm_packs_epi16(b: hi) + + v3 = delta._mm_adds_epi8(b: k3) + lo = zero._mm_unpacklo_epi8(b: v3) + hi = zero._mm_unpackhi_epi8(b: v3) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v3 = lo._mm_packs_epi16(b: hi) + + // ---- Filter6 (!HEV path): widen delta to i16, multiply by 27/18/9 ---- + // Sign-extend delta bytes to i16: unpack(zero, delta) puts delta in high + // byte of each i16 word, then srai by 8 sign-extends. + d_lo = zero._mm_unpacklo_epi8(b: delta)._mm_srai_epi16(imm8: 8) + d_hi = zero._mm_unpackhi_epi8(b: delta)._mm_srai_epi16(imm8: 8) + + // a1 = (27*delta + 63) >> 7 + lo = d_lo._mm_mullo_epi16(b: k27)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k27)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a1 = lo._mm_packs_epi16(b: hi) + + // a2 = (18*delta + 63) >> 7 + lo = d_lo._mm_mullo_epi16(b: k18)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k18)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a2 = lo._mm_packs_epi16(b: hi) + + // a3 = (9*delta + 63) >> 7 + lo = d_lo._mm_mullo_epi16(b: k9)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k9)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a3 = lo._mm_packs_epi16(b: hi) + + // ---- Merge HEV (filter2) and !HEV (filter6) results ---- + // p0 += select(hev, v3, a1): hev gets v3, !hev gets a1 + p0_adj = not_hev._mm_andnot_si128(b: v3)._mm_or_si128(b: a1._mm_and_si128(b: not_hev)) + p0 = p0._mm_adds_epi8(b: p0_adj) + // q0 -= select(hev, v4, a1) + q0_adj = not_hev._mm_andnot_si128(b: v4)._mm_or_si128(b: a1._mm_and_si128(b: not_hev)) + q0 = q0._mm_subs_epi8(b: q0_adj) + // p1 += a2 & not_hev (only !HEV) + p1 = p1._mm_adds_epi8(b: a2._mm_and_si128(b: not_hev)) + // q1 -= a2 & not_hev + q1 = q1._mm_subs_epi8(b: a2._mm_and_si128(b: not_hev)) + // p2 += a3 & not_hev + p2 = p2._mm_adds_epi8(b: a3._mm_and_si128(b: not_hev)) + // q2 -= a3 & not_hev + q2 = q2._mm_subs_epi8(b: a3._mm_and_si128(b: not_hev)) + + // ---- Convert back to unsigned ---- + p2 = p2._mm_xor_si128(b: sign_bit) + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + q2 = q2._mm_xor_si128(b: sign_bit) + + // ---- Store p2, p1, p0, q0, q1, q2 ---- + if args.q0_off < (3 * stride) { + return nothing + } + if (args.q0_off - (3 * stride)) <= args.workbuf.length() { + args.workbuf = args.workbuf[args.q0_off - (3 * stride) ..] + } else { + return nothing + } + + if stride > args.workbuf.length() { + return nothing + } + assert 16 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p2.store_slice128!(a: args.workbuf[.. 16]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 16 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p1.store_slice128!(a: args.workbuf[.. 16]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 16 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p0.store_slice128!(a: args.workbuf[.. 16]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 16 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + q0.store_slice128!(a: args.workbuf[.. 16]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 16 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + q1.store_slice128!(a: args.workbuf[.. 16]) + args.workbuf = args.workbuf[stride ..] + + if 16 > args.workbuf.length() { + return nothing + } + q2.store_slice128!(a: args.workbuf[.. 16]) +} + +// VP8 normal loop filter (filter6) for 8-pixel U/V horizontal MB edges, SSE4.2. +// Same algorithm as normal_vfilter_mb_16 but loads/stores only 8 bytes per row +// (upper 8 bytes of each register are zero/unused). Uses uv_stride for step. + +pri func decoder.normal_vfilter_mb_8_x86_sse42!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + var wb : slice base.u8 + var stride : base.u64 + + var p3 : base.x86_m128i + var p2 : base.x86_m128i + var p1 : base.x86_m128i + var p0 : base.x86_m128i + var q0 : base.x86_m128i + var q1 : base.x86_m128i + var q2 : base.x86_m128i + var q3 : base.x86_m128i + var zero : base.x86_m128i + var sign_bit : base.x86_m128i + var kFE : base.x86_m128i + var m_thresh : base.x86_m128i + var m_ithresh : base.x86_m128i + var m_hthresh : base.x86_m128i + var k3 : base.x86_m128i + var k4 : base.x86_m128i + var k63 : base.x86_m128i + var k27 : base.x86_m128i + var k18 : base.x86_m128i + var k9 : base.x86_m128i + var mask : base.x86_m128i + var not_hev : base.x86_m128i + var delta : base.x86_m128i + var v3 : base.x86_m128i + var v4 : base.x86_m128i + var a1 : base.x86_m128i + var a2 : base.x86_m128i + var a3 : base.x86_m128i + var t1 : base.x86_m128i + var t2 : base.x86_m128i + var t3 : base.x86_m128i + var lo : base.x86_m128i + var hi : base.x86_m128i + var d_lo : base.x86_m128i + var d_hi : base.x86_m128i + var p0_adj : base.x86_m128i + var q0_adj : base.x86_m128i + + stride = this.uv_stride as base.u64 + if stride < 8 { + return nothing + } + assert 8 <= stride via "a <= b: b >= a"() + + if args.q0_off < (4 * stride) { + return nothing + } + + wb = args.workbuf + if (args.q0_off - (4 * stride)) <= wb.length() { + wb = wb[args.q0_off - (4 * stride) ..] + } else { + return nothing + } + + // Load 8 rows using 8-byte loads (upper 8 bytes are zero). + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p3 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p2 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p1 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p0 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q0 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q1 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q2 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if 8 > wb.length() { + return nothing + } + q3 = util.make_m128i_slice64(a: wb[.. 8]) + + // Constants. + zero = util.make_m128i_zeroes() + sign_bit = util.make_m128i_repeat_u8(a: 0x80) + kFE = util.make_m128i_repeat_u8(a: 0xFE) + m_thresh = util.make_m128i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m128i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m128i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k3 = util.make_m128i_repeat_u8(a: 3) + k4 = util.make_m128i_repeat_u8(a: 4) + k63 = util.make_m128i_repeat_u16(a: 63) + k27 = util.make_m128i_repeat_u16(a: 27) + k18 = util.make_m128i_repeat_u16(a: 18) + k9 = util.make_m128i_repeat_u16(a: 9) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: p1)) + t2 = t1._mm_and_si128(b: kFE)._mm_srli_epi16(imm8: 1) + t3 = p0._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: p0)) + t3 = t3._mm_adds_epu8(b: t3) + t3 = t3._mm_adds_epu8(b: t2) + mask = t3._mm_subs_epu8(b: m_thresh)._mm_cmpeq_epi8(b: zero) + + t1 = p3._mm_subs_epu8(b: p2)._mm_or_si128(b: p2._mm_subs_epu8(b: p3)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p2._mm_subs_epu8(b: p1)._mm_or_si128(b: p1._mm_subs_epu8(b: p2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q0._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: q0)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q1._mm_subs_epu8(b: q2)._mm_or_si128(b: q2._mm_subs_epu8(b: q1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q2._mm_subs_epu8(b: q3)._mm_or_si128(b: q3._mm_subs_epu8(b: q2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + t2 = q1._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: q1)) + t3 = t1._mm_subs_epu8(b: m_hthresh)._mm_or_si128(b: t2._mm_subs_epu8(b: m_hthresh)) + not_hev = t3._mm_cmpeq_epi8(b: zero) + + // ---- Convert to signed ---- + p2 = p2._mm_xor_si128(b: sign_bit) + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + q2 = q2._mm_xor_si128(b: sign_bit) + + // ---- Delta ---- + t1 = p1._mm_subs_epi8(b: q1) + t2 = q0._mm_subs_epi8(b: p0) + t1 = t1._mm_adds_epi8(b: t2) + t1 = t1._mm_adds_epi8(b: t2) + delta = t1._mm_adds_epi8(b: t2) + delta = delta._mm_and_si128(b: mask) + + // ---- Filter2 (HEV path) ---- + v4 = delta._mm_adds_epi8(b: k4) + lo = zero._mm_unpacklo_epi8(b: v4) + hi = zero._mm_unpackhi_epi8(b: v4) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v4 = lo._mm_packs_epi16(b: hi) + + v3 = delta._mm_adds_epi8(b: k3) + lo = zero._mm_unpacklo_epi8(b: v3) + hi = zero._mm_unpackhi_epi8(b: v3) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v3 = lo._mm_packs_epi16(b: hi) + + // ---- Filter6 (!HEV path) ---- + d_lo = zero._mm_unpacklo_epi8(b: delta)._mm_srai_epi16(imm8: 8) + d_hi = zero._mm_unpackhi_epi8(b: delta)._mm_srai_epi16(imm8: 8) + + lo = d_lo._mm_mullo_epi16(b: k27)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k27)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a1 = lo._mm_packs_epi16(b: hi) + + lo = d_lo._mm_mullo_epi16(b: k18)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k18)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a2 = lo._mm_packs_epi16(b: hi) + + lo = d_lo._mm_mullo_epi16(b: k9)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k9)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a3 = lo._mm_packs_epi16(b: hi) + + // ---- Merge HEV/!HEV ---- + p0_adj = not_hev._mm_andnot_si128(b: v3)._mm_or_si128(b: a1._mm_and_si128(b: not_hev)) + p0 = p0._mm_adds_epi8(b: p0_adj) + q0_adj = not_hev._mm_andnot_si128(b: v4)._mm_or_si128(b: a1._mm_and_si128(b: not_hev)) + q0 = q0._mm_subs_epi8(b: q0_adj) + p1 = p1._mm_adds_epi8(b: a2._mm_and_si128(b: not_hev)) + q1 = q1._mm_subs_epi8(b: a2._mm_and_si128(b: not_hev)) + p2 = p2._mm_adds_epi8(b: a3._mm_and_si128(b: not_hev)) + q2 = q2._mm_subs_epi8(b: a3._mm_and_si128(b: not_hev)) + + // ---- Convert back to unsigned ---- + p2 = p2._mm_xor_si128(b: sign_bit) + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + q2 = q2._mm_xor_si128(b: sign_bit) + + // ---- Store p2, p1, p0, q0, q1, q2 using 8-byte stores ---- + if args.q0_off < (3 * stride) { + return nothing + } + if (args.q0_off - (3 * stride)) <= args.workbuf.length() { + args.workbuf = args.workbuf[args.q0_off - (3 * stride) ..] + } else { + return nothing + } + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p2.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p1.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p0.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + q0.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + q1.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if 8 > args.workbuf.length() { + return nothing + } + q2.store_slice64!(a: args.workbuf[.. 8]) +} + +// VP8 normal loop filter (filter6) for 16-row Y vertical MB edges, SSE4.2. +// +// Filters a vertical MB boundary edge on the Y plane: 16 rows, 8 pixels per row +// (p3..q3 at step=1). Uses a 16x8 -> 8x16 transpose to convert column-oriented +// pixel data into row-oriented SSE registers, applies the same filter6 as +// normal_vfilter_mb_16, then transposes back and stores. + +pri func decoder.normal_hfilter_mb_16_x86_sse42!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + var wb : slice base.u8 + var stride : base.u64 + + // Row load temporaries. + var ra : base.x86_m128i + var rb : base.x86_m128i + + // Transpose scratch (reused across phases). + var f0 : base.x86_m128i + var f1 : base.x86_m128i + var f2 : base.x86_m128i + var f3 : base.x86_m128i + var f4 : base.x86_m128i + var f5 : base.x86_m128i + var f6 : base.x86_m128i + var f7 : base.x86_m128i + var g0 : base.x86_m128i + var g1 : base.x86_m128i + var g2 : base.x86_m128i + var g3 : base.x86_m128i + var g4 : base.x86_m128i + var g5 : base.x86_m128i + var g6 : base.x86_m128i + var g7 : base.x86_m128i + + // Pixel columns after forward transpose. + var p3 : base.x86_m128i + var p2 : base.x86_m128i + var p1 : base.x86_m128i + var p0 : base.x86_m128i + var q0 : base.x86_m128i + var q1 : base.x86_m128i + var q2 : base.x86_m128i + var q3 : base.x86_m128i + + // Filter constants. + var zero : base.x86_m128i + var sign_bit : base.x86_m128i + var kFE : base.x86_m128i + var m_thresh : base.x86_m128i + var m_ithresh : base.x86_m128i + var m_hthresh : base.x86_m128i + var k3 : base.x86_m128i + var k4 : base.x86_m128i + var k63 : base.x86_m128i + var k27 : base.x86_m128i + var k18 : base.x86_m128i + var k9 : base.x86_m128i + + // Filter temporaries. + var mask : base.x86_m128i + var not_hev : base.x86_m128i + var delta : base.x86_m128i + var v3 : base.x86_m128i + var v4 : base.x86_m128i + var a1 : base.x86_m128i + var a2 : base.x86_m128i + var a3 : base.x86_m128i + var t1 : base.x86_m128i + var t2 : base.x86_m128i + var t3 : base.x86_m128i + var lo : base.x86_m128i + var hi : base.x86_m128i + var d_lo : base.x86_m128i + var d_hi : base.x86_m128i + var p0_adj : base.x86_m128i + var q0_adj : base.x86_m128i + + stride = this.y_stride as base.u64 + if stride < 8 { + return nothing + } + assert 8 <= stride via "a <= b: b >= a"() + + // Bounds: need 4 bytes before q0 for p3 column. + if args.q0_off < 4 { + return nothing + } + wb = args.workbuf + if (args.q0_off - 4) > wb.length() { + return nothing + } + wb = wb[(args.q0_off - 4) ..] + + // ==== Load 16 rows of 8 bytes + forward transpose phase 1 ==== + // Each pair of rows is loaded and interleaved via unpacklo_epi8. + + // Rows 0,1 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f0 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 2,3 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f1 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 4,5 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f2 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 6,7 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f3 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 8,9 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f4 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 10,11 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f5 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 12,13 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f6 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 14,15 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if 8 > wb.length() { + return nothing + } + rb = util.make_m128i_slice64(a: wb[.. 8]) + f7 = ra._mm_unpacklo_epi8(b: rb) + + // ==== Forward transpose phase 2: unpacklo/hi_epi16 ==== + g0 = f0._mm_unpacklo_epi16(b: f1) + g1 = f0._mm_unpackhi_epi16(b: f1) + g2 = f2._mm_unpacklo_epi16(b: f3) + g3 = f2._mm_unpackhi_epi16(b: f3) + g4 = f4._mm_unpacklo_epi16(b: f5) + g5 = f4._mm_unpackhi_epi16(b: f5) + g6 = f6._mm_unpacklo_epi16(b: f7) + g7 = f6._mm_unpackhi_epi16(b: f7) + + // ==== Forward transpose phase 3: unpacklo/hi_epi32 ==== + f0 = g0._mm_unpacklo_epi32(b: g2) + f1 = g0._mm_unpackhi_epi32(b: g2) + f2 = g1._mm_unpacklo_epi32(b: g3) + f3 = g1._mm_unpackhi_epi32(b: g3) + f4 = g4._mm_unpacklo_epi32(b: g6) + f5 = g4._mm_unpackhi_epi32(b: g6) + f6 = g5._mm_unpacklo_epi32(b: g7) + f7 = g5._mm_unpackhi_epi32(b: g7) + + // ==== Forward transpose phase 4: unpacklo/hi_epi64 → pixel columns ==== + p3 = f0._mm_unpacklo_epi64(b: f4) + p2 = f0._mm_unpackhi_epi64(b: f4) + p1 = f1._mm_unpacklo_epi64(b: f5) + p0 = f1._mm_unpackhi_epi64(b: f5) + q0 = f2._mm_unpacklo_epi64(b: f6) + q1 = f2._mm_unpackhi_epi64(b: f6) + q2 = f3._mm_unpacklo_epi64(b: f7) + q3 = f3._mm_unpackhi_epi64(b: f7) + + // ==== Filter computation (identical to normal_vfilter_mb_16) ==== + + // Constants. + zero = util.make_m128i_zeroes() + sign_bit = util.make_m128i_repeat_u8(a: 0x80) + kFE = util.make_m128i_repeat_u8(a: 0xFE) + m_thresh = util.make_m128i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m128i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m128i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k3 = util.make_m128i_repeat_u8(a: 3) + k4 = util.make_m128i_repeat_u8(a: 4) + k63 = util.make_m128i_repeat_u16(a: 63) + k27 = util.make_m128i_repeat_u16(a: 27) + k18 = util.make_m128i_repeat_u16(a: 18) + k9 = util.make_m128i_repeat_u16(a: 9) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: p1)) + t2 = t1._mm_and_si128(b: kFE)._mm_srli_epi16(imm8: 1) + t3 = p0._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: p0)) + t3 = t3._mm_adds_epu8(b: t3) + t3 = t3._mm_adds_epu8(b: t2) + mask = t3._mm_subs_epu8(b: m_thresh)._mm_cmpeq_epi8(b: zero) + + t1 = p3._mm_subs_epu8(b: p2)._mm_or_si128(b: p2._mm_subs_epu8(b: p3)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p2._mm_subs_epu8(b: p1)._mm_or_si128(b: p1._mm_subs_epu8(b: p2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q0._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: q0)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q1._mm_subs_epu8(b: q2)._mm_or_si128(b: q2._mm_subs_epu8(b: q1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q2._mm_subs_epu8(b: q3)._mm_or_si128(b: q3._mm_subs_epu8(b: q2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + t2 = q1._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: q1)) + t3 = t1._mm_subs_epu8(b: m_hthresh)._mm_or_si128(b: t2._mm_subs_epu8(b: m_hthresh)) + not_hev = t3._mm_cmpeq_epi8(b: zero) + + // ---- Convert p2, p1, p0, q0, q1, q2 to signed ---- + p2 = p2._mm_xor_si128(b: sign_bit) + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + q2 = q2._mm_xor_si128(b: sign_bit) + + // ---- Delta = sat_i8(3*(q0-p0) + sat_i8(p1-q1)) ---- + t1 = p1._mm_subs_epi8(b: q1) + t2 = q0._mm_subs_epi8(b: p0) + t1 = t1._mm_adds_epi8(b: t2) + t1 = t1._mm_adds_epi8(b: t2) + delta = t1._mm_adds_epi8(b: t2) + delta = delta._mm_and_si128(b: mask) + + // ---- Filter2 (HEV path): v4, v3 ---- + v4 = delta._mm_adds_epi8(b: k4) + lo = zero._mm_unpacklo_epi8(b: v4) + hi = zero._mm_unpackhi_epi8(b: v4) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v4 = lo._mm_packs_epi16(b: hi) + + v3 = delta._mm_adds_epi8(b: k3) + lo = zero._mm_unpacklo_epi8(b: v3) + hi = zero._mm_unpackhi_epi8(b: v3) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v3 = lo._mm_packs_epi16(b: hi) + + // ---- Filter6 (!HEV path): widen delta to i16, multiply by 27/18/9 ---- + d_lo = zero._mm_unpacklo_epi8(b: delta)._mm_srai_epi16(imm8: 8) + d_hi = zero._mm_unpackhi_epi8(b: delta)._mm_srai_epi16(imm8: 8) + + lo = d_lo._mm_mullo_epi16(b: k27)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k27)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a1 = lo._mm_packs_epi16(b: hi) + + lo = d_lo._mm_mullo_epi16(b: k18)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k18)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a2 = lo._mm_packs_epi16(b: hi) + + lo = d_lo._mm_mullo_epi16(b: k9)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k9)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a3 = lo._mm_packs_epi16(b: hi) + + // ---- Merge HEV/!HEV ---- + p0_adj = not_hev._mm_andnot_si128(b: v3)._mm_or_si128(b: a1._mm_and_si128(b: not_hev)) + p0 = p0._mm_adds_epi8(b: p0_adj) + q0_adj = not_hev._mm_andnot_si128(b: v4)._mm_or_si128(b: a1._mm_and_si128(b: not_hev)) + q0 = q0._mm_subs_epi8(b: q0_adj) + p1 = p1._mm_adds_epi8(b: a2._mm_and_si128(b: not_hev)) + q1 = q1._mm_subs_epi8(b: a2._mm_and_si128(b: not_hev)) + p2 = p2._mm_adds_epi8(b: a3._mm_and_si128(b: not_hev)) + q2 = q2._mm_subs_epi8(b: a3._mm_and_si128(b: not_hev)) + + // ---- Convert back to unsigned ---- + p2 = p2._mm_xor_si128(b: sign_bit) + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + q2 = q2._mm_xor_si128(b: sign_bit) + + // ==== Reverse transpose: 8 columns (16 values each) → 16 rows (8 bytes each) ==== + + // Phase R1: interleave adjacent column pairs. + f0 = p3._mm_unpacklo_epi8(b: p2) + f1 = p3._mm_unpackhi_epi8(b: p2) + f2 = p1._mm_unpacklo_epi8(b: p0) + f3 = p1._mm_unpackhi_epi8(b: p0) + f4 = q0._mm_unpacklo_epi8(b: q1) + f5 = q0._mm_unpackhi_epi8(b: q1) + f6 = q2._mm_unpacklo_epi8(b: q3) + f7 = q2._mm_unpackhi_epi8(b: q3) + + // Phase R2: interleave 16-bit pairs. + g0 = f0._mm_unpacklo_epi16(b: f2) + g1 = f0._mm_unpackhi_epi16(b: f2) + g2 = f4._mm_unpacklo_epi16(b: f6) + g3 = f4._mm_unpackhi_epi16(b: f6) + g4 = f1._mm_unpacklo_epi16(b: f3) + g5 = f1._mm_unpackhi_epi16(b: f3) + g6 = f5._mm_unpacklo_epi16(b: f7) + g7 = f5._mm_unpackhi_epi16(b: f7) + + // Phase R3: interleave 32-bit pairs → each register has 2 rows. + f0 = g0._mm_unpacklo_epi32(b: g2) + f1 = g0._mm_unpackhi_epi32(b: g2) + f2 = g1._mm_unpacklo_epi32(b: g3) + f3 = g1._mm_unpackhi_epi32(b: g3) + f4 = g4._mm_unpacklo_epi32(b: g6) + f5 = g4._mm_unpackhi_epi32(b: g6) + f6 = g5._mm_unpacklo_epi32(b: g7) + f7 = g5._mm_unpackhi_epi32(b: g7) + + // ==== Store 16 rows of 8 bytes ==== + // Each f register has [row_even, row_odd] in low/high 64-bit halves. + + if (args.q0_off - 4) > args.workbuf.length() { + return nothing + } + args.workbuf = args.workbuf[(args.q0_off - 4) ..] + + // Rows 0,1 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f0.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f0._mm_unpackhi_epi64(b: f0) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 2,3 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f1.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f1._mm_unpackhi_epi64(b: f1) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 4,5 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f2.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f2._mm_unpackhi_epi64(b: f2) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 6,7 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f3.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f3._mm_unpackhi_epi64(b: f3) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 8,9 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f4.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f4._mm_unpackhi_epi64(b: f4) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 10,11 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f5.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f5._mm_unpackhi_epi64(b: f5) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 12,13 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f6.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f6._mm_unpackhi_epi64(b: f6) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 14,15 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f7.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + ra = f7._mm_unpackhi_epi64(b: f7) + if 8 > args.workbuf.length() { + return nothing + } + ra.store_slice64!(a: args.workbuf[.. 8]) +} + +// VP8 normal loop filter (filter6) for 8-row U/V vertical MB edges, SSE4.2. +// +// Filters a vertical MB boundary edge on the U/V plane: 8 rows, 8 pixels per +// row (p3..q3 at step=1). Uses an 8x8 transpose to convert column-oriented +// pixel data into row-oriented SSE registers, applies filter6, then transposes +// back and stores. + +pri func decoder.normal_hfilter_mb_8_x86_sse42!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + var wb : slice base.u8 + var stride : base.u64 + + // Row load temporaries. + var ra : base.x86_m128i + var rb : base.x86_m128i + + // Transpose scratch. + var f0 : base.x86_m128i + var f1 : base.x86_m128i + var f2 : base.x86_m128i + var f3 : base.x86_m128i + var g0 : base.x86_m128i + var g1 : base.x86_m128i + var g2 : base.x86_m128i + var g3 : base.x86_m128i + + // Pixel columns after forward transpose. + var p3 : base.x86_m128i + var p2 : base.x86_m128i + var p1 : base.x86_m128i + var p0 : base.x86_m128i + var q0 : base.x86_m128i + var q1 : base.x86_m128i + var q2 : base.x86_m128i + var q3 : base.x86_m128i + + // Filter constants. + var zero : base.x86_m128i + var sign_bit : base.x86_m128i + var kFE : base.x86_m128i + var m_thresh : base.x86_m128i + var m_ithresh : base.x86_m128i + var m_hthresh : base.x86_m128i + var k3 : base.x86_m128i + var k4 : base.x86_m128i + var k63 : base.x86_m128i + var k27 : base.x86_m128i + var k18 : base.x86_m128i + var k9 : base.x86_m128i + + // Filter temporaries. + var mask : base.x86_m128i + var not_hev : base.x86_m128i + var delta : base.x86_m128i + var v3 : base.x86_m128i + var v4 : base.x86_m128i + var a1 : base.x86_m128i + var a2 : base.x86_m128i + var a3 : base.x86_m128i + var t1 : base.x86_m128i + var t2 : base.x86_m128i + var t3 : base.x86_m128i + var lo : base.x86_m128i + var hi : base.x86_m128i + var d_lo : base.x86_m128i + var d_hi : base.x86_m128i + var p0_adj : base.x86_m128i + var q0_adj : base.x86_m128i + + stride = this.uv_stride as base.u64 + if stride < 8 { + return nothing + } + assert 8 <= stride via "a <= b: b >= a"() + + // Bounds: need 4 bytes before q0 for p3 column. + if args.q0_off < 4 { + return nothing + } + wb = args.workbuf + if (args.q0_off - 4) > wb.length() { + return nothing + } + wb = wb[(args.q0_off - 4) ..] + + // ==== Load 8 rows of 8 bytes + forward transpose phase 1 ==== + // Each pair of rows is loaded and interleaved via unpacklo_epi8. + + // Rows 0,1 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f0 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 2,3 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f1 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 4,5 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f2 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 6,7 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if 8 > wb.length() { + return nothing + } + rb = util.make_m128i_slice64(a: wb[.. 8]) + f3 = ra._mm_unpacklo_epi8(b: rb) + + // ==== Forward transpose phase 2: unpacklo/hi_epi16 ==== + g0 = f0._mm_unpacklo_epi16(b: f1) + g1 = f0._mm_unpackhi_epi16(b: f1) + g2 = f2._mm_unpacklo_epi16(b: f3) + g3 = f2._mm_unpackhi_epi16(b: f3) + + // ==== Forward transpose phase 3: unpacklo/hi_epi32 ==== + // Each result has two columns packed in low/high 64-bit halves. + f0 = g0._mm_unpacklo_epi32(b: g2) + f1 = g0._mm_unpackhi_epi32(b: g2) + f2 = g1._mm_unpacklo_epi32(b: g3) + f3 = g1._mm_unpackhi_epi32(b: g3) + + // ==== Extract individual columns via unpackhi_epi64 ==== + p3 = f0 + p2 = f0._mm_unpackhi_epi64(b: f0) + p1 = f1 + p0 = f1._mm_unpackhi_epi64(b: f1) + q0 = f2 + q1 = f2._mm_unpackhi_epi64(b: f2) + q2 = f3 + q3 = f3._mm_unpackhi_epi64(b: f3) + + // ==== Filter computation (identical to normal_vfilter_mb_8) ==== + + // Constants. + zero = util.make_m128i_zeroes() + sign_bit = util.make_m128i_repeat_u8(a: 0x80) + kFE = util.make_m128i_repeat_u8(a: 0xFE) + m_thresh = util.make_m128i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m128i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m128i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k3 = util.make_m128i_repeat_u8(a: 3) + k4 = util.make_m128i_repeat_u8(a: 4) + k63 = util.make_m128i_repeat_u16(a: 63) + k27 = util.make_m128i_repeat_u16(a: 27) + k18 = util.make_m128i_repeat_u16(a: 18) + k9 = util.make_m128i_repeat_u16(a: 9) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: p1)) + t2 = t1._mm_and_si128(b: kFE)._mm_srli_epi16(imm8: 1) + t3 = p0._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: p0)) + t3 = t3._mm_adds_epu8(b: t3) + t3 = t3._mm_adds_epu8(b: t2) + mask = t3._mm_subs_epu8(b: m_thresh)._mm_cmpeq_epi8(b: zero) + + t1 = p3._mm_subs_epu8(b: p2)._mm_or_si128(b: p2._mm_subs_epu8(b: p3)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p2._mm_subs_epu8(b: p1)._mm_or_si128(b: p1._mm_subs_epu8(b: p2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q0._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: q0)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q1._mm_subs_epu8(b: q2)._mm_or_si128(b: q2._mm_subs_epu8(b: q1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q2._mm_subs_epu8(b: q3)._mm_or_si128(b: q3._mm_subs_epu8(b: q2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + t2 = q1._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: q1)) + t3 = t1._mm_subs_epu8(b: m_hthresh)._mm_or_si128(b: t2._mm_subs_epu8(b: m_hthresh)) + not_hev = t3._mm_cmpeq_epi8(b: zero) + + // ---- Convert to signed ---- + p2 = p2._mm_xor_si128(b: sign_bit) + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + q2 = q2._mm_xor_si128(b: sign_bit) + + // ---- Delta = sat_i8(3*(q0-p0) + sat_i8(p1-q1)) ---- + t1 = p1._mm_subs_epi8(b: q1) + t2 = q0._mm_subs_epi8(b: p0) + t1 = t1._mm_adds_epi8(b: t2) + t1 = t1._mm_adds_epi8(b: t2) + delta = t1._mm_adds_epi8(b: t2) + delta = delta._mm_and_si128(b: mask) + + // ---- Filter2 (HEV path): v4, v3 ---- + v4 = delta._mm_adds_epi8(b: k4) + lo = zero._mm_unpacklo_epi8(b: v4) + hi = zero._mm_unpackhi_epi8(b: v4) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v4 = lo._mm_packs_epi16(b: hi) + + v3 = delta._mm_adds_epi8(b: k3) + lo = zero._mm_unpacklo_epi8(b: v3) + hi = zero._mm_unpackhi_epi8(b: v3) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v3 = lo._mm_packs_epi16(b: hi) + + // ---- Filter6 (!HEV path): widen delta to i16, multiply by 27/18/9 ---- + d_lo = zero._mm_unpacklo_epi8(b: delta)._mm_srai_epi16(imm8: 8) + d_hi = zero._mm_unpackhi_epi8(b: delta)._mm_srai_epi16(imm8: 8) + + lo = d_lo._mm_mullo_epi16(b: k27)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k27)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a1 = lo._mm_packs_epi16(b: hi) + + lo = d_lo._mm_mullo_epi16(b: k18)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k18)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a2 = lo._mm_packs_epi16(b: hi) + + lo = d_lo._mm_mullo_epi16(b: k9)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + hi = d_hi._mm_mullo_epi16(b: k9)._mm_add_epi16(b: k63)._mm_srai_epi16(imm8: 7) + a3 = lo._mm_packs_epi16(b: hi) + + // ---- Merge HEV/!HEV ---- + p0_adj = not_hev._mm_andnot_si128(b: v3)._mm_or_si128(b: a1._mm_and_si128(b: not_hev)) + p0 = p0._mm_adds_epi8(b: p0_adj) + q0_adj = not_hev._mm_andnot_si128(b: v4)._mm_or_si128(b: a1._mm_and_si128(b: not_hev)) + q0 = q0._mm_subs_epi8(b: q0_adj) + p1 = p1._mm_adds_epi8(b: a2._mm_and_si128(b: not_hev)) + q1 = q1._mm_subs_epi8(b: a2._mm_and_si128(b: not_hev)) + p2 = p2._mm_adds_epi8(b: a3._mm_and_si128(b: not_hev)) + q2 = q2._mm_subs_epi8(b: a3._mm_and_si128(b: not_hev)) + + // ---- Convert back to unsigned ---- + p2 = p2._mm_xor_si128(b: sign_bit) + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + q2 = q2._mm_xor_si128(b: sign_bit) + + // ==== Reverse transpose: 8 columns → 8 rows (8 bytes each) ==== + + // Phase R1: interleave adjacent column pairs (unpacklo_epi8 reads low 8 bytes only). + f0 = p3._mm_unpacklo_epi8(b: p2) + f1 = p1._mm_unpacklo_epi8(b: p0) + f2 = q0._mm_unpacklo_epi8(b: q1) + f3 = q2._mm_unpacklo_epi8(b: q3) + + // Phase R2: interleave 16-bit pairs. + g0 = f0._mm_unpacklo_epi16(b: f1) + g1 = f0._mm_unpackhi_epi16(b: f1) + g2 = f2._mm_unpacklo_epi16(b: f3) + g3 = f2._mm_unpackhi_epi16(b: f3) + + // Phase R3: interleave 32-bit pairs → each register has 2 rows. + f0 = g0._mm_unpacklo_epi32(b: g2) + f1 = g0._mm_unpackhi_epi32(b: g2) + f2 = g1._mm_unpacklo_epi32(b: g3) + f3 = g1._mm_unpackhi_epi32(b: g3) + + // ==== Store 8 rows of 8 bytes ==== + if (args.q0_off - 4) > args.workbuf.length() { + return nothing + } + args.workbuf = args.workbuf[(args.q0_off - 4) ..] + + // Rows 0,1 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f0.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f0._mm_unpackhi_epi64(b: f0) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 2,3 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f1.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f1._mm_unpackhi_epi64(b: f1) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 4,5 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f2.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f2._mm_unpackhi_epi64(b: f2) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 6,7 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f3.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + ra = f3._mm_unpackhi_epi64(b: f3) + if 8 > args.workbuf.length() { + return nothing + } + ra.store_slice64!(a: args.workbuf[.. 8]) +} + +// VP8 normal loop filter (filter4) for 16-row Y vertical inner edges, SSE4.2. +// +// Filters a vertical sub-block boundary on the Y plane: 16 rows, 8 pixels per +// row (p3..q3 at step=1). Uses 16x8 → 8x16 transpose, filter4 (not filter6), +// then transposes back. Only p1, p0, q0, q1 are modified. + +pri func decoder.normal_hfilter_inner_16_x86_sse42!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + var wb : slice base.u8 + var stride : base.u64 + + var ra : base.x86_m128i + var rb : base.x86_m128i + + var f0 : base.x86_m128i + var f1 : base.x86_m128i + var f2 : base.x86_m128i + var f3 : base.x86_m128i + var f4 : base.x86_m128i + var f5 : base.x86_m128i + var f6 : base.x86_m128i + var f7 : base.x86_m128i + var g0 : base.x86_m128i + var g1 : base.x86_m128i + var g2 : base.x86_m128i + var g3 : base.x86_m128i + var g4 : base.x86_m128i + var g5 : base.x86_m128i + var g6 : base.x86_m128i + var g7 : base.x86_m128i + + var p3 : base.x86_m128i + var p2 : base.x86_m128i + var p1 : base.x86_m128i + var p0 : base.x86_m128i + var q0 : base.x86_m128i + var q1 : base.x86_m128i + var q2 : base.x86_m128i + var q3 : base.x86_m128i + + var zero : base.x86_m128i + var sign_bit : base.x86_m128i + var kFE : base.x86_m128i + var m_thresh : base.x86_m128i + var m_ithresh : base.x86_m128i + var m_hthresh : base.x86_m128i + var k1 : base.x86_m128i + var k3 : base.x86_m128i + var k4 : base.x86_m128i + + var mask : base.x86_m128i + var not_hev : base.x86_m128i + var delta : base.x86_m128i + var v3 : base.x86_m128i + var v4 : base.x86_m128i + var a3 : base.x86_m128i + var t1 : base.x86_m128i + var t2 : base.x86_m128i + var t3 : base.x86_m128i + var lo : base.x86_m128i + var hi : base.x86_m128i + + stride = this.y_stride as base.u64 + if stride < 8 { + return nothing + } + assert 8 <= stride via "a <= b: b >= a"() + + if args.q0_off < 4 { + return nothing + } + wb = args.workbuf + if (args.q0_off - 4) > wb.length() { + return nothing + } + wb = wb[(args.q0_off - 4) ..] + + // ==== Load 16 rows of 8 bytes + forward transpose phase 1 ==== + // With stride >= 8, the stride check subsumes the 8-byte load check. + + // Rows 0,1 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f0 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 2,3 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f1 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 4,5 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f2 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 6,7 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f3 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 8,9 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f4 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 10,11 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f5 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 12,13 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f6 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 14,15 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if 8 > wb.length() { + return nothing + } + rb = util.make_m128i_slice64(a: wb[.. 8]) + f7 = ra._mm_unpacklo_epi8(b: rb) + + // ==== Forward transpose phases 2-4 ==== + g0 = f0._mm_unpacklo_epi16(b: f1) + g1 = f0._mm_unpackhi_epi16(b: f1) + g2 = f2._mm_unpacklo_epi16(b: f3) + g3 = f2._mm_unpackhi_epi16(b: f3) + g4 = f4._mm_unpacklo_epi16(b: f5) + g5 = f4._mm_unpackhi_epi16(b: f5) + g6 = f6._mm_unpacklo_epi16(b: f7) + g7 = f6._mm_unpackhi_epi16(b: f7) + + f0 = g0._mm_unpacklo_epi32(b: g2) + f1 = g0._mm_unpackhi_epi32(b: g2) + f2 = g1._mm_unpacklo_epi32(b: g3) + f3 = g1._mm_unpackhi_epi32(b: g3) + f4 = g4._mm_unpacklo_epi32(b: g6) + f5 = g4._mm_unpackhi_epi32(b: g6) + f6 = g5._mm_unpacklo_epi32(b: g7) + f7 = g5._mm_unpackhi_epi32(b: g7) + + p3 = f0._mm_unpacklo_epi64(b: f4) + p2 = f0._mm_unpackhi_epi64(b: f4) + p1 = f1._mm_unpacklo_epi64(b: f5) + p0 = f1._mm_unpackhi_epi64(b: f5) + q0 = f2._mm_unpacklo_epi64(b: f6) + q1 = f2._mm_unpackhi_epi64(b: f6) + q2 = f3._mm_unpacklo_epi64(b: f7) + q3 = f3._mm_unpackhi_epi64(b: f7) + + // ==== Filter4 computation (from normal_vfilter_inner_16) ==== + + zero = util.make_m128i_zeroes() + sign_bit = util.make_m128i_repeat_u8(a: 0x80) + kFE = util.make_m128i_repeat_u8(a: 0xFE) + m_thresh = util.make_m128i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m128i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m128i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k1 = util.make_m128i_repeat_u8(a: 1) + k3 = util.make_m128i_repeat_u8(a: 3) + k4 = util.make_m128i_repeat_u8(a: 4) + + // ---- NeedsFilter + NeedsFilter2 ---- + t1 = p1._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: p1)) + t2 = t1._mm_and_si128(b: kFE)._mm_srli_epi16(imm8: 1) + t3 = p0._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: p0)) + t3 = t3._mm_adds_epu8(b: t3) + t3 = t3._mm_adds_epu8(b: t2) + mask = t3._mm_subs_epu8(b: m_thresh)._mm_cmpeq_epi8(b: zero) + + t1 = p3._mm_subs_epu8(b: p2)._mm_or_si128(b: p2._mm_subs_epu8(b: p3)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p2._mm_subs_epu8(b: p1)._mm_or_si128(b: p1._mm_subs_epu8(b: p2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q0._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: q0)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q1._mm_subs_epu8(b: q2)._mm_or_si128(b: q2._mm_subs_epu8(b: q1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q2._mm_subs_epu8(b: q3)._mm_or_si128(b: q3._mm_subs_epu8(b: q2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + + // ---- GetNotHEV ---- + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + t2 = q1._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: q1)) + t3 = t1._mm_subs_epu8(b: m_hthresh)._mm_or_si128(b: t2._mm_subs_epu8(b: m_hthresh)) + not_hev = t3._mm_cmpeq_epi8(b: zero) + + // ---- Convert p1, p0, q0, q1 to signed ---- + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + + // ---- Combined delta: HEV uses (p1-q1), !HEV zeroes it ---- + t1 = p1._mm_subs_epi8(b: q1) + t1 = not_hev._mm_andnot_si128(b: t1) + t2 = q0._mm_subs_epi8(b: p0) + t1 = t1._mm_adds_epi8(b: t2) + t1 = t1._mm_adds_epi8(b: t2) + delta = t1._mm_adds_epi8(b: t2) + delta = delta._mm_and_si128(b: mask) + + // ---- v4, v3 ---- + v4 = delta._mm_adds_epi8(b: k4) + lo = zero._mm_unpacklo_epi8(b: v4) + hi = zero._mm_unpackhi_epi8(b: v4) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v4 = lo._mm_packs_epi16(b: hi) + + v3 = delta._mm_adds_epi8(b: k3) + lo = zero._mm_unpacklo_epi8(b: v3) + hi = zero._mm_unpackhi_epi8(b: v3) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v3 = lo._mm_packs_epi16(b: hi) + + q0 = q0._mm_subs_epi8(b: v4) + p0 = p0._mm_adds_epi8(b: v3) + + // ---- Filter4 !HEV: a3 = SignedShift(v4+1, 1) & not_hev ---- + a3 = v4._mm_adds_epi8(b: k1) + lo = zero._mm_unpacklo_epi8(b: a3) + hi = zero._mm_unpackhi_epi8(b: a3) + lo = lo._mm_srai_epi16(imm8: 9) + hi = hi._mm_srai_epi16(imm8: 9) + a3 = lo._mm_packs_epi16(b: hi) + a3 = a3._mm_and_si128(b: not_hev) + + q1 = q1._mm_subs_epi8(b: a3) + p1 = p1._mm_adds_epi8(b: a3) + + // ---- Convert back to unsigned ---- + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + + // ==== Reverse transpose: columns → 16 rows of 8 bytes ==== + // p3, p2, q2, q3 are unchanged; p1, p0, q0, q1 are modified. + + f0 = p3._mm_unpacklo_epi8(b: p2) + f1 = p3._mm_unpackhi_epi8(b: p2) + f2 = p1._mm_unpacklo_epi8(b: p0) + f3 = p1._mm_unpackhi_epi8(b: p0) + f4 = q0._mm_unpacklo_epi8(b: q1) + f5 = q0._mm_unpackhi_epi8(b: q1) + f6 = q2._mm_unpacklo_epi8(b: q3) + f7 = q2._mm_unpackhi_epi8(b: q3) + + g0 = f0._mm_unpacklo_epi16(b: f2) + g1 = f0._mm_unpackhi_epi16(b: f2) + g2 = f4._mm_unpacklo_epi16(b: f6) + g3 = f4._mm_unpackhi_epi16(b: f6) + g4 = f1._mm_unpacklo_epi16(b: f3) + g5 = f1._mm_unpackhi_epi16(b: f3) + g6 = f5._mm_unpacklo_epi16(b: f7) + g7 = f5._mm_unpackhi_epi16(b: f7) + + f0 = g0._mm_unpacklo_epi32(b: g2) + f1 = g0._mm_unpackhi_epi32(b: g2) + f2 = g1._mm_unpacklo_epi32(b: g3) + f3 = g1._mm_unpackhi_epi32(b: g3) + f4 = g4._mm_unpacklo_epi32(b: g6) + f5 = g4._mm_unpackhi_epi32(b: g6) + f6 = g5._mm_unpacklo_epi32(b: g7) + f7 = g5._mm_unpackhi_epi32(b: g7) + + // ==== Store 16 rows ==== + // Same stride >= 8 optimization as load phase. + if (args.q0_off - 4) > args.workbuf.length() { + return nothing + } + args.workbuf = args.workbuf[(args.q0_off - 4) ..] + + // Rows 0,1 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f0.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f0._mm_unpackhi_epi64(b: f0) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 2,3 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f1.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f1._mm_unpackhi_epi64(b: f1) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 4,5 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f2.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f2._mm_unpackhi_epi64(b: f2) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 6,7 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f3.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f3._mm_unpackhi_epi64(b: f3) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 8,9 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f4.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f4._mm_unpackhi_epi64(b: f4) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 10,11 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f5.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f5._mm_unpackhi_epi64(b: f5) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 12,13 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f6.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f6._mm_unpackhi_epi64(b: f6) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + // Rows 14,15 + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f7.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + ra = f7._mm_unpackhi_epi64(b: f7) + if 8 > args.workbuf.length() { + return nothing + } + ra.store_slice64!(a: args.workbuf[.. 8]) +} + +// VP8 normal loop filter (filter4) for 8-row U/V vertical inner edges, SSE4.2. +// +// Filters a vertical sub-block boundary on the U/V plane: 8 rows, 8 pixels per +// row (p3..q3 at step=1). Uses 8x8 transpose, filter4, then transposes back. + +pri func decoder.normal_hfilter_inner_8_x86_sse42!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + var wb : slice base.u8 + var stride : base.u64 + + var ra : base.x86_m128i + var rb : base.x86_m128i + + var f0 : base.x86_m128i + var f1 : base.x86_m128i + var f2 : base.x86_m128i + var f3 : base.x86_m128i + var g0 : base.x86_m128i + var g1 : base.x86_m128i + var g2 : base.x86_m128i + var g3 : base.x86_m128i + + var p3 : base.x86_m128i + var p2 : base.x86_m128i + var p1 : base.x86_m128i + var p0 : base.x86_m128i + var q0 : base.x86_m128i + var q1 : base.x86_m128i + var q2 : base.x86_m128i + var q3 : base.x86_m128i + + var zero : base.x86_m128i + var sign_bit : base.x86_m128i + var kFE : base.x86_m128i + var m_thresh : base.x86_m128i + var m_ithresh : base.x86_m128i + var m_hthresh : base.x86_m128i + var k1 : base.x86_m128i + var k3 : base.x86_m128i + var k4 : base.x86_m128i + + var mask : base.x86_m128i + var not_hev : base.x86_m128i + var delta : base.x86_m128i + var v3 : base.x86_m128i + var v4 : base.x86_m128i + var a3 : base.x86_m128i + var t1 : base.x86_m128i + var t2 : base.x86_m128i + var t3 : base.x86_m128i + var lo : base.x86_m128i + var hi : base.x86_m128i + + stride = this.uv_stride as base.u64 + if stride < 8 { + return nothing + } + assert 8 <= stride via "a <= b: b >= a"() + + if args.q0_off < 4 { + return nothing + } + wb = args.workbuf + if (args.q0_off - 4) > wb.length() { + return nothing + } + wb = wb[(args.q0_off - 4) ..] + + // ==== Load 8 rows + forward transpose ==== + + // Rows 0,1 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f0 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 2,3 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f1 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 4,5 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + rb = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + f2 = ra._mm_unpacklo_epi8(b: rb) + + // Rows 6,7 + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + if 8 > wb.length() { + return nothing + } + rb = util.make_m128i_slice64(a: wb[.. 8]) + f3 = ra._mm_unpacklo_epi8(b: rb) + + g0 = f0._mm_unpacklo_epi16(b: f1) + g1 = f0._mm_unpackhi_epi16(b: f1) + g2 = f2._mm_unpacklo_epi16(b: f3) + g3 = f2._mm_unpackhi_epi16(b: f3) + + f0 = g0._mm_unpacklo_epi32(b: g2) + f1 = g0._mm_unpackhi_epi32(b: g2) + f2 = g1._mm_unpacklo_epi32(b: g3) + f3 = g1._mm_unpackhi_epi32(b: g3) + + p3 = f0 + p2 = f0._mm_unpackhi_epi64(b: f0) + p1 = f1 + p0 = f1._mm_unpackhi_epi64(b: f1) + q0 = f2 + q1 = f2._mm_unpackhi_epi64(b: f2) + q2 = f3 + q3 = f3._mm_unpackhi_epi64(b: f3) + + // ==== Filter4 computation ==== + + zero = util.make_m128i_zeroes() + sign_bit = util.make_m128i_repeat_u8(a: 0x80) + kFE = util.make_m128i_repeat_u8(a: 0xFE) + m_thresh = util.make_m128i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m128i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m128i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k1 = util.make_m128i_repeat_u8(a: 1) + k3 = util.make_m128i_repeat_u8(a: 3) + k4 = util.make_m128i_repeat_u8(a: 4) + + t1 = p1._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: p1)) + t2 = t1._mm_and_si128(b: kFE)._mm_srli_epi16(imm8: 1) + t3 = p0._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: p0)) + t3 = t3._mm_adds_epu8(b: t3) + t3 = t3._mm_adds_epu8(b: t2) + mask = t3._mm_subs_epu8(b: m_thresh)._mm_cmpeq_epi8(b: zero) + + t1 = p3._mm_subs_epu8(b: p2)._mm_or_si128(b: p2._mm_subs_epu8(b: p3)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p2._mm_subs_epu8(b: p1)._mm_or_si128(b: p1._mm_subs_epu8(b: p2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q0._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: q0)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q1._mm_subs_epu8(b: q2)._mm_or_si128(b: q2._mm_subs_epu8(b: q1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q2._mm_subs_epu8(b: q3)._mm_or_si128(b: q3._mm_subs_epu8(b: q2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + t2 = q1._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: q1)) + t3 = t1._mm_subs_epu8(b: m_hthresh)._mm_or_si128(b: t2._mm_subs_epu8(b: m_hthresh)) + not_hev = t3._mm_cmpeq_epi8(b: zero) + + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + + t1 = p1._mm_subs_epi8(b: q1) + t1 = not_hev._mm_andnot_si128(b: t1) + t2 = q0._mm_subs_epi8(b: p0) + t1 = t1._mm_adds_epi8(b: t2) + t1 = t1._mm_adds_epi8(b: t2) + delta = t1._mm_adds_epi8(b: t2) + delta = delta._mm_and_si128(b: mask) + + v4 = delta._mm_adds_epi8(b: k4) + lo = zero._mm_unpacklo_epi8(b: v4) + hi = zero._mm_unpackhi_epi8(b: v4) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v4 = lo._mm_packs_epi16(b: hi) + + v3 = delta._mm_adds_epi8(b: k3) + lo = zero._mm_unpacklo_epi8(b: v3) + hi = zero._mm_unpackhi_epi8(b: v3) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v3 = lo._mm_packs_epi16(b: hi) + + q0 = q0._mm_subs_epi8(b: v4) + p0 = p0._mm_adds_epi8(b: v3) + + a3 = v4._mm_adds_epi8(b: k1) + lo = zero._mm_unpacklo_epi8(b: a3) + hi = zero._mm_unpackhi_epi8(b: a3) + lo = lo._mm_srai_epi16(imm8: 9) + hi = hi._mm_srai_epi16(imm8: 9) + a3 = lo._mm_packs_epi16(b: hi) + a3 = a3._mm_and_si128(b: not_hev) + + q1 = q1._mm_subs_epi8(b: a3) + p1 = p1._mm_adds_epi8(b: a3) + + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + + // ==== Reverse transpose ==== + f0 = p3._mm_unpacklo_epi8(b: p2) + f1 = p1._mm_unpacklo_epi8(b: p0) + f2 = q0._mm_unpacklo_epi8(b: q1) + f3 = q2._mm_unpacklo_epi8(b: q3) + + g0 = f0._mm_unpacklo_epi16(b: f1) + g1 = f0._mm_unpackhi_epi16(b: f1) + g2 = f2._mm_unpacklo_epi16(b: f3) + g3 = f2._mm_unpackhi_epi16(b: f3) + + f0 = g0._mm_unpacklo_epi32(b: g2) + f1 = g0._mm_unpackhi_epi32(b: g2) + f2 = g1._mm_unpacklo_epi32(b: g3) + f3 = g1._mm_unpackhi_epi32(b: g3) + + // ==== Store 8 rows ==== + if (args.q0_off - 4) > args.workbuf.length() { + return nothing + } + args.workbuf = args.workbuf[(args.q0_off - 4) ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f0.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f0._mm_unpackhi_epi64(b: f0) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f1.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f1._mm_unpackhi_epi64(b: f1) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f2.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + ra = f2._mm_unpackhi_epi64(b: f2) + ra.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + f3.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + ra = f3._mm_unpackhi_epi64(b: f3) + if 8 > args.workbuf.length() { + return nothing + } + ra.store_slice64!(a: args.workbuf[.. 8]) +} + +// VP8 normal loop filter (filter4) for 8-pixel U/V horizontal inner edges, SSE4.2. +// +// Filters a horizontal sub-block boundary on U/V plane: 8 contiguous pixels. +// Same as normal_vfilter_inner_16 but loads/stores only 8 bytes per row. + +pri func decoder.normal_vfilter_inner_8_x86_sse42!(workbuf: slice base.u8, q0_off: base.u64, + level: base.u32, ilevel: base.u32, hlevel: base.u32), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + var wb : slice base.u8 + var stride : base.u64 + + var p3 : base.x86_m128i + var p2 : base.x86_m128i + var p1 : base.x86_m128i + var p0 : base.x86_m128i + var q0 : base.x86_m128i + var q1 : base.x86_m128i + var q2 : base.x86_m128i + var q3 : base.x86_m128i + var zero : base.x86_m128i + var sign_bit : base.x86_m128i + var kFE : base.x86_m128i + var m_thresh : base.x86_m128i + var m_ithresh : base.x86_m128i + var m_hthresh : base.x86_m128i + var k1 : base.x86_m128i + var k3 : base.x86_m128i + var k4 : base.x86_m128i + var mask : base.x86_m128i + var not_hev : base.x86_m128i + var delta : base.x86_m128i + var v3 : base.x86_m128i + var v4 : base.x86_m128i + var a3 : base.x86_m128i + var t1 : base.x86_m128i + var t2 : base.x86_m128i + var t3 : base.x86_m128i + var lo : base.x86_m128i + var hi : base.x86_m128i + + stride = this.uv_stride as base.u64 + if stride < 8 { + return nothing + } + assert 8 <= stride via "a <= b: b >= a"() + + if args.q0_off < (4 * stride) { + return nothing + } + wb = args.workbuf + if (args.q0_off - (4 * stride)) <= wb.length() { + wb = wb[args.q0_off - (4 * stride) ..] + } else { + return nothing + } + + // Load 8 rows using 8-byte loads. + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p3 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p2 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p1 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + p0 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q0 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q1 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if stride > wb.length() { + return nothing + } + assert 8 <= wb.length() via "a <= b: a <= c; c <= b"(c: stride) + q2 = util.make_m128i_slice64(a: wb[.. 8]) + wb = wb[stride ..] + + if 8 > wb.length() { + return nothing + } + q3 = util.make_m128i_slice64(a: wb[.. 8]) + + // Filter4 computation. + zero = util.make_m128i_zeroes() + sign_bit = util.make_m128i_repeat_u8(a: 0x80) + kFE = util.make_m128i_repeat_u8(a: 0xFE) + m_thresh = util.make_m128i_repeat_u8(a: (args.level & 0xFF) as base.u8) + m_ithresh = util.make_m128i_repeat_u8(a: (args.ilevel & 0xFF) as base.u8) + m_hthresh = util.make_m128i_repeat_u8(a: (args.hlevel & 0xFF) as base.u8) + k1 = util.make_m128i_repeat_u8(a: 1) + k3 = util.make_m128i_repeat_u8(a: 3) + k4 = util.make_m128i_repeat_u8(a: 4) + + t1 = p1._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: p1)) + t2 = t1._mm_and_si128(b: kFE)._mm_srli_epi16(imm8: 1) + t3 = p0._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: p0)) + t3 = t3._mm_adds_epu8(b: t3) + t3 = t3._mm_adds_epu8(b: t2) + mask = t3._mm_subs_epu8(b: m_thresh)._mm_cmpeq_epi8(b: zero) + + t1 = p3._mm_subs_epu8(b: p2)._mm_or_si128(b: p2._mm_subs_epu8(b: p3)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p2._mm_subs_epu8(b: p1)._mm_or_si128(b: p1._mm_subs_epu8(b: p2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q0._mm_subs_epu8(b: q1)._mm_or_si128(b: q1._mm_subs_epu8(b: q0)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q1._mm_subs_epu8(b: q2)._mm_or_si128(b: q2._mm_subs_epu8(b: q1)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + t1 = q2._mm_subs_epu8(b: q3)._mm_or_si128(b: q3._mm_subs_epu8(b: q2)) + mask = mask._mm_and_si128(b: t1._mm_subs_epu8(b: m_ithresh)._mm_cmpeq_epi8(b: zero)) + + t1 = p1._mm_subs_epu8(b: p0)._mm_or_si128(b: p0._mm_subs_epu8(b: p1)) + t2 = q1._mm_subs_epu8(b: q0)._mm_or_si128(b: q0._mm_subs_epu8(b: q1)) + t3 = t1._mm_subs_epu8(b: m_hthresh)._mm_or_si128(b: t2._mm_subs_epu8(b: m_hthresh)) + not_hev = t3._mm_cmpeq_epi8(b: zero) + + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + + t1 = p1._mm_subs_epi8(b: q1) + t1 = not_hev._mm_andnot_si128(b: t1) + t2 = q0._mm_subs_epi8(b: p0) + t1 = t1._mm_adds_epi8(b: t2) + t1 = t1._mm_adds_epi8(b: t2) + delta = t1._mm_adds_epi8(b: t2) + delta = delta._mm_and_si128(b: mask) + + v4 = delta._mm_adds_epi8(b: k4) + lo = zero._mm_unpacklo_epi8(b: v4) + hi = zero._mm_unpackhi_epi8(b: v4) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v4 = lo._mm_packs_epi16(b: hi) + + v3 = delta._mm_adds_epi8(b: k3) + lo = zero._mm_unpacklo_epi8(b: v3) + hi = zero._mm_unpackhi_epi8(b: v3) + lo = lo._mm_srai_epi16(imm8: 11) + hi = hi._mm_srai_epi16(imm8: 11) + v3 = lo._mm_packs_epi16(b: hi) + + q0 = q0._mm_subs_epi8(b: v4) + p0 = p0._mm_adds_epi8(b: v3) + + a3 = v4._mm_adds_epi8(b: k1) + lo = zero._mm_unpacklo_epi8(b: a3) + hi = zero._mm_unpackhi_epi8(b: a3) + lo = lo._mm_srai_epi16(imm8: 9) + hi = hi._mm_srai_epi16(imm8: 9) + a3 = lo._mm_packs_epi16(b: hi) + a3 = a3._mm_and_si128(b: not_hev) + + q1 = q1._mm_subs_epi8(b: a3) + p1 = p1._mm_adds_epi8(b: a3) + + p1 = p1._mm_xor_si128(b: sign_bit) + p0 = p0._mm_xor_si128(b: sign_bit) + q0 = q0._mm_xor_si128(b: sign_bit) + q1 = q1._mm_xor_si128(b: sign_bit) + + // Store p1, p0, q0, q1 using 8-byte stores. + if args.q0_off < (2 * stride) { + return nothing + } + if (args.q0_off - (2 * stride)) <= args.workbuf.length() { + args.workbuf = args.workbuf[args.q0_off - (2 * stride) ..] + } else { + return nothing + } + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p1.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + p0.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if stride > args.workbuf.length() { + return nothing + } + assert 8 <= args.workbuf.length() via "a <= b: a <= c; c <= b"(c: stride) + q0.store_slice64!(a: args.workbuf[.. 8]) + args.workbuf = args.workbuf[stride ..] + + if 8 > args.workbuf.length() { + return nothing + } + q1.store_slice64!(a: args.workbuf[.. 8]) +} diff --git a/std/vp8/decode_header.wuffs b/std/vp8/decode_header.wuffs new file mode 100644 index 000000000..9c4b1646d --- /dev/null +++ b/std/vp8/decode_header.wuffs @@ -0,0 +1,377 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +pri func decoder.decode_partition0!(workbuf: slice base.u8) { + this.bool_ri = 0 + this.bool_wi = 0 + this.bool_fill_from_workbuf!(workbuf: args.workbuf) + this.bool_init!() + + if this.key_frame { + this.bool_read_literal!(n: 2) + } + + this.decode_segmentation!() + this.decode_loop_filter!() + this.decode_partitions!() + this.decode_quant_indices!() + + if this.key_frame { + this.bool_read_literal!(n: 1) + } + + this.decode_coeff_prob_updates!() + this.decode_mb_skip_coeff!() + this.compute_dequant_values!() +} + +pri func decoder.decode_segmentation!() { + var v : base.u32[..= 1] + var i : base.u32 + var val : base.u32 + var update_feature_data : base.u32[..= 1] + + v = this.bool_read_bool!(prob: 128) + if v == 0 { + this.use_segment = false + return nothing + } + this.use_segment = true + + v = this.bool_read_bool!(prob: 128) + this.update_segment_map = (v <> 0) + + update_feature_data = this.bool_read_bool!(prob: 128) + if update_feature_data <> 0 { + v = this.bool_read_bool!(prob: 128) + this.segment_is_abs = (v <> 0) + + i = 0 + while i < 4 { + assert i < 4 via "a < b: a < c; c <= b"(c: 4) + this.segment_quant[i] = this.bool_read_signed!(n: 7) + i += 1 + } + i = 0 + while i < 4 { + assert i < 4 via "a < b: a < c; c <= b"(c: 4) + this.segment_lf[i] = this.bool_read_signed!(n: 6) + i += 1 + } + } + + if this.update_segment_map { + i = 0 + while i < 3 { + assert i < 3 via "a < b: a < c; c <= b"(c: 3) + v = this.bool_read_bool!(prob: 128) + if v <> 0 { + val = this.bool_read_literal!(n: 8) + this.segment_prob[i] = (val & 0xFF) as base.u8 + } else { + this.segment_prob[i] = 255 + } + i += 1 + } + } +} + +pri func decoder.decode_loop_filter!() { + var v : base.u32[..= 1] + var i : base.u32 + var val : base.u32 + + val = this.bool_read_literal!(n: 1) + this.filter_type = (val & 1) as base.u8 + val = this.bool_read_literal!(n: 6) + this.filter_level = (val & 63) as base.u8 + val = this.bool_read_literal!(n: 3) + this.sharpness_level = (val & 7) as base.u8 + + v = this.bool_read_bool!(prob: 128) + this.lf_delta_enabled = (v <> 0) + if this.lf_delta_enabled { + v = this.bool_read_bool!(prob: 128) + if v <> 0 { + i = 0 + while i < 4 { + assert i < 4 via "a < b: a < c; c <= b"(c: 4) + v = this.bool_read_bool!(prob: 128) + if v <> 0 { + val = this.bool_read_literal!(n: 6) + val = val & 63 + v = this.bool_read_bool!(prob: 128) + if v <> 0 { + this.lf_ref_delta[i] = -(val as base.i32) + } else { + this.lf_ref_delta[i] = val as base.i32 + } + } + i += 1 + } + i = 0 + while i < 4 { + assert i < 4 via "a < b: a < c; c <= b"(c: 4) + v = this.bool_read_bool!(prob: 128) + if v <> 0 { + val = this.bool_read_literal!(n: 6) + val = val & 63 + v = this.bool_read_bool!(prob: 128) + if v <> 0 { + this.lf_mode_delta[i] = -(val as base.i32) + } else { + this.lf_mode_delta[i] = val as base.i32 + } + } + i += 1 + } + } + } +} + +pri func decoder.decode_partitions!() { + var log2_parts : base.u32 + + log2_parts = this.bool_read_literal!(n: 2) + if log2_parts == 0 { + this.num_partitions = 1 + } else if log2_parts == 1 { + this.num_partitions = 2 + } else if log2_parts == 2 { + this.num_partitions = 4 + } else { + this.num_partitions = 8 + } +} + +pri func decoder.decode_quant_indices!() { + var val : base.u32 + + val = this.bool_read_literal!(n: 7) + this.quant_y_ac_qi = (val & 127) as base.u8 + this.quant_y_dc_delta = this.bool_read_signed!(n: 4) + this.quant_y2_dc_delta = this.bool_read_signed!(n: 4) + this.quant_y2_ac_delta = this.bool_read_signed!(n: 4) + this.quant_uv_dc_delta = this.bool_read_signed!(n: 4) + this.quant_uv_ac_delta = this.bool_read_signed!(n: 4) +} + +pri func decoder.decode_coeff_prob_updates!() { + var i : base.u32 + var flag : base.u32[..= 1] + var val : base.u32 + + i = 0 + while i < 1056 { + assert i < 1056 via "a < b: a < c; c <= b"(c: 1056) + flag = this.bool_read_bool!(prob: COEFF_UPDATE_PROBS[i]) + if flag <> 0 { + val = this.bool_read_literal!(n: 8) + this.coeff_probs[i] = (val & 0xFF) as base.u8 + } + i += 1 + } +} + +pri func decoder.decode_mb_skip_coeff!() { + var val : base.u32 + + val = this.bool_read_literal!(n: 1) + this.mb_no_skip_coeff = (val <> 0) + if this.mb_no_skip_coeff { + val = this.bool_read_literal!(n: 8) + this.prob_skip_false = (val & 0xFF) as base.u8 + } +} + +pri func decoder.compute_dequant_values!() { + var seg : base.u32 + var base_qi : base.u32[..= 127] + var qi : base.u32[..= 127] + var seg_delta : base.i32 + var y_dc : base.u32[..= 127] + var y2_dc : base.u32[..= 127] + var y2_ac : base.u32[..= 127] + var uv_dc : base.u32[..= 127] + var uv_ac : base.u32[..= 127] + var fl : base.u32[..= 127] + + base_qi = (this.quant_y_ac_qi & 0x7F) as base.u32[..= 127] + + seg = 0 + while seg < 4 { + assert seg < 4 via "a < b: a < c; c <= b"(c: 4) + + if this.use_segment { + seg_delta = this.segment_quant[seg] + if this.segment_is_abs { + qi = this.clamp_qi!(qi: 0, delta: seg_delta) + } else { + qi = this.clamp_qi!(qi: base_qi, delta: seg_delta) + } + } else { + qi = base_qi + } + + this.dequant_y_ac[seg] = AC_QUANT[qi] as base.u32 + y_dc = this.clamp_qi!(qi: qi, delta: this.quant_y_dc_delta) + this.dequant_y_dc[seg] = DC_QUANT[y_dc] as base.u32 + + y2_dc = this.clamp_qi!(qi: qi, delta: this.quant_y2_dc_delta) + this.dequant_y2_dc[seg] = (DC_QUANT[y2_dc] as base.u32) * 2 + + y2_ac = this.clamp_qi!(qi: qi, delta: this.quant_y2_ac_delta) + this.dequant_y2_ac[seg] = ((AC_QUANT[y2_ac] as base.u32) * 155) / 100 + if this.dequant_y2_ac[seg] < 8 { + this.dequant_y2_ac[seg] = 8 + } + + uv_dc = this.clamp_qi!(qi: qi, delta: this.quant_uv_dc_delta) + this.dequant_uv_dc[seg] = DC_QUANT[uv_dc] as base.u32 + if this.dequant_uv_dc[seg] > 132 { + this.dequant_uv_dc[seg] = 132 + } + + uv_ac = this.clamp_qi!(qi: qi, delta: this.quant_uv_ac_delta) + this.dequant_uv_ac[seg] = AC_QUANT[uv_ac] as base.u32 + + // Compute per-segment filter level. + if this.use_segment { + seg_delta = this.segment_lf[seg] + if this.segment_is_abs { + fl = this.clamp_qi!(qi: 0, delta: seg_delta) + } else { + fl = this.clamp_qi!(qi: (this.filter_level & 0x7F) as base.u32[..= 127], delta: seg_delta) + } + this.seg_filter_level[seg] = fl as base.u32 + } else { + this.seg_filter_level[seg] = this.filter_level as base.u32 + } + + seg += 1 + } +} + +// precompute_filter_strengths! precomputes filter strength parameters for +// each (segment, is_i4x4) combination, avoiding per-MB recomputation. +// Must be called after decode_partition0 (which sets seg_filter_level, +// lf_delta_enabled, lf_ref_delta, lf_mode_delta, sharpness_level). +pri func decoder.precompute_filter_strengths!() { + var seg : base.u32 + var i4x4 : base.u32 + var idx : base.u32 + var level : base.u32 + var ref_d : base.i32 + var mode_d : base.i32 + var ilevel : base.u32 + var hlevel : base.u32 + + seg = 0 + while seg < 4 { + i4x4 = 0 + while i4x4 < 2, + inv seg < 4, + { + idx = (seg * 2) + i4x4 + if idx >= 8 { + break + } + + level = this.seg_filter_level[seg] + if level > 63 { + level = 63 + } + + if this.lf_delta_enabled { + ref_d = this.lf_ref_delta[0] + if (ref_d <= -1) and (ref_d >= -63) { + level ~mod-= ((-(ref_d)) & 0x3F) as base.u32 + } else if ref_d > 0 { + level ~mod+= (ref_d & 0x3F) as base.u32 + } + if i4x4 <> 0 { + mode_d = this.lf_mode_delta[0] + if (mode_d <= -1) and (mode_d >= -63) { + level ~mod-= ((-(mode_d)) & 0x3F) as base.u32 + } else if mode_d > 0 { + level ~mod+= (mode_d & 0x3F) as base.u32 + } + } + if level > 63 { + // Distinguish overflow (64..189) from underflow + // (u32 wrap from negative). Level started in [0,63], + // deltas in [-63,63], so overflow max is 189. + if (level & 0x8000_0000) <> 0 { + level = 0 + } else { + level = 63 + } + } + } + + if (level > 0) and (level <= 63) { + ilevel = level + if this.sharpness_level > 4 { + ilevel >>= 2 + } else if this.sharpness_level > 0 { + ilevel >>= 1 + } + if this.sharpness_level > 0 { + if ilevel > (9 - (this.sharpness_level as base.u32)) { + ilevel = 9 - (this.sharpness_level as base.u32) + } + } + if ilevel < 1 { + ilevel = 1 + } + + this.fstrength_ilevel[idx] = (ilevel & 0xFF) as base.u8 + + if level < 15 { + hlevel = 0 + } else if level < 40 { + hlevel = 1 + } else { + hlevel = 2 + } + this.fstrength_hlevel[idx] = (hlevel & 0xFF) as base.u8 + + level = (2 ~mod* level) ~mod+ ilevel + this.fstrength_level[idx] = (level & 0xFF) as base.u8 + } + + i4x4 += 1 + } + seg += 1 + } +} + +pri func decoder.clamp_qi!(qi: base.u32[..= 127], delta: base.i32) base.u32[..= 127] { + var neg : base.u32[..= 127] + var pos : base.u32[..= 127] + + // Use <= -1 to test for negative delta, avoiding the >= 0 comparison + // which generates unsigned C code (a_delta >= 0u) due to Wuffs literals. + if args.delta <= -1 { + if args.delta <= -128 { + return 0 + } + neg = ((-(args.delta)) & 0x7F) as base.u32[..= 127] + if args.qi <= neg { + return 0 + } + return (args.qi - neg) as base.u32[..= 127] + } + pos = (args.delta & 0x7F) as base.u32[..= 127] + if (args.qi + pos) > 127 { + return 127 + } + return (args.qi + pos) as base.u32[..= 127] +} diff --git a/std/vp8/decode_idct.wuffs b/std/vp8/decode_idct.wuffs new file mode 100644 index 000000000..0fd81fede --- /dev/null +++ b/std/vp8/decode_idct.wuffs @@ -0,0 +1,357 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 uses two 4x4 transforms: +// 1. Inverse DCT for Y/U/V residual blocks +// 2. Walsh-Hadamard Transform (WHT) for Y2 DC block +// +// Coefficients are stored as base.u16 (2's complement). All intermediate +// computation uses base.u32 with modular arithmetic (~mod operators). +// +// IMPORTANT: VP8 arithmetic is signed. Right shifts in the IDCT must be +// arithmetic (sign-preserving), not logical. Wuffs's >> on base.u32 is +// logical, so we use asr16/asr3 helpers for arithmetic right shift. + +// asr16! performs branchless arithmetic right shift by 16 on a u32 value +// representing a signed 32-bit quantity. Preserves the sign bit. +pri func decoder.asr16!(v: base.u32) base.u32 { + // sign = 0 or 1; (0 ~mod- sign) = 0x00000000 or 0xFFFFFFFF + return (args.v >> 16) | ((0 ~mod- (args.v >> 31)) ~mod<< 16) +} + +// asr3! performs branchless arithmetic right shift by 3 on a u32 value +// representing a signed 32-bit quantity. Preserves the sign bit. +pri func decoder.asr3!(v: base.u32) base.u32 { + return (args.v >> 3) | ((0 ~mod- (args.v >> 31)) ~mod<< 29) +} + +// idct_add! performs a 4x4 inverse DCT and adds the result to the prediction +// block in the workbuf. The 16 input coefficients start at mb_coeffs[coeff_offset]. +pri func decoder.idct_add!(dst: slice base.u8, stride: base.u32, coeff_offset: base.u32[..= 384]), + choosy, +{ + var in0 : base.u32 + var in1 : base.u32 + var in2 : base.u32 + var in3 : base.u32 + var t0 : base.u32 + var t1 : base.u32 + var t2 : base.u32 + var t3 : base.u32 + var d0 : base.u32 + var d1 : base.u32 + var d2 : base.u32 + var d3 : base.u32 + var c1 : base.u32 + var c2 : base.u32 + var sh : base.u32 + var temp : array[16] base.u32 + var i : base.u32 + var j : base.u32 + var val : base.u32 + var idx : base.u64 + var row : base.u32 + + // Column pass: transform columns of the 4x4 matrix. + i = 0 + while i < 4, + inv i <= 4, + { + assert i < 4 via "a < b: a < c; c <= b"(c: 4) + in0 = this.mb_coeffs[args.coeff_offset + i] + in1 = this.mb_coeffs[args.coeff_offset + i + 4] + in2 = this.mb_coeffs[args.coeff_offset + i + 8] + in3 = this.mb_coeffs[args.coeff_offset + i + 12] + + t0 = in0 ~mod+ in2 + t1 = in0 ~mod- in2 + + // c1 = asr16(in1 * cospi8sqrt2minus1) + in1 + sh = this.asr16!(v: in1 ~mod* 20091) + c1 = sh ~mod+ in1 + // c2 = asr16(in3 * cospi8sqrt2minus1) + in3 + sh = this.asr16!(v: in3 ~mod* 20091) + c2 = sh ~mod+ in3 + // t2 = asr16(in1 * sinpi8sqrt2) - c2 + sh = this.asr16!(v: in1 ~mod* 35468) + t2 = sh ~mod- c2 + // t3 = c1 + asr16(in3 * sinpi8sqrt2) + sh = this.asr16!(v: in3 ~mod* 35468) + t3 = c1 ~mod+ sh + + temp[i] = t0 ~mod+ t3 + temp[i + 12] = t0 ~mod- t3 + temp[i + 4] = t1 ~mod+ t2 + temp[i + 8] = t1 ~mod- t2 + + i += 1 + } + + // Row pass: transform rows, add to prediction, and clamp. + row = 0 + while row < 4 { + j = row * 4 + in0 = temp[j] ~mod+ 4 + in1 = temp[j + 1] + in2 = temp[j + 2] + in3 = temp[j + 3] + + t0 = in0 ~mod+ in2 + t1 = in0 ~mod- in2 + sh = this.asr16!(v: in1 ~mod* 20091) + c1 = sh ~mod+ in1 + sh = this.asr16!(v: in3 ~mod* 20091) + c2 = sh ~mod+ in3 + sh = this.asr16!(v: in1 ~mod* 35468) + t2 = sh ~mod- c2 + sh = this.asr16!(v: in3 ~mod* 35468) + t3 = c1 ~mod+ sh + + d0 = this.asr3!(v: t0 ~mod+ t3) + d1 = this.asr3!(v: t1 ~mod+ t2) + d2 = this.asr3!(v: t1 ~mod- t2) + d3 = this.asr3!(v: t0 ~mod- t3) + + // Add to prediction and clamp to [0, 255]. + idx = (row as base.u64) * (args.stride as base.u64) + + if idx < args.dst.length() { + val = (args.dst[idx] as base.u32) ~mod+ d0 + if val > 255 { + if (val & 0x8000_0000) <> 0 { + val = 0 + } else { + val = 255 + } + } + args.dst[idx] = (val & 0xFF) as base.u8 + } + idx ~mod+= 1 + if idx < args.dst.length() { + val = (args.dst[idx] as base.u32) ~mod+ d1 + if val > 255 { + if (val & 0x8000_0000) <> 0 { + val = 0 + } else { + val = 255 + } + } + args.dst[idx] = (val & 0xFF) as base.u8 + } + idx ~mod+= 1 + if idx < args.dst.length() { + val = (args.dst[idx] as base.u32) ~mod+ d2 + if val > 255 { + if (val & 0x8000_0000) <> 0 { + val = 0 + } else { + val = 255 + } + } + args.dst[idx] = (val & 0xFF) as base.u8 + } + idx ~mod+= 1 + if idx < args.dst.length() { + val = (args.dst[idx] as base.u32) ~mod+ d3 + if val > 255 { + if (val & 0x8000_0000) <> 0 { + val = 0 + } else { + val = 255 + } + } + args.dst[idx] = (val & 0xFF) as base.u8 + } + + row += 1 + } + + // Clear the coefficients. + i = 0 + while i < 16 { + assert (args.coeff_offset + i) < 400 via "a < b: a < c; c <= b"(c: 400) + this.mb_coeffs[args.coeff_offset + i] = 0 + i += 1 + } +} + +// idct_dc_add! performs a simplified IDCT when only the DC coefficient is +// non-zero. Adds asr3(dc + 4) to each of the 16 prediction pixels. +pri func decoder.idct_dc_add!(dst: slice base.u8, stride: base.u32, coeff_offset: base.u32[..= 384]), + choosy, +{ + var dc : base.u32 + var row : base.u32 + var idx : base.u64 + var val : base.u32 + + dc = this.asr3!(v: this.mb_coeffs[args.coeff_offset] ~mod+ 4) + this.mb_coeffs[args.coeff_offset] = 0 + + row = 0 + while row < 4 { + idx = (row as base.u64) * (args.stride as base.u64) + + if idx < args.dst.length() { + val = (args.dst[idx] as base.u32) ~mod+ dc + if val > 255 { + if (val & 0x8000_0000) <> 0 { + val = 0 + } else { + val = 255 + } + } + args.dst[idx] = (val & 0xFF) as base.u8 + } + idx ~mod+= 1 + if idx < args.dst.length() { + val = (args.dst[idx] as base.u32) ~mod+ dc + if val > 255 { + if (val & 0x8000_0000) <> 0 { + val = 0 + } else { + val = 255 + } + } + args.dst[idx] = (val & 0xFF) as base.u8 + } + idx ~mod+= 1 + if idx < args.dst.length() { + val = (args.dst[idx] as base.u32) ~mod+ dc + if val > 255 { + if (val & 0x8000_0000) <> 0 { + val = 0 + } else { + val = 255 + } + } + args.dst[idx] = (val & 0xFF) as base.u8 + } + idx ~mod+= 1 + if idx < args.dst.length() { + val = (args.dst[idx] as base.u32) ~mod+ dc + if val > 255 { + if (val & 0x8000_0000) <> 0 { + val = 0 + } else { + val = 255 + } + } + args.dst[idx] = (val & 0xFF) as base.u8 + } + + row += 1 + } +} + +// idct_add_pair! performs 4x4 inverse DCT on two adjacent blocks and adds +// the results to prediction. Block A at dst[0..], Block B at dst[4..]. +// The scalar default just calls idct_add for each block separately. +pri func decoder.idct_add_pair!(dst: slice base.u8, stride: base.u32, + coeff_offset_a: base.u32[..= 384], coeff_offset_b: base.u32[..= 384]), + choosy, +{ + this.idct_add!(dst: args.dst, stride: args.stride, coeff_offset: args.coeff_offset_a) + if 4 <= args.dst.length() { + this.idct_add!(dst: args.dst[4 ..], stride: args.stride, coeff_offset: args.coeff_offset_b) + } +} + +// idct_dc_add_pair! adds DC-only offsets to two adjacent 4x4 blocks. +// Block A at dst[0..], Block B at dst[4..]. +pri func decoder.idct_dc_add_pair!(dst: slice base.u8, stride: base.u32, + coeff_offset_a: base.u32[..= 384], coeff_offset_b: base.u32[..= 384]), + choosy, +{ + this.idct_dc_add!(dst: args.dst, stride: args.stride, coeff_offset: args.coeff_offset_a) + if 4 <= args.dst.length() { + this.idct_dc_add!(dst: args.dst[4 ..], stride: args.stride, coeff_offset: args.coeff_offset_b) + } +} + +// wht! performs a 4x4 Walsh-Hadamard Transform on the Y2 (DC) block. +// Input: 16 DC coefficients from mb_coeffs[384..400]. +// Output: distributes the transformed DC values back to the 16 Y blocks. +// Uses u32 modular arithmetic (2's complement). +pri func decoder.wht!(coeff_offset: base.u32[..= 384]) { + var temp : array[16] base.u32 + var i : base.u32 + var j : base.u32 + var a0 : base.u32 + var a1 : base.u32 + var a2 : base.u32 + var a3 : base.u32 + var b0 : base.u32 + var b1 : base.u32 + var b2 : base.u32 + var b3 : base.u32 + + // Column pass. + i = 0 + while i < 4, + inv i <= 4, + { + assert i < 4 via "a < b: a < c; c <= b"(c: 4) + a0 = this.mb_coeffs[args.coeff_offset + i] + a1 = this.mb_coeffs[args.coeff_offset + i + 4] + a2 = this.mb_coeffs[args.coeff_offset + i + 8] + a3 = this.mb_coeffs[args.coeff_offset + i + 12] + + b0 = a0 ~mod+ a3 + b1 = a1 ~mod+ a2 + b2 = a1 ~mod- a2 + b3 = a0 ~mod- a3 + + temp[i] = b0 ~mod+ b1 + temp[i + 4] = b3 ~mod+ b2 + temp[i + 8] = b0 ~mod- b1 + temp[i + 12] = b3 ~mod- b2 + + i += 1 + } + + // Row pass. + i = 0 + while i < 4 { + j = i * 4 + a0 = temp[j] + a1 = temp[j + 1] + a2 = temp[j + 2] + a3 = temp[j + 3] + + b0 = a0 ~mod+ a3 + b1 = a1 ~mod+ a2 + b2 = a1 ~mod- a2 + b3 = a0 ~mod- a3 + + // Output DC values with rounding: asr3(val + 3) + temp[j] = this.asr3!(v: (b0 ~mod+ b1) ~mod+ 3) + temp[j + 1] = this.asr3!(v: (b3 ~mod+ b2) ~mod+ 3) + temp[j + 2] = this.asr3!(v: (b0 ~mod- b1) ~mod+ 3) + temp[j + 3] = this.asr3!(v: (b3 ~mod- b2) ~mod+ 3) + + i += 1 + } + + // Distribute the 16 transformed DC values to the Y sub-blocks. + i = 0 + while i < 16 { + this.mb_coeffs[i * 16] = temp[i] + i += 1 + } + + // Clear the Y2 block. + i = 0 + while i < 16 { + assert (args.coeff_offset + i) < 400 via "a < b: a < c; c <= b"(c: 400) + this.mb_coeffs[args.coeff_offset + i] = 0 + i += 1 + } +} diff --git a/std/vp8/decode_idct_arm_neon.wuffs b/std/vp8/decode_idct_arm_neon.wuffs new file mode 100644 index 000000000..e9dfd33a3 --- /dev/null +++ b/std/vp8/decode_idct_arm_neon.wuffs @@ -0,0 +1,348 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 4x4 inverse DCT, ARM NEON version using 16-bit arithmetic. +// +// Follows libwebp's approach: pack 4 rows into two int16x8 registers +// (r01 = [row0 | row1], r23 = [row2 | row3]) and process both halves +// simultaneously through the butterfly. +// +// Uses the vqdmulh trick: vqdmulh gives (a*b*2)>>16 = 2*mulhi(a,b). +// For K1 (cospi8sqrt2minus1 = 20091): MUL(x) = x + mulhi(x, K1) +// = x + (vqdmulh(x, K1) >> 1) +// For K2 (sinpi8sqrt2 = 35468): mulhi(x, K2) +// = vqdmulh(x, K2/2) where K2/2 = 17734 + +pri func decoder.idct_add_arm_neon!(dst: slice base.u8, stride: base.u32, coeff_offset: base.u32[..= 384]), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + + // Load temporaries for contiguous u32 → u16 narrowing. + var load0 : base.arm_neon_u32x4 + var load1 : base.arm_neon_u32x4 + var low : base.arm_neon_u16x4 + + // Two packed row pairs. + var r01 : base.arm_neon_u16x8 + var r23 : base.arm_neon_u16x8 + + // Butterfly temporaries. + var b1 : base.arm_neon_u16x8 + var mul1 : base.arm_neon_u16x8 + var c0 : base.arm_neon_u16x8 + var c1 : base.arm_neon_u16x8 + var a_val : base.arm_neon_u16x4 + var b_val : base.arm_neon_u16x4 + var c_val : base.arm_neon_u16x4 + var d_val : base.arm_neon_u16x4 + var d0 : base.arm_neon_u16x8 + var d1 : base.arm_neon_u16x8 + var e0 : base.arm_neon_u16x8 + var e_tmp : base.arm_neon_u16x8 + var e1 : base.arm_neon_u16x8 + + // Transpose temporaries. + var t0 : base.arm_neon_u16x8 + var t1 : base.arm_neon_u16x8 + + // Output temporaries. + var k4 : base.arm_neon_u16x8 + var pred01 : base.arm_neon_u8x8 + var pred23 : base.arm_neon_u8x8 + var pred01_w : base.arm_neon_u16x8 + var pred23_w : base.arm_neon_u16x8 + var out01 : base.arm_neon_u16x8 + var out23 : base.arm_neon_u16x8 + var out01_u8 : base.arm_neon_u8x8 + var out23_u8 : base.arm_neon_u8x8 + var val : base.u32 + + var off : base.u32[..= 384] + var i : base.u32 + + off = args.coeff_offset + + // Load 16 coefficients as two u16x8 (4 coefficients per row). + // Load 4 u32 values at a time via contiguous load, narrow to u16x4, + // combine into u16x8. + // r01 = [row0_c0..c3 | row1_c0..c3] + // r23 = [row2_c0..c3 | row3_c0..c3] + assert off <= (off + 4) via "a <= (a + b): 0 <= b"(b: 4) + assert (off + 4) <= (off + 8) via "a <= (a + b): 0 <= b"(b: 4) + assert (off + 8) <= (off + 12) via "a <= (a + b): 0 <= b"(b: 4) + assert (off + 12) <= (off + 16) via "a <= (a + b): 0 <= b"(b: 4) + load0 = util.make_u32x4_slice_u32lex4(a: this.mb_coeffs[off .. off + 4]) + load1 = util.make_u32x4_slice_u32lex4(a: this.mb_coeffs[off + 4 .. off + 8]) + low = load0.vmovn_u32() + r01 = low.vmovn_high_u32(b: load1) + + load0 = util.make_u32x4_slice_u32lex4(a: this.mb_coeffs[off + 8 .. off + 12]) + load1 = util.make_u32x4_slice_u32lex4(a: this.mb_coeffs[off + 12 .. off + 16]) + low = load0.vmovn_u32() + r23 = low.vmovn_high_u32(b: load1) + + // ---- Column butterfly pass ---- + + // B1 = [row1 | row3] (odd rows for sin/cos multiply) + b1 = r01.vget_high_u16().vcombine_u16(b: r23.vget_high_u16()) + + // C0 = MUL(B1, K1) = B1 + (vqdmulh(B1, 20091) >> 1) + mul1 = b1.vqdmulhq_n_s16(b: 20091) + c0 = b1.vaddq_u16(b: mul1.vshrq_n_s16(b: 1)) + + // C1 = mulhi(B1, K2) = vqdmulh(B1, K2/2) = vqdmulh(B1, 17734) + c1 = b1.vqdmulhq_n_s16(b: 17734) + + // a = row0 + row2, b = row0 - row2 + a_val = r01.vget_low_u16().vadd_u16(b: r23.vget_low_u16()) + b_val = r01.vget_low_u16().vsub_u16(b: r23.vget_low_u16()) + + // c = mulhi(row1, K2) - MUL(row3, K1) + // d = MUL(row1, K1) + mulhi(row3, K2) + c_val = c1.vget_low_u16().vsub_u16(b: c0.vget_high_u16()) + d_val = c0.vget_low_u16().vadd_u16(b: c1.vget_high_u16()) + + // Combine: D0 = [a | b], D1 = [d | c] + d0 = a_val.vcombine_u16(b: b_val) + d1 = d_val.vcombine_u16(b: c_val) + + // E0 = [a+d | b+c] = [out_row0 | out_row1] + e0 = d0.vaddq_u16(b: d1) + // E_tmp = [a-d | b-c] = [out_row3 | out_row2] + e_tmp = d0.vsubq_u16(b: d1) + // E1 = [out_row2 | out_row3] (swap halves) + e1 = e_tmp.vget_high_u16().vcombine_u16(b: e_tmp.vget_low_u16()) + + // ---- Transpose 4x4 via two rounds of zip ---- + t0 = e0.vzip1q_u16(b: e1) + t1 = e0.vzip2q_u16(b: e1) + r01 = t0.vzip1q_u16(b: t1) + r23 = t0.vzip2q_u16(b: t1) + + // ---- Row butterfly pass (same structure) ---- + + b1 = r01.vget_high_u16().vcombine_u16(b: r23.vget_high_u16()) + + mul1 = b1.vqdmulhq_n_s16(b: 20091) + c0 = b1.vaddq_u16(b: mul1.vshrq_n_s16(b: 1)) + + c1 = b1.vqdmulhq_n_s16(b: 17734) + + a_val = r01.vget_low_u16().vadd_u16(b: r23.vget_low_u16()) + b_val = r01.vget_low_u16().vsub_u16(b: r23.vget_low_u16()) + + c_val = c1.vget_low_u16().vsub_u16(b: c0.vget_high_u16()) + d_val = c0.vget_low_u16().vadd_u16(b: c1.vget_high_u16()) + + d0 = a_val.vcombine_u16(b: b_val) + d1 = d_val.vcombine_u16(b: c_val) + + e0 = d0.vaddq_u16(b: d1) + e_tmp = d0.vsubq_u16(b: d1) + e1 = e_tmp.vget_high_u16().vcombine_u16(b: e_tmp.vget_low_u16()) + + // ---- Transpose again ---- + t0 = e0.vzip1q_u16(b: e1) + t1 = e0.vzip2q_u16(b: e1) + r01 = t0.vzip1q_u16(b: t1) + r23 = t0.vzip2q_u16(b: t1) + + // ---- Add rounding bias (+4) and arithmetic right shift >>3 ---- + k4 = util.make_u16x8_repeat(a: 4) + r01 = r01.vaddq_u16(b: k4) + r23 = r23.vaddq_u16(b: k4) + r01 = r01.vshrq_n_s16(b: 3) + r23 = r23.vshrq_n_s16(b: 3) + + // ---- Add to prediction and clamp to [0, 255] ---- + // Process row pairs (0+1, 2+3), loading predictions per row via reslicing. + + // Row 0. + if 4 <= args.dst.length() { + pred01 = util.make_u8x8_multiple( + a00: args.dst[0], a01: args.dst[1], + a02: args.dst[2], a03: args.dst[3], + a04: 0, a05: 0, a06: 0, a07: 0) + pred01_w = pred01.vmovl_u8() + out01 = pred01_w.vaddq_u16(b: r01) + out01_u8 = out01.vqmovun_s16() + val = out01_u8.as_u32x2().vget_lane_u32(b: 0) + args.dst[0] = (val & 0xFF) as base.u8 + args.dst[1] = ((val >> 8) & 0xFF) as base.u8 + args.dst[2] = ((val >> 16) & 0xFF) as base.u8 + args.dst[3] = (val >> 24) as base.u8 + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 1 (uses high half of r01). + if 4 <= args.dst.length() { + pred01 = util.make_u8x8_multiple( + a00: 0, a01: 0, a02: 0, a03: 0, + a04: args.dst[0], a05: args.dst[1], + a06: args.dst[2], a07: args.dst[3]) + pred01_w = pred01.vmovl_u8() + out01 = pred01_w.vaddq_u16(b: r01) + out01_u8 = out01.vqmovun_s16() + val = out01_u8.as_u32x2().vget_lane_u32(b: 1) + args.dst[0] = (val & 0xFF) as base.u8 + args.dst[1] = ((val >> 8) & 0xFF) as base.u8 + args.dst[2] = ((val >> 16) & 0xFF) as base.u8 + args.dst[3] = (val >> 24) as base.u8 + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 2. + if 4 <= args.dst.length() { + pred23 = util.make_u8x8_multiple( + a00: args.dst[0], a01: args.dst[1], + a02: args.dst[2], a03: args.dst[3], + a04: 0, a05: 0, a06: 0, a07: 0) + pred23_w = pred23.vmovl_u8() + out23 = pred23_w.vaddq_u16(b: r23) + out23_u8 = out23.vqmovun_s16() + val = out23_u8.as_u32x2().vget_lane_u32(b: 0) + args.dst[0] = (val & 0xFF) as base.u8 + args.dst[1] = ((val >> 8) & 0xFF) as base.u8 + args.dst[2] = ((val >> 16) & 0xFF) as base.u8 + args.dst[3] = (val >> 24) as base.u8 + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 3. + if 4 <= args.dst.length() { + pred23 = util.make_u8x8_multiple( + a00: 0, a01: 0, a02: 0, a03: 0, + a04: args.dst[0], a05: args.dst[1], + a06: args.dst[2], a07: args.dst[3]) + pred23_w = pred23.vmovl_u8() + out23 = pred23_w.vaddq_u16(b: r23) + out23_u8 = out23.vqmovun_s16() + val = out23_u8.as_u32x2().vget_lane_u32(b: 1) + args.dst[0] = (val & 0xFF) as base.u8 + args.dst[1] = ((val >> 8) & 0xFF) as base.u8 + args.dst[2] = ((val >> 16) & 0xFF) as base.u8 + args.dst[3] = (val >> 24) as base.u8 + } + + // Clear the coefficients. + i = 0 + while i < 16 { + assert (off + i) < 400 via "a < b: a < c; c <= b"(c: 400) + this.mb_coeffs[off + i] = 0 + i += 1 + } +} + +// idct_dc_add_arm_neon! performs a DC-only IDCT add using NEON. +// Computes dc = asr3(coeff[0] + 4), broadcasts to all lanes, then for each +// of 4 rows: load 4 prediction bytes, widen, add dc, saturating narrow, store. +// Replaces 16 per-pixel bounds checks + branching clamp with vqmovun_s16. +pri func decoder.idct_dc_add_arm_neon!(dst: slice base.u8, stride: base.u32, coeff_offset: base.u32[..= 384]), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + + var dc_vec : base.arm_neon_u16x8 + var k4 : base.arm_neon_u16x8 + var pred : base.arm_neon_u8x8 + var pred_w : base.arm_neon_u16x8 + var out : base.arm_neon_u16x8 + var out_u8 : base.arm_neon_u8x8 + var val : base.u32 + + // Compute DC = asr3(coeff[0] + 4) and broadcast to all 8 lanes. + dc_vec = util.make_u16x8_repeat(a: (this.mb_coeffs[args.coeff_offset] & 0xFFFF) as base.u16) + k4 = util.make_u16x8_repeat(a: 4) + dc_vec = dc_vec.vaddq_u16(b: k4) + dc_vec = dc_vec.vshrq_n_s16(b: 3) + + // Clear the DC coefficient. + this.mb_coeffs[args.coeff_offset] = 0 + + // Row 0. + if 4 <= args.dst.length() { + pred = util.make_u8x8_multiple( + a00: args.dst[0], a01: args.dst[1], + a02: args.dst[2], a03: args.dst[3], + a04: 0, a05: 0, a06: 0, a07: 0) + pred_w = pred.vmovl_u8() + out = pred_w.vaddq_u16(b: dc_vec) + out_u8 = out.vqmovun_s16() + val = out_u8.as_u32x2().vget_lane_u32(b: 0) + args.dst[0] = (val & 0xFF) as base.u8 + args.dst[1] = ((val >> 8) & 0xFF) as base.u8 + args.dst[2] = ((val >> 16) & 0xFF) as base.u8 + args.dst[3] = (val >> 24) as base.u8 + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 1. + if 4 <= args.dst.length() { + pred = util.make_u8x8_multiple( + a00: args.dst[0], a01: args.dst[1], + a02: args.dst[2], a03: args.dst[3], + a04: 0, a05: 0, a06: 0, a07: 0) + pred_w = pred.vmovl_u8() + out = pred_w.vaddq_u16(b: dc_vec) + out_u8 = out.vqmovun_s16() + val = out_u8.as_u32x2().vget_lane_u32(b: 0) + args.dst[0] = (val & 0xFF) as base.u8 + args.dst[1] = ((val >> 8) & 0xFF) as base.u8 + args.dst[2] = ((val >> 16) & 0xFF) as base.u8 + args.dst[3] = (val >> 24) as base.u8 + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 2. + if 4 <= args.dst.length() { + pred = util.make_u8x8_multiple( + a00: args.dst[0], a01: args.dst[1], + a02: args.dst[2], a03: args.dst[3], + a04: 0, a05: 0, a06: 0, a07: 0) + pred_w = pred.vmovl_u8() + out = pred_w.vaddq_u16(b: dc_vec) + out_u8 = out.vqmovun_s16() + val = out_u8.as_u32x2().vget_lane_u32(b: 0) + args.dst[0] = (val & 0xFF) as base.u8 + args.dst[1] = ((val >> 8) & 0xFF) as base.u8 + args.dst[2] = ((val >> 16) & 0xFF) as base.u8 + args.dst[3] = (val >> 24) as base.u8 + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 3. + if 4 <= args.dst.length() { + pred = util.make_u8x8_multiple( + a00: args.dst[0], a01: args.dst[1], + a02: args.dst[2], a03: args.dst[3], + a04: 0, a05: 0, a06: 0, a07: 0) + pred_w = pred.vmovl_u8() + out = pred_w.vaddq_u16(b: dc_vec) + out_u8 = out.vqmovun_s16() + val = out_u8.as_u32x2().vget_lane_u32(b: 0) + args.dst[0] = (val & 0xFF) as base.u8 + args.dst[1] = ((val >> 8) & 0xFF) as base.u8 + args.dst[2] = ((val >> 16) & 0xFF) as base.u8 + args.dst[3] = (val >> 24) as base.u8 + } +} diff --git a/std/vp8/decode_idct_x86_avx2.wuffs b/std/vp8/decode_idct_x86_avx2.wuffs new file mode 100644 index 000000000..9b36caf52 --- /dev/null +++ b/std/vp8/decode_idct_x86_avx2.wuffs @@ -0,0 +1,324 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 4x4 inverse DCT pair, AVX2 version processing 2 adjacent blocks +// simultaneously. Lower 128-bit lane = block A, upper 128-bit lane = block B. +// The butterfly operates independently per lane, doubling throughput. +// +// Uses the same mulhi_epi16 trick as the SSE4.2 version: K = k + 65536, so +// (x * K) >> 16 = mulhi(x, k) + x +// with k1 = 20091 (cos) and k2 = 35468 as u16 = -30068 as i16 (sin). + +pri func decoder.idct_add_pair_x86_avx2!(dst: slice base.u8, stride: base.u32, + coeff_offset_a: base.u32[..= 384], coeff_offset_b: base.u32[..= 384]), + choose cpu_arch >= x86_avx2, +{ + var util128 : base.x86_sse42_utility + var util256 : base.x86_avx2_utility + + var k1 : base.x86_m256i + var k2 : base.x86_m256i + var k_4 : base.x86_m256i + var k_0_128 : base.x86_m128i + + var row0 : base.x86_m256i + var row1 : base.x86_m256i + var row2 : base.x86_m256i + var row3 : base.x86_m256i + + var la : base.x86_m128i + var lb : base.x86_m128i + + var a : base.x86_m256i + var b : base.x86_m256i + var c : base.x86_m256i + var d : base.x86_m256i + var c1 : base.x86_m256i + var c2 : base.x86_m256i + var c3 : base.x86_m256i + var c4 : base.x86_m256i + var d1 : base.x86_m256i + var d2 : base.x86_m256i + var d3 : base.x86_m256i + var d4 : base.x86_m256i + + var tr0 : base.x86_m256i + var tr1 : base.x86_m256i + var tr2 : base.x86_m256i + var tr3 : base.x86_m256i + var ts0 : base.x86_m256i + var ts1 : base.x86_m256i + var ts2 : base.x86_m256i + var ts3 : base.x86_m256i + + var oa : base.x86_m128i + var ob : base.x86_m128i + + var off_a : base.u32[..= 384] + var off_b : base.u32[..= 384] + var i : base.u32 + + off_a = args.coeff_offset_a + off_b = args.coeff_offset_b + + k1 = util256.make_m256i_repeat_u16(a: 20091) + k2 = util256.make_m256i_repeat_u16(a: 35468) + k_4 = util256.make_m256i_repeat_u16(a: 4) + k_0_128 = util128.make_m128i_zeroes() + + // Load 4 rows from both blocks. Each row: load 4 i32, pack to i16, + // combine lanes [A | B]. + + // Row 0. + assert off_a <= (off_a + 4) via "a <= (a + b): 0 <= b"(b: 4) + assert off_b <= (off_b + 4) via "a <= (a + b): 0 <= b"(b: 4) + la = util128.make_m128i_slice_u32lex4(a: this.mb_coeffs[off_a .. off_a + 4])._mm_packs_epi32(b: k_0_128) + lb = util128.make_m128i_slice_u32lex4(a: this.mb_coeffs[off_b .. off_b + 4])._mm_packs_epi32(b: k_0_128) + row0 = la._mm256_castsi128_si256()._mm256_inserti128_si256(b: lb, imm8: 1) + + // Row 1. + assert (off_a + 4) <= (off_a + 8) via "a <= (a + b): 0 <= b"(b: 4) + assert (off_b + 4) <= (off_b + 8) via "a <= (a + b): 0 <= b"(b: 4) + la = util128.make_m128i_slice_u32lex4(a: this.mb_coeffs[off_a + 4 .. off_a + 8])._mm_packs_epi32(b: k_0_128) + lb = util128.make_m128i_slice_u32lex4(a: this.mb_coeffs[off_b + 4 .. off_b + 8])._mm_packs_epi32(b: k_0_128) + row1 = la._mm256_castsi128_si256()._mm256_inserti128_si256(b: lb, imm8: 1) + + // Row 2. + assert (off_a + 8) <= (off_a + 12) via "a <= (a + b): 0 <= b"(b: 4) + assert (off_b + 8) <= (off_b + 12) via "a <= (a + b): 0 <= b"(b: 4) + la = util128.make_m128i_slice_u32lex4(a: this.mb_coeffs[off_a + 8 .. off_a + 12])._mm_packs_epi32(b: k_0_128) + lb = util128.make_m128i_slice_u32lex4(a: this.mb_coeffs[off_b + 8 .. off_b + 12])._mm_packs_epi32(b: k_0_128) + row2 = la._mm256_castsi128_si256()._mm256_inserti128_si256(b: lb, imm8: 1) + + // Row 3. + assert (off_a + 12) <= (off_a + 16) via "a <= (a + b): 0 <= b"(b: 4) + assert (off_b + 12) <= (off_b + 16) via "a <= (a + b): 0 <= b"(b: 4) + la = util128.make_m128i_slice_u32lex4(a: this.mb_coeffs[off_a + 12 .. off_a + 16])._mm_packs_epi32(b: k_0_128) + lb = util128.make_m128i_slice_u32lex4(a: this.mb_coeffs[off_b + 12 .. off_b + 16])._mm_packs_epi32(b: k_0_128) + row3 = la._mm256_castsi128_si256()._mm256_inserti128_si256(b: lb, imm8: 1) + + // ---- Column pass (16-bit butterfly, per-lane independent) ---- + a = row0._mm256_add_epi16(b: row2) + b = row0._mm256_sub_epi16(b: row2) + + c1 = row1._mm256_mulhi_epi16(b: k2) + c2 = row3._mm256_mulhi_epi16(b: k1) + c3 = row1._mm256_sub_epi16(b: row3) + c4 = c1._mm256_sub_epi16(b: c2) + c = c3._mm256_add_epi16(b: c4) + + d1 = row1._mm256_mulhi_epi16(b: k1) + d2 = row3._mm256_mulhi_epi16(b: k2) + d3 = row1._mm256_add_epi16(b: row3) + d4 = d1._mm256_add_epi16(b: d2) + d = d3._mm256_add_epi16(b: d4) + + row0 = a._mm256_add_epi16(b: d) + row1 = b._mm256_add_epi16(b: c) + row2 = b._mm256_sub_epi16(b: c) + row3 = a._mm256_sub_epi16(b: d) + + // ---- Transpose 4x4 i16 per lane (upper halves stay zero) ---- + tr0 = row0._mm256_unpacklo_epi16(b: row1) + tr1 = row2._mm256_unpacklo_epi16(b: row3) + tr2 = row0._mm256_unpackhi_epi16(b: row1) + tr3 = row2._mm256_unpackhi_epi16(b: row3) + ts0 = tr0._mm256_unpacklo_epi32(b: tr1) + ts1 = tr0._mm256_unpackhi_epi32(b: tr1) + ts2 = tr2._mm256_unpacklo_epi32(b: tr3) + ts3 = tr2._mm256_unpackhi_epi32(b: tr3) + row0 = ts0._mm256_unpacklo_epi64(b: ts2) + row1 = ts0._mm256_unpackhi_epi64(b: ts2) + row2 = ts1._mm256_unpacklo_epi64(b: ts3) + row3 = ts1._mm256_unpackhi_epi64(b: ts3) + + // ---- Row pass: butterfly with +4 rounding bias and >>3 ---- + row0 = row0._mm256_add_epi16(b: k_4) + + a = row0._mm256_add_epi16(b: row2) + b = row0._mm256_sub_epi16(b: row2) + + c1 = row1._mm256_mulhi_epi16(b: k2) + c2 = row3._mm256_mulhi_epi16(b: k1) + c3 = row1._mm256_sub_epi16(b: row3) + c4 = c1._mm256_sub_epi16(b: c2) + c = c3._mm256_add_epi16(b: c4) + + d1 = row1._mm256_mulhi_epi16(b: k1) + d2 = row3._mm256_mulhi_epi16(b: k2) + d3 = row1._mm256_add_epi16(b: row3) + d4 = d1._mm256_add_epi16(b: d2) + d = d3._mm256_add_epi16(b: d4) + + row0 = a._mm256_add_epi16(b: d)._mm256_srai_epi16(imm8: 3) + row1 = b._mm256_add_epi16(b: c)._mm256_srai_epi16(imm8: 3) + row2 = b._mm256_sub_epi16(b: c)._mm256_srai_epi16(imm8: 3) + row3 = a._mm256_sub_epi16(b: d)._mm256_srai_epi16(imm8: 3) + + // ---- Transpose again ---- + tr0 = row0._mm256_unpacklo_epi16(b: row1) + tr1 = row2._mm256_unpacklo_epi16(b: row3) + tr2 = row0._mm256_unpackhi_epi16(b: row1) + tr3 = row2._mm256_unpackhi_epi16(b: row3) + ts0 = tr0._mm256_unpacklo_epi32(b: tr1) + ts1 = tr0._mm256_unpackhi_epi32(b: tr1) + ts2 = tr2._mm256_unpacklo_epi32(b: tr3) + ts3 = tr2._mm256_unpackhi_epi32(b: tr3) + row0 = ts0._mm256_unpacklo_epi64(b: ts2) + row1 = ts0._mm256_unpackhi_epi64(b: ts2) + row2 = ts1._mm256_unpacklo_epi64(b: ts3) + row3 = ts1._mm256_unpackhi_epi64(b: ts3) + + // ---- Add to prediction and clamp to [0, 255] ---- + // Extract IDCT results from 256-bit lanes to 128-bit, combine A+B in one + // m128i, load 8 prediction bytes, zero-extend, add, clamp, store as u64. + + // Row 0. + if 8 <= args.dst.length() { + assert args.dst.length() >= 8 via "a >= b: b <= a"() + oa = row0._mm256_castsi256_si128() + ob = row0._mm256_extracti128_si256(imm8: 1) + la = oa._mm_unpacklo_epi64(b: ob) + lb = util128.make_m128i_single_u64(a: args.dst.peek_u64le())._mm_unpacklo_epi8(b: k_0_128) + la = lb._mm_add_epi16(b: la)._mm_packus_epi16(b: la) + args.dst.poke_u64le!(a: la.truncate_u64()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 1. + if 8 <= args.dst.length() { + assert args.dst.length() >= 8 via "a >= b: b <= a"() + oa = row1._mm256_castsi256_si128() + ob = row1._mm256_extracti128_si256(imm8: 1) + la = oa._mm_unpacklo_epi64(b: ob) + lb = util128.make_m128i_single_u64(a: args.dst.peek_u64le())._mm_unpacklo_epi8(b: k_0_128) + la = lb._mm_add_epi16(b: la)._mm_packus_epi16(b: la) + args.dst.poke_u64le!(a: la.truncate_u64()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 2. + if 8 <= args.dst.length() { + assert args.dst.length() >= 8 via "a >= b: b <= a"() + oa = row2._mm256_castsi256_si128() + ob = row2._mm256_extracti128_si256(imm8: 1) + la = oa._mm_unpacklo_epi64(b: ob) + lb = util128.make_m128i_single_u64(a: args.dst.peek_u64le())._mm_unpacklo_epi8(b: k_0_128) + la = lb._mm_add_epi16(b: la)._mm_packus_epi16(b: la) + args.dst.poke_u64le!(a: la.truncate_u64()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 3. + if 8 <= args.dst.length() { + assert args.dst.length() >= 8 via "a >= b: b <= a"() + oa = row3._mm256_castsi256_si128() + ob = row3._mm256_extracti128_si256(imm8: 1) + la = oa._mm_unpacklo_epi64(b: ob) + lb = util128.make_m128i_single_u64(a: args.dst.peek_u64le())._mm_unpacklo_epi8(b: k_0_128) + la = lb._mm_add_epi16(b: la)._mm_packus_epi16(b: la) + args.dst.poke_u64le!(a: la.truncate_u64()) + } + + // Clear both blocks' coefficients. + i = 0 + while i < 16 { + assert (off_a + i) < 400 via "a < b: a < c; c <= b"(c: 400) + assert (off_b + i) < 400 via "a < b: a < c; c <= b"(c: 400) + this.mb_coeffs[off_a + i] = 0 + this.mb_coeffs[off_b + i] = 0 + i += 1 + } +} + +// idct_dc_add_pair_x86_avx2! adds DC-only offsets to two adjacent 4x4 blocks. +// Pure SSE42 output: precompute combined DC in one m128i, no 256-bit ops needed. +pri func decoder.idct_dc_add_pair_x86_avx2!(dst: slice base.u8, stride: base.u32, + coeff_offset_a: base.u32[..= 384], coeff_offset_b: base.u32[..= 384]), + choose cpu_arch >= x86_avx2, +{ + var util128 : base.x86_sse42_utility + + var k_0 : base.x86_m128i + var dc : base.x86_m128i + var la : base.x86_m128i + var lb : base.x86_m128i + + var off_a : base.u32[..= 384] + var off_b : base.u32[..= 384] + var dc_a : base.u32 + var dc_b : base.u32 + + off_a = args.coeff_offset_a + off_b = args.coeff_offset_b + + k_0 = util128.make_m128i_zeroes() + + // Compute DC for each block: asr3(dc + 4). + dc_a = this.mb_coeffs[off_a] ~mod+ 4 + dc_a = (dc_a >> 3) | ((0 ~mod- (dc_a >> 31)) ~mod<< 29) + this.mb_coeffs[off_a] = 0 + + dc_b = this.mb_coeffs[off_b] ~mod+ 4 + dc_b = (dc_b >> 3) | ((0 ~mod- (dc_b >> 31)) ~mod<< 29) + this.mb_coeffs[off_b] = 0 + + // Combine: [dc_a×4, dc_b×4] as 8 i16 in one m128i. + la = util128.make_m128i_repeat_u16(a: (dc_a & 0xFFFF) as base.u16) + lb = util128.make_m128i_repeat_u16(a: (dc_b & 0xFFFF) as base.u16) + dc = la._mm_unpacklo_epi64(b: lb) + + // Row 0. + if 8 <= args.dst.length() { + assert args.dst.length() >= 8 via "a >= b: b <= a"() + la = util128.make_m128i_single_u64(a: args.dst.peek_u64le())._mm_unpacklo_epi8(b: k_0) + la = la._mm_add_epi16(b: dc)._mm_packus_epi16(b: la) + args.dst.poke_u64le!(a: la.truncate_u64()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 1. + if 8 <= args.dst.length() { + assert args.dst.length() >= 8 via "a >= b: b <= a"() + la = util128.make_m128i_single_u64(a: args.dst.peek_u64le())._mm_unpacklo_epi8(b: k_0) + la = la._mm_add_epi16(b: dc)._mm_packus_epi16(b: la) + args.dst.poke_u64le!(a: la.truncate_u64()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 2. + if 8 <= args.dst.length() { + assert args.dst.length() >= 8 via "a >= b: b <= a"() + la = util128.make_m128i_single_u64(a: args.dst.peek_u64le())._mm_unpacklo_epi8(b: k_0) + la = la._mm_add_epi16(b: dc)._mm_packus_epi16(b: la) + args.dst.poke_u64le!(a: la.truncate_u64()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 3. + if 8 <= args.dst.length() { + assert args.dst.length() >= 8 via "a >= b: b <= a"() + la = util128.make_m128i_single_u64(a: args.dst.peek_u64le())._mm_unpacklo_epi8(b: k_0) + la = la._mm_add_epi16(b: dc)._mm_packus_epi16(b: la) + args.dst.poke_u64le!(a: la.truncate_u64()) + } +} diff --git a/std/vp8/decode_idct_x86_sse42.wuffs b/std/vp8/decode_idct_x86_sse42.wuffs new file mode 100644 index 000000000..14e803e8a --- /dev/null +++ b/std/vp8/decode_idct_x86_sse42.wuffs @@ -0,0 +1,320 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 4x4 inverse DCT, SSE4.2 version using 16-bit arithmetic. +// +// Uses the mulhi_epi16 trick from libwebp: K = k + 65536, so +// (x * K) >> 16 = mulhi(x, k) + x +// with k1 = 20091 (for cos) and k2 = -30068 = 35468 as u16 (for sin). +// +// Coefficients are loaded as i32, packed to i16 (they fit in 12 bits), +// and the entire butterfly operates in 16-bit with zero upper halves. + +pri func decoder.idct_add_x86_sse42!(dst: slice base.u8, stride: base.u32, coeff_offset: base.u32[..= 384]), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + + // 16-bit constants. + var k1 : base.x86_m128i // 20091 (k for K1 = cos factor) + var k2 : base.x86_m128i // 35468 as u16 = -30068 as i16 (k for K2 = sin factor) + var k_4 : base.x86_m128i // rounding bias + var k_0 : base.x86_m128i // zeroes + + // Rows loaded from mb_coeffs (i32), then packed to i16. + var row0 : base.x86_m128i + var row1 : base.x86_m128i + var row2 : base.x86_m128i + var row3 : base.x86_m128i + + // i32 loading temporaries. + var load0 : base.x86_m128i + var load1 : base.x86_m128i + var load2 : base.x86_m128i + var load3 : base.x86_m128i + + // Butterfly temporaries. + var a : base.x86_m128i + var b : base.x86_m128i + var c : base.x86_m128i + var d : base.x86_m128i + var c1 : base.x86_m128i + var c2 : base.x86_m128i + var c3 : base.x86_m128i + var c4 : base.x86_m128i + var d1 : base.x86_m128i + var d2 : base.x86_m128i + var d3 : base.x86_m128i + var d4 : base.x86_m128i + + // Transpose temporaries. + var tr0 : base.x86_m128i + var tr1 : base.x86_m128i + var tr2 : base.x86_m128i + var tr3 : base.x86_m128i + var ts0 : base.x86_m128i + var ts1 : base.x86_m128i + var ts2 : base.x86_m128i + var ts3 : base.x86_m128i + + // Output temporaries. + var pred : base.x86_m128i + var pred16 : base.x86_m128i + var sum : base.x86_m128i + var out : base.x86_m128i + + var off : base.u32[..= 384] + var i : base.u32 + + off = args.coeff_offset + + // Set up 16-bit constants. + k1 = util.make_m128i_repeat_u16(a: 20091) + k2 = util.make_m128i_repeat_u16(a: 35468) // = -30068 as i16 + k_4 = util.make_m128i_repeat_u16(a: 4) + k_0 = util.make_m128i_zeroes() + + // Load 4 rows of 4 coefficients as contiguous i32x4, pack to i16x8. + assert off <= (off + 4) via "a <= (a + b): 0 <= b"(b: 4) + assert (off + 4) <= (off + 8) via "a <= (a + b): 0 <= b"(b: 4) + assert (off + 8) <= (off + 12) via "a <= (a + b): 0 <= b"(b: 4) + assert (off + 12) <= (off + 16) via "a <= (a + b): 0 <= b"(b: 4) + load0 = util.make_m128i_slice_u32lex4(a: this.mb_coeffs[off .. off + 4]) + load1 = util.make_m128i_slice_u32lex4(a: this.mb_coeffs[off + 4 .. off + 8]) + load2 = util.make_m128i_slice_u32lex4(a: this.mb_coeffs[off + 8 .. off + 12]) + load3 = util.make_m128i_slice_u32lex4(a: this.mb_coeffs[off + 12 .. off + 16]) + + // Pack i32 to i16 with signed saturation. Coefficients are 12-bit so no clipping. + // Upper half zeroed since we pack with k_0. + row0 = load0._mm_packs_epi32(b: k_0) // [c0,c1,c2,c3, 0,0,0,0] + row1 = load1._mm_packs_epi32(b: k_0) + row2 = load2._mm_packs_epi32(b: k_0) + row3 = load3._mm_packs_epi32(b: k_0) + + // ---- Column pass (16-bit butterfly) ---- + // a = row0 + row2, b = row0 - row2 + a = row0._mm_add_epi16(b: row2) + b = row0._mm_sub_epi16(b: row2) + + // c = MUL(row1, K2) - MUL(row3, K1) + // = (mulhi(row1,k2) + row1) - (mulhi(row3,k1) + row3) + // = (row1 - row3) + (mulhi(row1,k2) - mulhi(row3,k1)) + c1 = row1._mm_mulhi_epi16(b: k2) + c2 = row3._mm_mulhi_epi16(b: k1) + c3 = row1._mm_sub_epi16(b: row3) + c4 = c1._mm_sub_epi16(b: c2) + c = c3._mm_add_epi16(b: c4) + + // d = MUL(row1, K1) + MUL(row3, K2) + // = (mulhi(row1,k1) + row1) + (mulhi(row3,k2) + row3) + // = (row1 + row3) + (mulhi(row1,k1) + mulhi(row3,k2)) + d1 = row1._mm_mulhi_epi16(b: k1) + d2 = row3._mm_mulhi_epi16(b: k2) + d3 = row1._mm_add_epi16(b: row3) + d4 = d1._mm_add_epi16(b: d2) + d = d3._mm_add_epi16(b: d4) + + row0 = a._mm_add_epi16(b: d) + row1 = b._mm_add_epi16(b: c) + row2 = b._mm_sub_epi16(b: c) + row3 = a._mm_sub_epi16(b: d) + + // ---- Transpose 4x4 i16 matrix (upper half stays zero) ---- + tr0 = row0._mm_unpacklo_epi16(b: row1) + tr1 = row2._mm_unpacklo_epi16(b: row3) + tr2 = row0._mm_unpackhi_epi16(b: row1) + tr3 = row2._mm_unpackhi_epi16(b: row3) + ts0 = tr0._mm_unpacklo_epi32(b: tr1) + ts1 = tr0._mm_unpackhi_epi32(b: tr1) + ts2 = tr2._mm_unpacklo_epi32(b: tr3) + ts3 = tr2._mm_unpackhi_epi32(b: tr3) + row0 = ts0._mm_unpacklo_epi64(b: ts2) + row1 = ts0._mm_unpackhi_epi64(b: ts2) + row2 = ts1._mm_unpacklo_epi64(b: ts3) + row3 = ts1._mm_unpackhi_epi64(b: ts3) + + // ---- Row pass: same butterfly, with +4 rounding bias and >>3 ---- + row0 = row0._mm_add_epi16(b: k_4) + + a = row0._mm_add_epi16(b: row2) + b = row0._mm_sub_epi16(b: row2) + + c1 = row1._mm_mulhi_epi16(b: k2) + c2 = row3._mm_mulhi_epi16(b: k1) + c3 = row1._mm_sub_epi16(b: row3) + c4 = c1._mm_sub_epi16(b: c2) + c = c3._mm_add_epi16(b: c4) + + d1 = row1._mm_mulhi_epi16(b: k1) + d2 = row3._mm_mulhi_epi16(b: k2) + d3 = row1._mm_add_epi16(b: row3) + d4 = d1._mm_add_epi16(b: d2) + d = d3._mm_add_epi16(b: d4) + + row0 = a._mm_add_epi16(b: d)._mm_srai_epi16(imm8: 3) + row1 = b._mm_add_epi16(b: c)._mm_srai_epi16(imm8: 3) + row2 = b._mm_sub_epi16(b: c)._mm_srai_epi16(imm8: 3) + row3 = a._mm_sub_epi16(b: d)._mm_srai_epi16(imm8: 3) + + // ---- Transpose again to get row-major output ---- + tr0 = row0._mm_unpacklo_epi16(b: row1) + tr1 = row2._mm_unpacklo_epi16(b: row3) + tr2 = row0._mm_unpackhi_epi16(b: row1) + tr3 = row2._mm_unpackhi_epi16(b: row3) + ts0 = tr0._mm_unpacklo_epi32(b: tr1) + ts1 = tr0._mm_unpackhi_epi32(b: tr1) + ts2 = tr2._mm_unpacklo_epi32(b: tr3) + ts3 = tr2._mm_unpackhi_epi32(b: tr3) + row0 = ts0._mm_unpacklo_epi64(b: ts2) + row1 = ts0._mm_unpackhi_epi64(b: ts2) + row2 = ts1._mm_unpacklo_epi64(b: ts3) + row3 = ts1._mm_unpackhi_epi64(b: ts3) + + // ---- Add to prediction and clamp to [0, 255] ---- + // Load 4 prediction bytes, zero-extend to i16, add IDCT, packus to u8. + + // Row 0. + if 4 <= args.dst.length() { + assert args.dst.length() >= 4 via "a >= b: b <= a"() + pred = util.make_m128i_single_u32(a: args.dst.peek_u32le()) + pred16 = pred._mm_unpacklo_epi8(b: k_0) + sum = pred16._mm_add_epi16(b: row0) + out = sum._mm_packus_epi16(b: sum) + args.dst.poke_u32le!(a: out.truncate_u32()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 1. + if 4 <= args.dst.length() { + assert args.dst.length() >= 4 via "a >= b: b <= a"() + pred = util.make_m128i_single_u32(a: args.dst.peek_u32le()) + pred16 = pred._mm_unpacklo_epi8(b: k_0) + sum = pred16._mm_add_epi16(b: row1) + out = sum._mm_packus_epi16(b: sum) + args.dst.poke_u32le!(a: out.truncate_u32()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 2. + if 4 <= args.dst.length() { + assert args.dst.length() >= 4 via "a >= b: b <= a"() + pred = util.make_m128i_single_u32(a: args.dst.peek_u32le()) + pred16 = pred._mm_unpacklo_epi8(b: k_0) + sum = pred16._mm_add_epi16(b: row2) + out = sum._mm_packus_epi16(b: sum) + args.dst.poke_u32le!(a: out.truncate_u32()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 3. + if 4 <= args.dst.length() { + assert args.dst.length() >= 4 via "a >= b: b <= a"() + pred = util.make_m128i_single_u32(a: args.dst.peek_u32le()) + pred16 = pred._mm_unpacklo_epi8(b: k_0) + sum = pred16._mm_add_epi16(b: row3) + out = sum._mm_packus_epi16(b: sum) + args.dst.poke_u32le!(a: out.truncate_u32()) + } + + // Clear the coefficients. + i = 0 + while i < 16 { + assert (off + i) < 400 via "a < b: a < c; c <= b"(c: 400) + this.mb_coeffs[off + i] = 0 + i += 1 + } +} + +// idct_dc_add_x86_sse42! adds a constant DC offset to a 4x4 block. +// Only the first coefficient is non-zero; this avoids the full IDCT butterfly. +// Uses SSE to vectorize the add + clamp-to-[0,255] across 4 pixels per row. +pri func decoder.idct_dc_add_x86_sse42!(dst: slice base.u8, stride: base.u32, coeff_offset: base.u32[..= 384]), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + + var k_0 : base.x86_m128i + var dc16 : base.x86_m128i + var pred : base.x86_m128i + var pred16 : base.x86_m128i + var sum : base.x86_m128i + var out : base.x86_m128i + + var off : base.u32[..= 384] + var dc : base.u32 + + off = args.coeff_offset + k_0 = util.make_m128i_zeroes() + + // Compute DC offset: arithmetic shift right by 3 with rounding. + // asr3(v) = (v >> 3) | ((0 - (v >> 31)) << 29) + dc = this.mb_coeffs[off] ~mod+ 4 + dc = (dc >> 3) | ((0 ~mod- (dc >> 31)) ~mod<< 29) + this.mb_coeffs[off] = 0 + + // Broadcast DC to all i16 lanes. The u16 bit pattern is interpreted as + // signed i16 by _mm_add_epi16, giving correct signed addition. + dc16 = util.make_m128i_repeat_u16(a: (dc & 0xFFFF) as base.u16) + + // Row 0. + if 4 <= args.dst.length() { + assert args.dst.length() >= 4 via "a >= b: b <= a"() + pred = util.make_m128i_single_u32(a: args.dst.peek_u32le()) + pred16 = pred._mm_unpacklo_epi8(b: k_0) + sum = pred16._mm_add_epi16(b: dc16) + out = sum._mm_packus_epi16(b: sum) + args.dst.poke_u32le!(a: out.truncate_u32()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 1. + if 4 <= args.dst.length() { + assert args.dst.length() >= 4 via "a >= b: b <= a"() + pred = util.make_m128i_single_u32(a: args.dst.peek_u32le()) + pred16 = pred._mm_unpacklo_epi8(b: k_0) + sum = pred16._mm_add_epi16(b: dc16) + out = sum._mm_packus_epi16(b: sum) + args.dst.poke_u32le!(a: out.truncate_u32()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 2. + if 4 <= args.dst.length() { + assert args.dst.length() >= 4 via "a >= b: b <= a"() + pred = util.make_m128i_single_u32(a: args.dst.peek_u32le()) + pred16 = pred._mm_unpacklo_epi8(b: k_0) + sum = pred16._mm_add_epi16(b: dc16) + out = sum._mm_packus_epi16(b: sum) + args.dst.poke_u32le!(a: out.truncate_u32()) + } + if (args.stride as base.u64) <= args.dst.length() { + args.dst = args.dst[(args.stride as base.u64) ..] + } + + // Row 3. + if 4 <= args.dst.length() { + assert args.dst.length() >= 4 via "a >= b: b <= a"() + pred = util.make_m128i_single_u32(a: args.dst.peek_u32le()) + pred16 = pred._mm_unpacklo_epi8(b: k_0) + sum = pred16._mm_add_epi16(b: dc16) + out = sum._mm_packus_epi16(b: sum) + args.dst.poke_u32le!(a: out.truncate_u32()) + } +} diff --git a/std/vp8/decode_mb.wuffs b/std/vp8/decode_mb.wuffs new file mode 100644 index 000000000..c2c5a618b --- /dev/null +++ b/std/vp8/decode_mb.wuffs @@ -0,0 +1,1183 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +pri func decoder.decode_frame_mb!(src: base.io_reader, dst: ptr base.pixel_buffer, workbuf: slice base.u8) base.status { + var swizzle_status : base.status + var prev_mby : base.u32[..= 0x3FF] + var i : base.u32 + var part_size : base.u32 + var total_size : base.u32 + var n_copied : base.u32 + var coeff_start : base.u64 + var off : base.u64 + var new_part : base.u32 + var p : base.u32[..= 7] + var unconsumed : base.u32 + + if this.num_partitions > 1 { + // Multi-partition: read partition sizes, copy all coefficient data + // to workbuf, and set up per-partition boolean decoder state. + this.multi_partition = true + + // Read (num_partitions - 1) 3-byte LE partition sizes. + total_size = 0 + i = 0 + while (i < 7) and ((i + 1) < this.num_partitions) { + if args.src.length() >= 3 { + part_size = args.src.peek_u24le_as_u32() + args.src.skip_u32_fast!(actual: 3, worst_case: 3) + } else { + part_size = 0 + } + if i < 8 { + this.part_wbuf_size[i] = part_size + } + total_size ~mod+= part_size + i += 1 + } + + // Copy all coefficient data from src to workbuf. + coeff_start = this.workbuf_offset_v_end ~mod+ (this.partition0_size as base.u64) + off = coeff_start + n_copied = 0 + while args.src.length() > 0 { + if off < args.workbuf.length() { + args.workbuf[off] = args.src.peek_u8() + } + args.src.skip_u32_fast!(actual: 1, worst_case: 1) + off ~mod+= 1 + n_copied ~mod+= 1 + } + + // Last partition size = total copied - sum of first (N-1) sizes. + if this.num_partitions > 0 { + i = this.num_partitions - 1 + } else { + i = 0 + } + if i < 8 { + if n_copied > total_size { + this.part_wbuf_size[i] = n_copied - total_size + } else { + this.part_wbuf_size[i] = 0 + } + } + + // Compute per-partition workbuf offsets. + off = coeff_start + i = 0 + while (i < this.num_partitions) and (i < 8) { + this.part_wbuf_offset[i] = off + off ~mod+= this.part_wbuf_size[i] as base.u64 + i += 1 + } + + // Initialize per-partition boolean decoder state. + i = 0 + while i < 8 { + this.part_range[i] = 254 + this.part_value[i] = 0 + this.part_bits[i] = 0 + this.part_wbuf_ri[i] = 0 + i += 1 + } + + // Activate partition 0. + this.current_partition = 0 + this.current_part_wbuf_ri = 0 + this.p1_ri = 0 + this.p1_wi = 0 + this.p1_fill_from_workbuf!(workbuf: args.workbuf) + this.p1_init!() + } else { + // Single partition: stream coefficient data from src. + this.multi_partition = false + this.p1_ri = 0 + this.p1_wi = 0 + this.p1_fill_buffer!(src: args.src, n: 0x1000) + this.p1_init!() + } + + this.above_nz[.. 8200].bulk_memset!(byte_value: 0) + this.above_nz_y2[.. 0x401].bulk_memset!(byte_value: 0) + this.above_modes[.. 4096].bulk_memset!(byte_value: 0) + + // Interleaved decode-filter loop: decode row N, then filter row N-1. + // + // The prediction for row N reads the above row's UNFILTERED pixels (per + // RFC 6386 §15, the loop filter is post-reconstruction). So we must decode + // row N before filtering row N-1. After decoding row N, row N-1's data is + // hot in L1 cache (accessed during row N's prediction), making the filter + // fast. This eliminates the separate full-image filter pass. + // + // Filter parameters use a 2-row ring buffer indexed by (mb_y & 1) * 0x400. + this.mb_y = 0 + while this.mb_y < this.mb_height { + // Multi-partition: switch to the correct partition for this MB row. + if this.multi_partition and (this.mb_y > 0) { + // Save current partition state. + p = this.current_partition + this.part_range[p] = this.p1_range + this.part_value[p] = this.p1_value + this.part_bits[p] = this.p1_bits + // Adjust wbuf_ri for unconsumed bytes still in p1_buffer. + if this.p1_wi >= this.p1_ri { + unconsumed = this.p1_wi - this.p1_ri + } else { + unconsumed = 0 + } + this.part_wbuf_ri[p] = this.current_part_wbuf_ri ~sat- unconsumed + + // Advance to next partition (round-robin). + new_part = (this.current_partition as base.u32) + 1 + if new_part >= this.num_partitions { + new_part = 0 + } + + // Restore new partition state. + if new_part < 8 { + this.p1_range = this.part_range[new_part] & 0xFF + this.p1_value = this.part_value[new_part] + this.p1_bits = this.part_bits[new_part] + this.current_part_wbuf_ri = this.part_wbuf_ri[new_part] + this.current_partition = new_part as base.u32[..= 7] + } + + // Refill p1_buffer from new partition's workbuf region. + this.p1_ri = 0 + this.p1_wi = 0 + this.p1_fill_from_workbuf!(workbuf: args.workbuf) + } + + // Zero the current row's filter params in the ring buffer. + if (this.mb_y & 1) == 0 { + this.mb_filter_level[0x000 .. 0x400].bulk_memset!(byte_value: 0) + this.mb_filter_inner[0x000 .. 0x400].bulk_memset!(byte_value: 0) + } else { + this.mb_filter_level[0x400 .. 0x800].bulk_memset!(byte_value: 0) + this.mb_filter_inner[0x400 .. 0x800].bulk_memset!(byte_value: 0) + } + + this.left_nz[.. 8].bulk_memset!(byte_value: 0) + this.left_nz_y2 = 0 + this.left_modes[.. 4].bulk_memset!(byte_value: 0) + + this.mb_x = 0 + while this.mb_x < this.mb_width { + // Refill partition 0 (mode data) buffer when running low. + if (this.bool_ri ~mod+ 256) >= this.bool_wi { + this.bool_fill_from_workbuf!(workbuf: args.workbuf) + } + + // Refill partition 1 (coefficient data) buffer when running low. + // Threshold 2048 ensures the buffer has enough data for a full + // MB's coefficient decode (~1200 bytes worst case), preventing + // mid-block buffer exhaustion that would corrupt the bool state. + if (this.p1_ri ~mod+ 2048) >= this.p1_wi { + if this.multi_partition { + this.p1_fill_from_workbuf!(workbuf: args.workbuf) + } else { + this.p1_fill_buffer!(src: args.src, n: 0x800) + } + } + + this.decode_one_mb!(workbuf: args.workbuf) + + if this.mb_x < 0x3FF { + this.mb_x += 1 + } + } + + // Filter the PREVIOUS row (now safe: this row's prediction already + // used the previous row's unfiltered pixels), then swizzle to output + // while the data is still cache-hot. + if this.mb_y > 0 { + prev_mby = this.mb_y - 1 + if (this.filter_type == 1) and (this.filter_level > 0) { + this.apply_simple_filter_row!(workbuf: args.workbuf, mby: prev_mby) + } else if this.filter_level > 0 { + this.apply_normal_filter_row!(workbuf: args.workbuf, mby: prev_mby) + } + swizzle_status = this.swizzle_mb_row!(dst: args.dst, workbuf: args.workbuf, mby: prev_mby, is_last: false) + } + + if this.mb_y < 0x3FF { + this.mb_y += 1 + } + } + + // Filter and swizzle the last row. + if this.mb_height > 0 { + prev_mby = this.mb_height - 1 + if prev_mby <= 0x3FF { + if (this.filter_type == 1) and (this.filter_level > 0) { + this.apply_simple_filter_row!(workbuf: args.workbuf, mby: prev_mby) + } else if this.filter_level > 0 { + this.apply_normal_filter_row!(workbuf: args.workbuf, mby: prev_mby) + } + swizzle_status = this.swizzle_mb_row!(dst: args.dst, workbuf: args.workbuf, mby: prev_mby, is_last: true) + } + } + + return swizzle_status +} + +pri func decoder.decode_one_mb!(workbuf: slice base.u8) { + var i : base.u32 + var v : base.u32[..= 1] + var block_offset : base.u32[..= 384] + var y_off : base.u64 + var uv_off : base.u64 + var dst : slice base.u8 + var mb_idx : base.u32 + var seg : base.u32 + + // Cached stride and base offset locals. These avoid re-reading struct + // fields after choosy (indirect) IDCT/prediction calls, since the Wuffs + // compiler must assume struct fields may have changed. + var ys : base.u32[..= 0x4000] + var uvs : base.u32[..= 0x2000] + var y_base : base.u64 + var uv_base : base.u64 + + // mb_coeffs is guaranteed to be zero on entry: the struct is zero-initialized, + // and idct_add/wht clear their coefficients after use. + + // Parse segment ID. + if this.use_segment and this.update_segment_map { + v = this.bool_read_bool!(prob: this.segment_prob[0]) + if v == 0 { + v = this.bool_read_bool!(prob: this.segment_prob[1]) + if v == 0 { + this.segment_id = 0 + } else { + this.segment_id = 1 + } + } else { + v = this.bool_read_bool!(prob: this.segment_prob[2]) + if v == 0 { + this.segment_id = 2 + } else { + this.segment_id = 3 + } + } + } else { + this.segment_id = 0 + } + + // Parse skip coefficient flag. + if this.mb_no_skip_coeff { + v = this.bool_read_bool!(prob: this.prob_skip_false) + this.is_skip_coeff = (v <> 0) + } else { + this.is_skip_coeff = false + } + + // Parse luma and chroma prediction modes. + this.decode_luma_mode!() + this.decode_chroma_mode!() + + // Decode coefficients. + if not this.is_skip_coeff { + this.decode_mb_coefficients!() + } else { + // Skip: clear non-zero context for this macroblock. + this.clear_mb_nz_context!() + } + + // Cache strides and base offsets as locals to avoid re-reading struct + // fields after choosy calls (IDCT, prediction). + ys = this.y_stride + uvs = this.uv_stride + y_base = ((this.mb_y as base.u64) * 16 * (ys as base.u64)) + + ((this.mb_x as base.u64) * 16) + + // Apply prediction and reconstruction. For skip blocks (all-zero coefficients), + // IDCT adds nothing to the prediction, so we skip both WHT and IDCT. + if this.mb_luma_mode < 4 { + this.predict_16x16!(workbuf: args.workbuf, mode: this.mb_luma_mode as base.u8[..= 3]) + + if not this.is_skip_coeff { + this.wht!(coeff_offset: 384) + + i = 0 + while i < 16 { + block_offset = i * 16 + y_off = (y_base ~mod+ (((i >> 2) as base.u64) * 4 * (ys as base.u64))) ~mod+ (((i & 3) as base.u64) * 4) + if y_off < args.workbuf.length() { + dst = args.workbuf[y_off ..] + if this.mb_y_ac_nz[i] >= 2 { + this.idct_add!(dst: dst, stride: ys, coeff_offset: block_offset) + } else if this.mb_coeffs[block_offset] <> 0 { + this.idct_dc_add!(dst: dst, stride: ys, coeff_offset: block_offset) + } + } + i += 1 + } + } + } else { + // Compute the upper-right pixels for B_PRED rightmost column. + // These are the same for all sub-block rows and come from the row + // above the macroblock, matching Go's ybr[0][24..27] duplication. + if this.mb_y > 0 { + // Compute offset of row (mb_y*16-1) in the Y plane. + // y_off = mb_y * 16 * y_stride - y_stride + mb_x * 16 + y_off = ((this.mb_y as base.u64) ~mod* 16) ~mod* (this.y_stride as base.u64) + y_off = y_off ~mod- (this.y_stride as base.u64) + y_off = y_off ~mod+ ((this.mb_x as base.u64) ~mod* 16) + if (this.mb_x as base.u32) < (this.mb_width ~mod- 1) { + // Upper-right from the row above, to the right of this MB. + y_off = y_off ~mod+ 16 + if y_off < args.workbuf.length() { + dst = args.workbuf[y_off ..] + if dst.length() >= 4 { + mb_idx = dst.peek_u32le() + this.mb_upper_right[0] = (mb_idx & 0xFF) as base.u8 + this.mb_upper_right[1] = ((mb_idx >> 8) & 0xFF) as base.u8 + this.mb_upper_right[2] = ((mb_idx >> 16) & 0xFF) as base.u8 + this.mb_upper_right[3] = (mb_idx >> 24) as base.u8 + } + } + } else { + // Rightmost MB column: replicate last pixel of above row. + y_off = y_off ~mod+ 15 + if y_off < args.workbuf.length() { + this.mb_upper_right[0] = args.workbuf[y_off] + this.mb_upper_right[1] = args.workbuf[y_off] + this.mb_upper_right[2] = args.workbuf[y_off] + this.mb_upper_right[3] = args.workbuf[y_off] + } + } + } else { + // mby=0: above border is 127. + this.mb_upper_right[0] = 127 + this.mb_upper_right[1] = 127 + this.mb_upper_right[2] = 127 + this.mb_upper_right[3] = 127 + } + + i = 0 + while i < 16 { + block_offset = i * 16 + this.predict_4x4!(workbuf: args.workbuf, block_idx: i as base.u32[..= 15], mode: this.sub_modes[i]) + if (not this.is_skip_coeff) and (this.mb_y_ac_nz[i] > 0) { + y_off = (y_base ~mod+ (((i >> 2) as base.u64) * 4 * (ys as base.u64))) ~mod+ (((i & 3) as base.u64) * 4) + if y_off < args.workbuf.length() { + dst = args.workbuf[y_off ..] + if this.mb_y_ac_nz[i] >= 2 { + this.idct_add!(dst: dst, stride: ys, coeff_offset: block_offset) + } else { + this.idct_dc_add!(dst: dst, stride: ys, coeff_offset: block_offset) + } + } + } + i += 1 + } + } + + // Chroma prediction and IDCT. + this.predict_8x8!(workbuf: args.workbuf, mode: this.mb_chroma_mode, plane_offset: this.workbuf_offset_y_end) + this.predict_8x8!(workbuf: args.workbuf, mode: this.mb_chroma_mode, plane_offset: this.workbuf_offset_u_end) + + if not this.is_skip_coeff { + // U blocks (indices 0-3 in mb_uv_nz). + uv_base = (this.workbuf_offset_y_end ~mod+ + ((this.mb_y as base.u64) * 8 * (uvs as base.u64))) ~mod+ + ((this.mb_x as base.u64) * 8) + i = 0 + while i < 4 { + block_offset = (16 + i) * 16 + if this.mb_uv_nz[i] > 0 { + uv_off = (uv_base ~mod+ (((i >> 1) as base.u64) * 4 * (uvs as base.u64))) ~mod+ (((i & 1) as base.u64) * 4) + if uv_off < args.workbuf.length() { + dst = args.workbuf[uv_off ..] + if this.mb_uv_nz[i] >= 2 { + this.idct_add!(dst: dst, stride: uvs, coeff_offset: block_offset) + } else { + this.idct_dc_add!(dst: dst, stride: uvs, coeff_offset: block_offset) + } + } + } + i += 1 + } + + // V blocks (indices 4-7 in mb_uv_nz). + uv_base = (this.workbuf_offset_u_end ~mod+ + ((this.mb_y as base.u64) * 8 * (uvs as base.u64))) ~mod+ + ((this.mb_x as base.u64) * 8) + i = 0 + while i < 4 { + block_offset = (20 + i) * 16 + if this.mb_uv_nz[i + 4] > 0 { + uv_off = (uv_base ~mod+ (((i >> 1) as base.u64) * 4 * (uvs as base.u64))) ~mod+ (((i & 1) as base.u64) * 4) + if uv_off < args.workbuf.length() { + dst = args.workbuf[uv_off ..] + if this.mb_uv_nz[i + 4] >= 2 { + this.idct_add!(dst: dst, stride: uvs, coeff_offset: block_offset) + } else { + this.idct_dc_add!(dst: dst, stride: uvs, coeff_offset: block_offset) + } + } + } + i += 1 + } + } + + // Store per-MB filter parameters from precomputed table. + mb_idx = ((this.mb_y & 1) * 0x400) + this.mb_x + if mb_idx < 0x800 { + // Lookup precomputed filter strength by (segment, is_i4x4). + seg = ((this.segment_id & 3) as base.u32) * 2 + if this.mb_luma_mode == 4 { + seg += 1 + } + if seg < 8 { + this.mb_filter_level[mb_idx] = this.fstrength_level[seg] + this.mb_filter_ilevel[mb_idx] = this.fstrength_ilevel[seg] + this.mb_filter_hlevel[mb_idx] = this.fstrength_hlevel[seg] + } + // Store inner flag: filter sub-block edges if MB uses B_PRED (4x4) + // or has non-zero coefficients (matches libwebp: f_inner = i4x4 | !skip). + if (this.mb_luma_mode == 4) or (not this.is_skip_coeff) { + this.mb_filter_inner[mb_idx] = 1 + } + } +} + +pri func decoder.decode_luma_mode!() { + var v : base.u32[..= 1] + var val : base.u32 + var mode : base.u32 + var i : base.u32 + var above_mode : base.u32 + var left_mode : base.u32 + var prob_idx : base.u32 + var above_idx : base.u32 + + // Key frame luma mode tree (RFC 6386 section 11.2): + // bit=0 → B_PRED(4) + // bit=1 → prob[1]: bit=0 → (prob[2]: 0→DC, 1→V) + // bit=1 → (prob[3]: 0→H, 1→TM) + v = this.bool_read_bool!(prob: KF_Y_MODE_PROBS[0]) + if v == 0 { + mode = 4 + } else { + v = this.bool_read_bool!(prob: KF_Y_MODE_PROBS[1]) + if v == 0 { + v = this.bool_read_bool!(prob: KF_Y_MODE_PROBS[2]) + if v == 0 { + mode = 0 + } else { + mode = 1 + } + } else { + v = this.bool_read_bool!(prob: KF_Y_MODE_PROBS[3]) + if v == 0 { + mode = 2 + } else { + mode = 3 + } + } + } + + this.mb_luma_mode = (mode & 0xFF) as base.u8 + + if mode == 4 { + i = 0 + while i < 16 { + assert i < 16 via "a < b: a < c; c <= b"(c: 16) + if i < 4 { + above_idx = (this.mb_x * 4) + (i & 3) + if above_idx < 4096 { + above_mode = this.above_modes[above_idx] as base.u32 + } + } else { + above_mode = this.sub_modes[i - 4] as base.u32 + } + if (i & 3) == 0 { + if (i >> 2) < 4 { + left_mode = this.left_modes[i >> 2] as base.u32 + } + } else if i > 0 { + left_mode = this.sub_modes[i - 1] as base.u32 + } + + if above_mode > 9 { + above_mode = 0 + } + if left_mode > 9 { + left_mode = 0 + } + above_mode = above_mode & 0x0F + left_mode = left_mode & 0x0F + + prob_idx = ((above_mode * 10) + left_mode) * 9 + val = this.decode_sub_block_mode!(prob_offset: prob_idx) + this.sub_modes[i] = val as base.u8 + + i += 1 + } + + above_idx = this.mb_x * 4 + if above_idx < 4093 { + this.above_modes[above_idx + 0] = this.sub_modes[12] + this.above_modes[above_idx + 1] = this.sub_modes[13] + this.above_modes[above_idx + 2] = this.sub_modes[14] + this.above_modes[above_idx + 3] = this.sub_modes[15] + } + + this.left_modes[0] = this.sub_modes[3] + this.left_modes[1] = this.sub_modes[7] + this.left_modes[2] = this.sub_modes[11] + this.left_modes[3] = this.sub_modes[15] + } else { + // 16x16 mode: update above_modes/left_modes with the equivalent + // sub-block mode value for B_PRED context in neighboring MBs. + // Map 16x16 mode (DC=0, V=1, H=2, TM=3) to context numbering + // (DC=0, TM=1, VE=2, HE=3) matching Go/predProb table order. + val = mode + if mode == 1 { + val = 2 + } else if mode == 2 { + val = 3 + } else if mode == 3 { + val = 1 + } + + above_idx = this.mb_x * 4 + if above_idx < 4093 { + this.above_modes[above_idx + 0] = (val & 0xFF) as base.u8 + this.above_modes[above_idx + 1] = (val & 0xFF) as base.u8 + this.above_modes[above_idx + 2] = (val & 0xFF) as base.u8 + this.above_modes[above_idx + 3] = (val & 0xFF) as base.u8 + } + + this.left_modes[0] = (val & 0xFF) as base.u8 + this.left_modes[1] = (val & 0xFF) as base.u8 + this.left_modes[2] = (val & 0xFF) as base.u8 + this.left_modes[3] = (val & 0xFF) as base.u8 + } +} + +pri func decoder.decode_sub_block_mode!(prob_offset: base.u32) base.u32[..= 9] { + var v : base.u32[..= 1] + var p : base.u32 + + p = args.prob_offset + if p > 891 { + return 0 + } + + v = this.bool_read_bool!(prob: KF_B_MODE_PROBS[p]) + if v == 0 { + return 0 + } + v = this.bool_read_bool!(prob: KF_B_MODE_PROBS[p + 1]) + if v == 0 { + return 1 + } + v = this.bool_read_bool!(prob: KF_B_MODE_PROBS[p + 2]) + if v == 0 { + return 2 + } + v = this.bool_read_bool!(prob: KF_B_MODE_PROBS[p + 3]) + if v == 0 { + v = this.bool_read_bool!(prob: KF_B_MODE_PROBS[p + 4]) + if v == 0 { + return 3 + } + v = this.bool_read_bool!(prob: KF_B_MODE_PROBS[p + 5]) + if v == 0 { + return 5 + } + return 6 + } + v = this.bool_read_bool!(prob: KF_B_MODE_PROBS[p + 6]) + if v == 0 { + return 4 + } + v = this.bool_read_bool!(prob: KF_B_MODE_PROBS[p + 7]) + if v == 0 { + return 7 + } + v = this.bool_read_bool!(prob: KF_B_MODE_PROBS[p + 8]) + if v == 0 { + return 8 + } + return 9 +} + +pri func decoder.decode_chroma_mode!() { + var v : base.u32[..= 1] + + v = this.bool_read_bool!(prob: KF_UV_MODE_PROBS[0]) + if v == 0 { + this.mb_chroma_mode = 0 + } else { + v = this.bool_read_bool!(prob: KF_UV_MODE_PROBS[1]) + if v == 0 { + this.mb_chroma_mode = 1 + } else { + v = this.bool_read_bool!(prob: KF_UV_MODE_PROBS[2]) + if v == 0 { + this.mb_chroma_mode = 2 + } else { + this.mb_chroma_mode = 3 + } + } + } +} + +pri func decoder.clear_mb_nz_context!() { + var i : base.u32 + var above_idx : base.u32 + + // Clear Y columns (4 entries). + i = 0 + while i < 4 { + above_idx = (this.mb_x * 8) + i + this.above_nz[above_idx] = 0 + this.left_nz[i] = 0 + i += 1 + } + // Clear U columns (2 entries). + i = 0 + while i < 2 { + above_idx = (this.mb_x * 8) + 4 + i + this.above_nz[above_idx] = 0 + this.left_nz[4 + i] = 0 + i += 1 + } + // Clear V columns (2 entries). + i = 0 + while i < 2 { + above_idx = (this.mb_x * 8) + 6 + i + this.above_nz[above_idx] = 0 + this.left_nz[6 + i] = 0 + i += 1 + } + // Clear Y2 context only for Y16 mode. B_PRED (mode >= 4) has no Y2 block, + // so its skip path must preserve the Y2 NZ state from the previous MB at + // this position (matching libwebp's VP8DecodeMB which only clears nz_dc + // when !is_i4x4). + if this.mb_luma_mode < 4 { + this.above_nz_y2[this.mb_x] = 0 + this.left_nz_y2 = 0 + } +} + +pri func decoder.decode_mb_coefficients!() { + var block_idx : base.u32 + var ctx : base.u32[..= 2] + var raw_ctx : base.u32 + var nz : base.u32[..= 1] + var above_idx : base.u32 + var left_idx : base.u32 + var any_nz : base.u32 + var uv_idx : base.u32 + + if this.mb_luma_mode < 4 { + // Y2 block (block 24) for 16x16 mode. + raw_ctx = this.above_nz_y2[this.mb_x] as base.u32 + raw_ctx ~mod+= this.left_nz_y2 as base.u32 + if raw_ctx <= 2 { + ctx = raw_ctx as base.u32[..= 2] + } else { + ctx = 2 + } + nz = this.decode_block_coeffs!(block_offset: 384, block_type: 1, start_coeff: 0, init_ctx: ctx) + any_nz |= nz + this.above_nz_y2[this.mb_x] = nz as base.u8 + this.left_nz_y2 = nz as base.u8 + + // Y blocks 0-15 (skip DC, already in Y2). + block_idx = 0 + while block_idx < 16 { + above_idx = (this.mb_x * 8) + (block_idx & 3) + left_idx = block_idx >> 2 + raw_ctx = (this.above_nz[above_idx] as base.u32) ~mod+ + (this.left_nz[left_idx] as base.u32) + if raw_ctx <= 2 { + ctx = raw_ctx as base.u32[..= 2] + } else { + ctx = 2 + } + nz = this.decode_block_coeffs!(block_offset: block_idx * 16, block_type: 0, start_coeff: 1, init_ctx: ctx) + any_nz |= nz + // Store per-block NZ info: 0=zero, 2=has AC (never 1 for start_coeff=1). + if nz == 0 { + this.mb_y_ac_nz[block_idx] = 0 + } else { + this.mb_y_ac_nz[block_idx] = 2 + } + this.above_nz[above_idx] = nz as base.u8 + this.left_nz[left_idx] = nz as base.u8 + block_idx += 1 + } + } else { + // 4x4 mode: Y blocks 0-15 (full decode, no Y2). + block_idx = 0 + while block_idx < 16 { + above_idx = (this.mb_x * 8) + (block_idx & 3) + left_idx = block_idx >> 2 + raw_ctx = (this.above_nz[above_idx] as base.u32) ~mod+ + (this.left_nz[left_idx] as base.u32) + if raw_ctx <= 2 { + ctx = raw_ctx as base.u32[..= 2] + } else { + ctx = 2 + } + nz = this.decode_block_coeffs!(block_offset: block_idx * 16, block_type: 3, start_coeff: 0, init_ctx: ctx) + any_nz |= nz + // Branchless NZ: 0=zero, 1=DC only, 2=has AC. + this.mb_y_ac_nz[block_idx] = (nz + (nz & (this.block_ac_nz & 1))) as base.u8 + this.above_nz[above_idx] = nz as base.u8 + this.left_nz[left_idx] = nz as base.u8 + block_idx += 1 + } + } + + // U blocks 16-19 and V blocks 20-23 (merged: same block_type/start_coeff). + uv_idx = 0 + while uv_idx < 8 { + block_idx = 16 + uv_idx + above_idx = (this.mb_x * 8) + 4 + ((uv_idx >> 2) * 2) + (uv_idx & 1) + left_idx = 4 + ((uv_idx >> 2) * 2) + ((uv_idx >> 1) & 1) + raw_ctx = (this.above_nz[above_idx] as base.u32) ~mod+ + (this.left_nz[left_idx] as base.u32) + if raw_ctx <= 2 { + ctx = raw_ctx as base.u32[..= 2] + } else { + ctx = 2 + } + nz = this.decode_block_coeffs!(block_offset: block_idx * 16, block_type: 2, start_coeff: 0, init_ctx: ctx) + any_nz |= nz + // Branchless NZ: 0=zero, 1=DC only, 2=has AC. + this.mb_uv_nz[uv_idx] = (nz + (nz & (this.block_ac_nz & 1))) as base.u8 + this.above_nz[above_idx] = nz as base.u8 + this.left_nz[left_idx] = nz as base.u8 + uv_idx += 1 + } + + // Update is_skip_coeff: if all coefficients decoded to zero, treat as + // skipped for filter purposes (matches libwebp: skip = ParseResiduals()). + if any_nz == 0 { + this.is_skip_coeff = true + } +} + +// decode_coeff_category! decodes a large coefficient value (5+) from the +// category tables (CAT1-CAT6). Separated from decode_block_coeffs so the +// compiler keeps the cold category path out of the hot loop's register +// pressure, matching libwebp's GetCoeffsFast/GetLargeValue split. +// Uses a loop for extra bits to minimize code size (30 inlined p1_read_bool +// calls → 5 tree reads + 1 looped read). +pri func decoder.decode_coeff_category!(prob_idx: base.u32[..= 1045]) base.u32 { + var v : base.u32[..= 1] + var cat : base.u32[..= 5] + var extra_val : base.u32 + var i : base.u32 + var n_extra : base.u32[..= 11] + var cat_off : base.u32[..= 15] + var cat_end : base.u32 + + // Navigate the category tree using prob[6-10]. + v = this.p1_read_bool!(prob: this.coeff_probs[args.prob_idx + 6]) + if v == 0 { + v = this.p1_read_bool!(prob: this.coeff_probs[args.prob_idx + 7]) + if v == 0 { + cat = 0 // CAT1 + } else { + cat = 1 // CAT2 + } + } else { + v = this.p1_read_bool!(prob: this.coeff_probs[args.prob_idx + 8]) + if v == 0 { + v = this.p1_read_bool!(prob: this.coeff_probs[args.prob_idx + 9]) + if v == 0 { + cat = 2 // CAT3 + } else { + cat = 3 // CAT4 + } + } else { + v = this.p1_read_bool!(prob: this.coeff_probs[args.prob_idx + 10]) + if v == 0 { + cat = 4 // CAT5 + } else { + cat = 5 // CAT6 + } + } + } + + // Read extra bits in a loop. Loop from cat_off to cat_end, with + // explicit i < 26 guard so the checker can prove array bounds. + cat_off = CAT_PROBS_OFFSET[cat] as base.u32[..= 15] + n_extra = CAT_EXTRA_BITS[cat] as base.u32[..= 11] + cat_end = cat_off + n_extra + extra_val = 0 + i = cat_off + while (i < cat_end) and (i < 26) { + v = this.p1_read_bool!(prob: CAT_PROBS[i]) + extra_val = (extra_val ~mod<< 1) | (v as base.u32) + i += 1 + } + return (CAT_BASE_VALUE[cat] as base.u32) ~mod+ extra_val +} + +// decode_block_coeffs! decodes one 4x4 block of DCT coefficients from +// partition 1. Bool decoder state (p1_range/p1_value/p1_bits) is copied to +// local variables to keep them in CPU registers, avoiding the store-reload +// overhead the C compiler emits between each bool read and array access +// (it cannot prove struct fields don't alias with coeff_probs/mb_coeffs). +// +// The 7 most frequent reads are inline with local variables: ZERO, ONE-vs-more, +// prob[3-5] for values 2-4, sign, and EOB. Only category values (5+) sync to +// struct and call decode_coeff_category. +// Byte loading at the top of each coefficient ensures >= 49 bits available. +// A second reload before prob[3-5] ensures enough bits for values >= 2. +pri func decoder.decode_block_coeffs!(block_offset: base.u32[..= 384], block_type: base.u32[..= 3], start_coeff: base.u32[..= 1], init_ctx: base.u32[..= 2]) base.u32[..= 1] { + var coeff_idx : base.u32 + var ctx : base.u32[..= 2] + var prob_idx : base.u32[..= 1045] + var bt_base : base.u32[..= 792] + var v : base.u32[..= 1] + var abs_val : base.u32 + var sign : base.u32[..= 1] + var zi : base.u32[..= 15] + var dq : base.u32 + var seg : base.u32[..= 3] + var ci : base.u32 + var has_nz : base.u32[..= 1] + var has_ac : base.u32 + var dq_dc : base.u32 + var dq_ac : base.u32 + var lr : base.u32 + var lv : base.u64 + var lb : base.u32 + var s : base.u32 + var pos : base.u32 + var bval : base.u32 + var nshift : base.u32[..= 7] + var bb : base.u64 + var lri : base.u32[..= 0x1000] + var lwi : base.u32[..= 0x1000] + var lr_taken : base.u32 + var neg_mask : base.u32 + + seg = this.segment_id as base.u32 + has_nz = 0 + has_ac = 0 + + bt_base = args.block_type * 264 + + if args.block_type == 1 { + dq_dc = this.dequant_y2_dc[seg] + dq_ac = this.dequant_y2_ac[seg] + } else if args.block_type == 2 { + dq_dc = this.dequant_uv_dc[seg] + dq_ac = this.dequant_uv_ac[seg] + } else { + dq_dc = this.dequant_y_dc[seg] + dq_ac = this.dequant_y_ac[seg] + } + + lr = this.p1_range + lv = this.p1_value + lb = this.p1_bits + lri = this.p1_ri + lwi = this.p1_wi + + coeff_idx = args.start_coeff + ctx = args.init_ctx + prob_idx = bt_base + (COEFF_BAND_OFFSET[coeff_idx] as base.u32) + (ctx * 11) + + // Byte loading for initial EOB check. 4-byte bulk fast path + // eliminates the multi-iteration while loop (saves ~8 branches per refill). + if lb < 16 { + if ((lri ~mod+ 4) <= lwi) and (lri < 0xFFD) { + lv = (lv ~mod<< 32) | + ((this.p1_buffer[lri + 0] as base.u64) ~mod<< 24) | + ((this.p1_buffer[lri + 1] as base.u64) ~mod<< 16) | + ((this.p1_buffer[lri + 2] as base.u64) ~mod<< 8) | + (this.p1_buffer[lri + 3] as base.u64) + lri += 4 + lb ~mod+= 32 + } else { + while (lb <= 48) and (lri < lwi) { + assert lri < 0x1000 via "a < b: a < c; c <= b"(c: lwi) + bb = this.p1_buffer[lri] as base.u64 + lri += 1 + lv = (lv ~mod<< 8) | bb + lb ~mod+= 8 + } + } + } + + // Initial EOB check (prob[0]) using local variables. + s = (lr ~mod* (this.coeff_probs[prob_idx] as base.u32)) >> 8 + pos = (lb ~mod- 8) & 63 + bval = ((lv >> pos) & 0xFFFF_FFFF) as base.u32 + if bval > s { + lv ~mod-= ((s ~mod+ 1) as base.u64) ~mod<< pos + lr = ((lr ~mod- s) ~mod- 1) & 0xFF + } else { + lr = s + nshift = RENORM_SHIFT_256[lr & 0xFF] as base.u32 + lr = RENORM_RANGE_256[lr & 0xFF] as base.u32 + lb ~mod-= nshift + this.p1_range = lr & 0xFF + this.p1_value = lv + this.p1_bits = lb + this.p1_ri = lri + this.block_ac_nz = 0 + return 0 + } + nshift = RENORM_SHIFT_256[lr & 0xFF] as base.u32 + lr = RENORM_RANGE_256[lr & 0xFF] as base.u32 + lb ~mod-= nshift + + while coeff_idx < 16 { + // Load bytes. Threshold 28 ensures lb >= 28 at prob[1], enough for + // the worst case abs_val=1 path: prob[1](7)+prob[2](7)+sign(1)+EOB(7) + // = 22 bits, with lb >= 8 at every read point (28-7-7=14, 14-7=7+1=8). + // 4-byte bulk fast path: 1 branch replaces ~9 loop branches. + if lb < 28 { + if ((lri ~mod+ 4) <= lwi) and (lri < 0xFFD) { + lv = (lv ~mod<< 32) | + ((this.p1_buffer[lri + 0] as base.u64) ~mod<< 24) | + ((this.p1_buffer[lri + 1] as base.u64) ~mod<< 16) | + ((this.p1_buffer[lri + 2] as base.u64) ~mod<< 8) | + (this.p1_buffer[lri + 3] as base.u64) + lri += 4 + lb ~mod+= 32 + } else { + while (lb <= 48) and (lri < lwi), + inv coeff_idx < 16, + { + assert lri < 0x1000 via "a < b: a < c; c <= b"(c: lwi) + bb = this.p1_buffer[lri] as base.u64 + lri += 1 + lv = (lv ~mod<< 8) | bb + lb ~mod+= 8 + } + } + } + + // Inline: ZERO/non-zero check (prob[1]). + s = (lr ~mod* (this.coeff_probs[prob_idx + 1] as base.u32)) >> 8 + pos = (lb ~mod- 8) & 63 + bval = ((lv >> pos) & 0xFFFF_FFFF) as base.u32 + if bval > s { + v = 1 + lv ~mod-= ((s ~mod+ 1) as base.u64) ~mod<< pos + lr = ((lr ~mod- s) ~mod- 1) & 0xFF + } else { + v = 0 + lr = s + } + nshift = RENORM_SHIFT_256[lr & 0xFF] as base.u32 + lr = RENORM_RANGE_256[lr & 0xFF] as base.u32 + lb ~mod-= nshift + + if v == 0 { + coeff_idx += 1 + if coeff_idx >= 16 { + break + } + prob_idx = bt_base + (COEFF_BAND_OFFSET[coeff_idx] as base.u32) + continue + } + + // Inline: prob[2] — one vs more. + s = (lr ~mod* (this.coeff_probs[prob_idx + 2] as base.u32)) >> 8 + pos = (lb ~mod- 8) & 63 + bval = ((lv >> pos) & 0xFFFF_FFFF) as base.u32 + if bval > s { + v = 1 + lv ~mod-= ((s ~mod+ 1) as base.u64) ~mod<< pos + lr = ((lr ~mod- s) ~mod- 1) & 0xFF + } else { + v = 0 + lr = s + } + nshift = RENORM_SHIFT_256[lr & 0xFF] as base.u32 + lr = RENORM_RANGE_256[lr & 0xFF] as base.u32 + lb ~mod-= nshift + + if v == 0 { + abs_val = 1 + } else { + // Reload bytes before prob[3-5] inline reads: after prob[1]+prob[2] + // consumed up to 14 bits, lb could be as low as 14. We need enough + // bits for prob[3]+prob[4]+prob[5]+sign+EOB (5 reads, max 35 bits). + // 3-byte bulk (not 4): lb < 40 means lb can be up to 39, and + // 39 + 32 = 71 > 64 would overflow, but 39 + 24 = 63 fits. + if lb < 40 { + if ((lri ~mod+ 3) <= lwi) and (lri < 0xFFD) { + lv = (lv ~mod<< 24) | + ((this.p1_buffer[lri + 0] as base.u64) ~mod<< 16) | + ((this.p1_buffer[lri + 1] as base.u64) ~mod<< 8) | + (this.p1_buffer[lri + 2] as base.u64) + lri += 3 + lb ~mod+= 24 + } else { + while (lb <= 48) and (lri < lwi), + inv coeff_idx < 16, + { + assert lri < 0x1000 via "a < b: a < c; c <= b"(c: lwi) + bb = this.p1_buffer[lri] as base.u64 + lri += 1 + lv = (lv ~mod<< 8) | bb + lb ~mod+= 8 + } + } + } + + // Inline: prob[3] — values 2-4 vs categories 5+. + s = (lr ~mod* (this.coeff_probs[prob_idx + 3] as base.u32)) >> 8 + pos = (lb ~mod- 8) & 63 + bval = ((lv >> pos) & 0xFFFF_FFFF) as base.u32 + if bval > s { + v = 1 + lv ~mod-= ((s ~mod+ 1) as base.u64) ~mod<< pos + lr = ((lr ~mod- s) ~mod- 1) & 0xFF + } else { + v = 0 + lr = s + } + nshift = RENORM_SHIFT_256[lr & 0xFF] as base.u32 + lr = RENORM_RANGE_256[lr & 0xFF] as base.u32 + lb ~mod-= nshift + + if v == 0 { + // Inline: prob[4] — value 2 vs 3-4. + s = (lr ~mod* (this.coeff_probs[prob_idx + 4] as base.u32)) >> 8 + pos = (lb ~mod- 8) & 63 + bval = ((lv >> pos) & 0xFFFF_FFFF) as base.u32 + if bval > s { + v = 1 + lv ~mod-= ((s ~mod+ 1) as base.u64) ~mod<< pos + lr = ((lr ~mod- s) ~mod- 1) & 0xFF + } else { + v = 0 + lr = s + } + nshift = RENORM_SHIFT_256[lr & 0xFF] as base.u32 + lr = RENORM_RANGE_256[lr & 0xFF] as base.u32 + lb ~mod-= nshift + + if v == 0 { + abs_val = 2 + } else { + // Inline: prob[5] — value 3 vs 4. + s = (lr ~mod* (this.coeff_probs[prob_idx + 5] as base.u32)) >> 8 + pos = (lb ~mod- 8) & 63 + bval = ((lv >> pos) & 0xFFFF_FFFF) as base.u32 + if bval > s { + lv ~mod-= ((s ~mod+ 1) as base.u64) ~mod<< pos + lr = ((lr ~mod- s) ~mod- 1) & 0xFF + abs_val = 4 + } else { + lr = s + abs_val = 3 + } + nshift = RENORM_SHIFT_256[lr & 0xFF] as base.u32 + lr = RENORM_RANGE_256[lr & 0xFF] as base.u32 + lb ~mod-= nshift + } + } else { + // Categories 5+: sync to struct, call helper, reload. + this.p1_range = lr & 0xFF + this.p1_value = lv + this.p1_bits = lb + this.p1_ri = lri + abs_val = this.decode_coeff_category!(prob_idx: prob_idx) + lr = this.p1_range + lv = this.p1_value + lb = this.p1_bits + lri = this.p1_ri + } + } + + // Inline: sign bit (prob=128, branchless). + // The sign bit is 50/50, so the branch is maximally unpredictable. + // Branchless avoids ~50% mispredict penalty (matching libwebp's VP8GetSigned). + s = lr >> 1 + pos = (lb ~mod- 8) & 63 + bval = ((lv >> pos) & 0xFFFF_FFFF) as base.u32 + // sign = 1 if bval > s, 0 otherwise (underflow sets bit 31). + sign = (s ~mod- bval) >> 31 + // Conditional value update: subtract (s+1)< 0 { + has_ac = 1 + } + ctx = 1 + if abs_val > 1 { + ctx = 2 + } + + // Dequantize and store. + zi = ZIGZAG[coeff_idx] as base.u32[..= 15] + if coeff_idx == 0 { + dq = dq_dc + } else { + dq = dq_ac + } + abs_val = abs_val ~mod* dq + ci = args.block_offset + (zi as base.u32) + // Branchless sign application: (x ^ mask) - mask negates when mask=0xFFFFFFFF. + neg_mask = 0 ~mod- sign + this.mb_coeffs[ci] = (abs_val ^ neg_mask) ~mod- neg_mask + + coeff_idx += 1 + if coeff_idx >= 16 { + break + } + + // Inline: EOB check (prob[0]). + prob_idx = bt_base + (COEFF_BAND_OFFSET[coeff_idx] as base.u32) + (ctx * 11) + s = (lr ~mod* (this.coeff_probs[prob_idx] as base.u32)) >> 8 + pos = (lb ~mod- 8) & 63 + bval = ((lv >> pos) & 0xFFFF_FFFF) as base.u32 + if bval > s { + v = 1 + lv ~mod-= ((s ~mod+ 1) as base.u64) ~mod<< pos + lr = ((lr ~mod- s) ~mod- 1) & 0xFF + } else { + v = 0 + lr = s + } + nshift = RENORM_SHIFT_256[lr & 0xFF] as base.u32 + lr = RENORM_RANGE_256[lr & 0xFF] as base.u32 + lb ~mod-= nshift + if v == 0 { + break + } + } + + // Store bool state back to struct. + this.p1_range = lr & 0xFF + this.p1_value = lv + this.p1_bits = lb + this.p1_ri = lri + + this.block_ac_nz = has_ac + return has_nz +} diff --git a/std/vp8/decode_predict.wuffs b/std/vp8/decode_predict.wuffs new file mode 100644 index 000000000..d5c9512b8 --- /dev/null +++ b/std/vp8/decode_predict.wuffs @@ -0,0 +1,705 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 intra prediction modes. +// +// 16x16 luma modes: DC (0), V (1), H (2), TM (3) +// 4x4 sub-block modes: DC (0), TM (1), VE (2), HE (3), LD (4), +// RD (5), VR (6), VL (7), HD (8), HU (9) +// 8x8 chroma modes: DC (0), V (1), H (2), TM (3) +// +// All prediction functions write the predicted pixel values directly into +// the workbuf at the correct plane position. The IDCT residuals are then +// added on top. + +// predict_16x16! fills a 16x16 luma block in the workbuf with the predicted +// values based on the mode. The block is at (mb_x*16, mb_y*16) in the Y plane. +pri func decoder.predict_16x16!(workbuf: slice base.u8, mode: base.u8[..= 3]), + choosy, +{ + var y_off : base.u64 + var r : base.u32 + var c : base.u32 + var idx : base.u64 + var sum : base.u32 + var count : base.u32 + var dc : base.u8 + var tl : base.u8 + var p : base.u32 + + y_off = (this.mb_y as base.u64) * 16 * (this.y_stride as base.u64) + y_off ~mod+= ((this.mb_x as base.u64) * 16) + + if args.mode == 0 { + // DC prediction: average of above and left pixels. + sum = 0 + count = 0 + if (this.mb_y > 0) and (y_off >= (this.y_stride as base.u64)) { + c = 0 + while c < 16 { + idx = (y_off ~mod- (this.y_stride as base.u64)) ~mod+ (c as base.u64) + if idx < args.workbuf.length() { + sum ~mod+= args.workbuf[idx] as base.u32 + } + c += 1 + count ~mod+= 1 + } + } + if this.mb_x > 0 { + r = 0 + while r < 16 { + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + if idx > 0 { + idx -= 1 + if idx < args.workbuf.length() { + sum ~mod+= args.workbuf[idx] as base.u32 + } + } + r += 1 + count ~mod+= 1 + } + } + if count > 0 { + dc = (((sum ~mod+ (count >> 1)) / count) & 0xFF) as base.u8 + } else { + dc = 128 + } + + r = 0 + while r < 16 { + c = 0 + while c < 16, + inv r < 16, + { + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + idx ~mod+= (c as base.u64) + if idx < args.workbuf.length() { + args.workbuf[idx] = dc + } + c += 1 + } + r += 1 + } + + } else if args.mode == 1 { + // V prediction: replicate the above row. + // At top edge (mb_y=0), above is filled with 127 per VP8 spec. + r = 0 + while r < 16 { + c = 0 + while c < 16, + inv r < 16, + { + dc = 127 + if (this.mb_y > 0) and (y_off >= (this.y_stride as base.u64)) { + idx = (y_off ~mod- (this.y_stride as base.u64)) ~mod+ (c as base.u64) + if idx < args.workbuf.length() { + dc = args.workbuf[idx] + } + } + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + idx ~mod+= (c as base.u64) + if idx < args.workbuf.length() { + args.workbuf[idx] = dc + } + c += 1 + } + r += 1 + } + + } else if args.mode == 2 { + // H prediction: replicate the left column. + // At left edge (mb_x=0), left is filled with 129 per VP8 spec. + r = 0 + while r < 16 { + dc = 129 + if this.mb_x > 0 { + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + if idx > 0 { + idx -= 1 + if idx < args.workbuf.length() { + dc = args.workbuf[idx] + } + } + } + c = 0 + while c < 16, + inv r < 16, + { + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + idx ~mod+= (c as base.u64) + if idx < args.workbuf.length() { + args.workbuf[idx] = dc + } + c += 1 + } + r += 1 + } + + } else { + // TM prediction: pred[r][c] = clamp(above[c] + left[r] - top_left) + tl = 127 + if (this.mb_x > 0) and (this.mb_y > 0) and (y_off > (this.y_stride as base.u64)) { + idx = (y_off - (this.y_stride as base.u64)) - 1 + if idx < args.workbuf.length() { + tl = args.workbuf[idx] + } + } else if (this.mb_x == 0) and (this.mb_y > 0) { + tl = 129 + } + + r = 0 + while r < 16 { + c = 0 + while c < 16, + inv r < 16, + { + p = 127 + if (this.mb_y > 0) and (y_off >= (this.y_stride as base.u64)) { + idx = (y_off ~mod- (this.y_stride as base.u64)) ~mod+ (c as base.u64) + if idx < args.workbuf.length() { + p = args.workbuf[idx] as base.u32 + } + } + if this.mb_x > 0 { + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + if idx > 0 { + idx -= 1 + if idx < args.workbuf.length() { + p = (p ~mod+ (args.workbuf[idx] as base.u32)) ~mod- (tl as base.u32) + } + } + } + // Clamp to [0, 255]. + if p > 255 { + if (p & 0x8000_0000) <> 0 { + p = 0 + } else { + p = 255 + } + } + + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + idx ~mod+= (c as base.u64) + if idx < args.workbuf.length() { + args.workbuf[idx] = (p & 0xFF) as base.u8 + } + c += 1 + } + r += 1 + } + } +} + +// predict_8x8! fills an 8x8 chroma block (U or V) in the workbuf. +// plane_offset is the start of the U or V plane in workbuf. +pri func decoder.predict_8x8!(workbuf: slice base.u8, mode: base.u8[..= 3], plane_offset: base.u64), + choosy, +{ + var uv_off : base.u64 + var r : base.u32 + var c : base.u32 + var idx : base.u64 + var sum : base.u32 + var count : base.u32 + var dc : base.u8 + var tl : base.u8 + var p : base.u32 + + uv_off = args.plane_offset ~mod+ ((this.mb_y as base.u64) * 8 * (this.uv_stride as base.u64)) + uv_off ~mod+= ((this.mb_x as base.u64) * 8) + + if args.mode == 0 { + // DC prediction. + sum = 0 + count = 0 + if (this.mb_y > 0) and (uv_off >= (this.uv_stride as base.u64)) { + c = 0 + while c < 8 { + idx = (uv_off ~mod- (this.uv_stride as base.u64)) ~mod+ (c as base.u64) + if idx < args.workbuf.length() { + sum ~mod+= args.workbuf[idx] as base.u32 + } + c += 1 + count ~mod+= 1 + } + } + if this.mb_x > 0 { + r = 0 + while r < 8 { + idx = uv_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)) + if idx > 0 { + idx -= 1 + if idx < args.workbuf.length() { + sum ~mod+= args.workbuf[idx] as base.u32 + } + } + r += 1 + count ~mod+= 1 + } + } + if count > 0 { + dc = (((sum ~mod+ (count >> 1)) / count) & 0xFF) as base.u8 + } else { + dc = 128 + } + + r = 0 + while r < 8 { + c = 0 + while c < 8, + inv r < 8, + { + idx = uv_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)) + idx ~mod+= (c as base.u64) + if idx < args.workbuf.length() { + args.workbuf[idx] = dc + } + c += 1 + } + r += 1 + } + + } else if args.mode == 1 { + // V prediction. + // At top edge (mb_y=0), above is filled with 127. + r = 0 + while r < 8 { + c = 0 + while c < 8, + inv r < 8, + { + dc = 127 + if (this.mb_y > 0) and (uv_off >= (this.uv_stride as base.u64)) { + idx = (uv_off ~mod- (this.uv_stride as base.u64)) ~mod+ (c as base.u64) + if idx < args.workbuf.length() { + dc = args.workbuf[idx] + } + } + idx = uv_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)) + idx ~mod+= (c as base.u64) + if idx < args.workbuf.length() { + args.workbuf[idx] = dc + } + c += 1 + } + r += 1 + } + + } else if args.mode == 2 { + // H prediction. + // At left edge (mb_x=0), left is filled with 129. + r = 0 + while r < 8 { + dc = 129 + if this.mb_x > 0 { + idx = uv_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)) + if idx > 0 { + idx -= 1 + if idx < args.workbuf.length() { + dc = args.workbuf[idx] + } + } + } + c = 0 + while c < 8, + inv r < 8, + { + idx = uv_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)) + idx ~mod+= (c as base.u64) + if idx < args.workbuf.length() { + args.workbuf[idx] = dc + } + c += 1 + } + r += 1 + } + + } else { + // TM prediction. + tl = 127 + if (this.mb_x > 0) and (this.mb_y > 0) and (uv_off > (this.uv_stride as base.u64)) { + idx = (uv_off - (this.uv_stride as base.u64)) - 1 + if idx < args.workbuf.length() { + tl = args.workbuf[idx] + } + } else if (this.mb_x == 0) and (this.mb_y > 0) { + tl = 129 + } + + r = 0 + while r < 8 { + c = 0 + while c < 8, + inv r < 8, + { + p = 127 + if (this.mb_y > 0) and (uv_off >= (this.uv_stride as base.u64)) { + idx = (uv_off ~mod- (this.uv_stride as base.u64)) ~mod+ (c as base.u64) + if idx < args.workbuf.length() { + p = args.workbuf[idx] as base.u32 + } + } + if this.mb_x > 0 { + idx = uv_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)) + if idx > 0 { + idx -= 1 + if idx < args.workbuf.length() { + p = (p ~mod+ (args.workbuf[idx] as base.u32)) ~mod- (tl as base.u32) + } + } + } + if p > 255 { + if (p & 0x8000_0000) <> 0 { + p = 0 + } else { + p = 255 + } + } + + idx = uv_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)) + idx ~mod+= (c as base.u64) + if idx < args.workbuf.length() { + args.workbuf[idx] = (p & 0xFF) as base.u8 + } + c += 1 + } + r += 1 + } + } +} + +// predict_4x4! fills a single 4x4 sub-block for B_PRED mode. +// block_idx is 0..15 identifying which sub-block within the macroblock. +// +// Reference pixel layout around the 4x4 block: +// tl a0 a1 a2 a3 a4 a5 a6 a7 +// l0 X X X X +// l1 X X X X +// l2 X X X X +// l3 X X X X +// +// Modes (VP8 spec ordering): +// 0=DC, 1=TM, 2=VE, 3=HE, 4=LD, 5=RD, 6=VR, 7=VL, 8=HD, 9=HU +pri func decoder.predict_4x4!(workbuf: slice base.u8, block_idx: base.u32[..= 15], mode: base.u8) { + var y_off : base.u64 + var bx : base.u32 + var by : base.u32 + var idx : base.u64 + var stride : base.u64 + var has_top : base.bool + var has_left : base.bool + var tl : base.u32 + var a0 : base.u32 + var a1 : base.u32 + var a2 : base.u32 + var a3 : base.u32 + var a4 : base.u32 + var a5 : base.u32 + var a6 : base.u32 + var a7 : base.u32 + var l0 : base.u32 + var l1 : base.u32 + var l2 : base.u32 + var l3 : base.u32 + var dc : base.u32 + var s : slice base.u8 + var above4 : base.u32 + + bx = args.block_idx & 3 + by = args.block_idx >> 2 + y_off = (this.mb_y as base.u64) * 16 * (this.y_stride as base.u64) + y_off ~mod+= ((this.mb_x as base.u64) * 16) + y_off ~mod+= ((by as base.u64) * 4 * (this.y_stride as base.u64)) + y_off ~mod+= ((bx as base.u64) * 4) + stride = this.y_stride as base.u64 + + // Determine if "above" and "left" reference pixels are available. + // Within the macroblock, sub-blocks above/left of us are already decoded. + has_top = (by > 0) or (this.mb_y > 0) + has_left = (bx > 0) or (this.mb_x > 0) + + // Load the 4 "above" pixels (a0..a3) via peek_u32le (1 load, 2 checks). + if has_top and (y_off >= stride) { + idx = y_off ~mod- stride + if idx < args.workbuf.length() { + s = args.workbuf[idx ..] + if s.length() >= 4 { + above4 = s.peek_u32le() + a0 = above4 & 0xFF + a1 = (above4 >> 8) & 0xFF + a2 = (above4 >> 16) & 0xFF + a3 = above4 >> 24 + } + } + } else { + a0 = 127 + a1 = 127 + a2 = 127 + a3 = 127 + } + + // Load upper-right pixels (a4..a7) via peek_u32le. + if has_top and (y_off >= stride) and (bx < 3) { + idx = (y_off ~mod- stride) ~mod+ 4 + if idx < args.workbuf.length() { + s = args.workbuf[idx ..] + if s.length() >= 4 { + above4 = s.peek_u32le() + a4 = above4 & 0xFF + a5 = (above4 >> 8) & 0xFF + a6 = (above4 >> 16) & 0xFF + a7 = above4 >> 24 + } + } + } else if (bx >= 3) and has_top { + a4 = this.mb_upper_right[0] as base.u32 + a5 = this.mb_upper_right[1] as base.u32 + a6 = this.mb_upper_right[2] as base.u32 + a7 = this.mb_upper_right[3] as base.u32 + } else { + a4 = a3 + a5 = a3 + a6 = a3 + a7 = a3 + } + + // Load the 4 "left" pixels (l0..l3). + if has_left and (y_off > 0) { + idx = y_off ~mod- 1 + if idx < args.workbuf.length() { + l0 = args.workbuf[idx] as base.u32 + } + idx = (y_off ~mod+ stride) ~mod- 1 + if idx < args.workbuf.length() { + l1 = args.workbuf[idx] as base.u32 + } + idx = (y_off ~mod+ (stride ~mod* 2)) ~mod- 1 + if idx < args.workbuf.length() { + l2 = args.workbuf[idx] as base.u32 + } + idx = (y_off ~mod+ (stride ~mod* 3)) ~mod- 1 + if idx < args.workbuf.length() { + l3 = args.workbuf[idx] as base.u32 + } + } else { + l0 = 129 + l1 = 129 + l2 = 129 + l3 = 129 + } + + // Load top-left pixel. + if has_top and has_left and (y_off > stride) { + idx = (y_off ~mod- stride) ~mod- 1 + if idx < args.workbuf.length() { + tl = args.workbuf[idx] as base.u32 + } + } else if has_top and (not has_left) { + tl = 129 + } else { + // When has_top is false (mby=0, by=0), tl is from the above border + // row, which is all 127 in VP8. This applies whether or not has_left + // is true. Go's prepareYBR sets row 0 to 0x7f after column 7 to 0x81, + // so the above border value (127) wins at the corner. + tl = 127 + } + + if args.mode == 0 { + // DC: average of 4 above + 4 left pixels. + dc = ((((((((a0 ~mod+ a1) ~mod+ a2) ~mod+ a3) ~mod+ l0) ~mod+ l1) ~mod+ l2) ~mod+ l3) ~mod+ 4) + dc = (dc >> 3) & 0xFF + this.pred4x4_store!(workbuf: args.workbuf, off: y_off, + v00: dc, v01: dc, v02: dc, v03: dc, + v10: dc, v11: dc, v12: dc, v13: dc, + v20: dc, v21: dc, v22: dc, v23: dc, + v30: dc, v31: dc, v32: dc, v33: dc) + } else if args.mode == 1 { + // TM: pred[r][c] = clamp(above[c] + left[r] - tl) + this.pred4x4_store!(workbuf: args.workbuf, off: y_off, + v00: this.clip8(v: (a0 ~mod+ l0) ~mod- tl), + v01: this.clip8(v: (a1 ~mod+ l0) ~mod- tl), + v02: this.clip8(v: (a2 ~mod+ l0) ~mod- tl), + v03: this.clip8(v: (a3 ~mod+ l0) ~mod- tl), + v10: this.clip8(v: (a0 ~mod+ l1) ~mod- tl), + v11: this.clip8(v: (a1 ~mod+ l1) ~mod- tl), + v12: this.clip8(v: (a2 ~mod+ l1) ~mod- tl), + v13: this.clip8(v: (a3 ~mod+ l1) ~mod- tl), + v20: this.clip8(v: (a0 ~mod+ l2) ~mod- tl), + v21: this.clip8(v: (a1 ~mod+ l2) ~mod- tl), + v22: this.clip8(v: (a2 ~mod+ l2) ~mod- tl), + v23: this.clip8(v: (a3 ~mod+ l2) ~mod- tl), + v30: this.clip8(v: (a0 ~mod+ l3) ~mod- tl), + v31: this.clip8(v: (a1 ~mod+ l3) ~mod- tl), + v32: this.clip8(v: (a2 ~mod+ l3) ~mod- tl), + v33: this.clip8(v: (a3 ~mod+ l3) ~mod- tl)) + } else if args.mode == 2 { + // VE: smoothed vertical prediction. + // Each column uses a weighted 3-tap filter of the above pixels. + this.pred4x4_store!(workbuf: args.workbuf, off: y_off, + v00: this.avg3(a: tl, b: a0, c: a1), v01: this.avg3(a: a0, b: a1, c: a2), + v02: this.avg3(a: a1, b: a2, c: a3), v03: this.avg3(a: a2, b: a3, c: a4), + v10: this.avg3(a: tl, b: a0, c: a1), v11: this.avg3(a: a0, b: a1, c: a2), + v12: this.avg3(a: a1, b: a2, c: a3), v13: this.avg3(a: a2, b: a3, c: a4), + v20: this.avg3(a: tl, b: a0, c: a1), v21: this.avg3(a: a0, b: a1, c: a2), + v22: this.avg3(a: a1, b: a2, c: a3), v23: this.avg3(a: a2, b: a3, c: a4), + v30: this.avg3(a: tl, b: a0, c: a1), v31: this.avg3(a: a0, b: a1, c: a2), + v32: this.avg3(a: a1, b: a2, c: a3), v33: this.avg3(a: a2, b: a3, c: a4)) + } else if args.mode == 3 { + // HE: smoothed horizontal prediction. + this.pred4x4_store!(workbuf: args.workbuf, off: y_off, + v00: this.avg3(a: tl, b: l0, c: l1), v01: this.avg3(a: tl, b: l0, c: l1), + v02: this.avg3(a: tl, b: l0, c: l1), v03: this.avg3(a: tl, b: l0, c: l1), + v10: this.avg3(a: l0, b: l1, c: l2), v11: this.avg3(a: l0, b: l1, c: l2), + v12: this.avg3(a: l0, b: l1, c: l2), v13: this.avg3(a: l0, b: l1, c: l2), + v20: this.avg3(a: l1, b: l2, c: l3), v21: this.avg3(a: l1, b: l2, c: l3), + v22: this.avg3(a: l1, b: l2, c: l3), v23: this.avg3(a: l1, b: l2, c: l3), + v30: this.avg3(a: l2, b: l3, c: l3), v31: this.avg3(a: l2, b: l3, c: l3), + v32: this.avg3(a: l2, b: l3, c: l3), v33: this.avg3(a: l2, b: l3, c: l3)) + } else if args.mode == 4 { + // LD: Left-Down diagonal. Uses top + upper-right pixels. + this.pred4x4_store!(workbuf: args.workbuf, off: y_off, + v00: this.avg3(a: a0, b: a1, c: a2), v01: this.avg3(a: a1, b: a2, c: a3), + v02: this.avg3(a: a2, b: a3, c: a4), v03: this.avg3(a: a3, b: a4, c: a5), + v10: this.avg3(a: a1, b: a2, c: a3), v11: this.avg3(a: a2, b: a3, c: a4), + v12: this.avg3(a: a3, b: a4, c: a5), v13: this.avg3(a: a4, b: a5, c: a6), + v20: this.avg3(a: a2, b: a3, c: a4), v21: this.avg3(a: a3, b: a4, c: a5), + v22: this.avg3(a: a4, b: a5, c: a6), v23: this.avg3(a: a5, b: a6, c: a7), + v30: this.avg3(a: a3, b: a4, c: a5), v31: this.avg3(a: a4, b: a5, c: a6), + v32: this.avg3(a: a5, b: a6, c: a7), v33: this.avg3(a: a6, b: a7, c: a7)) + } else if args.mode == 5 { + // RD: Right-Down diagonal. + this.pred4x4_store!(workbuf: args.workbuf, off: y_off, + v00: this.avg3(a: l0, b: tl, c: a0), v01: this.avg3(a: tl, b: a0, c: a1), + v02: this.avg3(a: a0, b: a1, c: a2), v03: this.avg3(a: a1, b: a2, c: a3), + v10: this.avg3(a: l1, b: l0, c: tl), v11: this.avg3(a: l0, b: tl, c: a0), + v12: this.avg3(a: tl, b: a0, c: a1), v13: this.avg3(a: a0, b: a1, c: a2), + v20: this.avg3(a: l2, b: l1, c: l0), v21: this.avg3(a: l1, b: l0, c: tl), + v22: this.avg3(a: l0, b: tl, c: a0), v23: this.avg3(a: tl, b: a0, c: a1), + v30: this.avg3(a: l3, b: l2, c: l1), v31: this.avg3(a: l2, b: l1, c: l0), + v32: this.avg3(a: l1, b: l0, c: tl), v33: this.avg3(a: l0, b: tl, c: a0)) + } else if args.mode == 6 { + // VR: Vertical-Right diagonal. + this.pred4x4_store!(workbuf: args.workbuf, off: y_off, + v00: this.avg2(a: tl, b: a0), v01: this.avg2(a: a0, b: a1), + v02: this.avg2(a: a1, b: a2), v03: this.avg2(a: a2, b: a3), + v10: this.avg3(a: l0, b: tl, c: a0), v11: this.avg3(a: tl, b: a0, c: a1), + v12: this.avg3(a: a0, b: a1, c: a2), v13: this.avg3(a: a1, b: a2, c: a3), + v20: this.avg3(a: l1, b: l0, c: tl), v21: this.avg2(a: tl, b: a0), + v22: this.avg2(a: a0, b: a1), v23: this.avg2(a: a1, b: a2), + v30: this.avg3(a: l2, b: l1, c: l0), v31: this.avg3(a: l0, b: tl, c: a0), + v32: this.avg3(a: tl, b: a0, c: a1), v33: this.avg3(a: a0, b: a1, c: a2)) + } else if args.mode == 7 { + // VL: Vertical-Left diagonal. + this.pred4x4_store!(workbuf: args.workbuf, off: y_off, + v00: this.avg2(a: a0, b: a1), v01: this.avg2(a: a1, b: a2), + v02: this.avg2(a: a2, b: a3), v03: this.avg2(a: a3, b: a4), + v10: this.avg3(a: a0, b: a1, c: a2), v11: this.avg3(a: a1, b: a2, c: a3), + v12: this.avg3(a: a2, b: a3, c: a4), v13: this.avg3(a: a3, b: a4, c: a5), + v20: this.avg2(a: a1, b: a2), v21: this.avg2(a: a2, b: a3), + v22: this.avg2(a: a3, b: a4), v23: this.avg3(a: a4, b: a5, c: a6), + v30: this.avg3(a: a1, b: a2, c: a3), v31: this.avg3(a: a2, b: a3, c: a4), + v32: this.avg3(a: a3, b: a4, c: a5), v33: this.avg3(a: a5, b: a6, c: a7)) + } else if args.mode == 8 { + // HD: Horizontal-Down diagonal. + this.pred4x4_store!(workbuf: args.workbuf, off: y_off, + v00: this.avg2(a: l0, b: tl), v01: this.avg3(a: l0, b: tl, c: a0), + v02: this.avg3(a: tl, b: a0, c: a1), v03: this.avg3(a: a0, b: a1, c: a2), + v10: this.avg2(a: l1, b: l0), v11: this.avg3(a: l1, b: l0, c: tl), + v12: this.avg2(a: l0, b: tl), v13: this.avg3(a: l0, b: tl, c: a0), + v20: this.avg2(a: l2, b: l1), v21: this.avg3(a: l2, b: l1, c: l0), + v22: this.avg2(a: l1, b: l0), v23: this.avg3(a: l1, b: l0, c: tl), + v30: this.avg2(a: l3, b: l2), v31: this.avg3(a: l3, b: l2, c: l1), + v32: this.avg2(a: l2, b: l1), v33: this.avg3(a: l2, b: l1, c: l0)) + } else { + // HU: Horizontal-Up diagonal. Uses left pixels only. + this.pred4x4_store!(workbuf: args.workbuf, off: y_off, + v00: this.avg2(a: l0, b: l1), v01: this.avg3(a: l0, b: l1, c: l2), + v02: this.avg2(a: l1, b: l2), v03: this.avg3(a: l1, b: l2, c: l3), + v10: this.avg2(a: l1, b: l2), v11: this.avg3(a: l1, b: l2, c: l3), + v12: this.avg2(a: l2, b: l3), v13: this.avg3(a: l2, b: l3, c: l3), + v20: this.avg2(a: l2, b: l3), v21: this.avg3(a: l2, b: l3, c: l3), + v22: (l3 & 0xFF), v23: (l3 & 0xFF), + v30: (l3 & 0xFF), v31: (l3 & 0xFF), + v32: (l3 & 0xFF), v33: (l3 & 0xFF)) + } +} + +// pred4x4_store! writes 16 pixel values to a 4x4 block at the given offset. +// Uses packed u32 writes (4 bytes at a time) for efficiency. +pri func decoder.pred4x4_store!(workbuf: slice base.u8, off: base.u64, + v00: base.u32, v01: base.u32, v02: base.u32, v03: base.u32, + v10: base.u32, v11: base.u32, v12: base.u32, v13: base.u32, + v20: base.u32, v21: base.u32, v22: base.u32, v23: base.u32, + v30: base.u32, v31: base.u32, v32: base.u32, v33: base.u32) { + var stride : base.u64 + var row_off : base.u64 + var s : slice base.u8 + + stride = this.y_stride as base.u64 + + // Row 0. + row_off = args.off + if row_off < args.workbuf.length() { + s = args.workbuf[row_off ..] + if s.length() >= 4 { + s.poke_u32le!(a: (args.v00 & 0xFF) | ((args.v01 & 0xFF) << 8) | ((args.v02 & 0xFF) << 16) | ((args.v03 & 0xFF) << 24)) + } + } + + // Row 1. + row_off = args.off ~mod+ stride + if row_off < args.workbuf.length() { + s = args.workbuf[row_off ..] + if s.length() >= 4 { + s.poke_u32le!(a: (args.v10 & 0xFF) | ((args.v11 & 0xFF) << 8) | ((args.v12 & 0xFF) << 16) | ((args.v13 & 0xFF) << 24)) + } + } + + // Row 2. + row_off = args.off ~mod+ (stride ~mod* 2) + if row_off < args.workbuf.length() { + s = args.workbuf[row_off ..] + if s.length() >= 4 { + s.poke_u32le!(a: (args.v20 & 0xFF) | ((args.v21 & 0xFF) << 8) | ((args.v22 & 0xFF) << 16) | ((args.v23 & 0xFF) << 24)) + } + } + + // Row 3. + row_off = args.off ~mod+ (stride ~mod* 3) + if row_off < args.workbuf.length() { + s = args.workbuf[row_off ..] + if s.length() >= 4 { + s.poke_u32le!(a: (args.v30 & 0xFF) | ((args.v31 & 0xFF) << 8) | ((args.v32 & 0xFF) << 16) | ((args.v33 & 0xFF) << 24)) + } + } +} + +// avg2! computes (a + b + 1) / 2 for pixel averaging. +pri func decoder.avg2(a: base.u32, b: base.u32) base.u32 { + return (((args.a ~mod+ args.b) ~mod+ 1) >> 1) & 0xFF +} + +// avg3! computes (a + 2*b + c + 2) / 4 for weighted pixel averaging. +pri func decoder.avg3(a: base.u32, b: base.u32, c: base.u32) base.u32 { + return (((((args.a ~mod+ (args.b ~mod* 2)) ~mod+ args.c) ~mod+ 2)) >> 2) & 0xFF +} + +// clip8 clamps a u32 value to [0, 255]. Values with bit 31 set (negative +// in two's complement from ~mod- underflow) are clamped to 0. +pri func decoder.clip8(v: base.u32) base.u32 { + if args.v <= 255 { + return args.v + } + if (args.v & 0x8000_0000) <> 0 { + return 0 + } + return 255 +} diff --git a/std/vp8/decode_predict_arm_neon.wuffs b/std/vp8/decode_predict_arm_neon.wuffs new file mode 100644 index 000000000..ffad079f1 --- /dev/null +++ b/std/vp8/decode_predict_arm_neon.wuffs @@ -0,0 +1,340 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 16x16 intra prediction, ARM NEON version. +// +// Uses NEON for the fill/compute inner loops. Reference pixel loading +// (above row, left column) is scalar since those bytes are scattered. + +pri func decoder.predict_16x16_arm_neon!(workbuf: slice base.u8, mode: base.u8[..= 3]), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + + // Reference pixel arrays (pre-loaded before reslicing workbuf). + var left_arr : array[16] base.u8 + var tl : base.u8 + var s : slice base.u8 + + // NEON registers. + var above : base.arm_neon_u8x16 + var diff_u8 : base.arm_neon_u8x16 + var result : base.arm_neon_u8x16 + + var y_off : base.u64 + var idx : base.u64 + var r : base.u32 + var sum : base.u32 + var count : base.u32 + var dc : base.u8 + var left_val : base.u8 + var tl_val : base.u8 + + // Calculate block offset. + y_off = (this.mb_y as base.u64) * 16 * (this.y_stride as base.u64) + y_off ~mod+= ((this.mb_x as base.u64) * 16) + + // Load above row directly into NEON register (16 contiguous bytes). + if (this.mb_y > 0) and (y_off >= (this.y_stride as base.u64)) { + idx = y_off ~mod- (this.y_stride as base.u64) + if idx < args.workbuf.length() { + s = args.workbuf[idx ..] + if s.length() >= 16 { + above = util.make_u8x16_slice128(a: s[.. 16]) + } + } + } else { + above = util.make_u8x16_repeat(a: 127) + } + + // Pre-load left column (16 bytes, stride apart). + r = 0 + while r < 16 { + left_arr[r] = 129 + if this.mb_x > 0 { + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + if idx > 0 { + idx -= 1 + if idx < args.workbuf.length() { + left_arr[r] = args.workbuf[idx] + } + } + } + r += 1 + } + + // Pre-load top-left pixel. + tl = 127 + if (this.mb_x > 0) and (this.mb_y > 0) and (y_off > (this.y_stride as base.u64)) { + idx = (y_off - (this.y_stride as base.u64)) - 1 + if idx < args.workbuf.length() { + tl = args.workbuf[idx] + } + } else if (this.mb_x == 0) and (this.mb_y > 0) { + tl = 129 + } + + // Reslice workbuf to block start for efficient output. + if y_off <= args.workbuf.length() { + args.workbuf = args.workbuf[y_off ..] + } + + if args.mode == 0 { + // DC prediction: average of above + left pixels. + sum = 0 + count = 0 + if this.mb_y > 0 { + // Sum above row using vaddlvq_u8 (add-long-across). + sum = above.vaddlvq_u8() as base.u32 + count = 16 + } + if this.mb_x > 0 { + r = 0 + while r < 16 { + sum ~mod+= left_arr[r] as base.u32 + r += 1 + } + count ~mod+= 16 + } + if count > 0 { + dc = (((sum ~mod+ (count >> 1)) / count) & 0xFF) as base.u8 + } else { + dc = 128 + } + + result = util.make_u8x16_repeat(a: dc) + r = 0 + while r < 16 { + if 16 <= args.workbuf.length() { + result.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + r += 1 + } + + } else if args.mode == 1 { + // V prediction: replicate the above row. + r = 0 + while r < 16 { + if 16 <= args.workbuf.length() { + above.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + r += 1 + } + + } else if args.mode == 2 { + // H prediction: broadcast left[r] per row. + r = 0 + while r < 16 { + result = util.make_u8x16_repeat(a: left_arr[r]) + if 16 <= args.workbuf.length() { + result.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + r += 1 + } + + } else { + // TM prediction: pred[r][c] = clamp(above[c] + left[r] - tl). + // Use unsigned-only arithmetic with per-row branching: + // if left[r] >= tl: result = vqadd(above, left[r] - tl) + // if left[r] < tl: result = vqsub(above, tl - left[r]) + tl_val = tl + r = 0 + while r < 16 { + left_val = left_arr[r] + if left_val >= tl_val { + diff_u8 = util.make_u8x16_repeat(a: left_val ~mod- tl_val) + result = above.vqaddq_u8(b: diff_u8) + } else { + diff_u8 = util.make_u8x16_repeat(a: tl_val ~mod- left_val) + result = above.vqsubq_u8(b: diff_u8) + } + if 16 <= args.workbuf.length() { + result.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + r += 1 + } + } +} + +// VP8 8x8 chroma intra prediction, ARM NEON version. +// +// Same 4 modes as 16x16 (DC, V, H, TM) but operating on 8x8 blocks +// using the UV plane stride. Uses 64-bit NEON loads/stores. + +pri func decoder.predict_8x8_arm_neon!(workbuf: slice base.u8, mode: base.u8[..= 3], plane_offset: base.u64), + choose cpu_arch >= arm_neon, +{ + var util : base.arm_neon_utility + + // Reference pixel arrays (pre-loaded before reslicing workbuf). + var left_arr : array[8] base.u8 + var tl : base.u8 + var s : slice base.u8 + + // NEON registers. + var above : base.arm_neon_u8x8 + var diff_u8 : base.arm_neon_u8x8 + var result : base.arm_neon_u8x8 + + var uv_off : base.u64 + var idx : base.u64 + var r : base.u32 + var sum : base.u32 + var count : base.u32 + var dc : base.u8 + var left_val : base.u8 + var tl_val : base.u8 + + // Calculate block offset within the U or V plane. + uv_off = args.plane_offset ~mod+ ((this.mb_y as base.u64) * 8 * (this.uv_stride as base.u64)) + uv_off ~mod+= ((this.mb_x as base.u64) * 8) + + // Load above row directly into NEON register (8 contiguous bytes). + if (this.mb_y > 0) and (uv_off >= (this.uv_stride as base.u64)) { + idx = uv_off ~mod- (this.uv_stride as base.u64) + if idx < args.workbuf.length() { + s = args.workbuf[idx ..] + if s.length() >= 8 { + above = util.make_u8x8_slice64(a: s[.. 8]) + } + } + } else { + above = util.make_u8x8_repeat(a: 127) + } + + // Pre-load left column (8 bytes, uv_stride apart). + r = 0 + while r < 8 { + left_arr[r] = 129 + if this.mb_x > 0 { + idx = uv_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)) + if idx > 0 { + idx -= 1 + if idx < args.workbuf.length() { + left_arr[r] = args.workbuf[idx] + } + } + } + r += 1 + } + + // Pre-load top-left pixel. + tl = 127 + if (this.mb_x > 0) and (this.mb_y > 0) and (uv_off > (this.uv_stride as base.u64)) { + idx = (uv_off - (this.uv_stride as base.u64)) - 1 + if idx < args.workbuf.length() { + tl = args.workbuf[idx] + } + } else if (this.mb_x == 0) and (this.mb_y > 0) { + tl = 129 + } + + // Reslice workbuf to block start for efficient output. + if uv_off <= args.workbuf.length() { + args.workbuf = args.workbuf[uv_off ..] + } + + if args.mode == 0 { + // DC prediction: average of above + left pixels. + sum = 0 + count = 0 + if this.mb_y > 0 { + // Sum above row using vaddlv_u8 (add-long-across, 64-bit). + sum = above.vaddlv_u8() as base.u32 + count = 8 + } + if this.mb_x > 0 { + r = 0 + while r < 8 { + sum ~mod+= left_arr[r] as base.u32 + r += 1 + } + count ~mod+= 8 + } + if count > 0 { + dc = (((sum ~mod+ (count >> 1)) / count) & 0xFF) as base.u8 + } else { + dc = 128 + } + + result = util.make_u8x8_repeat(a: dc) + r = 0 + while r < 8 { + if 8 <= args.workbuf.length() { + result.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + r += 1 + } + + } else if args.mode == 1 { + // V prediction: replicate the above row. + r = 0 + while r < 8 { + if 8 <= args.workbuf.length() { + above.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + r += 1 + } + + } else if args.mode == 2 { + // H prediction: broadcast left[r] per row. + r = 0 + while r < 8 { + result = util.make_u8x8_repeat(a: left_arr[r]) + if 8 <= args.workbuf.length() { + result.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + r += 1 + } + + } else { + // TM prediction: pred[r][c] = clamp(above[c] + left[r] - tl). + tl_val = tl + r = 0 + while r < 8 { + left_val = left_arr[r] + if left_val >= tl_val { + diff_u8 = util.make_u8x8_repeat(a: left_val ~mod- tl_val) + result = above.vqadd_u8(b: diff_u8) + } else { + diff_u8 = util.make_u8x8_repeat(a: tl_val ~mod- left_val) + result = above.vqsub_u8(b: diff_u8) + } + if 8 <= args.workbuf.length() { + result.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + r += 1 + } + } +} diff --git a/std/vp8/decode_predict_x86_sse42.wuffs b/std/vp8/decode_predict_x86_sse42.wuffs new file mode 100644 index 000000000..9385f9e1d --- /dev/null +++ b/std/vp8/decode_predict_x86_sse42.wuffs @@ -0,0 +1,349 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// VP8 16x16 intra prediction, SSE4.2 version. +// +// Uses SSE for the fill/compute inner loops. Reference pixel loading +// (above row, left column) is scalar since those bytes are scattered. + +pri func decoder.predict_16x16_x86_sse42!(workbuf: slice base.u8, mode: base.u8[..= 3]), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + + // Reference pixel arrays (pre-loaded before reslicing workbuf). + var left_arr : array[16] base.u8 + var tl : base.u8 + var s : slice base.u8 + + // SSE registers. + var zero : base.x86_m128i + var above : base.x86_m128i + var diff : base.x86_m128i + var result : base.x86_m128i + var sad : base.x86_m128i + var tmp : base.x86_m128i + + var y_off : base.u64 + var idx : base.u64 + var r : base.u32 + var sum : base.u32 + var count : base.u32 + var dc : base.u8 + + zero = util.make_m128i_zeroes() + + // Calculate block offset. + y_off = (this.mb_y as base.u64) * 16 * (this.y_stride as base.u64) + y_off ~mod+= ((this.mb_x as base.u64) * 16) + + // Load above row directly into SSE register (16 contiguous bytes). + if (this.mb_y > 0) and (y_off >= (this.y_stride as base.u64)) { + idx = y_off ~mod- (this.y_stride as base.u64) + if idx < args.workbuf.length() { + s = args.workbuf[idx ..] + if s.length() >= 16 { + above = util.make_m128i_slice128(a: s[.. 16]) + } + } + } else { + above = util.make_m128i_repeat_u8(a: 127) + } + + // Pre-load left column (16 bytes, stride apart). + r = 0 + while r < 16 { + left_arr[r] = 129 + if this.mb_x > 0 { + idx = y_off ~mod+ ((r as base.u64) * (this.y_stride as base.u64)) + if idx > 0 { + idx -= 1 + if idx < args.workbuf.length() { + left_arr[r] = args.workbuf[idx] + } + } + } + r += 1 + } + + // Pre-load top-left pixel. + tl = 127 + if (this.mb_x > 0) and (this.mb_y > 0) and (y_off > (this.y_stride as base.u64)) { + idx = (y_off - (this.y_stride as base.u64)) - 1 + if idx < args.workbuf.length() { + tl = args.workbuf[idx] + } + } else if (this.mb_x == 0) and (this.mb_y > 0) { + tl = 129 + } + + // Reslice workbuf to block start for efficient output. + if y_off <= args.workbuf.length() { + args.workbuf = args.workbuf[y_off ..] + } + + if args.mode == 0 { + // DC prediction: average of above + left pixels. + sum = 0 + count = 0 + if this.mb_y > 0 { + // Sum above row using SAD trick: sad_epu8(above, zero) gives byte sums. + sad = above._mm_sad_epu8(b: zero) + // sad = [sum_lo, 0, 0, 0, sum_hi, 0, 0, 0] in u16 layout. + tmp = sad._mm_srli_si128(imm8: 8) + sad = sad._mm_add_epi32(b: tmp) + sum = sad.truncate_u32() + count = 16 + } + if this.mb_x > 0 { + r = 0 + while r < 16 { + sum ~mod+= left_arr[r] as base.u32 + r += 1 + } + count ~mod+= 16 + } + if count > 0 { + dc = (((sum ~mod+ (count >> 1)) / count) & 0xFF) as base.u8 + } else { + dc = 128 + } + + result = util.make_m128i_repeat_u8(a: dc) + r = 0 + while r < 16 { + if 16 <= args.workbuf.length() { + result.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + r += 1 + } + + } else if args.mode == 1 { + // V prediction: replicate the above row. + r = 0 + while r < 16 { + if 16 <= args.workbuf.length() { + above.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + r += 1 + } + + } else if args.mode == 2 { + // H prediction: broadcast left[r] per row. + r = 0 + while r < 16 { + result = util.make_m128i_repeat_u8(a: left_arr[r]) + if 16 <= args.workbuf.length() { + result.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + r += 1 + } + + } else { + // TM prediction: pred[r][c] = clamp(above[c] + left[r] - tl). + // Use saturating unsigned byte arithmetic (same approach as ARM NEON): + // if left[r] >= tl: result = adds_epu8(above, left[r] - tl) + // if left[r] < tl: result = subs_epu8(above, tl - left[r]) + r = 0 + while r < 16 { + if left_arr[r] >= tl { + diff = util.make_m128i_repeat_u8(a: left_arr[r] ~mod- tl) + result = above._mm_adds_epu8(b: diff) + } else { + diff = util.make_m128i_repeat_u8(a: tl ~mod- left_arr[r]) + result = above._mm_subs_epu8(b: diff) + } + if 16 <= args.workbuf.length() { + result.store_slice128!(a: args.workbuf[.. 16]) + } + if (this.y_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.y_stride as base.u64) ..] + } + r += 1 + } + } +} + +// VP8 8x8 chroma intra prediction, SSE4.2 version. +// +// Same 4 modes as 16x16 (DC, V, H, TM) but operating on 8x8 blocks +// using the UV plane stride. Uses 64-bit SSE loads/stores. + +pri func decoder.predict_8x8_x86_sse42!(workbuf: slice base.u8, mode: base.u8[..= 3], plane_offset: base.u64), + choose cpu_arch >= x86_sse42, +{ + var util : base.x86_sse42_utility + + // Reference pixel arrays (pre-loaded before reslicing workbuf). + var left_arr : array[8] base.u8 + var tl : base.u8 + var s : slice base.u8 + + // SSE registers. + var zero : base.x86_m128i + var above : base.x86_m128i + var diff : base.x86_m128i + var result : base.x86_m128i + var sad : base.x86_m128i + + var uv_off : base.u64 + var idx : base.u64 + var r : base.u32 + var sum : base.u32 + var count : base.u32 + var dc : base.u8 + + zero = util.make_m128i_zeroes() + + // Calculate block offset within the U or V plane. + uv_off = args.plane_offset ~mod+ ((this.mb_y as base.u64) * 8 * (this.uv_stride as base.u64)) + uv_off ~mod+= ((this.mb_x as base.u64) * 8) + + // Load above row directly into SSE register (8 contiguous bytes). + if (this.mb_y > 0) and (uv_off >= (this.uv_stride as base.u64)) { + idx = uv_off ~mod- (this.uv_stride as base.u64) + if idx < args.workbuf.length() { + s = args.workbuf[idx ..] + if s.length() >= 8 { + above = util.make_m128i_slice64(a: s[.. 8]) + } + } + } else { + above = util.make_m128i_repeat_u8(a: 127) + } + + // Pre-load left column (8 bytes, uv_stride apart). + r = 0 + while r < 8 { + left_arr[r] = 129 + if this.mb_x > 0 { + idx = uv_off ~mod+ ((r as base.u64) * (this.uv_stride as base.u64)) + if idx > 0 { + idx -= 1 + if idx < args.workbuf.length() { + left_arr[r] = args.workbuf[idx] + } + } + } + r += 1 + } + + // Pre-load top-left pixel. + tl = 127 + if (this.mb_x > 0) and (this.mb_y > 0) and (uv_off > (this.uv_stride as base.u64)) { + idx = (uv_off - (this.uv_stride as base.u64)) - 1 + if idx < args.workbuf.length() { + tl = args.workbuf[idx] + } + } else if (this.mb_x == 0) and (this.mb_y > 0) { + tl = 129 + } + + // Reslice workbuf to block start for efficient output. + if uv_off <= args.workbuf.length() { + args.workbuf = args.workbuf[uv_off ..] + } + + if args.mode == 0 { + // DC prediction: average of above + left pixels. + sum = 0 + count = 0 + if this.mb_y > 0 { + // SAD trick: sum 8 above bytes (upper 8 bytes of m128i are zero). + sad = above._mm_sad_epu8(b: zero) + sum = sad.truncate_u32() + count = 8 + } + if this.mb_x > 0 { + r = 0 + while r < 8 { + sum ~mod+= left_arr[r] as base.u32 + r += 1 + } + count ~mod+= 8 + } + if count > 0 { + dc = (((sum ~mod+ (count >> 1)) / count) & 0xFF) as base.u8 + } else { + dc = 128 + } + + result = util.make_m128i_repeat_u8(a: dc) + r = 0 + while r < 8 { + if 8 <= args.workbuf.length() { + result.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + r += 1 + } + + } else if args.mode == 1 { + // V prediction: replicate the above row. + r = 0 + while r < 8 { + if 8 <= args.workbuf.length() { + above.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + r += 1 + } + + } else if args.mode == 2 { + // H prediction: broadcast left[r] per row. + r = 0 + while r < 8 { + result = util.make_m128i_repeat_u8(a: left_arr[r]) + if 8 <= args.workbuf.length() { + result.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + r += 1 + } + + } else { + // TM prediction: pred[r][c] = clamp(above[c] + left[r] - tl). + // Use saturating unsigned byte arithmetic (same approach as ARM NEON): + // if left[r] >= tl: result = adds_epu8(above, left[r] - tl) + // if left[r] < tl: result = subs_epu8(above, tl - left[r]) + r = 0 + while r < 8 { + if left_arr[r] >= tl { + diff = util.make_m128i_repeat_u8(a: left_arr[r] ~mod- tl) + result = above._mm_adds_epu8(b: diff) + } else { + diff = util.make_m128i_repeat_u8(a: tl ~mod- left_arr[r]) + result = above._mm_subs_epu8(b: diff) + } + if 8 <= args.workbuf.length() { + result.store_slice64!(a: args.workbuf[.. 8]) + } + if (this.uv_stride as base.u64) <= args.workbuf.length() { + args.workbuf = args.workbuf[(this.uv_stride as base.u64) ..] + } + r += 1 + } + } +} diff --git a/std/vp8/decode_vp8.wuffs b/std/vp8/decode_vp8.wuffs index e804dc051..957de0ba9 100644 --- a/std/vp8/decode_vp8.wuffs +++ b/std/vp8/decode_vp8.wuffs @@ -9,26 +9,229 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT pub status "#bad header" +pub status "#bad coefficient" pub status "#truncated input" pub status "#unsupported VP8 file" -pub const DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE : base.u64 = 0 +pri status "#internal error: inconsistent decoder state" + +// Max workbuf: 1024 * 1024 * 384 (planes) + 524288 (partition0) = 0x1808_0000. +// For multi-partition VP8, the WebP container adds coefficient data space. +pub const DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE : base.u64 = 0x1808_0000 pub struct decoder? implements base.image_decoder( width : base.u32[..= 0x3FFF], height : base.u32[..= 0x3FFF], + // Macroblock grid dimensions. + mb_width : base.u32[..= 0x400], + mb_height : base.u32[..= 0x400], + // The call sequence state machine is discussed in // (/doc/std/image-decoders-call-sequence.md). call_sequence : base.u8, frame_config_io_position : base.u64, + // Frame header fields from 3-byte frame tag. + key_frame : base.bool, + partition0_size : base.u32, + + // ---- Boolean decoder state (partition 0: header + mode data) ---- + // Wide-accumulator approach (matching partition 1): + // bool_range stores (range - 1), in [0, 254]. + // bool_value (u64) is a wide accumulator. + // bool_bits (u32) is the position counter. + bool_range : base.u32[..= 255], + bool_value : base.u64, + bool_bits : base.u32, + + // Boolean decoder buffer read/write indices (partition 0). + bool_ri : base.u32[..= 0x1000], + bool_wi : base.u32[..= 0x1000], + + // ---- Boolean decoder state (partition 1: coefficient data) ---- + // p1_range stores (range - 1), in [0, 254]. This saves one subtract + // in the split computation per bool read. + // + // Position-based value tracking (like libwebp's VP8GetBit): + // p1_value is a 64-bit accumulator. p1_bits tracks the bit position. + // Extraction: (p1_value >> ((p1_bits - 8) & 63)). + // Renormalization only decrements p1_bits (no value shifting). + // Byte loading shifts value left by 8 and ORs in new byte. + p1_range : base.u32[..= 255], + p1_value : base.u64, + p1_bits : base.u32, + p1_ri : base.u32[..= 0x1000], + p1_wi : base.u32[..= 0x1000], + + // ---- Segmentation ---- + use_segment : base.bool, + update_segment_map : base.bool, + segment_is_abs : base.bool, + segment_quant : array[4] base.i32, + segment_lf : array[4] base.i32, + segment_prob : array[3] base.u8, + + // ---- Loop filter ---- + filter_type : base.u8[..= 1], + filter_level : base.u8[..= 63], + sharpness_level : base.u8[..= 7], + lf_delta_enabled : base.bool, + lf_ref_delta : array[4] base.i32[..= 63], + lf_mode_delta : array[4] base.i32[..= 63], + // Number of Y rows to defer at each MB row boundary for filter overlap. + // The V-MB filter for row N modifies pixels in row N-1, so we hold + // back these rows and output them with the next MB row's swizzle. + // 0 = no filter, 2 = simple filter, 6 = normal filter. + filter_extra_rows : base.u32[..= 8], + + // ---- Quantization ---- + quant_y_ac_qi : base.u8[..= 127], + quant_y_dc_delta : base.i32, + quant_y2_dc_delta : base.i32, + quant_y2_ac_delta : base.i32, + quant_uv_dc_delta : base.i32, + quant_uv_ac_delta : base.i32, + + // Per-segment dequantization values. + dequant_y_dc : array[4] base.u32, + dequant_y_ac : array[4] base.u32, + dequant_y2_dc : array[4] base.u32, + dequant_y2_ac : array[4] base.u32, + dequant_uv_dc : array[4] base.u32, + dequant_uv_ac : array[4] base.u32, + + // Per-segment filter levels (computed from filter_level, sharpness, segment_lf). + seg_filter_level : array[4] base.u32, + + // Precomputed filter strengths per (segment, is_i4x4) combination. + // Indexed by segment*2 + is_i4x4. Avoids per-MB recomputation. + fstrength_level : array[8] base.u8, + fstrength_ilevel : array[8] base.u8, + fstrength_hlevel : array[8] base.u8, + + // ---- Partitions ---- + num_partitions : base.u32[..= 8], + + // Multi-partition state for coefficient data partitions. + // When num_partitions > 1, all coefficient data is copied to workbuf + // and per-partition boolean decoder state is saved/restored at row + // boundaries (MB rows cycle through partitions round-robin). + multi_partition : base.bool, + current_partition : base.u32[..= 7], + part_range : array[8] base.u32, + part_value : array[8] base.u64, + part_bits : array[8] base.u32, + part_wbuf_ri : array[8] base.u32, + part_wbuf_size : array[8] base.u32, + part_wbuf_offset : array[8] base.u64, + current_part_wbuf_ri : base.u32, + + // ---- Macroblock state ---- + mb_x : base.u32[..= 0x400], + mb_y : base.u32[..= 0x400], + + segment_id : base.u8[..= 3], + is_skip_coeff : base.bool, + mb_no_skip_coeff : base.bool, + prob_skip_false : base.u8, + mb_luma_mode : base.u8, + mb_chroma_mode : base.u8[..= 3], + left_nz_y2 : base.u8, + + // ---- Workbuf layout ---- + // Y plane: [0 .. y_end) + // U plane: [y_end .. u_end) + // V plane: [u_end .. v_end) + // Partition 0: [v_end .. v_end + partition0_size) + // Coeff data: [v_end + partition0_size .. ) (multi-partition only) + y_stride : base.u32[..= 0x4000], + uv_stride : base.u32[..= 0x2000], + workbuf_offset_y_end : base.u64[..= 0x1000_0000], + workbuf_offset_u_end : base.u64[..= 0x1400_0000], + workbuf_offset_v_end : base.u64[..= 0x1800_0000], + + // Partition 0 data stored in workbuf: read index and actual count + // of bytes copied (may be less than partition0_size if input truncated). + p0_wbuf_ri : base.u32, + p0_wbuf_count : base.u32, + + // ---- Output ---- dst_x : base.u32, dst_y : base.u32, swizzler : base.pixel_swizzler, util : base.utility, +) + ( + // Boolean decoder buffer for partition 0 (mode data). 4096 bytes. + bool_buffer : array[0x1000] base.u8, + + // Boolean decoder buffer for partition 1 (coefficient data). 4096 bytes. + p1_buffer : array[0x1000] base.u8, + + // Current macroblock coefficients (u32 two's complement for signed). + // 25 blocks × 16 coefficients = 400 values. + mb_coeffs : array[400] base.u32, + + // Per-block NZ tracking for DC-only IDCT optimization. + // mb_y_ac_nz[i]: 1 if Y block i has non-zero AC coefficients (16x16 mode). + // mb_uv_nz[i]: 0=all zero, 1=DC only, 2=has AC. + mb_y_ac_nz : array[16] base.u8, + mb_uv_nz : array[8] base.u8, + + // Side-effect output from decode_block_coeffs: 1 if any AC coefficient + // (index > 0) was non-zero. + block_ac_nz : base.u32, + + // Coefficient probability table (mutable, updated per frame). + // [4 types][8 bands][3 contexts][11 tokens] = 1056 values. + coeff_probs : array[1056] base.u8, + + // Scratch buffer for swizzle_ycck. + scratch_buffer_2k : array[2048] base.u8, + + // Above-row context: number of non-zero coefficients per 4x4 block. + // Max mb_width * 4 = 4096 for Y, mb_width * 2 for U, mb_width * 2 for V. + // Total = mb_width * 8, max = 8192. Sized to 8200 so that mb_x (up to + // 0x400) * 8 + 7 = 8199 is always in-bounds without runtime guards. + above_nz : array[8200] base.u8, + + // Left-column context for current macroblock row. + // 4 Y + 2 U + 2 V = 8. + left_nz : array[8] base.u8, + + // Above-row 4x4 sub-block modes for B_PRED. + // Max mb_width * 4 = 4096. + above_modes : array[4096] base.u8, + + // Left-column 4x4 sub-block modes for current macroblock. + left_modes : array[4] base.u8, + + // Sub-block modes for current macroblock (4x4 grid = 16 modes). + sub_modes : array[16] base.u8, + + // Upper-right pixels for B_PRED rightmost column sub-blocks. + // Saved from the first row (by=0) and reused for by>0. + mb_upper_right : array[4] base.u8, + + // Above-row Y2 non-zero context, one per macroblock column. + // Sized to 0x401 so that mb_x (up to 0x400) is always in-bounds. + above_nz_y2 : array[0x401] base.u8, + + // Per-macroblock filter parameters, ring-buffered by row (2 rows). + // Indexed by (mb_y & 1) * 0x400 + mb_x. Max index = 0x7FF. + // + // Only the current and previous rows are needed at any time (the + // interleaved decode-filter loop decodes row N, then filters row N-1). + // + // For normal filter: level = 2*base_level + ilevel (0..189). + // For simple filter: level = base_level (0..63). + // ilevel and hlevel are only used by the normal filter. + mb_filter_level : array[0x800] base.u8, + mb_filter_ilevel : array[0x800] base.u8, + mb_filter_hlevel : array[0x800] base.u8, + mb_filter_inner : array[0x800] base.u8, ) pub func decoder.get_quirk(key: base.u32) base.u64 { @@ -58,12 +261,15 @@ pri func decoder.do_decode_image_config?(dst: nptr base.image_config, src: base. return base."#bad call sequence" } + // 3-byte frame tag. c32 = args.src.read_u24le_as_u32?() - if (c32 & 0x01) <> 0 { - // TODO: support non-key frames. + this.key_frame = ((c32 & 0x01) == 0) + if not this.key_frame { return "#unsupported VP8 file" } + this.partition0_size = (c32 >> 5) & 0x7_FFFF + // 7-byte key frame header: 3-byte start code + 4-byte dimensions. c32 = args.src.read_u24le_as_u32?() if c32 <> '\x9D\x01\x2A'le { return "#bad header" @@ -73,6 +279,19 @@ pri func decoder.do_decode_image_config?(dst: nptr base.image_config, src: base. this.width = 0x3FFF & (c32 >> 0) this.height = 0x3FFF & (c32 >> 16) + // Calculate macroblock grid dimensions. + this.mb_width = (this.width + 15) / 16 + this.mb_height = (this.height + 15) / 16 + + // Calculate workbuf layout: Y, U, V planes. + this.y_stride = this.mb_width * 16 + this.uv_stride = this.mb_width * 8 + this.workbuf_offset_y_end = (this.y_stride as base.u64) * ((this.mb_height * 16) as base.u64) + this.workbuf_offset_u_end = this.workbuf_offset_y_end + + ((this.uv_stride as base.u64) * ((this.mb_height * 8) as base.u64)) + this.workbuf_offset_v_end = this.workbuf_offset_u_end + + ((this.uv_stride as base.u64) * ((this.mb_height * 8) as base.u64)) + this.frame_config_io_position = args.src.position() if args.dst <> nullptr { @@ -147,7 +366,9 @@ pub func decoder.decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reader, } pri func decoder.do_decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reader, blend: base.pixel_blend, workbuf: slice base.u8, opts: nptr base.decode_frame_options) { - var status : base.status + var status : base.status + var remaining : base.u32 + var off : base.u64 if this.call_sequence == 0x40 { // No-op. @@ -157,9 +378,105 @@ pri func decoder.do_decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reade return base."@end of data" } - this.dst_x = 0 - this.dst_y = 0 + if this.workbuf_offset_v_end > args.workbuf.length() { + return base."#bad workbuf length" + } + choose idct_add = [ + idct_add_arm_neon, + idct_add_x86_sse42] + choose idct_dc_add = [ + idct_dc_add_arm_neon, + idct_dc_add_x86_sse42] + choose idct_add_pair = [ + idct_add_pair_x86_avx2] + choose idct_dc_add_pair = [ + idct_dc_add_pair_x86_avx2] + choose predict_16x16 = [ + predict_16x16_arm_neon, + predict_16x16_x86_sse42] + choose predict_8x8 = [ + predict_8x8_arm_neon, + predict_8x8_x86_sse42] + choose simple_vfilter_16 = [ + simple_vfilter_16_arm_neon, + simple_vfilter_16_x86_sse42] + choose normal_vfilter_inner_16 = [ + normal_vfilter_inner_16_arm_neon, + normal_vfilter_inner_16_x86_sse42] + choose normal_vfilter_mb_16 = [ + normal_vfilter_mb_16_arm_neon, + normal_vfilter_mb_16_x86_sse42] + choose normal_vfilter_mb_8 = [ + normal_vfilter_mb_8_arm_neon, + normal_vfilter_mb_8_x86_sse42] + choose normal_hfilter_mb_16 = [ + normal_hfilter_mb_16_arm_neon, + normal_hfilter_mb_16_x86_sse42] + choose normal_hfilter_mb_8 = [ + normal_hfilter_mb_8_arm_neon, + normal_hfilter_mb_8_x86_sse42] + choose normal_hfilter_inner_16 = [ + normal_hfilter_inner_16_arm_neon, + normal_hfilter_inner_16_x86_sse42] + choose normal_hfilter_inner_8 = [ + normal_hfilter_inner_8_arm_neon, + normal_hfilter_inner_8_x86_sse42] + choose normal_vfilter_inner_8 = [ + normal_vfilter_inner_8_arm_neon, + normal_vfilter_inner_8_x86_sse42] + choose normal_vfilter_mb_uv = [ + normal_vfilter_mb_uv_x86_avx2] + choose normal_hfilter_mb_uv = [ + normal_hfilter_mb_uv_x86_avx2] + choose normal_vfilter_inner_uv = [ + normal_vfilter_inner_uv_x86_avx2] + choose normal_hfilter_inner_uv = [ + normal_hfilter_inner_uv_x86_avx2] + + // Ensure mb_coeffs is zeroed. IDCT/WHT rely on unwritten positions being 0. + // With LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, mb_coeffs starts as garbage. + this.init_mb_coeffs!() + + // Initialize coefficient probabilities from defaults. + this.init_coeff_probs!() + // Copy partition 0 data from src into workbuf[v_end..] so that the + // boolean decoder can be refilled during MB mode decoding. + this.p0_wbuf_ri = 0 + this.p0_wbuf_count = 0 + off = this.workbuf_offset_v_end + remaining = this.partition0_size + while (remaining > 0) and (args.src.length() > 0) { + if off < args.workbuf.length() { + args.workbuf[off] = args.src.peek_u8() + } + args.src.skip_u32_fast!(actual: 1, worst_case: 1) + off ~mod+= 1 + remaining ~mod-= 1 + this.p0_wbuf_count ~mod+= 1 + } + + // Parse frame header from workbuf-stored partition 0 data. + this.decode_partition0!(workbuf: args.workbuf) + + // Precompute filter strengths per (segment, mode) to avoid per-MB work. + this.precompute_filter_strengths!() + + // The V-MB filter for row N modifies pixels in row N-1 (up to 3 rows in + // both Y and UV planes). We must hold back those rows from output until + // row N's filter has been applied. The number of extra luma rows: + // Simple filter (Y only): filter2 modifies 1 Y row, round up to 2 (even) + // Normal filter (Y+UV): filter246 modifies 3 UV rows = 6 luma rows + // No filter: 0 + if this.filter_level == 0 { + this.filter_extra_rows = 0 + } else if this.filter_type == 1 { + this.filter_extra_rows = 2 + } else { + this.filter_extra_rows = 6 + } + + // Prepare swizzler for per-row output during decode. status = this.swizzler.prepare!( dst_pixfmt: args.dst.pixel_format(), dst_palette: args.dst.palette(), @@ -170,8 +487,8 @@ pri func decoder.do_decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reade return status } - // TODO: actually decode the pixels. - status = this.make_a_placeholder_gradient!(dst: args.dst) + // Decode all macroblocks with per-row filtering and output. + status = this.decode_frame_mb!(src: args.src, dst: args.dst, workbuf: args.workbuf) if not status.is_ok() { return status } @@ -179,58 +496,145 @@ pri func decoder.do_decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reade this.call_sequence = 0x60 } -pri func decoder.make_a_placeholder_gradient!(dst: ptr base.pixel_buffer) base.status { - var dst_pixfmt : base.pixel_format - var dst_bits_per_pixel : base.u32[..= 256] - var dst_bytes_per_pixel : base.u32[..= 32] - var dst_bytes_per_row : base.u64 - var tab : table base.u8 - var dst : slice base.u8 - var i : base.u64 - var bgrx : array[4] base.u8 - - // TODO: the dst_pixfmt variable shouldn't be necessary. We should be able - // to chain the two calls: "args.dst.pixel_format().bits_per_pixel()". - dst_pixfmt = args.dst.pixel_format() - dst_bits_per_pixel = dst_pixfmt.bits_per_pixel() - if (dst_bits_per_pixel & 7) <> 0 { - return base."#unsupported option" +pri func decoder.init_mb_coeffs!() { + var i : base.u32 + + i = 0 + while i < 400 { + assert i < 400 via "a < b: a < c; c <= b"(c: 400) + this.mb_coeffs[i] = 0 + i += 1 } - dst_bytes_per_pixel = dst_bits_per_pixel / 8 - dst_bytes_per_row = (this.width * dst_bytes_per_pixel) as base.u64 - tab = args.dst.plane(p: 0) - - bgrx[0] = 0x80 - - while this.dst_y < this.height { - assert this.dst_y < 0x3FFF via "a < b: a < c; c <= b"(c: this.height) - bgrx[1] = (this.dst_y & 0xFF) as base.u8 - - this.dst_x = 0 - while this.dst_x < this.width, - inv this.dst_y < 0x3FFF, - { - assert this.dst_x < 0x3FFF via "a < b: a < c; c <= b"(c: this.width) - bgrx[2] = (this.dst_x & 0xFF) as base.u8 - - dst = tab.row_u32(y: this.dst_y) - if dst_bytes_per_row < dst.length() { - dst = dst[.. dst_bytes_per_row] - } - i = (this.dst_x as base.u64) * (dst_bytes_per_pixel as base.u64) - if i < dst.length() { - this.swizzler.swizzle_interleaved_from_slice!( - dst: dst[i ..], - dst_palette: args.dst.palette(), - src: bgrx[.. 4]) - } - - this.dst_x += 1 - } - this.dst_y += 1 +} + +pri func decoder.init_coeff_probs!() { + var i : base.u32 + + i = 0 + while i < 1056 { + assert i < 1056 via "a < b: a < c; c <= b"(c: 1056) + this.coeff_probs[i] = DEFAULT_COEFF_PROBS[i] + i += 1 } +} - return ok +// swizzle_mb_row! converts one MB row of YCbCr data to the output pixel format. +// The swizzler must have been prepared before calling this function. +// +// The V-MB filter for row N modifies pixels in row N-1, so we hold back +// filter_extra_rows from the bottom of each non-last MB row and include them +// in the next row's output. This matches libwebp's FinishRow behavior. +pri func decoder.swizzle_mb_row!(dst: ptr base.pixel_buffer, workbuf: slice base.u8, mby: base.u32[..= 0x3FF], is_last: base.bool) base.status { + var status : base.status + var src0 : slice base.u8 + var src1 : slice base.u8 + var src2 : slice base.u8 + var src3 : slice base.u8 + var y_width : base.u32 + var uv_width : base.u32 + var y_min : base.u32 + var y_max : base.u32 + var y_off : base.u64 + var uv_off : base.u64 + var u_start : base.u64 + var v_start : base.u64 + var rem_y_h : base.u32 + var rem_uv_h : base.u32 + + if this.workbuf_offset_v_end > args.workbuf.length() { + return base."#bad workbuf length" + } + if this.workbuf_offset_y_end > this.workbuf_offset_u_end { + return base."#bad workbuf length" + } + if this.workbuf_offset_u_end > this.workbuf_offset_v_end { + return base."#bad workbuf length" + } + + y_width = this.mb_width * 16 + uv_width = this.mb_width * 8 + + // Compute output row range, accounting for filter overlap. + y_min = args.mby * 16 + if args.mby > 0 { + // Include deferred rows from the previous MB row. + y_min ~sat-= this.filter_extra_rows + } + y_max = ((args.mby as base.u32) + 1) * 16 + if not args.is_last { + // Defer bottom rows until next MB row's V-MB filter is applied. + y_max ~sat-= this.filter_extra_rows + } + y_max = y_max.min(no_more_than: this.height) + + if y_min >= y_max { + return ok + } + + // Source sub-slices starting at the adjusted y_min offset. + y_off = (y_min as base.u64) * (this.y_stride as base.u64) + uv_off = ((y_min / 2) as base.u64) * (this.uv_stride as base.u64) + + assert this.workbuf_offset_u_end <= args.workbuf.length() via "a <= b: a <= c; c <= b"( + c: this.workbuf_offset_v_end) + assert this.workbuf_offset_y_end <= args.workbuf.length() via "a <= b: a <= c; c <= b"( + c: this.workbuf_offset_u_end) + if y_off <= this.workbuf_offset_y_end { + src0 = args.workbuf[y_off .. this.workbuf_offset_y_end] + } + + u_start = this.workbuf_offset_y_end ~sat+ uv_off + if u_start <= this.workbuf_offset_u_end { + src1 = args.workbuf[u_start .. this.workbuf_offset_u_end] + } + + v_start = this.workbuf_offset_u_end ~sat+ uv_off + if v_start <= this.workbuf_offset_v_end { + src2 = args.workbuf[v_start .. this.workbuf_offset_v_end] + } + + src3 = this.util.empty_slice_u8() + + // Remaining heights from y_min to end of planes. + rem_y_h = (this.mb_height * 16) ~sat- y_min + rem_uv_h = (this.mb_height * 8) ~sat- (y_min / 2) + + status = this.swizzler.swizzle_ycck!( + dst: args.dst, + dst_palette: args.dst.palette(), + x_min_incl: 0, + x_max_excl: this.width, + y_min_incl: y_min, + y_max_excl: y_max, + src0: src0, + src1: src1, + src2: src2, + src3: src3, + width0: y_width, + width1: uv_width, + width2: uv_width, + width3: 0, + height0: rem_y_h, + height1: rem_uv_h, + height2: rem_uv_h, + height3: 0, + stride0: y_width, + stride1: uv_width, + stride2: uv_width, + stride3: 0, + h0: 2, + h1: 1, + h2: 1, + h3: 0, + v0: 2, + v1: 1, + v2: 1, + v3: 0, + is_rgb_or_cmyk: false, + triangle_filter_for_2to1: false, + src_is_bt601: true, + scratch_buffer_2k: this.scratch_buffer_2k[..]) + return status } pub func decoder.frame_dirty_rect() base.rect_ie_u32 { @@ -279,5 +683,14 @@ pub func decoder.tell_me_more?(dst: base.io_writer, minfo: nptr base.more_inform } pub func decoder.workbuf_len() base.range_ii_u64 { - return this.util.make_range_ii_u64(min_incl: 0, max_incl: 0) + var total : base.u64 + total = this.workbuf_offset_v_end ~sat+ (this.partition0_size as base.u64) + return this.util.make_range_ii_u64( + min_incl: total, + max_incl: total) +} + +// workbuf_len_total returns the VP8 workbuf requirement as a plain u64. +pub func decoder.workbuf_len_total() base.u64 { + return this.workbuf_offset_v_end ~sat+ (this.partition0_size as base.u64) } diff --git a/std/webp/decode_huffman.wuffs b/std/webp/decode_huffman.wuffs index da7a15c4c..680941073 100644 --- a/std/webp/decode_huffman.wuffs +++ b/std/webp/decode_huffman.wuffs @@ -8,25 +8,118 @@ // // SPDX-License-Identifier: Apache-2.0 OR MIT -pri func decoder.decode_huffman_groups?(src: base.io_reader, n_huffman_groups: base.u32[..= 256]) { - var hg : base.u32 - var ht : base.u32 - - hg = 0 - while hg < args.n_huffman_groups { - assert hg < 256 via "a < b: a < c; c <= b"(c: args.n_huffman_groups) - ht = 0 - while ht < 5, - inv hg < 256, +pri func decoder.decode_huffman_groups?(src: base.io_reader, n_huffman_groups: base.u32[..= 1024], n_bitstream_groups: base.u32[..= 0x1_0000]) { + var hg : base.u32 + var ht : base.u32 + var target : base.u32[..= 1024] + var sorted_idx : base.u32[..= 1024] + var raw_hg : base.u32 + + var red_entry : base.u32 + var blue_entry : base.u32 + var alpha_entry : base.u32 + var green_entry : base.u32 + + if args.n_bitstream_groups <= args.n_huffman_groups { + // Non-compacted: decode n_huffman_groups trees directly. + hg = 0 + while hg < args.n_huffman_groups { + assert hg < 1024 via "a < b: a < c; c <= b"(c: args.n_huffman_groups) + this.ht_next_top = 1280 + ht = 0 + while ht < 5, + inv hg < 1024, + { + this.decode_huffman_tree?(src: args.src, hg: hg, ht: ht) + ht += 1 + } + + red_entry = this.huffman_tables[hg][256] + blue_entry = this.huffman_tables[hg][512] + alpha_entry = this.huffman_tables[hg][768] + if ((red_entry & 0x8000_000F) == 0x8000_0000) and + ((blue_entry & 0x8000_000F) == 0x8000_0000) and + ((alpha_entry & 0x8000_000F) == 0x8000_0000) { + this.hg_literal_arb[hg] = + (((alpha_entry >> 8) & 0xFF) << 24) | + (((red_entry >> 8) & 0xFF) << 16) | + ((blue_entry >> 8) & 0xFF) + green_entry = this.huffman_tables[hg][0] + if ((green_entry & 0x8000_000F) == 0x8000_0000) and + (((green_entry >> 8) & 0xFFFF) < 0x100) { + this.hg_trivial[hg] = 2 + this.hg_literal_arb[hg] |= ((green_entry >> 8) & 0xFF) << 8 + } else { + this.hg_trivial[hg] = 1 + } + } else { + this.hg_trivial[hg] = 0 + } + + hg += 1 + } + } else { + // Compacted: the bitstream contains n_bitstream_groups sets of + // Huffman trees, but only n_huffman_groups are actually used. + // Merge-iterate with the sorted list of used indices; unused + // trees are decoded into the scratch slot at index 1024. + sorted_idx = 0 + raw_hg = 0 + while raw_hg < args.n_bitstream_groups, + inv sorted_idx <= 1024, { - this.decode_huffman_tree?(src: args.src, hg: hg, ht: ht) - ht += 1 + // Determine where to store this group's trees. + if (sorted_idx < this.hg_n_sorted) and (sorted_idx < 1024) { + if (this.hg_sorted[sorted_idx] as base.u32) == raw_hg { + target = sorted_idx + sorted_idx += 1 + } else { + target = 1024 + } + } else { + target = 1024 + } + + this.ht_next_top = 1280 + ht = 0 + while ht < 5, + inv target <= 1024, + inv sorted_idx <= 1024, + { + this.decode_huffman_tree?(src: args.src, hg: target, ht: ht) + ht += 1 + } + + if target < 1024 { + red_entry = this.huffman_tables[target][256] + blue_entry = this.huffman_tables[target][512] + alpha_entry = this.huffman_tables[target][768] + if ((red_entry & 0x8000_000F) == 0x8000_0000) and + ((blue_entry & 0x8000_000F) == 0x8000_0000) and + ((alpha_entry & 0x8000_000F) == 0x8000_0000) { + this.hg_literal_arb[target] = + (((alpha_entry >> 8) & 0xFF) << 24) | + (((red_entry >> 8) & 0xFF) << 16) | + ((blue_entry >> 8) & 0xFF) + green_entry = this.huffman_tables[target][0] + if ((green_entry & 0x8000_000F) == 0x8000_0000) and + (((green_entry >> 8) & 0xFFFF) < 0x100) { + this.hg_trivial[target] = 2 + this.hg_literal_arb[target] |= ((green_entry >> 8) & 0xFF) << 8 + } else { + this.hg_trivial[target] = 1 + } + } else { + this.hg_trivial[target] = 0 + } + } + + raw_hg ~mod+= 1 } - hg += 1 } } -pri func decoder.decode_huffman_tree?(src: base.io_reader, hg: base.u32[..= 255], ht: base.u32[..= 4]) { +pri func decoder.decode_huffman_tree?(src: base.io_reader, hg: base.u32[..= 1024], ht: base.u32[..= 4]) { var c8 : base.u8 var use_simple : base.u32[..= 1] var status : base.status @@ -64,20 +157,21 @@ pri func decoder.decode_huffman_tree?(src: base.io_reader, hg: base.u32[..= 255] this.build_code_lengths?(src: args.src) - status = this.build_huffman_nodes!(hg: args.hg, ht: args.ht) + status = this.build_huffman_table!(hg: args.hg, ht: args.ht) if not status.is_ok() { return status } } } -pri func decoder.decode_huffman_tree_simple?(src: base.io_reader, hg: base.u32[..= 255], ht: base.u32[..= 4]) { +pri func decoder.decode_huffman_tree_simple?(src: base.io_reader, hg: base.u32[..= 1024], ht: base.u32[..= 4]) { var c8 : base.u8 var use_second_symbol : base.u32[..= 1] var first_symbol_n_bits : base.u32[..= 8] var symbol0 : base.u32[..= 0xFF] var symbol1 : base.u32[..= 0xFF] - var base_offset : base.u32[..= 0x064C] + var base_offset : base.u32[..= 3840] + var i : base.u32 if this.n_bits < 2 { c8 = args.src.read_u8?() @@ -109,7 +203,8 @@ pri func decoder.decode_huffman_tree_simple?(src: base.io_reader, hg: base.u32[. this.bits >>= first_symbol_n_bits this.n_bits -= first_symbol_n_bits - base_offset = HUFFMAN_TABLE_BASE_OFFSETS[args.ht] as base.u32 + base_offset = args.ht * 256 + this.huffman_table_base_offsets[args.hg][args.ht] = base_offset as base.u16 if use_second_symbol <> 0 { if this.n_bits < 8 { @@ -125,13 +220,27 @@ pri func decoder.decode_huffman_tree_simple?(src: base.io_reader, hg: base.u32[. this.bits >>= 8 this.n_bits -= 8 - this.huffman_nodes[args.hg][base_offset + 0] = (base_offset + 1) as base.u16 - this.huffman_nodes[args.hg][base_offset + 1] = (symbol0 | 0x8000) as base.u16 - this.huffman_nodes[args.hg][base_offset + 2] = (symbol1 | 0x8000) as base.u16 + // Two symbols with 1-bit codes. + // symbol0 at code 0 fills even indices, symbol1 at code 1 fills odd. + i = 0 + while i < 256 { + this.huffman_tables[args.hg][(base_offset ~mod+ i) & 0xFFF] = + 0x8000_0000 | (symbol0 << 8) | 1 + this.huffman_tables[args.hg][((base_offset ~mod+ i) ~mod+ 1) & 0xFFF] = + 0x8000_0000 | (symbol1 << 8) | 1 + i += 2 + } } else { - this.huffman_nodes[args.hg][base_offset] = (symbol0 | 0x8000) as base.u16 + // Single symbol, 0 bits consumed. Fill all 256 entries. + i = 0 + while i < 256 { + this.huffman_tables[args.hg][(base_offset ~mod+ i) & 0xFFF] = + 0x8000_0000 | (symbol0 << 8) | 0 + i += 1 + } } + } pri func decoder.decode_code_length_code_lengths?(src: base.io_reader) { @@ -270,102 +379,278 @@ pri func decoder.build_code_lengths_huffman_nodes!() base.status { return ok } -pri func decoder.build_huffman_nodes!(hg: base.u32[..= 255], ht: base.u32[..= 4]) base.status { - var base_offset : base.u32[..= 0x064C] - - var code_bits : base.u32 - var code_len : base.u32[..= 15] - var symbol : base.u32[..= 2328] - - var histogram : array[16] base.u32 - var n_used_symbols : base.u32 - var last_used_symbol : base.u32[..= 2328] - - var subscription_weight : base.u32[..= 0x8000] - var subscription_total : base.u32 - var curr_code : base.u32 - var next_codes : array[17] base.u32 - - var n_branches : base.u32 - var h : base.u32[..= 0x187A] - var children : base.u32 - var node : base.u16 - - base_offset = HUFFMAN_TABLE_BASE_OFFSETS[args.ht] as base.u32 - - symbol = 0 - while symbol < this.ht_n_symbols { - assert symbol < 2328 via "a < b: a < c; c <= b"(c: this.ht_n_symbols) - code_len = (this.code_lengths[symbol] & 15) as base.u32 - if code_len <> 0 { - histogram[code_len] ~mod+= 1 +// build_huffman_table! builds a two-level Huffman lookup table, modeled on +// Deflate's init_huff!. Primary table: 8-bit index (256 entries). Secondary +// tables handle codes longer than 8 bits via redirect entries. +pri func decoder.build_huffman_table!(hg: base.u32[..= 1024], ht: base.u32[..= 4]) base.status { + var base_offset : base.u32[..= 3840] + var i : base.u32[..= 2328] + var n_symbols : base.u32[..= 2328] + var count : base.u32[..= 2328] + var n_used_symbols : base.u32 + var last_used_symbol : base.u32[..= 2328] + var remaining : base.u32 + var min_cl : base.u32[..= 9] + var max_cl : base.u32[..= 15] + var initial_high_bits : base.u32 + var prev_cl : base.u32[..= 15] + var prev_redirect_key : base.u32 + var top : base.u32[..= 4096] + var next_top : base.u32[..= 4096] + var code : base.u32 + var key : base.u32 + var value : base.u32 + var cl : base.u32[..= 15] + var redirect_key : base.u32[..= 255] + var j : base.u32[..= 16] + var reversed_key : base.u32[..= 255] + var symbol : base.u32[..= 2328] + var high_bits : base.u32 + var delta : base.u32 + + var counts : array[16] base.u16[..= 2328] + var offsets : array[16] base.u16[..= 2328] + var symbols : array[2328] base.u16 + + // Primary table is at a fixed offset: tree * 256. + base_offset = args.ht * 256 + this.huffman_table_base_offsets[args.hg][args.ht] = base_offset as base.u16 + + // Calculate counts and find single-symbol case. + i = 0 + while i < this.ht_n_symbols { + assert i < 2328 via "a < b: a < c; c <= b"(c: this.ht_n_symbols) + if counts[this.code_lengths[i] & 15] >= 2328 { + return "#internal error: inconsistent Huffman decoder state" + } + counts[this.code_lengths[i] & 15] += 1 + if (this.code_lengths[i] & 15) <> 0 { n_used_symbols ~mod+= 1 - last_used_symbol = symbol + last_used_symbol = i } - symbol += 1 + i += 1 } if n_used_symbols < 1 { return "#bad Huffman code" } else if n_used_symbols == 1 { - this.huffman_nodes[args.hg][base_offset] = (last_used_symbol | 0x8000) as base.u16 + // Single symbol: fill all 256 primary entries with 0-bit code. + i = 0 + while i < 256 { + this.huffman_tables[args.hg][(base_offset ~mod+ i) & 0xFFF] = + 0x8000_0000 | (last_used_symbol << 8) + i += 1 + } return ok } - subscription_weight = 1 << 14 - code_len = 1 - while true { - curr_code = (curr_code ~mod+ histogram[code_len]) ~mod<< 1 - next_codes[code_len + 1] = curr_code - subscription_total ~mod+= subscription_weight ~mod* histogram[code_len] - subscription_weight >>= 1 - if code_len >= 15 { + // Check that the Huffman code completely covers all possible input bits. + remaining = 1 + i = 1 + while i <= 15 { + if remaining > (1 << 30) { + return "#internal error: inconsistent Huffman decoder state" + } + remaining <<= 1 + if remaining < (counts[i] as base.u32) { + return "#bad Huffman code (over-subscribed)" + } + remaining -= counts[i] as base.u32 + i += 1 + } + if remaining <> 0 { + return "#bad Huffman code (under-subscribed)" + } + + // Calculate offsets and n_symbols. + i = 1 + while i <= 15 { + offsets[i] = n_symbols as base.u16 + count = counts[i] as base.u32 + if n_symbols > (2328 - count) { + return "#internal error: inconsistent Huffman decoder state" + } + assert (n_symbols + count) <= 2328 via "(a + b) <= c: a <= (c - b)"() + n_symbols = n_symbols + count + i += 1 + } + if n_symbols > 2328 { + return "#internal error: inconsistent Huffman decoder state" + } + + // Sort symbols by code length. + i = 0 + while i < this.ht_n_symbols, + inv n_symbols <= 2328, + { + assert i < 2328 via "a < b: a < c; c <= b"(c: this.ht_n_symbols) + if (this.code_lengths[i] & 15) <> 0 { + if offsets[this.code_lengths[i] & 15] >= 2328 { + return "#internal error: inconsistent Huffman decoder state" + } + symbols[offsets[this.code_lengths[i] & 15]] = i as base.u16 + offsets[this.code_lengths[i] & 15] += 1 + } + i += 1 + } + + // Calculate min_cl and max_cl. + min_cl = 1 + while true, + inv n_symbols <= 2328, + { + if counts[min_cl] <> 0 { break } - code_len += 1 + if min_cl >= 9 { + return "#bad Huffman code" + } + min_cl += 1 + } + max_cl = 15 + while true, + inv n_symbols <= 2328, + { + if counts[max_cl] <> 0 { + break + } + if max_cl <= 1 { + return "#bad Huffman code" + } + max_cl -= 1 } - if subscription_total > (1 << 15) { - return "#bad Huffman code (over-subscribed)" - } else if subscription_total < (1 << 15) { - return "#bad Huffman code (under-subscribed)" + // Fill the primary and secondary tables. + // + // Primary table: 8-bit index (256 entries). + // Secondary tables: for codes > 8 bits, redirect entries in the primary + // table point to secondary sub-tables. + // Always fill all 256 primary table entries, regardless of max_cl. + // The decode loop uses a fixed 8-bit mask (bits & 0xFF) so all 256 + // entries must be valid. For codes shorter than 8 bits, entries are + // replicated across all high-bit patterns. + initial_high_bits = 1 << 8 + if (symbols[0] as base.u32) >= this.ht_n_symbols { + return "#internal error: inconsistent Huffman decoder state" } + assert (symbols[0] as base.u32) < 2328 via "a < b: a < c; c <= b"(c: this.ht_n_symbols) + prev_cl = (this.code_lengths[symbols[0] as base.u32] & 15) as base.u32 + prev_redirect_key = 0xFFFF_FFFF + top = base_offset + next_top = this.ht_next_top + code = 0 + key = 0 + value = 0 + i = 0 + while true, + pre code < (1 << 15), + pre i < 2328, + inv n_symbols <= 2328, + { + if (symbols[i] as base.u32) >= this.ht_n_symbols { + return "#internal error: inconsistent Huffman decoder state" + } + assert (symbols[i] as base.u32) < 2328 via "a < b: a < c; c <= b"(c: this.ht_n_symbols) + cl = (this.code_lengths[symbols[i] as base.u32] & 15) as base.u32 + if cl > prev_cl { + code <<= cl - prev_cl + if code >= (1 << 15) { + return "#internal error: inconsistent Huffman decoder state" + } + } + prev_cl = cl + + key = code + if cl > 8 { + cl -= 8 + assert cl <= 7 + + redirect_key = (key >> cl) & 0xFF + key = key.low_bits(n: cl) + if prev_redirect_key <> (redirect_key as base.u32) { + prev_redirect_key = redirect_key as base.u32 + + // Calculate the number of bits needed for the 2nd level table. + remaining = (1 as base.u32) << cl + j = prev_cl + while j <= 15, + inv cl <= 7, + inv code < (1 << 15), + inv i < 2328, + inv n_symbols <= 2328, + { + if remaining <= (counts[j] as base.u32) { + break + } + remaining -= counts[j] as base.u32 + if remaining > (1 << 30) { + return "#internal error: inconsistent Huffman decoder state" + } + remaining <<= 1 + j += 1 + } + if (j <= 8) or (15 < j) { + return "#internal error: inconsistent Huffman decoder state" + } + j -= 8 + initial_high_bits = (1 as base.u32) << j - this.huffman_nodes[args.hg][base_offset] = 0 - symbol = 0 - while symbol < this.ht_n_symbols { - assert symbol < 2328 via "a < b: a < c; c <= b"(c: this.ht_n_symbols) - code_len = (this.code_lengths[symbol] & 15) as base.u32 - if code_len <> 0 { - code_bits = next_codes[code_len] - next_codes[code_len] ~mod+= 1 + top = next_top + if (top + ((1 as base.u32) << j)) > 4096 { + return "#internal error: inconsistent Huffman decoder state" + } + assert (top + ((1 as base.u32) << j)) <= 4096 via "a <= b: a <= c; c <= b"(c: 4096) + next_top = top + ((1 as base.u32) << j) - // Insert {symbol, code_bits, code_len} into the node tree. - code_bits ~mod<<= 32 - code_len - h = base_offset - while code_len > 0, - inv symbol < 2328, - { - node = this.huffman_nodes[args.hg][h] - if node == 0 { - children = base_offset ~mod+ (1 ~mod+ (2 ~mod* n_branches)) - children = children.min(no_more_than: 0x1879) - this.huffman_nodes[args.hg][h] = children as base.u16 - this.huffman_nodes[args.hg][children + 0] = 0 - this.huffman_nodes[args.hg][children + 1] = 0 - h = children + (code_bits >> 31) - n_branches ~mod+= 1 - } else { - children = node as base.u32 - h = children.min(no_more_than: 0x1879) + (code_bits >> 31) + // Write redirect entry in primary table. + redirect_key = REVERSE8[redirect_key] as base.u32 + if (base_offset + redirect_key) >= 4096 { + return "#internal error: inconsistent Huffman decoder state" } - code_bits ~mod<<= 1 - code_len -= 1 + this.huffman_tables[args.hg][base_offset + redirect_key] = + 0x1000_0008 | (top << 8) | (j << 4) } - this.huffman_nodes[args.hg][h] = (symbol | 0x8000) as base.u16 } - symbol += 1 + if (cl > 8) or (key >= (1 << 8)) or (counts[prev_cl] <= 0) { + return "#internal error: inconsistent Huffman decoder state" + } + counts[prev_cl] -= 1 + + reversed_key = ((REVERSE8[key & 0xFF] as base.u32) >> (8 - cl)) & 0xFF + + if (symbols[i] as base.u32) >= 2328 { + return "#internal error: inconsistent Huffman decoder state" + } + symbol = symbols[i] as base.u32 + value = 0x8000_0000 | (symbol << 8) | cl + + // Replicate entry across all high_bits slots. + high_bits = initial_high_bits + delta = (1 as base.u32) << cl + while high_bits >= delta, + inv code < (1 << 15), + inv i < 2328, + inv n_symbols <= 2328, + { + high_bits -= delta + if (top + ((high_bits | reversed_key) & 0xFF)) >= 4096 { + return "#internal error: inconsistent Huffman decoder state" + } + this.huffman_tables[args.hg][top + ((high_bits | reversed_key) & 0xFF)] = value + } + + i += 1 + if i >= n_symbols { + break + } + assert i < 2328 via "a < b: a < c; c <= b"(c: n_symbols) + code += 1 + if code >= (1 << 15) { + return "#internal error: inconsistent Huffman decoder state" + } } + + this.ht_next_top = next_top return ok } @@ -521,11 +806,38 @@ pri const REPEAT_N_BITS : roarray[4] base.u8[..= 7] = [2, 3, 7, 0] pri const REPEAT_COUNTS : roarray[4] base.u8[..= 11] = [3, 3, 11, 0] -// See the decoder.huffman_nodes comment re five (5) Huffman trees. -pri const HUFFMAN_TABLE_BASE_OFFSETS : roarray[5] base.u16[..= 0x064C] = [ - 0x064C, - 0x0000, - 0x01FF, - 0x03FE, - 0x05FD, +// REVERSE8 reverses the bits in a byte. +pri const REVERSE8 : roarray[256] base.u8 = [ + 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, // 0x00 - 0x07 + 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, // 0x08 - 0x0F + 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, // 0x10 - 0x17 + 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, // 0x18 - 0x1F + 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, // 0x20 - 0x27 + 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, // 0x28 - 0x2F + 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, // 0x30 - 0x37 + 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, // 0x38 - 0x3F + 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, // 0x40 - 0x47 + 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, // 0x48 - 0x4F + 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, // 0x50 - 0x57 + 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, // 0x58 - 0x5F + 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, // 0x60 - 0x67 + 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, // 0x68 - 0x6F + 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, // 0x70 - 0x77 + 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, // 0x78 - 0x7F + 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, // 0x80 - 0x87 + 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, // 0x88 - 0x8F + 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, // 0x90 - 0x97 + 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, // 0x98 - 0x9F + 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, // 0xA0 - 0xA7 + 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, // 0xA8 - 0xAF + 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, // 0xB0 - 0xB7 + 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, // 0xB8 - 0xBF + 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, // 0xC0 - 0xC7 + 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, // 0xC8 - 0xCF + 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, // 0xD0 - 0xD7 + 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, // 0xD8 - 0xDF + 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, // 0xE0 - 0xE7 + 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, // 0xE8 - 0xEF + 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, // 0xF0 - 0xF7 + 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF, // 0xF8 - 0xFF ] diff --git a/std/webp/decode_pixels_fast.wuffs b/std/webp/decode_pixels_fast.wuffs new file mode 100644 index 000000000..ef30f1d75 --- /dev/null +++ b/std/webp/decode_pixels_fast.wuffs @@ -0,0 +1,371 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// decode_pixels_fast is the non-coroutine fast path for VP8L pixel decoding. +// It uses a 64-bit bit accumulator with "Variant 4" bulk refill (peek_u64le + +// skip_u32_fast), requiring args.src.length() >= 16 to guarantee two refills +// per pixel. Falls back to decode_pixels_slow for the last few bytes. +// +// This mirrors std/deflate/decode_huffman_fast64.wuffs. + +pri func decoder.decode_pixels_fast!(dst: slice base.u8, src: base.io_reader, width: base.u32[..= 0x4000], height: base.u32[..= 0x4000], tile_data: roslice base.u8, tile_size_log2: base.u32[..= 9]) base.status { + var bits : base.u64 + var n_bits : base.u32 + + var p : base.u64 + var p_max : base.u64[..= 0x4000_0000] + + var tile_size_log2 : base.u32[..= 31] + var width_in_tiles : base.u32[..= 0x20FF] + + var x : base.u32 + var y : base.u32 + var i : base.u32 + + var hg : base.u32[..= 0x3FF] + var trivial : base.u8 + var table_entry : base.u32 + var table_entry_n_bits : base.u32[..= 15] + var redir_top : base.u32[..= 0xFFFF] + var redir_mask : base.u32[..= 0x7FFF] + + var pixel_g : base.u32[..= 0xFFFF] + var color : base.u32 + var back_ref_len_n_bits : base.u32[..= 11] + var back_ref_len_minus_1 : base.u32[..= 0x1FFF] + var back_ref_dist_n_bits : base.u32[..= 18] + var back_ref_dist_sym : base.u32[..= 0xFFFF] + var back_ref_dist_premap_minus_1 : base.u32[..= 0xF_FFFF] + var back_ref_dist_minus_1 : base.u32 + + var dm : base.u32[..= 0xFF] + var dx : base.u32 + var dy : base.u32 + + var p_end : base.u64[..= 0x4000_8000] + var dist4 : base.u64 + var q : base.u64 + + var tmask : base.u32 + var tile_x_end : base.u32 + + var color_cache_shift : base.u32[..= 31] + var color_cache_pixels : slice base.u8 + + // Load shared state. + bits = this.bits as base.u64 + n_bits = this.n_bits + p = this.pix_p + x = this.pix_x + y = this.pix_y + p_max = (4 * args.width * args.height) as base.u64 + if args.dst.length() < p_max { + return "#internal error: inconsistent dst buffer" + } + + // Pre-compute color cache shift for eager insertion. + color_cache_shift = (32 - this.color_cache_bits) & 31 + + if args.tile_size_log2 <> 0 { + tile_size_log2 = args.tile_size_log2 + width_in_tiles = (args.width + (((1 as base.u32) << tile_size_log2) - 1)) >> tile_size_log2 + } else { + tile_size_log2 = 31 + width_in_tiles = 1 + } + tmask = ((1 as base.u32) << tile_size_log2) - 1 + + while (p < p_max) and (args.src.length() >= 16) { + // Group index is (Red << 8) | Green in BGRA pixel layout. + i = ((((y >> tile_size_log2) ~mod* width_in_tiles) ~mod+ (x >> tile_size_log2)) ~mod* 4) ~mod+ 1 + if (i as base.u64) < args.tile_data.length() { + hg = args.tile_data[i as base.u64] as base.u32 + if ((i as base.u64) + 1) < args.tile_data.length() { + hg = (((args.tile_data[(i as base.u64) + 1] as base.u32) << 8) | hg) & 0x3FF + } + } + trivial = this.hg_trivial[hg] + + // Compute end of this tile row span. + tile_x_end = (x | tmask) ~mod+ 1 + if tile_x_end > args.width { + tile_x_end = args.width + } + + // Inner loop: process pixels within the same tile. + while (x < tile_x_end) and (p < p_max) and (args.src.length() >= 16) { + // Fast path: trivial code (all channels single-symbol). + // No Huffman decode needed, no bits consumed. + if trivial >= 2 { + color = this.hg_literal_arb[hg] + } else { + // Refill 1: Variant 4 bulk refill to >= 56 bits. + bits |= args.src.peek_u64le() ~mod<< (n_bits & 63) + args.src.skip_u32_fast!(actual: (63 - (n_bits & 63)) >> 3, worst_case: 8) + n_bits |= 56 + + // Decode Green symbol. Primary table at fixed offset 0. + table_entry = this.huffman_tables[hg][((bits & 0xFF) as base.u32)] + table_entry_n_bits = table_entry & 0x0F + bits >>= table_entry_n_bits + n_bits ~mod-= table_entry_n_bits + if (table_entry >> 31) == 0 { + redir_top = (table_entry >> 8) & 0xFFFF + redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 + table_entry = this.huffman_tables[hg][((redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & 0xFFF)] + table_entry_n_bits = table_entry & 0x0F + bits >>= table_entry_n_bits + n_bits ~mod-= table_entry_n_bits + } + pixel_g = (table_entry >> 8) & 0xFFFF + + if pixel_g < 0x100 { + if trivial >= 1 { + // Trivial literal: R/B/A are single-symbol. + // Only green was decoded; skip Red, Blue, Alpha. + color = this.hg_literal_arb[hg] | (pixel_g << 8) + } else { + // Full literal: decode all 4 channels. + color = pixel_g << 8 + + // Decode Red. Primary table at fixed offset 256. + table_entry = this.huffman_tables[hg][(256 + ((bits & 0xFF) as base.u32))] + table_entry_n_bits = table_entry & 0x0F + bits >>= table_entry_n_bits + n_bits ~mod-= table_entry_n_bits + if (table_entry >> 31) == 0 { + redir_top = (table_entry >> 8) & 0xFFFF + redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 + table_entry = this.huffman_tables[hg][((redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & 0xFFF)] + table_entry_n_bits = table_entry & 0x0F + bits >>= table_entry_n_bits + n_bits ~mod-= table_entry_n_bits + } + color |= (((table_entry >> 8) & 0xFF) as base.u32) << 16 + + // Conditional refill 2: only refill if needed for Blue + Alpha. + if n_bits < 30 { + bits |= args.src.peek_u64le() ~mod<< (n_bits & 63) + args.src.skip_u32_fast!(actual: (63 - (n_bits & 63)) >> 3, worst_case: 8) + n_bits |= 56 + } + + // Decode Blue. Primary table at fixed offset 512. + table_entry = this.huffman_tables[hg][(512 + ((bits & 0xFF) as base.u32))] + table_entry_n_bits = table_entry & 0x0F + bits >>= table_entry_n_bits + n_bits ~mod-= table_entry_n_bits + if (table_entry >> 31) == 0 { + redir_top = (table_entry >> 8) & 0xFFFF + redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 + table_entry = this.huffman_tables[hg][((redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & 0xFFF)] + table_entry_n_bits = table_entry & 0x0F + bits >>= table_entry_n_bits + n_bits ~mod-= table_entry_n_bits + } + color |= (((table_entry >> 8) & 0xFF) as base.u32) << 0 + + // Decode Alpha. Primary table at fixed offset 768. + table_entry = this.huffman_tables[hg][(768 + ((bits & 0xFF) as base.u32))] + table_entry_n_bits = table_entry & 0x0F + bits >>= table_entry_n_bits + n_bits ~mod-= table_entry_n_bits + if (table_entry >> 31) == 0 { + redir_top = (table_entry >> 8) & 0xFFFF + redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 + table_entry = this.huffman_tables[hg][((redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & 0xFFF)] + table_entry_n_bits = table_entry & 0x0F + bits >>= table_entry_n_bits + n_bits ~mod-= table_entry_n_bits + } + color |= (((table_entry >> 8) & 0xFF) as base.u32) << 24 + } + + } else if pixel_g < 0x118 { + // === Back-reference === + // Decode length. + if pixel_g < 0x104 { + back_ref_len_minus_1 = pixel_g - 0x100 + } else { + back_ref_len_n_bits = (pixel_g - 0x102) >> 1 + back_ref_len_minus_1 = ((2 as base.u32) + (pixel_g & 1)) << back_ref_len_n_bits + assert back_ref_len_minus_1 <= 6144 + back_ref_len_minus_1 += (bits.low_bits(n: back_ref_len_n_bits) as base.u32) & 0x1FFF + bits >>= back_ref_len_n_bits + n_bits ~mod-= back_ref_len_n_bits + } + + // Conditional refill 2: distance symbol (15) + extra bits (18) = 33 max. + if n_bits < 33 { + bits |= args.src.peek_u64le() ~mod<< (n_bits & 63) + args.src.skip_u32_fast!(actual: (63 - (n_bits & 63)) >> 3, worst_case: 8) + n_bits |= 56 + } + + // Decode distance symbol. Primary table at fixed offset 1024. + table_entry = this.huffman_tables[hg][(1024 + ((bits & 0xFF) as base.u32))] + table_entry_n_bits = table_entry & 0x0F + bits >>= table_entry_n_bits + n_bits ~mod-= table_entry_n_bits + if (table_entry >> 31) == 0 { + redir_top = (table_entry >> 8) & 0xFFFF + redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 + table_entry = this.huffman_tables[hg][((redir_top + (((bits & 0xFFFF_FFFF) as base.u32) & redir_mask)) & 0xFFF)] + table_entry_n_bits = table_entry & 0x0F + bits >>= table_entry_n_bits + n_bits ~mod-= table_entry_n_bits + } + back_ref_dist_sym = (table_entry >> 8) & 0xFFFF + + // Decode distance extra bits. + if back_ref_dist_sym < 4 { + back_ref_dist_premap_minus_1 = back_ref_dist_sym + } else if back_ref_dist_sym < 40 { + back_ref_dist_n_bits = (back_ref_dist_sym - 2) >> 1 + back_ref_dist_premap_minus_1 = ((2 as base.u32) + (back_ref_dist_sym & 1)) << back_ref_dist_n_bits + assert back_ref_dist_premap_minus_1 <= 786432 + back_ref_dist_premap_minus_1 += (bits.low_bits(n: back_ref_dist_n_bits) as base.u32) & 0xF_FFFF + bits >>= back_ref_dist_n_bits + n_bits ~mod-= back_ref_dist_n_bits + } + + // Distance mapping. + if back_ref_dist_premap_minus_1 >= 120 { + back_ref_dist_minus_1 = back_ref_dist_premap_minus_1 - 120 + } else { + dm = DISTANCE_MAP[back_ref_dist_premap_minus_1] as base.u32 + dy = dm >> 4 + dx = 7 ~mod- (dm & 15) + back_ref_dist_minus_1 = (args.width * dy) ~mod+ dx + } + + // Apply back-reference. + assert p < 0x4000_0000 via "a < b: a < c; c <= b"(c: p_max) + p_end = p + (((back_ref_len_minus_1 + 1) * 4) as base.u64) + dist4 = ((back_ref_dist_minus_1 as base.u64) * 4) + 4 + if (p_end > p_max) or (p_end > args.dst.length()) or (p < dist4) { + return "#bad back-reference" + } + q = p - dist4 + if p > p_end { + return "#internal error: inconsistent dst buffer" + } + if back_ref_dist_minus_1 >= back_ref_len_minus_1 { + // Non-overlapping back-ref (dist >= len): bulk copy. + if (q > p) or (p > args.dst.length()) { + return "#internal error: inconsistent dst buffer" + } + args.dst[p .. p_end].copy_from_slice!(s: args.dst[q .. p]) + // Cache insertion for copied pixels. + if color_cache_shift > 0 { + if p_end > args.dst.length() { + return "#internal error: inconsistent dst buffer" + } + iterate (color_cache_pixels = args.dst[p .. p_end])(length: 4, advance: 4, unroll: 4) { + color = color_cache_pixels.peek_u32le() + this.color_cache[((color ~mod* 0x1E35_A7BD) >> color_cache_shift) & 2047] = color + } + } + p = p_end + } else { + // Overlapping back-ref (dist < len): per-pixel forward loop. + while (q < p) and (p < p_end), + inv p_end <= args.dst.length(), + { + assert q < p_end via "a < b: a < c; c < b"(c: p) + assert p < 0x4000_8000 via "a < b: a < c; c <= b"(c: p_end) + assert q < 0x4000_8000 via "a < b: a < c; c <= b"(c: p_end) + if ((p + 4) <= p_end) and ((q + 4) <= p) { + assert p <= (p + 4) via "a <= (a + b): 0 <= b"() + assert (p + 4) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: p_end) + assert q <= (q + 4) via "a <= (a + b): 0 <= b"() + assert (q + 4) <= p_end via "a <= b: a <= c; c <= b"(c: p) + assert (q + 4) <= args.dst.length() via "a <= b: a <= c; c <= b"(c: p_end) + color = args.dst[q .. (q + 4)].peek_u32le() + args.dst[p .. (p + 4)].poke_u32le!(a: color) + if color_cache_shift > 0 { + this.color_cache[((color ~mod* 0x1E35_A7BD) >> color_cache_shift) & 2047] = color + } + p += 4 + q += 4 + } else { + assert p < args.dst.length() via "a < b: a < c; c <= b"(c: p_end) + assert q < args.dst.length() via "a < b: a < c; c <= b"(c: p_end) + args.dst[p] = args.dst[q] + p += 1 + q += 1 + } + } + } + + // Update (x, y) for back-reference. + x ~mod+= back_ref_len_minus_1 + 1 + while x >= args.width { + x -= args.width + y ~mod+= 1 + } + break + + } else { + // === Color cache pixel === + color = this.color_cache[(pixel_g - 0x118) & 2047] + } + } + + // Write pixel (literals, trivial code, and color cache). + assert p < 0x4000_0000 via "a < b: a < c; c <= b"(c: p_max) + if (p + 4) > args.dst.length() { + return "#internal error: inconsistent dst buffer" + } + assert p <= (p + 4) via "a <= (a + b): 0 <= b"() + args.dst[p .. (p + 4)].poke_u32le!(a: color) + p += 4 + + // Eager color cache insertion (skip when no cache is used). + if color_cache_shift > 0 { + this.color_cache[((color ~mod* 0x1E35_A7BD) >> color_cache_shift) & 2047] = color + } + + // Update (x, y). + x ~mod+= 1 + if x == args.width { + x = 0 + y ~mod+= 1 + break + } + } + } + + // Unwind excess bytes from the accumulator so that n_bits < 8. + // This is safe because this function never suspends. + if n_bits > 63 { + return "#internal error: inconsistent n_bits" + } + while n_bits >= 8, + post n_bits < 8, + { + n_bits -= 8 + if args.src.can_undo_byte() { + args.src.undo_byte!() + } else { + return "#internal error: inconsistent I/O" + } + } + + // Store shared state back. + this.bits = (bits & (((1 as base.u64) << n_bits) - 1)) as base.u32 + this.n_bits = n_bits + this.pix_p = p + this.pix_x = x + this.pix_y = y + this.pix_cc_p = p + + return ok +} diff --git a/std/webp/decode_pixels_slow.wuffs b/std/webp/decode_pixels_slow.wuffs index 668c30acc..c3cd8a055 100644 --- a/std/webp/decode_pixels_slow.wuffs +++ b/std/webp/decode_pixels_slow.wuffs @@ -21,18 +21,21 @@ pri func decoder.decode_pixels_slow?(dst: slice base.u8, src: base.io_reader, wi var y : base.u32 var i : base.u32 - var hg : base.u32[..= 0xFF] - var h : base.u32[..= 0x187A] - var node : base.u16 + var hg : base.u32[..= 0x3FF] + var ht_base : base.u32 + var table_entry : base.u32 + var table_entry_n_bits : base.u32[..= 15] + var redir_top : base.u32[..= 0xFFFF] + var redir_mask : base.u32[..= 0x7FFF] - var pixel_g : base.u32[..= 0x7FFF] + var pixel_g : base.u32[..= 0xFFFF] var color : base.u32 // u32 0xAARR_GGBB, non-premultiplied alpha. var dst_pixel : slice base.u8 var back_ref_len_n_bits : base.u32[..= 11] var back_ref_len_minus_1 : base.u32[..= 0x1FFF] // 0x1FFF = 8191. var back_ref_dist_n_bits : base.u32[..= 18] - var back_ref_dist_sym : base.u32[..= 0x7FFF] + var back_ref_dist_sym : base.u32[..= 0xFFFF] var back_ref_dist_premap_minus_1 : base.u32[..= 0xF_FFFF] // 0xF_FFFF = 1048575. var back_ref_dist_minus_1 : base.u32 @@ -53,6 +56,12 @@ pri func decoder.decode_pixels_slow?(dst: slice base.u8, src: base.io_reader, wi return "#internal error: inconsistent dst buffer" } + // Resume from shared pixel position (set by fast path or prior call). + p = this.pix_p + x = this.pix_x + y = this.pix_y + color_cache_p = this.pix_cc_p + if args.tile_size_log2 <> 0 { tile_size_log2 = args.tile_size_log2 width_in_tiles = (args.width + (((1 as base.u32) << tile_size_log2) - 1)) >> tile_size_log2 @@ -62,100 +71,165 @@ pri func decoder.decode_pixels_slow?(dst: slice base.u8, src: base.io_reader, wi } while p < p_max { - // The "~mod+ 1" selects the green pixel of the BGRA 4-byte group. + // Group index is (Red << 8) | Green in BGRA pixel layout. + // "~mod* 4 ~mod+ 1" points to the Green byte of the tile pixel. i = ((((y >> tile_size_log2) ~mod* width_in_tiles) ~mod+ (x >> tile_size_log2)) ~mod* 4) ~mod+ 1 if (i as base.u64) < args.tile_data.length() { hg = args.tile_data[i as base.u64] as base.u32 + if ((i as base.u64) + 1) < args.tile_data.length() { + hg = (((args.tile_data[(i as base.u64) + 1] as base.u32) << 8) | hg) & 0x3FF + } } - // Decode the Green+etc symbol. - h = HUFFMAN_TABLE_BASE_OFFSETS[0] as base.u32 - while true, + // Decode the Green+etc symbol via table lookup. + // Best-effort refill for primary table lookup. Read when source + // has data available, but don't block at end of stream. + while (this.n_bits < 8) and (args.src.length() > 0), inv p < p_max, { - node = this.huffman_nodes[hg][h] - if node >= 0x8000 { - break - } else if node > 0x1879 { - return "#internal error: inconsistent Huffman code" + c8 = args.src.read_u8?() + if this.n_bits >= 8 { + return "#internal error: inconsistent n_bits" } - if this.n_bits < 1 { + this.bits |= (c8 as base.u32) << this.n_bits + this.n_bits += 8 + } + ht_base = this.huffman_table_base_offsets[hg][0] as base.u32 + table_entry = this.huffman_tables[hg][(ht_base ~mod+ (this.bits & 0xFF)) & 0xFFF] + table_entry_n_bits = table_entry & 0x0F + this.bits >>= table_entry_n_bits + this.n_bits = (this.n_bits ~mod- table_entry_n_bits) & 31 + if (table_entry >> 31) == 0 { + // Redirect to second-level table. Refill for secondary. + while (this.n_bits < 7) and (args.src.length() > 0), + inv p < p_max, + { c8 = args.src.read_u8?() - this.bits = (c8 as base.u32) - this.n_bits = 8 - assert this.n_bits >= 1 + if this.n_bits >= 7 { + return "#internal error: inconsistent n_bits" + } + this.bits |= (c8 as base.u32) << this.n_bits + this.n_bits += 8 } - h = (node as base.u32) + (this.bits & 1) - this.bits >>= 1 - this.n_bits -= 1 + redir_top = (table_entry >> 8) & 0xFFFF + redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 + table_entry = this.huffman_tables[hg][(redir_top + (this.bits & redir_mask)) & 0xFFF] + table_entry_n_bits = table_entry & 0x0F + this.bits >>= table_entry_n_bits + this.n_bits = (this.n_bits ~mod- table_entry_n_bits) & 31 } - pixel_g = (node & 0x7FFF) as base.u32 + pixel_g = (table_entry >> 8) & 0xFFFF if pixel_g < 0x100 { // Literal pixel. color = pixel_g << 8 // Decode the Red symbol. - h = HUFFMAN_TABLE_BASE_OFFSETS[1] as base.u32 - while true, + while (this.n_bits < 8) and (args.src.length() > 0), inv p < p_max, { - node = this.huffman_nodes[hg][h] - if node >= 0x8000 { - break + c8 = args.src.read_u8?() + if this.n_bits >= 8 { + return "#internal error: inconsistent n_bits" } - if this.n_bits < 1 { + this.bits |= (c8 as base.u32) << this.n_bits + this.n_bits += 8 + } + ht_base = this.huffman_table_base_offsets[hg][1] as base.u32 + table_entry = this.huffman_tables[hg][(ht_base ~mod+ (this.bits & 0xFF)) & 0xFFF] + table_entry_n_bits = table_entry & 0x0F + this.bits >>= table_entry_n_bits + this.n_bits = (this.n_bits ~mod- table_entry_n_bits) & 31 + if (table_entry >> 31) == 0 { + while (this.n_bits < 7) and (args.src.length() > 0), + inv p < p_max, + { c8 = args.src.read_u8?() - this.bits = (c8 as base.u32) - this.n_bits = 8 - assert this.n_bits >= 1 + if this.n_bits >= 7 { + return "#internal error: inconsistent n_bits" + } + this.bits |= (c8 as base.u32) << this.n_bits + this.n_bits += 8 } - h = ((node as base.u32) & 0xFFF) + (this.bits & 1) - this.bits >>= 1 - this.n_bits -= 1 + redir_top = (table_entry >> 8) & 0xFFFF + redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 + table_entry = this.huffman_tables[hg][(redir_top + (this.bits & redir_mask)) & 0xFFF] + table_entry_n_bits = table_entry & 0x0F + this.bits >>= table_entry_n_bits + this.n_bits = (this.n_bits ~mod- table_entry_n_bits) & 31 } - color |= ((node & 0xFF) as base.u32) << 16 + color |= (((table_entry >> 8) & 0xFF) as base.u32) << 16 // Decode the Blue symbol. - h = HUFFMAN_TABLE_BASE_OFFSETS[2] as base.u32 - while true, + while (this.n_bits < 8) and (args.src.length() > 0), inv p < p_max, { - node = this.huffman_nodes[hg][h] - if node >= 0x8000 { - break + c8 = args.src.read_u8?() + if this.n_bits >= 8 { + return "#internal error: inconsistent n_bits" } - if this.n_bits < 1 { + this.bits |= (c8 as base.u32) << this.n_bits + this.n_bits += 8 + } + ht_base = this.huffman_table_base_offsets[hg][2] as base.u32 + table_entry = this.huffman_tables[hg][(ht_base ~mod+ (this.bits & 0xFF)) & 0xFFF] + table_entry_n_bits = table_entry & 0x0F + this.bits >>= table_entry_n_bits + this.n_bits = (this.n_bits ~mod- table_entry_n_bits) & 31 + if (table_entry >> 31) == 0 { + while (this.n_bits < 7) and (args.src.length() > 0), + inv p < p_max, + { c8 = args.src.read_u8?() - this.bits = (c8 as base.u32) - this.n_bits = 8 - assert this.n_bits >= 1 + if this.n_bits >= 7 { + return "#internal error: inconsistent n_bits" + } + this.bits |= (c8 as base.u32) << this.n_bits + this.n_bits += 8 } - h = ((node as base.u32) & 0xFFF) + (this.bits & 1) - this.bits >>= 1 - this.n_bits -= 1 + redir_top = (table_entry >> 8) & 0xFFFF + redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 + table_entry = this.huffman_tables[hg][(redir_top + (this.bits & redir_mask)) & 0xFFF] + table_entry_n_bits = table_entry & 0x0F + this.bits >>= table_entry_n_bits + this.n_bits = (this.n_bits ~mod- table_entry_n_bits) & 31 } - color |= ((node & 0xFF) as base.u32) << 0 + color |= (((table_entry >> 8) & 0xFF) as base.u32) << 0 // Decode the Alpha symbol. - h = HUFFMAN_TABLE_BASE_OFFSETS[3] as base.u32 - while true, + while (this.n_bits < 8) and (args.src.length() > 0), inv p < p_max, { - node = this.huffman_nodes[hg][h] - if node >= 0x8000 { - break + c8 = args.src.read_u8?() + if this.n_bits >= 8 { + return "#internal error: inconsistent n_bits" } - if this.n_bits < 1 { + this.bits |= (c8 as base.u32) << this.n_bits + this.n_bits += 8 + } + ht_base = this.huffman_table_base_offsets[hg][3] as base.u32 + table_entry = this.huffman_tables[hg][(ht_base ~mod+ (this.bits & 0xFF)) & 0xFFF] + table_entry_n_bits = table_entry & 0x0F + this.bits >>= table_entry_n_bits + this.n_bits = (this.n_bits ~mod- table_entry_n_bits) & 31 + if (table_entry >> 31) == 0 { + while (this.n_bits < 7) and (args.src.length() > 0), + inv p < p_max, + { c8 = args.src.read_u8?() - this.bits = (c8 as base.u32) - this.n_bits = 8 - assert this.n_bits >= 1 + if this.n_bits >= 7 { + return "#internal error: inconsistent n_bits" + } + this.bits |= (c8 as base.u32) << this.n_bits + this.n_bits += 8 } - h = ((node as base.u32) & 0xFFF) + (this.bits & 1) - this.bits >>= 1 - this.n_bits -= 1 + redir_top = (table_entry >> 8) & 0xFFFF + redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 + table_entry = this.huffman_tables[hg][(redir_top + (this.bits & redir_mask)) & 0xFFF] + table_entry_n_bits = table_entry & 0x0F + this.bits >>= table_entry_n_bits + this.n_bits = (this.n_bits ~mod- table_entry_n_bits) & 31 } - color |= ((node & 0xFF) as base.u32) << 24 + color |= (((table_entry >> 8) & 0xFF) as base.u32) << 24 } else if pixel_g < 0x118 { // Back-ref pixel. // Decode the back-ref length. @@ -183,27 +257,41 @@ pri func decoder.decode_pixels_slow?(dst: slice base.u8, src: base.io_reader, wi this.n_bits -= back_ref_len_n_bits } - // Decode the back-ref distance. - - h = HUFFMAN_TABLE_BASE_OFFSETS[4] as base.u32 - while true, + // Decode the back-ref distance symbol. + while (this.n_bits < 8) and (args.src.length() > 0), inv p < p_max, { - node = this.huffman_nodes[hg][h] - if node >= 0x8000 { - break + c8 = args.src.read_u8?() + if this.n_bits >= 8 { + return "#internal error: inconsistent n_bits" } - if this.n_bits < 1 { + this.bits |= (c8 as base.u32) << this.n_bits + this.n_bits += 8 + } + ht_base = this.huffman_table_base_offsets[hg][4] as base.u32 + table_entry = this.huffman_tables[hg][(ht_base ~mod+ (this.bits & 0xFF)) & 0xFFF] + table_entry_n_bits = table_entry & 0x0F + this.bits >>= table_entry_n_bits + this.n_bits = (this.n_bits ~mod- table_entry_n_bits) & 31 + if (table_entry >> 31) == 0 { + while (this.n_bits < 7) and (args.src.length() > 0), + inv p < p_max, + { c8 = args.src.read_u8?() - this.bits = (c8 as base.u32) - this.n_bits = 8 - assert this.n_bits >= 1 + if this.n_bits >= 7 { + return "#internal error: inconsistent n_bits" + } + this.bits |= (c8 as base.u32) << this.n_bits + this.n_bits += 8 } - h = ((node as base.u32) & 0xFFF) + (this.bits & 1) - this.bits >>= 1 - this.n_bits -= 1 + redir_top = (table_entry >> 8) & 0xFFFF + redir_mask = ((1 as base.u32) << ((table_entry >> 4) & 0x0F)) - 1 + table_entry = this.huffman_tables[hg][(redir_top + (this.bits & redir_mask)) & 0xFFF] + table_entry_n_bits = table_entry & 0x0F + this.bits >>= table_entry_n_bits + this.n_bits = (this.n_bits ~mod- table_entry_n_bits) & 31 } - back_ref_dist_sym = (node & 0x7FFF) as base.u32 + back_ref_dist_sym = (table_entry >> 8) & 0xFFFF if back_ref_dist_sym < 4 { back_ref_dist_premap_minus_1 = back_ref_dist_sym @@ -305,4 +393,10 @@ pri func decoder.decode_pixels_slow?(dst: slice base.u8, src: base.io_reader, wi y ~mod+= 1 } } + + // Store back shared pixel position. + this.pix_p = p + this.pix_x = x + this.pix_y = y + this.pix_cc_p = color_cache_p } diff --git a/std/webp/decode_transform.wuffs b/std/webp/decode_transform.wuffs index c883a708e..00ecf2564 100644 --- a/std/webp/decode_transform.wuffs +++ b/std/webp/decode_transform.wuffs @@ -8,7 +8,9 @@ // // SPDX-License-Identifier: Apache-2.0 OR MIT -pri func decoder.apply_transform_predictor!(pix: slice base.u8, tile_data: roslice base.u8) { +pri func decoder.apply_transform_predictor!(pix: slice base.u8, tile_data: roslice base.u8), + choosy, +{ var w4 : base.u64[..= 0x1_0000] var prev_row : roslice base.u8 var curr_row : slice base.u8 @@ -283,14 +285,17 @@ pri func decoder.mode13(l: base.u8, t: base.u8, tl: base.u8) base.u8 { return 0 } -pri func decoder.apply_transform_cross_color!(pix: slice base.u8, tile_data: roslice base.u8) { - var tile_size_log2 : base.u32[..= 9] - var tiles_per_row : base.u32[..= 16895] - var mask : base.u32 - var y : base.u32[..= 0x4000] - var x : base.u32[..= 0x4000] - var t : base.u64 - var tile_data : roslice base.u8 +pri func decoder.apply_transform_cross_color!(pix: slice base.u8, tile_data: roslice base.u8), + choosy, +{ + var tile_size_log2 : base.u32[..= 9] + var tiles_per_row : base.u32[..= 16895] + var mask : base.u32 + var do_subtract_green : base.bool + var y : base.u32[..= 0x4000] + var x : base.u32[..= 0x4000] + var t : base.u64 + var tile_data : roslice base.u8 var g2r : base.u32 var g2b : base.u32 @@ -303,6 +308,7 @@ pri func decoder.apply_transform_cross_color!(pix: slice base.u8, tile_data: ros tile_size_log2 = this.transform_tile_size_log2[1] as base.u32 tiles_per_row = (this.width + (((1 as base.u32) << tile_size_log2) - 1)) >> tile_size_log2 mask = ((1 as base.u32) << tile_size_log2) - 1 + do_subtract_green = this.fuse_subtract_green y = 0 while y < this.height { @@ -331,9 +337,15 @@ pri func decoder.apply_transform_cross_color!(pix: slice base.u8, tile_data: ros b = args.pix[0] g = args.pix[1] r = args.pix[2] + // Apply cross_color first, then subtract_green. The r2b term + // must use red before green is added (subtract_green order). r ~mod+= (((this.util.sign_extend_convert_u8_u32(a: g) ~mod* g2r) >> 5) & 0xFF) as base.u8 b ~mod+= (((this.util.sign_extend_convert_u8_u32(a: g) ~mod* g2b) >> 5) & 0xFF) as base.u8 b ~mod+= (((this.util.sign_extend_convert_u8_u32(a: r) ~mod* r2b) >> 5) & 0xFF) as base.u8 + if do_subtract_green { + r ~mod+= g + b ~mod+= g + } args.pix[0] = b args.pix[2] = r args.pix = args.pix[4 ..] @@ -346,7 +358,9 @@ pri func decoder.apply_transform_cross_color!(pix: slice base.u8, tile_data: ros } } -pri func decoder.apply_transform_subtract_green!(pix: slice base.u8) { +pri func decoder.apply_transform_subtract_green!(pix: slice base.u8), + choosy, +{ var p : slice base.u8 var g : base.u8 diff --git a/std/webp/decode_transform_x86_avx2.wuffs b/std/webp/decode_transform_x86_avx2.wuffs new file mode 100644 index 000000000..64963c2ec --- /dev/null +++ b/std/webp/decode_transform_x86_avx2.wuffs @@ -0,0 +1,674 @@ +// Copyright 2024 The Wuffs Authors. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +// +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// -------- + +pri func decoder.apply_transform_subtract_green_x86_avx2!(pix: slice base.u8), + choose cpu_arch >= x86_avx2, +{ + var util : base.x86_avx2_utility + var tail : slice base.u8 + var v : base.x86_m256i + var mask : base.x86_m256i + var green : base.x86_m256i + var g_br : base.x86_m256i + + mask = util.make_m256i_repeat_u32(a: 0x0000_FF00) + tail = args.pix + + while tail.length() >= 32 { + v = util.make_m256i_slice256(a: tail) + green = v._mm256_and_si256(b: mask) + g_br = green._mm256_srli_epi32(imm8: 8)._mm256_or_si256( + b: green._mm256_slli_epi32(imm8: 8)) + v = v._mm256_add_epi8(b: g_br) + v.store_slice256!(a: tail[.. 32]) + tail = tail[32 ..] + } + + while tail.length() >= 4 { + tail[0] ~mod+= tail[1] + tail[2] ~mod+= tail[1] + tail = tail[4 ..] + } +} + +// -------- + +// cross_color inverse transform with AVX2. +// +// Per pixel: new_r = old_r + ((sign_ext(g) * g2r) >> 5) & 0xFF +// new_b = old_b + ((sign_ext(g) * g2b) >> 5) & 0xFF +// + ((sign_ext(new_r) * r2b) >> 5) & 0xFF +// +// Uses i16 multiply (mullo_epi16) on sign-extended byte values. +// shuffle_epi8 extracts green/red bytes to i16 lanes and scatters deltas back. + +pri func decoder.apply_transform_cross_color_x86_avx2!(pix: slice base.u8, tile_data: roslice base.u8), + choose cpu_arch >= x86_avx2, +{ + var tile_size_log2 : base.u32[..= 9] + var tiles_per_row : base.u32[..= 16895] + var tmask : base.u32[..= 0x1FF] + var do_subtract_green : base.bool + var y : base.u32[..= 0x4000] + var x : base.u32[..= 0x4008] + var t : base.u64 + var tile_data : roslice base.u8 + var x_end : base.u32 + + var g2r : base.u32 + var g2b : base.u32 + var r2b : base.u32 + var raw_g2r : base.u8 + var raw_g2b : base.u8 + var raw_r2b : base.u8 + var skip_bytes : base.u64 + + var b : base.u8 + var g : base.u8 + var r : base.u8 + + var util : base.x86_avx2_utility + var pix : base.x86_m256i + var green_i16 : base.x86_m256i + var red_i16 : base.x86_m256i + var new_r_i16 : base.x86_m256i + var delta_r_i16 : base.x86_m256i + var delta_b_i16 : base.x86_m256i + var g2r_vec : base.x86_m256i + var g2b_vec : base.x86_m256i + var r2b_vec : base.x86_m256i + var delta_r_packed : base.x86_m256i + var delta_b_packed : base.x86_m256i + + // Shuffle masks (same for both 128-bit lanes). + var green_shuf : base.x86_m256i + var red_shuf : base.x86_m256i + var r_scatter : base.x86_m256i + var b_scatter : base.x86_m256i + + // Fused subtract_green support. + var sg_mask : base.x86_m256i + var sg_green : base.x86_m256i + var sg_br : base.x86_m256i + + tile_size_log2 = this.transform_tile_size_log2[1] as base.u32 + tiles_per_row = (this.width + (((1 as base.u32) << tile_size_log2) - 1)) >> tile_size_log2 + tmask = ((1 as base.u32) << tile_size_log2) - 1 + do_subtract_green = this.fuse_subtract_green + + // Extract green byte (offset 1 in each pixel) to i16 positions. + // Per 128-bit lane: bytes [1,_,5,_,9,_,13,_,0,0,0,0,0,0,0,0]. + green_shuf = util.make_m256i_multiple_u32( + a00: 0x8005_8001, a01: 0x800D_8009, + a02: 0x8080_8080, a03: 0x8080_8080, + a04: 0x8005_8001, a05: 0x800D_8009, + a06: 0x8080_8080, a07: 0x8080_8080) + + // Extract red byte (offset 2 in each pixel) to i16 positions. + red_shuf = util.make_m256i_multiple_u32( + a00: 0x8006_8002, a01: 0x800E_800A, + a02: 0x8080_8080, a03: 0x8080_8080, + a04: 0x8006_8002, a05: 0x800E_800A, + a06: 0x8080_8080, a07: 0x8080_8080) + + // Scatter delta_r low bytes from i16 to red byte position (offset 2) in each pixel. + // i16 values at byte positions [0,_,2,_,4,_,6,_,...] → pixel byte [_,_,R,_] per pixel. + r_scatter = util.make_m256i_multiple_u32( + a00: 0x8000_8080, a01: 0x8002_8080, + a02: 0x8004_8080, a03: 0x8006_8080, + a04: 0x8000_8080, a05: 0x8002_8080, + a06: 0x8004_8080, a07: 0x8006_8080) + + // Scatter delta_b low bytes to blue byte position (offset 0) in each pixel. + b_scatter = util.make_m256i_multiple_u32( + a00: 0x8080_8000, a01: 0x8080_8002, + a02: 0x8080_8004, a03: 0x8080_8006, + a04: 0x8080_8000, a05: 0x8080_8002, + a06: 0x8080_8004, a07: 0x8080_8006) + + // Mask for extracting green bytes (fused subtract_green). + sg_mask = util.make_m256i_repeat_u32(a: 0x0000_FF00) + + y = 0 + while y < this.height { + assert y < 0x4000 via "a < b: a < c; c <= b"(c: this.height) + + t = (4 * (y >> tile_size_log2) * tiles_per_row) as base.u64 + tile_data = this.util.empty_slice_u8() + if t <= args.tile_data.length() { + tile_data = args.tile_data[t ..] + } + + x = 0 + while x < this.width, + inv y < 0x4000, + { + assert x < 0x4000 via "a < b: a < c; c <= b"(c: this.width) + + if ((x & tmask) == 0) and (tile_data.length() >= 4) { + raw_g2r = tile_data[0] + raw_g2b = tile_data[1] + raw_r2b = tile_data[2] + g2r = this.util.sign_extend_convert_u8_u32(a: raw_g2r) + g2b = this.util.sign_extend_convert_u8_u32(a: raw_g2b) + r2b = this.util.sign_extend_convert_u8_u32(a: raw_r2b) + tile_data = tile_data[4 ..] + } + + // Compute end of this tile span. + x_end = (x | tmask) + 1 + if x_end > this.width { + x_end = this.width + } + + // Extend x_end across consecutive tiles with same coefficients. + while (x_end < this.width) and (tile_data.length() >= 4), + inv y < 0x4000, + { + if (tile_data[0] <> raw_g2r) or (tile_data[1] <> raw_g2b) or (tile_data[2] <> raw_r2b) { + break + } + tile_data = tile_data[4 ..] + // Safe: x_end < this.width <= 0x4000, so no actual wrap. + x_end = (x_end | tmask) ~mod+ 1 + if x_end > this.width { + x_end = this.width + } + } + if x_end > this.width { + x_end = this.width + } + + // Skip identity tiles (all coefficients zero, no subtract_green). + if (g2r == 0) and (g2b == 0) and (r2b == 0) and (not do_subtract_green) { + if (x_end > x) and (x_end <= this.width) { + skip_bytes = ((x_end - x) as base.u64) * 4 + assert x_end < 0x4001 via "a < b: a <= c; c < b"(c: this.width) + x = x_end + if skip_bytes <= args.pix.length() { + args.pix = args.pix[skip_bytes ..] + } + } + } else { + // Broadcast coefficients as i16. + g2r_vec = util.make_m256i_repeat_u16(a: (g2r & 0xFFFF) as base.u16) + g2b_vec = util.make_m256i_repeat_u16(a: (g2b & 0xFFFF) as base.u16) + r2b_vec = util.make_m256i_repeat_u16(a: (r2b & 0xFFFF) as base.u16) + + // AVX2: process 8 pixels (32 bytes) per iteration. + if x_end >= 8 { + while (x < this.width) and (x <= (x_end - 8)) and (args.pix.length() >= 32) and (x_end <= this.width), + inv y < 0x4000, + { + pix = util.make_m256i_slice256(a: args.pix) + + // Extract green to i16, sign-extend. + green_i16 = pix._mm256_shuffle_epi8(b: green_shuf) + green_i16 = green_i16._mm256_slli_epi16(imm8: 8)._mm256_srai_epi16(imm8: 8) + + // delta_r = (green * g2r) >> 5 + delta_r_i16 = green_i16._mm256_mullo_epi16(b: g2r_vec)._mm256_srai_epi16(imm8: 5) + + // delta_b_from_green = (green * g2b) >> 5 + delta_b_i16 = green_i16._mm256_mullo_epi16(b: g2b_vec)._mm256_srai_epi16(imm8: 5) + + // Compute new_r (for r2b contribution) from ORIGINAL red + // (before subtract_green), since cross_color uses coded red. + red_i16 = pix._mm256_shuffle_epi8(b: red_shuf) + red_i16 = red_i16._mm256_slli_epi16(imm8: 8)._mm256_srai_epi16(imm8: 8) + new_r_i16 = red_i16._mm256_add_epi16(b: delta_r_i16) + // Byte-truncate and sign-extend for the multiply. + new_r_i16 = new_r_i16._mm256_slli_epi16(imm8: 8)._mm256_srai_epi16(imm8: 8) + + // delta_b_from_red = (new_r * r2b) >> 5 + delta_b_i16 = delta_b_i16._mm256_add_epi16( + b: new_r_i16._mm256_mullo_epi16(b: r2b_vec)._mm256_srai_epi16(imm8: 5)) + + // Scatter deltas back to pixel byte positions and add. + delta_r_packed = delta_r_i16._mm256_shuffle_epi8(b: r_scatter) + delta_b_packed = delta_b_i16._mm256_shuffle_epi8(b: b_scatter) + pix = pix._mm256_add_epi8(b: delta_r_packed) + pix = pix._mm256_add_epi8(b: delta_b_packed) + + // Fused subtract_green: add green to R and B AFTER + // cross_color, so r2b uses the coded red, not red+green. + if do_subtract_green { + sg_green = pix._mm256_and_si256(b: sg_mask) + sg_br = sg_green._mm256_srli_epi32(imm8: 8)._mm256_or_si256( + b: sg_green._mm256_slli_epi32(imm8: 8)) + pix = pix._mm256_add_epi8(b: sg_br) + } + + pix.store_slice256!(a: args.pix[.. 32]) + args.pix = args.pix[32 ..] + assert x < this.width via "a < b: a < c; c <= b"(c: this.width) + assert x < 0x4000 via "a < b: a < c; c <= b"(c: this.width) + x += 8 + } + } + + // Scalar tail for remaining pixels in this tile span. + while (x < x_end) and (x_end <= this.width), + inv y < 0x4000, + { + assert x < this.width via "a < b: a < c; c <= b"(c: x_end) + assert x < 0x4000 via "a < b: a < c; c <= b"(c: this.width) + + if args.pix.length() >= 4 { + b = args.pix[0] + g = args.pix[1] + r = args.pix[2] + r ~mod+= (((this.util.sign_extend_convert_u8_u32(a: g) ~mod* g2r) >> 5) & 0xFF) as base.u8 + b ~mod+= (((this.util.sign_extend_convert_u8_u32(a: g) ~mod* g2b) >> 5) & 0xFF) as base.u8 + b ~mod+= (((this.util.sign_extend_convert_u8_u32(a: r) ~mod* r2b) >> 5) & 0xFF) as base.u8 + if do_subtract_green { + r ~mod+= g + b ~mod+= g + } + args.pix[0] = b + args.pix[2] = r + args.pix = args.pix[4 ..] + } + + x += 1 + } + } + } + + y += 1 + } +} + +// -------- + +// predictor inverse transform with per-tile-span dispatch. +// Restructured to avoid per-pixel mode checks within a tile span. +// For top-only modes (2, 3, 4), uses AVX2 to process 8 pixels at once. + +pri func decoder.apply_transform_predictor_x86_avx2!(pix: slice base.u8, tile_data: roslice base.u8), + choose cpu_arch >= x86_avx2, +{ + var w4 : base.u64[..= 0x1_0000] + var prev_row : roslice base.u8 + var curr_row : slice base.u8 + + var tile_size_log2 : base.u32[..= 9] + var tiles_per_row : base.u32[..= 16895] + var mask : base.u32[..= 0x1FF] + var y : base.u32[..= 0x4000] + var x : base.u32[..= 0x4008] + var t : base.u64 + var tile_data : roslice base.u8 + var mode : base.u8[..= 0x0F] + var x_end : base.u32 + + var util : base.x86_avx2_utility + var avx_pix : base.x86_m256i + var avx_prev : base.x86_m256i + var avx_opaque : base.x86_m256i + var avx_carry : base.x86_m256i + + var l0 : base.u32[..= 0xFF] + var l1 : base.u32[..= 0xFF] + var l2 : base.u32[..= 0xFF] + var l3 : base.u32[..= 0xFF] + var c0 : base.u32[..= 0xFF] + var c1 : base.u32[..= 0xFF] + var c2 : base.u32[..= 0xFF] + var c3 : base.u32[..= 0xFF] + var t0 : base.u32[..= 0xFF] + var t1 : base.u32[..= 0xFF] + var t2 : base.u32[..= 0xFF] + var t3 : base.u32[..= 0xFF] + var sum_l : base.u32 + var sum_t : base.u32 + + if (this.width <= 0) or (this.height <= 0) { + return nothing + } + + w4 = (this.width * 4) as base.u64 + curr_row = this.util.empty_slice_u8() + if w4 <= args.pix.length() { + curr_row = args.pix[.. w4] + } + + // The first pixel's predictor is mode 0 (opaque black). + if curr_row.length() >= 4 { + curr_row[3] ~mod+= 0xFF + } + + // The rest of the first row's predictor is mode 1 (L). + // AVX2 prefix-sum: 8 pixels at a time. + if curr_row.length() >= 4 { + avx_carry = util.make_m256i_repeat_u32(a: curr_row.peek_u32le()) + while curr_row.length() >= 36 { + avx_pix = util.make_m256i_slice256(a: curr_row[4 .. 36]) + avx_prev = avx_pix._mm256_slli_si256(imm8: 4) + avx_pix = avx_pix._mm256_add_epi8(b: avx_prev) + avx_prev = avx_pix._mm256_slli_si256(imm8: 8) + avx_pix = avx_pix._mm256_add_epi8(b: avx_prev) + avx_opaque = avx_pix._mm256_shuffle_epi32(imm8: 0xFF) + avx_opaque = avx_opaque._mm256_permute2x128_si256(b: avx_opaque, imm8: 0x08) + avx_pix = avx_pix._mm256_add_epi8(b: avx_opaque) + avx_pix = avx_pix._mm256_add_epi8(b: avx_carry) + avx_pix.store_slice256!(a: curr_row[4 .. 36]) + avx_carry = avx_pix._mm256_permute4x64_epi64(imm8: 0xFF) + avx_carry = avx_carry._mm256_shuffle_epi32(imm8: 0xFF) + curr_row = curr_row[32 ..] + } + } + // Scalar tail. + while curr_row.length() >= 8 { + curr_row[4] ~mod+= curr_row[0] + curr_row[5] ~mod+= curr_row[1] + curr_row[6] ~mod+= curr_row[2] + curr_row[7] ~mod+= curr_row[3] + curr_row = curr_row[4 ..] + } + + tile_size_log2 = this.transform_tile_size_log2[0] as base.u32 + tiles_per_row = (this.width + (((1 as base.u32) << tile_size_log2) - 1)) >> tile_size_log2 + mask = ((1 as base.u32) << tile_size_log2) - 1 + + y = 1 + while y < this.height { + assert y < 0x4000 via "a < b: a < c; c <= b"(c: this.height) + + t = (4 * (y >> tile_size_log2) * tiles_per_row) as base.u64 + tile_data = this.util.empty_slice_u8() + if t <= args.tile_data.length() { + tile_data = args.tile_data[t ..] + if tile_data.length() >= 4 { + mode = tile_data[1] & 0x0F + tile_data = tile_data[4 ..] + } + } + + if w4 <= args.pix.length() { + prev_row = args.pix + args.pix = args.pix[w4 ..] + curr_row = args.pix + } + + // The first column's predictor is mode 2 (T). + if (prev_row.length() >= 4) and (curr_row.length() >= 4) { + curr_row[0] ~mod+= prev_row[0] + curr_row[1] ~mod+= prev_row[1] + curr_row[2] ~mod+= prev_row[2] + curr_row[3] ~mod+= prev_row[3] + } + + x = 1 + while x < this.width, + inv y < 0x4000, + { + assert x < 0x4000 via "a < b: a < c; c <= b"(c: this.width) + + if ((x & mask) == 0) and (tile_data.length() >= 4) { + mode = tile_data[1] & 0x0F + tile_data = tile_data[4 ..] + } + + // Compute end of this tile span. + x_end = (x | mask) + 1 + if x_end > this.width { + x_end = this.width + } + + // Extend x_end across consecutive tiles with same mode. + while (x_end < this.width) and (tile_data.length() >= 4), + inv y < 0x4000, + { + if (tile_data[1] & 0x0F) <> mode { + break + } + tile_data = tile_data[4 ..] + // Safe: x_end < this.width <= 0x4000, so no actual wrap. + x_end = (x_end | mask) ~mod+ 1 + if x_end > this.width { + x_end = this.width + } + } + if x_end > this.width { + x_end = this.width + } + + // AVX2 path for mode 0 (opaque black): only alpha (+0xFF) per + // pixel. No left-pixel dependency, fully vectorizable. + if mode == 0 { + avx_opaque = util.make_m256i_repeat_u32(a: 0xFF00_0000) + if x_end >= 8 { + while (x < this.width) and (x <= (x_end - 8)) and (curr_row.length() >= 36) and (prev_row.length() >= 32), + inv y < 0x4000, + { + avx_pix = util.make_m256i_slice256(a: curr_row[4 .. 36]) + avx_pix = avx_pix._mm256_add_epi8(b: avx_opaque) + avx_pix.store_slice256!(a: curr_row[4 .. 36]) + curr_row = curr_row[32 ..] + prev_row = prev_row[32 ..] + assert x < this.width via "a < b: a < c; c <= b"(c: this.width) + assert x < 0x4000 via "a < b: a < c; c <= b"(c: this.width) + x += 8 + } + } + + // AVX2 prefix-sum for mode 1 (L/left): 8 pixels at a time. + // Uses Hillis-Steele parallel scan within 128-bit lanes + // (2 shift+add steps) plus a cross-lane fixup. + } else if mode == 1 { + if (x_end >= 8) and (curr_row.length() >= 4) { + avx_carry = util.make_m256i_repeat_u32(a: curr_row.peek_u32le()) + while (x < this.width) and (x <= (x_end - 8)) and (curr_row.length() >= 36) and (prev_row.length() >= 32), + inv y < 0x4000, + { + avx_pix = util.make_m256i_slice256(a: curr_row[4 .. 36]) + + // Within-lane prefix sum (2 shift+add steps). + avx_prev = avx_pix._mm256_slli_si256(imm8: 4) + avx_pix = avx_pix._mm256_add_epi8(b: avx_prev) + avx_prev = avx_pix._mm256_slli_si256(imm8: 8) + avx_pix = avx_pix._mm256_add_epi8(b: avx_prev) + + // Cross-lane fixup: add lower lane total to upper lane. + avx_opaque = avx_pix._mm256_shuffle_epi32(imm8: 0xFF) + avx_opaque = avx_opaque._mm256_permute2x128_si256(b: avx_opaque, imm8: 0x08) + avx_pix = avx_pix._mm256_add_epi8(b: avx_opaque) + + // Add carry from previous iteration. + avx_pix = avx_pix._mm256_add_epi8(b: avx_carry) + + // Store result. + avx_pix.store_slice256!(a: curr_row[4 .. 36]) + + // Update carry: broadcast last pixel to all positions. + avx_carry = avx_pix._mm256_permute4x64_epi64(imm8: 0xFF) + avx_carry = avx_carry._mm256_shuffle_epi32(imm8: 0xFF) + + curr_row = curr_row[32 ..] + prev_row = prev_row[32 ..] + assert x < this.width via "a < b: a < c; c <= b"(c: this.width) + assert x < 0x4000 via "a < b: a < c; c <= b"(c: this.width) + x += 8 + } + } + + // Scalar tail for remaining pixels. + while (x < x_end) and (x_end <= this.width) and (curr_row.length() >= 8) and (prev_row.length() >= 4), + inv y < 0x4000, + { + assert x < this.width via "a < b: a < c; c <= b"(c: x_end) + assert x < 0x4000 via "a < b: a < c; c <= b"(c: this.width) + curr_row[4] ~mod+= curr_row[0] + curr_row[5] ~mod+= curr_row[1] + curr_row[6] ~mod+= curr_row[2] + curr_row[7] ~mod+= curr_row[3] + curr_row = curr_row[4 ..] + prev_row = prev_row[4 ..] + x += 1 + } + + // AVX2 path for top-only modes without averaging (2, 3, 4). + } else if (mode == 2) or (mode == 3) or (mode == 4) { + if x_end >= 8 { + while (x < this.width) and (x <= (x_end - 8)) and (curr_row.length() >= 36) and (prev_row.length() >= 40), + inv y < 0x4000, + { + avx_pix = util.make_m256i_slice256(a: curr_row[4 .. 36]) + + if mode == 2 { + avx_prev = util.make_m256i_slice256(a: prev_row[4 .. 36]) + } else if mode == 3 { + avx_prev = util.make_m256i_slice256(a: prev_row[8 .. 40]) + } else { + avx_prev = util.make_m256i_slice256(a: prev_row[0 .. 32]) + } + + avx_pix = avx_pix._mm256_add_epi8(b: avx_prev) + avx_pix.store_slice256!(a: curr_row[4 .. 36]) + curr_row = curr_row[32 ..] + prev_row = prev_row[32 ..] + assert x < this.width via "a < b: a < c; c <= b"(c: this.width) + assert x < 0x4000 via "a < b: a < c; c <= b"(c: this.width) + x += 8 + } + } + } + + // Scalar fallback for remaining pixels and non-vectorizable modes. + while (x < x_end) and (x_end <= this.width), + inv y < 0x4000, + { + assert x < this.width via "a < b: a < c; c <= b"(c: x_end) + assert x < 0x4000 via "a < b: a < c; c <= b"(c: this.width) + + if (prev_row.length() < 12) or (curr_row.length() < 8) { + break + } + + if mode == 0 { + curr_row[7] ~mod+= 0xFF + } else if mode == 1 { + curr_row[4] ~mod+= curr_row[0] + curr_row[5] ~mod+= curr_row[1] + curr_row[6] ~mod+= curr_row[2] + curr_row[7] ~mod+= curr_row[3] + } else if mode == 2 { + curr_row[4] ~mod+= prev_row[4] + curr_row[5] ~mod+= prev_row[5] + curr_row[6] ~mod+= prev_row[6] + curr_row[7] ~mod+= prev_row[7] + } else if mode == 3 { + curr_row[4] ~mod+= prev_row[8] + curr_row[5] ~mod+= prev_row[9] + curr_row[6] ~mod+= prev_row[10] + curr_row[7] ~mod+= prev_row[11] + } else if mode == 4 { + curr_row[4] ~mod+= prev_row[0] + curr_row[5] ~mod+= prev_row[1] + curr_row[6] ~mod+= prev_row[2] + curr_row[7] ~mod+= prev_row[3] + } else if mode == 5 { + l0 = ((curr_row[0] as base.u32) + (prev_row[8] as base.u32)) / 2 + l1 = ((curr_row[1] as base.u32) + (prev_row[9] as base.u32)) / 2 + l2 = ((curr_row[2] as base.u32) + (prev_row[10] as base.u32)) / 2 + l3 = ((curr_row[3] as base.u32) + (prev_row[11] as base.u32)) / 2 + curr_row[4] ~mod+= ((l0 + (prev_row[4] as base.u32)) / 2) as base.u8 + curr_row[5] ~mod+= ((l1 + (prev_row[5] as base.u32)) / 2) as base.u8 + curr_row[6] ~mod+= ((l2 + (prev_row[6] as base.u32)) / 2) as base.u8 + curr_row[7] ~mod+= ((l3 + (prev_row[7] as base.u32)) / 2) as base.u8 + } else if mode == 6 { + curr_row[4] ~mod+= (((curr_row[0] as base.u32) + (prev_row[0] as base.u32)) / 2) as base.u8 + curr_row[5] ~mod+= (((curr_row[1] as base.u32) + (prev_row[1] as base.u32)) / 2) as base.u8 + curr_row[6] ~mod+= (((curr_row[2] as base.u32) + (prev_row[2] as base.u32)) / 2) as base.u8 + curr_row[7] ~mod+= (((curr_row[3] as base.u32) + (prev_row[3] as base.u32)) / 2) as base.u8 + } else if mode == 7 { + curr_row[4] ~mod+= (((curr_row[0] as base.u32) + (prev_row[4] as base.u32)) / 2) as base.u8 + curr_row[5] ~mod+= (((curr_row[1] as base.u32) + (prev_row[5] as base.u32)) / 2) as base.u8 + curr_row[6] ~mod+= (((curr_row[2] as base.u32) + (prev_row[6] as base.u32)) / 2) as base.u8 + curr_row[7] ~mod+= (((curr_row[3] as base.u32) + (prev_row[7] as base.u32)) / 2) as base.u8 + } else if mode == 8 { + curr_row[4] ~mod+= (((prev_row[0] as base.u32) + (prev_row[4] as base.u32)) / 2) as base.u8 + curr_row[5] ~mod+= (((prev_row[1] as base.u32) + (prev_row[5] as base.u32)) / 2) as base.u8 + curr_row[6] ~mod+= (((prev_row[2] as base.u32) + (prev_row[6] as base.u32)) / 2) as base.u8 + curr_row[7] ~mod+= (((prev_row[3] as base.u32) + (prev_row[7] as base.u32)) / 2) as base.u8 + } else if mode == 9 { + curr_row[4] ~mod+= (((prev_row[4] as base.u32) + (prev_row[8] as base.u32)) / 2) as base.u8 + curr_row[5] ~mod+= (((prev_row[5] as base.u32) + (prev_row[9] as base.u32)) / 2) as base.u8 + curr_row[6] ~mod+= (((prev_row[6] as base.u32) + (prev_row[10] as base.u32)) / 2) as base.u8 + curr_row[7] ~mod+= (((prev_row[7] as base.u32) + (prev_row[11] as base.u32)) / 2) as base.u8 + } else if mode == 10 { + l0 = ((curr_row[0] as base.u32) + (prev_row[0] as base.u32)) / 2 + l1 = ((curr_row[1] as base.u32) + (prev_row[1] as base.u32)) / 2 + l2 = ((curr_row[2] as base.u32) + (prev_row[2] as base.u32)) / 2 + l3 = ((curr_row[3] as base.u32) + (prev_row[3] as base.u32)) / 2 + t0 = ((prev_row[4] as base.u32) + (prev_row[8] as base.u32)) / 2 + t1 = ((prev_row[5] as base.u32) + (prev_row[9] as base.u32)) / 2 + t2 = ((prev_row[6] as base.u32) + (prev_row[10] as base.u32)) / 2 + t3 = ((prev_row[7] as base.u32) + (prev_row[11] as base.u32)) / 2 + curr_row[4] ~mod+= ((l0 + t0) / 2) as base.u8 + curr_row[5] ~mod+= ((l1 + t1) / 2) as base.u8 + curr_row[6] ~mod+= ((l2 + t2) / 2) as base.u8 + curr_row[7] ~mod+= ((l3 + t3) / 2) as base.u8 + } else if mode == 11 { + l0 = curr_row[0] as base.u32 + l1 = curr_row[1] as base.u32 + l2 = curr_row[2] as base.u32 + l3 = curr_row[3] as base.u32 + c0 = prev_row[0] as base.u32 + c1 = prev_row[1] as base.u32 + c2 = prev_row[2] as base.u32 + c3 = prev_row[3] as base.u32 + t0 = prev_row[4] as base.u32 + t1 = prev_row[5] as base.u32 + t2 = prev_row[6] as base.u32 + t3 = prev_row[7] as base.u32 + sum_l = this.absolute_difference(a: c0, b: t0) + + this.absolute_difference(a: c1, b: t1) + + this.absolute_difference(a: c2, b: t2) + + this.absolute_difference(a: c3, b: t3) + sum_t = this.absolute_difference(a: c0, b: l0) + + this.absolute_difference(a: c1, b: l1) + + this.absolute_difference(a: c2, b: l2) + + this.absolute_difference(a: c3, b: l3) + if sum_l < sum_t { + curr_row[4] ~mod+= l0 as base.u8 + curr_row[5] ~mod+= l1 as base.u8 + curr_row[6] ~mod+= l2 as base.u8 + curr_row[7] ~mod+= l3 as base.u8 + } else { + curr_row[4] ~mod+= t0 as base.u8 + curr_row[5] ~mod+= t1 as base.u8 + curr_row[6] ~mod+= t2 as base.u8 + curr_row[7] ~mod+= t3 as base.u8 + } + } else if mode == 12 { + curr_row[4] ~mod+= this.mode12(l: curr_row[0], t: prev_row[4], tl: prev_row[0]) + curr_row[5] ~mod+= this.mode12(l: curr_row[1], t: prev_row[5], tl: prev_row[1]) + curr_row[6] ~mod+= this.mode12(l: curr_row[2], t: prev_row[6], tl: prev_row[2]) + curr_row[7] ~mod+= this.mode12(l: curr_row[3], t: prev_row[7], tl: prev_row[3]) + } else if mode == 13 { + curr_row[4] ~mod+= this.mode13(l: curr_row[0], t: prev_row[4], tl: prev_row[0]) + curr_row[5] ~mod+= this.mode13(l: curr_row[1], t: prev_row[5], tl: prev_row[1]) + curr_row[6] ~mod+= this.mode13(l: curr_row[2], t: prev_row[6], tl: prev_row[2]) + curr_row[7] ~mod+= this.mode13(l: curr_row[3], t: prev_row[7], tl: prev_row[3]) + } + + curr_row = curr_row[4 ..] + prev_row = prev_row[4 ..] + x += 1 + } + } + + y += 1 + } +} diff --git a/std/webp/decode_webp.wuffs b/std/webp/decode_webp.wuffs index 0589aabf8..33d5ca3e8 100644 --- a/std/webp/decode_webp.wuffs +++ b/std/webp/decode_webp.wuffs @@ -20,11 +20,12 @@ pub status "#bad transform" pub status "#short chunk" pub status "#truncated input" pub status "#unsupported number of Huffman groups" -pub status "#unsupported transform after color indexing transform" pub status "#unsupported WebP file" pri status "#internal error: inconsistent Huffman code" +pri status "#internal error: inconsistent Huffman decoder state" pri status "#internal error: inconsistent dst buffer" +pri status "#internal error: inconsistent I/O" pri status "#internal error: inconsistent n_bits" pub const DECODER_WORKBUF_LEN_MAX_INCL_WORST_CASE : base.u64 = 0 @@ -43,6 +44,12 @@ pub struct decoder? implements base.image_decoder( sub_chunk_has_padding : base.bool, is_vp8_lossy : base.bool, + is_vp8x : base.bool, + has_alpha : base.bool, + + // VP8X-calculated workbuf length (VP8 Y/U/V planes + optional alpha). + vp8x_workbuf_len : base.u64, + vp8l_alpha_workbuf_len : base.u64, frame_config_io_position : base.u64, @@ -52,18 +59,33 @@ pub struct decoder? implements base.image_decoder( bits : base.u32, n_bits : base.u32[..= 31], + // Pixel decode position, shared between fast and slow paths. + pix_p : base.u64, + pix_x : base.u32, + pix_y : base.u32, + pix_cc_p : base.u64, + seen_transform : array[4] base.bool, transform_type : array[4] base.u8[..= 3], transform_tile_size_log2 : array[4] base.u8[..= 9], n_transforms : base.u32[..= 4], + fuse_subtract_green : base.bool, color_cache_bits : base.u32[..= 11], overall_color_cache_bits : base.u32[..= 11], overall_tile_size_log2 : base.u32[..= 9], - overall_n_huffman_groups : base.u32[..= 256], + overall_n_huffman_groups : base.u32[..= 1024], + + // Compaction state for sparse Huffman group indices. + // When the entropy image has max_index >= 1024 but <= 1024 distinct + // groups, indices are remapped to dense 0..K-1 in the tile data. + hg_compacted : base.bool, + hg_bitstream_groups : base.u32[..= 0x1_0000], + hg_n_sorted : base.u32[..= 1024], ht_n_symbols : base.u32[..= 2328], ht_code_lengths_remaining : base.u32, + ht_next_top : base.u32[..= 4096], color_indexing_palette_size : base.u32[..= 256], color_indexing_width : base.u32[..= 0x4000], @@ -101,6 +123,9 @@ pub struct decoder? implements base.image_decoder( palette : array[4 * 256] base.u8, color_cache : array[2048] base.u32, + // Sorted distinct original HG indices for compacted decode. + hg_sorted : array[1024] base.u16, + codes : array[2328] base.u16, code_lengths : array[2328] base.u16, @@ -111,34 +136,35 @@ pub struct decoder? implements base.image_decoder( // - 1SSS_SSSS_SSSS_SSSS Leaf node (symbol is SSS, it may be zero). code_lengths_huffman_nodes : array[37] base.u16, - // A Huffman group has five (5) Huffman trees. - // - // Start .. End Size - // 0x064C .. 0x187B 0x122F (1) Green, back-ref length, color cache. - // 0x0000 .. 0x01FF 0x01FF (2) Red. - // 0x01FF .. 0x03FE 0x01FF (3) Blue. - // 0x03FE .. 0x05FD 0x01FF (4) Alpha. - // 0x05FD .. 0x064C 0x004F (5) Back-ref distance. - // - // The Green+etc tree is last (in terms of start offset) because it's - // by far the largest (worst case) and its color cache component has - // variable length. + // Two-level Huffman lookup tables. A Huffman group has five (5) + // Huffman trees packed sequentially into one 4096-entry array: + // (0) Green + back-ref length + color cache + // (1) Red + // (2) Blue + // (3) Alpha + // (4) Back-ref distance // - // 0x122F = 4655 = ((2 * 2328) - 1) and the same 2328 turns up in func - // decoder.decode_huffman_tree. + // Each base.u32 entry is either: + // - Leaf: 0x8000_0000 | (symbol << 8) | n_bits + // - Redirect: 0x1000_0008 | (offset << 8) | (extra_bits << 4) // - // 2328 is (256 + 24 + 2048), combining 256 Green values, 24 back-ref - // lengths and up to 2048 = (1 << 11) color cache keys. + // Primary table: 8-bit index (256 entries per tree). + // Secondary tables: variable size, for codes > 8 bits. // - // 0x01FF = 511 = ((2 * 256) - 1) is for 256 Red (Blue, Alpha) values. - // - // 0x004F = 79 = ((2 * 40) - 1) is for 40 back-ref distances. - // - // 0x187B = 6267 and (0x187B * sizeof(u16)) = 12534. Overall, this - // field takes 0x30_F600 = 3_208704 bytes of memory. - // - // The base.u16's bits are the same as for code_lengths_huffman_nodes. - huffman_nodes : array[256] array[0x187B] base.u16, + // huffman_table_base_offsets[hg][ht] is the starting index of + // tree ht's primary table within huffman_tables[hg]. + // Index 1024 is a scratch slot for discarding unused trees + // during compacted Huffman group decode. + huffman_tables : array[1025] array[4096] base.u32, + huffman_table_base_offsets : array[1025] array[5] base.u16, + + // Per Huffman group: 0=normal, 1=trivial_literal (R/B/A are + // single-symbol), 2=trivial_code (all 4 are single-symbol). + hg_trivial : array[1025] base.u8, + // Pre-computed (alpha << 24) | (red << 16) | blue for trivial groups. + // For trivial_code, also includes (green << 8). + hg_literal_arb : array[1025] base.u32, + ) pub func decoder.get_quirk(key: base.u32) base.u64 { @@ -199,7 +225,7 @@ pri func decoder.do_decode_image_config?(dst: nptr base.image_config, src: base. this.frame_config_io_position = args.src.position() - if (not this.is_vp8_lossy) and (args.dst <> nullptr) { + if ((not this.is_vp8_lossy) or this.is_vp8x) and (args.dst <> nullptr) { args.dst.set!( pixfmt: this.pixfmt, pixsub: 0, @@ -213,9 +239,11 @@ pri func decoder.do_decode_image_config?(dst: nptr base.image_config, src: base. } pri func decoder.do_decode_image_config_limited?(dst: nptr base.image_config, src: base.io_reader) { - var c32 : base.u32 - var r_mark : base.u64 - var status : base.status + var c32 : base.u32 + var r_mark : base.u64 + var status : base.status + var flags : base.u8 + var mb_width : base.u32[..= 0x400] c32 = args.src.read_u32le?() if c32 <> 'WEBP'le { @@ -228,7 +256,7 @@ pri func decoder.do_decode_image_config_limited?(dst: nptr base.image_config, sr } else if c32 == 'VP8L'le { // No-op. } else if c32 == 'VP8X'le { - return "#unsupported WebP file" + this.is_vp8x = true } else { return "#bad header" } @@ -239,6 +267,62 @@ pri func decoder.do_decode_image_config_limited?(dst: nptr base.image_config, sr } this.sub_chunk_has_padding = (this.sub_chunk_length & 1) <> 0 + if this.is_vp8x { + // Parse VP8X extended header (10 bytes). + if this.sub_chunk_length < 10 { + return "#bad header" + } + flags = args.src.read_u8?() + this.has_alpha = ((flags & 0x10) <> 0) + // Skip 3 reserved bytes. + c32 = args.src.read_u24le_as_u32?() + // Canvas width (24-bit LE) + 1. + c32 = args.src.read_u24le_as_u32?() + this.width = (c32 + 1) & 0x3FFF + // Canvas height (24-bit LE) + 1. + c32 = args.src.read_u24le_as_u32?() + this.height = (c32 + 1) & 0x3FFF + + if (this.width == 0) or (this.height == 0) { + return "#bad header" + } + + // Compute VP8X workbuf length. + // VP8 needs mb_width * mb_height * 384. + // VP8L alpha decode needs 4*W*H + 4*transform_size. + // workbuf = max(vp8_planes, vp8l_decode) + alpha_plane. + mb_width = (this.width + 15) / 16 + this.vp8x_workbuf_len = (mb_width as base.u64) * ((((this.height + 15) / 16) * 384) as base.u64) + if this.has_alpha { + // VP8L alpha decode space: 4*W*H + 4*ceil(W/4)*ceil(H/4)*4. + this.vp8l_alpha_workbuf_len = + (4 * (this.width as base.u64) * (this.height as base.u64)) + + (16 * ((((this.width + 3) >> 2) * ((this.height + 3) >> 2)) as base.u64)) + if this.vp8l_alpha_workbuf_len > this.vp8x_workbuf_len { + this.vp8x_workbuf_len = this.vp8l_alpha_workbuf_len + } + this.vp8x_workbuf_len ~mod+= (this.width as base.u64) * (this.height as base.u64) + } + + // Set pixel format based on alpha presence. + if this.has_alpha { + this.pixfmt = base.PIXEL_FORMAT__BGRA_NONPREMUL + } else { + this.pixfmt = base.PIXEL_FORMAT__BGRX + } + + // Skip remaining VP8X chunk data + padding. Sub-chunk iteration + // is deferred to decode_frame so we can process ALPH before VP8. + this.sub_chunk_length ~sat-= 10 + if this.sub_chunk_length > 0 { + args.src.skip_u32?(n: this.sub_chunk_length) + } + if this.sub_chunk_has_padding { + args.src.skip_u32?(n: 1) + } + return ok + } + while true { io_limit (io: args.src, limit: this.sub_chunk_length as base.u64) { r_mark = args.src.mark() @@ -275,10 +359,10 @@ pri func decoder.do_decode_image_config_limited_vp8l?(src: base.io_reader) { c32 >>= 14 this.height = (c32 & 0x3FFF) + 1 c32 >>= 14 - this.pixfmt = base.PIXEL_FORMAT__BGRX - if (c32 & 1) <> 0 { - this.pixfmt = base.PIXEL_FORMAT__BGRA_NONPREMUL - } + // VP8L always stores full BGRA data. For images without alpha, all valid + // pixels have alpha=0xFF. Use BGRA_NONPREMUL (not BGRX) so that zeroed + // palette entries for out-of-range color indices preserve alpha=0. + this.pixfmt = base.PIXEL_FORMAT__BGRA_NONPREMUL c32 >>= 1 if c32 <> 0 { return "#bad header" @@ -295,10 +379,12 @@ pub func decoder.decode_frame_config?(dst: nptr base.frame_config, src: base.io_ var status : base.status while true { - if this.is_vp8_lossy { - status =? this.vp8.decode_frame_config?(dst: args.dst, src: args.src) - } else { + if this.is_vp8x or (not this.is_vp8_lossy) { + // VP8X and VP8L: WebP decoder handles frame_config. status =? this.do_decode_frame_config?(dst: args.dst, src: args.src) + } else { + // VP8 (non-VP8X): delegate to VP8 decoder. + status =? this.vp8.decode_frame_config?(dst: args.dst, src: args.src) } if (status == base."$short read") and args.src.is_closed() { return "#truncated input" @@ -346,10 +432,18 @@ pri func decoder.do_decode_frame_config?(dst: nptr base.frame_config, src: base. pub func decoder.decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reader, blend: base.pixel_blend, workbuf: slice base.u8, opts: nptr base.decode_frame_options) { var status : base.status + var r_mark : base.u64 while true { - if this.is_vp8_lossy { - status =? this.vp8.decode_frame?(dst: args.dst, src: args.src, blend: args.blend, workbuf: args.workbuf, opts: args.opts) + if this.is_vp8x { + status =? this.do_decode_frame_vp8x?(dst: args.dst, src: args.src, blend: args.blend, workbuf: args.workbuf, opts: args.opts) + } else if this.is_vp8_lossy { + io_limit (io: args.src, limit: this.sub_chunk_length as base.u64) { + r_mark = args.src.mark() + status =? this.vp8.decode_frame?(dst: args.dst, src: args.src, blend: args.blend, workbuf: args.workbuf, opts: args.opts) + this.sub_chunk_length ~sat-= + (args.src.count_since(mark: r_mark) & 0xFFFF_FFFF) as base.u32 + } } else { status =? this.do_decode_frame?(dst: args.dst, src: args.src, blend: args.blend, workbuf: args.workbuf, opts: args.opts) } @@ -360,10 +454,389 @@ pub func decoder.decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reader, } } +pri func decoder.do_decode_frame_vp8x?(dst: ptr base.pixel_buffer, src: base.io_reader, blend: base.pixel_blend, workbuf: slice base.u8, opts: nptr base.decode_frame_options) { + var c32 : base.u32 + var chunk_length : base.u32 + var chunk_padding : base.bool + var status : base.status + var r_mark : base.u64 + var alpha_offset : base.u64 + var alph_length : base.u32 + var alph_header : base.u8 + var alph_comp : base.u8 + var alph_filter : base.u8 + var alpha_i : base.u64 + var alpha_n : base.u64 + var y : base.u32 + var x : base.u32 + var tab : table base.u8 + var row : slice base.u8 + var row_idx : base.u64 + + if this.call_sequence == 0x40 { + // No-op. + } else if this.call_sequence < 0x40 { + this.do_decode_frame_config?(dst: nullptr, src: args.src) + } else { + return base."@end of data" + } + + // Iterate sub-chunks to find ALPH and VP8/VP8L. + // The VP8 decoder's workbuf is used for Y/U/V planes. For VP8X with alpha, + // the alpha plane is stored after the V plane in the workbuf. + alpha_offset = this.vp8x_workbuf_len + if this.has_alpha { + alpha_offset ~mod-= (this.width as base.u64) * (this.height as base.u64) + } + + while true { + c32 = args.src.read_u32le?() + chunk_length = args.src.read_u32le?() + chunk_padding = ((chunk_length & 1) <> 0) + + if c32 == 'ALPH'le { + // Decode ALPH chunk. + if (chunk_length < 1) or (not this.has_alpha) { + args.src.skip_u32?(n: chunk_length) + if chunk_padding { + args.src.skip_u32?(n: 1) + } + continue + } + // Read ALPH header byte. + alph_header = args.src.read_u8?() + alph_comp = alph_header & 0x03 + alph_filter = (alph_header >> 2) & 0x03 + alph_length = chunk_length ~sat- 1 + + if alph_comp == 0 { + // Uncompressed alpha: read raw bytes into workbuf alpha region. + alpha_n = (this.width as base.u64) * (this.height as base.u64) + alpha_i = 0 + while alpha_i < alpha_n { + if alph_length == 0 { + break + } + if (alpha_offset ~mod+ alpha_i) < args.workbuf.length() { + args.workbuf[alpha_offset ~mod+ alpha_i] = args.src.read_u8?() + } + alpha_i ~mod+= 1 + alph_length ~sat-= 1 + } + } else { + // VP8L-compressed alpha: decode VP8L image, extract green channel. + // VP8L alpha within ALPH has no 0x2F signature or image descriptor. + // The VP8L bitstream starts directly with transform data. + // Set up VP8L transform offsets for workbuf. + this.workbuf_offset_for_transform[0] = 4 * this.width * this.height + this.workbuf_offset_for_transform[1] = this.workbuf_offset_for_transform[0] + + (4 * ((this.width + 3) >> 2) * ((this.height + 3) >> 2)) + this.workbuf_offset_for_transform[2] = this.workbuf_offset_for_transform[1] + + (4 * ((this.width + 3) >> 2) * ((this.height + 3) >> 2)) + this.workbuf_offset_for_transform[3] = this.workbuf_offset_for_transform[2] + + (4 * ((this.width + 3) >> 2) * ((this.height + 3) >> 2)) + + // Decode VP8L pixels. Use io_limit to prevent reading + // past the ALPH chunk into the following VP8 chunk. + this.call_sequence = 0x40 + while true { + io_limit (io: args.src, limit: alph_length as base.u64) { + r_mark = args.src.mark() + status =? this.do_decode_frame?(dst: args.dst, src: args.src, blend: args.blend, workbuf: args.workbuf, opts: args.opts) + alph_length ~sat-= (args.src.count_since(mark: r_mark) & 0xFFFF_FFFF) as base.u32 + } + if status.is_ok() { + break + } else if not status.is_suspension() { + return status + } + yield? status + } + + // Extract green channel from decoded VP8L pixels in workbuf. + // VP8L pixels are BGRA at workbuf[0 .. 4*width*height]. + // Green channel is at byte offset 1 in each 4-byte pixel. + alpha_n = (this.width as base.u64) * (this.height as base.u64) + alpha_i = 0 + row_idx = 1 // Start at green channel of first pixel. + while alpha_i < alpha_n { + if ((alpha_offset ~mod+ alpha_i) < args.workbuf.length()) and + (row_idx < args.workbuf.length()) { + args.workbuf[alpha_offset ~mod+ alpha_i] = + args.workbuf[row_idx] + } + alpha_i ~mod+= 1 + row_idx ~mod+= 4 + } + + // Reset call_sequence for VP8 decode phase. + this.call_sequence = 0x40 + } + + // Apply alpha filter (reverse prediction). + if alph_filter == 1 { + // Horizontal: alpha[i] += alpha[i-1] + this.apply_alpha_filter_horizontal!(workbuf: args.workbuf, alpha_offset: alpha_offset) + } else if alph_filter == 2 { + // Vertical: alpha[i] += alpha[i-width] + this.apply_alpha_filter_vertical!(workbuf: args.workbuf, alpha_offset: alpha_offset) + } else if alph_filter == 3 { + // Gradient: alpha[i] += clamp(left + above - above_left) + this.apply_alpha_filter_gradient!(workbuf: args.workbuf, alpha_offset: alpha_offset) + } + + // Skip remaining ALPH data + padding. + if alph_length > 0 { + args.src.skip_u32?(n: alph_length) + } + if chunk_padding { + args.src.skip_u32?(n: 1) + } + + } else if (c32 == 'VP8 'le) or (c32 == 'VP8L'le) { + this.is_vp8_lossy = (c32 == 'VP8 'le) + this.sub_chunk_length = chunk_length + this.sub_chunk_has_padding = chunk_padding + break + + } else { + // Skip unknown chunks (ICCP, EXIF, XMP, etc.). + args.src.skip_u32?(n: chunk_length) + if chunk_padding { + args.src.skip_u32?(n: 1) + } + } + } + + if this.is_vp8_lossy { + // Call VP8 decode pipeline: image_config, frame_config, frame. + while true { + io_limit (io: args.src, limit: this.sub_chunk_length as base.u64) { + r_mark = args.src.mark() + status =? this.vp8.decode_image_config?(dst: nullptr, src: args.src) + this.sub_chunk_length ~sat-= + (args.src.count_since(mark: r_mark) & 0xFFFF_FFFF) as base.u32 + } + if status.is_ok() { + break + } else if not status.is_suspension() { + return status + } + yield? status + } + + while true { + status =? this.vp8.decode_frame_config?(dst: nullptr, src: args.src) + if status.is_ok() { + break + } else if not status.is_suspension() { + return status + } + yield? status + } + + while true { + io_limit (io: args.src, limit: this.sub_chunk_length as base.u64) { + r_mark = args.src.mark() + status =? this.vp8.decode_frame?(dst: args.dst, src: args.src, blend: args.blend, workbuf: args.workbuf, opts: args.opts) + this.sub_chunk_length ~sat-= + (args.src.count_since(mark: r_mark) & 0xFFFF_FFFF) as base.u32 + } + if status.is_ok() { + break + } else if not status.is_suspension() { + return status + } + yield? status + } + } else { + // VP8L inside VP8X: use the existing VP8L decode path. + // TODO: implement VP8L inside VP8X. + return "#unsupported WebP file" + } + + // Apply alpha channel to the decoded pixel buffer. + if this.has_alpha { + tab = args.dst.plane(p: 0) + y = 0 + while y < this.height { + row = tab.row_u32(y: y) + x = 0 + while x < this.width, + inv y < this.height, + { + // BGRA pixel layout: [B, G, R, A] at offset x*4. + // Set the alpha byte (offset 3) from the decoded alpha plane. + row_idx = ((x as base.u64) * 4) + 3 + alpha_i = (alpha_offset ~mod+ ((y as base.u64) * (this.width as base.u64))) ~mod+ (x as base.u64) + if (row_idx < row.length()) and (alpha_i < args.workbuf.length()) { + row[row_idx] = args.workbuf[alpha_i] + } + x ~mod+= 1 + } + y ~mod+= 1 + } + } + + this.call_sequence = 0x60 +} + +pri func decoder.apply_alpha_filter_horizontal!(workbuf: slice base.u8, alpha_offset: base.u64) { + var y : base.u32 + var x : base.u32 + var i : base.u64 + var prev : base.u8 + + y = 0 + while y < this.height { + // For rows after the first, initialize prev with the first pixel of the + // previous row's output (matching libwebp's ALPHFilterRestoreH behavior). + prev = 0 + if y > 0 { + i = args.alpha_offset ~mod+ (((y ~mod- 1) as base.u64) * (this.width as base.u64)) + if i < args.workbuf.length() { + prev = args.workbuf[i] + } + } + x = 0 + while x < this.width, + inv y < this.height, + { + i = (args.alpha_offset ~mod+ ((y as base.u64) * (this.width as base.u64))) ~mod+ (x as base.u64) + if i < args.workbuf.length() { + args.workbuf[i] = (args.workbuf[i] ~mod+ prev) as base.u8 + prev = args.workbuf[i] + } + x ~mod+= 1 + } + y ~mod+= 1 + } +} + +pri func decoder.apply_alpha_filter_vertical!(workbuf: slice base.u8, alpha_offset: base.u64) { + var y : base.u32 + var x : base.u32 + var i : base.u64 + var prev : base.u8 + + // Row 0: apply horizontal unfilter (matching libwebp's VerticalUnfilter + // which delegates to HorizontalUnfilter when prev_row is NULL). + prev = 0 + x = 0 + while x < this.width { + i = args.alpha_offset ~mod+ (x as base.u64) + if i < args.workbuf.length() { + args.workbuf[i] = (args.workbuf[i] ~mod+ prev) as base.u8 + prev = args.workbuf[i] + } + x ~mod+= 1 + } + + // Rows 1+: apply vertical unfilter (add pixel from above row). + y = 1 + while y < this.height { + x = 0 + while x < this.width, + inv y < this.height, + { + i = (args.alpha_offset ~mod+ (((y ~mod- 1) as base.u64) * (this.width as base.u64))) ~mod+ (x as base.u64) + prev = 0 + if i < args.workbuf.length() { + prev = args.workbuf[i] + } + i = (args.alpha_offset ~mod+ ((y as base.u64) * (this.width as base.u64))) ~mod+ (x as base.u64) + if i < args.workbuf.length() { + args.workbuf[i] = (args.workbuf[i] ~mod+ prev) as base.u8 + } + x ~mod+= 1 + } + y ~mod+= 1 + } +} + +pri func decoder.apply_alpha_filter_gradient!(workbuf: slice base.u8, alpha_offset: base.u64) { + var y : base.u32 + var x : base.u32 + var i : base.u64 + var prev : base.u8 + var left : base.u32 + var above : base.u32 + var tl : base.u32 + var pred : base.u32 + + // Row 0: apply horizontal unfilter (matching libwebp's GradientUnfilter + // which delegates to HorizontalUnfilter when prev_row is NULL). + prev = 0 + x = 0 + while x < this.width { + i = args.alpha_offset ~mod+ (x as base.u64) + if i < args.workbuf.length() { + args.workbuf[i] = (args.workbuf[i] ~mod+ prev) as base.u8 + prev = args.workbuf[i] + } + x ~mod+= 1 + } + + // Rows 1+: apply gradient unfilter. + y = 1 + while y < this.height { + // First pixel of each row: pred = above (left=0, tl=0, so grad=above). + i = (args.alpha_offset ~mod+ (((y ~mod- 1) as base.u64) * (this.width as base.u64))) + above = 0 + if i < args.workbuf.length() { + above = args.workbuf[i] as base.u32 + } + i = (args.alpha_offset ~mod+ ((y as base.u64) * (this.width as base.u64))) + if i < args.workbuf.length() { + args.workbuf[i] = (((args.workbuf[i] as base.u32) ~mod+ above) & 0xFF) as base.u8 + } + + x = 1 + while x < this.width, + inv y < this.height, + { + // left = output[y][x-1] + left = 0 + i = (args.alpha_offset ~mod+ ((y as base.u64) * (this.width as base.u64))) ~mod+ ((x ~mod- 1) as base.u64) + if i < args.workbuf.length() { + left = args.workbuf[i] as base.u32 + } + // above = output[y-1][x] + above = 0 + i = (args.alpha_offset ~mod+ (((y ~mod- 1) as base.u64) * (this.width as base.u64))) ~mod+ (x as base.u64) + if i < args.workbuf.length() { + above = args.workbuf[i] as base.u32 + } + // tl = output[y-1][x-1] + tl = 0 + i = (args.alpha_offset ~mod+ (((y ~mod- 1) as base.u64) * (this.width as base.u64))) ~mod+ ((x ~mod- 1) as base.u64) + if i < args.workbuf.length() { + tl = args.workbuf[i] as base.u32 + } + // clamp(left + above - tl, 0, 255) + pred = (left ~mod+ above) ~mod- tl + if pred > 255 { + if (left ~mod+ above) < tl { + pred = 0 + } else { + pred = 255 + } + } + i = (args.alpha_offset ~mod+ ((y as base.u64) * (this.width as base.u64))) ~mod+ (x as base.u64) + if i < args.workbuf.length() { + args.workbuf[i] = (((args.workbuf[i] as base.u32) ~mod+ pred) & 0xFF) as base.u8 + } + x ~mod+= 1 + } + y ~mod+= 1 + } +} + pri func decoder.do_decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reader, blend: base.pixel_blend, workbuf: slice base.u8, opts: nptr base.decode_frame_options) { var c8 : base.u8 var has_more : base.u32[..= 1] var width : base.u32[..= 0x4000] + var saved_width : base.u32[..= 0x4000] var dst : slice base.u8 var tile_data : roslice base.u8 var status : base.status @@ -413,7 +886,7 @@ pri func decoder.do_decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reade this.overall_color_cache_bits = this.color_cache_bits this.decode_hg_table?(src: args.src, width: width, workbuf: args.workbuf) this.color_cache_bits = this.overall_color_cache_bits - this.decode_huffman_groups?(src: args.src, n_huffman_groups: this.overall_n_huffman_groups) + this.decode_huffman_groups?(src: args.src, n_huffman_groups: this.overall_n_huffman_groups, n_bitstream_groups: this.hg_bitstream_groups) while true { if ((this.workbuf_offset_for_color_indexing as base.u64) > (this.workbuf_offset_for_transform[0] as base.u64)) or @@ -446,6 +919,30 @@ pri func decoder.do_decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reade } pix = args.workbuf[.. (this.workbuf_offset_for_transform[0] as base.u64)] + choose apply_transform_predictor = [ + apply_transform_predictor_x86_avx2] + choose apply_transform_cross_color = [ + apply_transform_cross_color_x86_avx2] + choose apply_transform_subtract_green = [ + apply_transform_subtract_green_x86_avx2] + + // When color indexing is present with packing (palette <= 16), transforms + // applied before color_indexing inverse operate on the packed-width data, + // which lives at workbuf[workbuf_offset_for_color_indexing..]. Temporarily + // set this.width to color_indexing_width and point pix at the packed region. + saved_width = this.width + if this.seen_transform[3] { + this.width = this.color_indexing_width + if ((this.workbuf_offset_for_color_indexing as base.u64) <= + (this.workbuf_offset_for_transform[0] as base.u64)) and + ((this.workbuf_offset_for_transform[0] as base.u64) <= + args.workbuf.length()) { + pix = args.workbuf[ + (this.workbuf_offset_for_color_indexing as base.u64) .. + (this.workbuf_offset_for_transform[0] as base.u64)] + } + } + which = this.n_transforms while which > 0 { which -= 1 @@ -463,14 +960,30 @@ pri func decoder.do_decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reade if transform_type == 0 { this.apply_transform_predictor!(pix: pix, tile_data: tile_data) } else if transform_type == 1 { + // Fuse subtract_green into cross_color if it's the next transform. + if which > 0 { + if this.transform_type[which - 1] == 2 { + this.fuse_subtract_green = true + which -= 1 + } + } this.apply_transform_cross_color!(pix: pix, tile_data: tile_data) + this.fuse_subtract_green = false } else if transform_type == 2 { this.apply_transform_subtract_green!(pix: pix) } else { + // Restore full width and full pix buffer before color_indexing + // inverse (it reads packed data from workbuf_offset_for_color_indexing + // and writes expanded rows using this.width). + this.width = saved_width + if (this.workbuf_offset_for_transform[0] as base.u64) <= args.workbuf.length() { + pix = args.workbuf[.. (this.workbuf_offset_for_transform[0] as base.u64)] + } this.apply_transform_color_indexing!(pix: pix) width = this.width } } + this.width = saved_width status = this.swizzle!( dst: args.dst, @@ -484,11 +997,12 @@ pri func decoder.do_decode_frame?(dst: ptr base.pixel_buffer, src: base.io_reade } pri func decoder.decode_transform?(src: base.io_reader, workbuf: slice base.u8) { - var status : base.status - var c8 : base.u8 - var transform_type : base.u32[..= 3] - var tile_size_log2 : base.u32[..= 9] - var p : slice base.u8 + var status : base.status + var c8 : base.u8 + var transform_type : base.u32[..= 3] + var tile_size_log2 : base.u32[..= 9] + var effective_width : base.u32[..= 0x4000] + var p : slice base.u8 if this.n_bits < 2 { c8 = args.src.read_u8?() @@ -505,8 +1019,6 @@ pri func decoder.decode_transform?(src: base.io_reader, workbuf: slice base.u8) if this.seen_transform[transform_type] or (this.n_transforms >= 4) { return "#bad transform" - } else if this.seen_transform[3] { - return "#unsupported transform after color indexing transform" } this.seen_transform[transform_type] = true this.transform_type[this.n_transforms] = transform_type as base.u8 @@ -527,8 +1039,14 @@ pri func decoder.decode_transform?(src: base.io_reader, workbuf: slice base.u8) this.bits >>= 3 this.n_bits -= 3 + // When color indexing was parsed earlier, use the packed width. + effective_width = this.width + if this.seen_transform[3] { + effective_width = this.color_indexing_width + } + this.decode_color_cache_parameters?(src: args.src) - this.decode_huffman_groups?(src: args.src, n_huffman_groups: 1) + this.decode_huffman_groups?(src: args.src, n_huffman_groups: 1, n_bitstream_groups: 1) while true, inv transform_type < 2, @@ -543,7 +1061,7 @@ pri func decoder.decode_transform?(src: base.io_reader, workbuf: slice base.u8) (this.workbuf_offset_for_transform[transform_type + 1] as base.u64) .. (this.workbuf_offset_for_transform[transform_type + 2] as base.u64)], src: args.src, - width: (this.width + (((1 as base.u32) << tile_size_log2) - 1)) >> tile_size_log2, + width: (effective_width + (((1 as base.u32) << tile_size_log2) - 1)) >> tile_size_log2, height: (this.height + (((1 as base.u32) << tile_size_log2) - 1)) >> tile_size_log2, tile_data: this.util.empty_slice_u8(), tile_size_log2: 0) @@ -589,7 +1107,7 @@ pri func decoder.decode_transform?(src: base.io_reader, workbuf: slice base.u8) } this.decode_color_cache_parameters?(src: args.src) - this.decode_huffman_groups?(src: args.src, n_huffman_groups: 1) + this.decode_huffman_groups?(src: args.src, n_huffman_groups: 1, n_bitstream_groups: 1) this.decode_pixels?( dst: this.palette[.. 4 * this.color_indexing_palette_size], @@ -656,7 +1174,19 @@ pri func decoder.decode_hg_table?(src: base.io_reader, width: base.u32[..= 0x400 var hg_pixels : slice base.u8 var n : base.u64 var p : roslice base.u8 - var hg_plus_1 : base.u32[..= 256] + var hg_raw : base.u32 + var max_hg : base.u32 + var k : base.u32[..= 1024] + var j : base.u32[..= 1024] + var found : base.bool + var sort_i : base.u32[..= 1024] + var sort_j : base.u32[..= 1024] + var sort_val : base.u16 + var q : slice base.u8 + var lo : base.u32[..= 1024] + var hi : base.u32[..= 1024] + var mid : base.u32[..= 1024] + var compact : base.u32 if this.n_bits < 1 { c8 = args.src.read_u8?() @@ -670,6 +1200,8 @@ pri func decoder.decode_hg_table?(src: base.io_reader, width: base.u32[..= 0x400 if use_hg_table == 0 { this.overall_n_huffman_groups = 1 + this.hg_compacted = false + this.hg_bitstream_groups = 1 this.overall_tile_size_log2 = 0 if ((this.workbuf_offset_for_transform[0] as base.u64) > (this.workbuf_offset_for_transform[1] as base.u64)) or ((this.workbuf_offset_for_transform[1] as base.u64) > args.workbuf.length()) { @@ -703,7 +1235,7 @@ pri func decoder.decode_hg_table?(src: base.io_reader, width: base.u32[..= 0x400 this.overall_tile_size_log2 = tile_size_log2 this.decode_color_cache_parameters?(src: args.src) - this.decode_huffman_groups?(src: args.src, n_huffman_groups: 1) + this.decode_huffman_groups?(src: args.src, n_huffman_groups: 1, n_bitstream_groups: 1) while true, inv tile_size_log2 >= 2, { @@ -726,7 +1258,6 @@ pri func decoder.decode_hg_table?(src: base.io_reader, width: base.u32[..= 0x400 yield? status } - this.overall_n_huffman_groups = 1 if ((this.workbuf_offset_for_transform[0] as base.u64) > (this.workbuf_offset_for_transform[1] as base.u64)) or ((this.workbuf_offset_for_transform[1] as base.u64) > args.workbuf.length()) { return base."#bad workbuf length" @@ -740,22 +1271,129 @@ pri func decoder.decode_hg_table?(src: base.io_reader, width: base.u32[..= 0x400 if n > hg_pixels.length() { return base."#bad workbuf length" } + + // Pass 1: find max group index across all entropy image pixels. + max_hg = 0 p = hg_pixels[.. n] while p.length() >= 4 { - if p[2] <> 0 { - return "#unsupported number of Huffman groups" + hg_raw = ((p[2] as base.u32) << 8) | (p[1] as base.u32) + if max_hg < hg_raw { + max_hg = hg_raw } - hg_plus_1 = (p[1] as base.u32) + 1 - if this.overall_n_huffman_groups < hg_plus_1 { - this.overall_n_huffman_groups = hg_plus_1 + p = p[4 ..] + } + + if max_hg < 1024 { + // Fast path: no compaction needed. Direct index mapping. + this.hg_compacted = false + this.overall_n_huffman_groups = (max_hg & 0x3FF) + 1 + this.hg_bitstream_groups = (max_hg & 0x3FF) + 1 + return ok + } + + // Compaction: max_hg >= 1024. Sparse indices need remapping to dense + // 0..K-1 so they fit in our fixed-size arrays. + // + // Pass 2: collect distinct group indices into hg_sorted[]. + k = 0 + if n > hg_pixels.length() { + return base."#bad workbuf length" + } + p = hg_pixels[.. n] + while p.length() >= 4, + inv k <= 1024, + { + hg_raw = ((p[2] as base.u32) << 8) | (p[1] as base.u32) + // Linear scan for duplicates in hg_sorted[0 .. k]. + found = false + j = 0 + while j < k, + inv k <= 1024, + inv p.length() >= 4, + { + assert j < 1024 via "a < b: a < c; c <= b"(c: k) + if (this.hg_sorted[j] as base.u32) == (hg_raw & 0xFFFF) { + found = true + break + } + j += 1 + } + if not found { + if k >= 1024 { + return "#unsupported number of Huffman groups" + } + this.hg_sorted[k] = (hg_raw & 0xFFFF) as base.u16 + k += 1 } p = p[4 ..] } + + // Insertion sort hg_sorted[0 .. k]. + sort_i = 1 + while sort_i < k { + assert sort_i < 1024 via "a < b: a < c; c <= b"(c: k) + sort_val = this.hg_sorted[sort_i] + sort_j = sort_i + while sort_j > 0, + inv sort_i < k, + { + if sort_j < 1024 { + if this.hg_sorted[sort_j - 1] <= sort_val { + break + } + this.hg_sorted[sort_j] = this.hg_sorted[sort_j - 1] + } + sort_j -= 1 + } + if sort_j < 1024 { + this.hg_sorted[sort_j] = sort_val + } + assert sort_i < 1024 via "a < b: a < c; c <= b"(c: k) + sort_i += 1 + } + + // Pass 3: rewrite tile data with compact indices via binary search. + if n > hg_pixels.length() { + return base."#bad workbuf length" + } + q = hg_pixels[.. n] + while q.length() >= 4 { + hg_raw = ((q[2] as base.u32) << 8) | (q[1] as base.u32) + // Binary search in hg_sorted[0 .. k]. + lo = 0 + hi = k + while lo < hi, + inv q.length() >= 4, + { + mid = (lo + hi) / 2 + if mid < 1024 { + if (this.hg_sorted[mid] as base.u32) < hg_raw { + lo = mid + 1 + } else { + hi = mid + } + } else { + break + } + } + // lo is now the compact index (0..K-1). + compact = lo + q[1] = (compact & 0xFF) as base.u8 + q[2] = ((compact >> 8) & 0xFF) as base.u8 + q = q[4 ..] + } + + this.hg_compacted = true + this.overall_n_huffman_groups = k + this.hg_bitstream_groups = (max_hg & 0xFFFF) + 1 + this.hg_n_sorted = k } pri func decoder.decode_pixels?(dst: slice base.u8, src: base.io_reader, width: base.u32[..= 0x4000], height: base.u32[..= 0x4000], tile_data: roslice base.u8, tile_size_log2: base.u32[..= 9]) { - var i : base.u32 - var n : base.u32[..= 2048] + var status : base.status + var i : base.u32 + var n : base.u32[..= 2048] + var p_max : base.u64 i = 0 n = (1 as base.u32) << this.color_cache_bits @@ -765,13 +1403,41 @@ pri func decoder.decode_pixels?(dst: slice base.u8, src: base.io_reader, width: i += 1 } - this.decode_pixels_slow?( - dst: args.dst, - src: args.src, - width: args.width, - height: args.height, - tile_data: args.tile_data, - tile_size_log2: args.tile_size_log2) + // Initialize shared pixel position. + this.pix_p = 0 + this.pix_x = 0 + this.pix_y = 0 + this.pix_cc_p = 0 + + p_max = (4 * args.width * args.height) as base.u64 + + // Fast path: 64-bit bulk refill, table-driven decode. + while true { + status = this.decode_pixels_fast!( + dst: args.dst, + src: args.src, + width: args.width, + height: args.height, + tile_data: args.tile_data, + tile_size_log2: args.tile_size_log2) + if status.is_error() { + return status + } + if this.pix_p >= p_max { + return ok + } + // Slow path: byte-at-a-time refill for remaining pixels. + this.decode_pixels_slow?( + dst: args.dst, + src: args.src, + width: args.width, + height: args.height, + tile_data: args.tile_data, + tile_size_log2: args.tile_size_log2) + if this.pix_p >= p_max { + return ok + } + } } pri func decoder.swizzle!(dst: ptr base.pixel_buffer, src: roslice base.u8, blend: base.pixel_blend) base.status { @@ -828,7 +1494,7 @@ pri func decoder.swizzle!(dst: ptr base.pixel_buffer, src: roslice base.u8, blen } pub func decoder.frame_dirty_rect() base.rect_ie_u32 { - if this.is_vp8_lossy { + if this.is_vp8_lossy and (not this.is_vp8x) { return this.vp8.frame_dirty_rect() } return this.util.make_rect_ie_u32( @@ -888,8 +1554,22 @@ pub func decoder.tell_me_more?(dst: base.io_writer, minfo: nptr base.more_inform } pub func decoder.workbuf_len() base.range_ii_u64 { + var total : base.u64 + + if this.is_vp8x { + return this.util.make_range_ii_u64( + min_incl: this.vp8x_workbuf_len, + max_incl: this.vp8x_workbuf_len) + } if this.is_vp8_lossy { - return this.vp8.workbuf_len() + // VP8 needs planes + partition0. Add remaining VP8 data for + // multi-partition coefficient data storage. sub_chunk_length + // at this point = partition0 data + partition sizes + coefficient data. + total = this.vp8.workbuf_len_total() ~sat+ + (this.sub_chunk_length as base.u64) + return this.util.make_range_ii_u64( + min_incl: total, + max_incl: total) } return this.util.make_range_ii_u64( min_incl: this.workbuf_offset_for_transform[3] as base.u64, diff --git a/test/3pdata/nia-checksums-of-webpsuite.txt b/test/3pdata/nia-checksums-of-webpsuite.txt index a7a1b2aea..428e023c4 100644 --- a/test/3pdata/nia-checksums-of-webpsuite.txt +++ b/test/3pdata/nia-checksums-of-webpsuite.txt @@ -1,10 +1,24 @@ # Generated by script/print-nia-checksums.sh -OK. ca4c0038 test/3pdata/webpsuite/bad_palette_index.webp -OK. 807f5a23 test/3pdata/webpsuite/bryce.webp -OK. 2ee5c270 test/3pdata/webpsuite/bug3.webp +OK. a92322d0 test/3pdata/webpsuite/alpha_color_cache.webp +OK. 3081803b test/3pdata/webpsuite/alpha_filter_0_method_0.webp +OK. 3081803b test/3pdata/webpsuite/alpha_filter_0_method_1.webp +OK. d955c04f test/3pdata/webpsuite/alpha_filter_1.webp +OK. 3081803b test/3pdata/webpsuite/alpha_filter_1_method_0.webp +OK. 3081803b test/3pdata/webpsuite/alpha_filter_1_method_1.webp +OK. d955c04f test/3pdata/webpsuite/alpha_filter_2.webp +OK. 3081803b test/3pdata/webpsuite/alpha_filter_2_method_0.webp +OK. 3081803b test/3pdata/webpsuite/alpha_filter_2_method_1.webp +OK. d955c04f test/3pdata/webpsuite/alpha_filter_3.webp +OK. 3081803b test/3pdata/webpsuite/alpha_filter_3_method_0.webp +OK. 3081803b test/3pdata/webpsuite/alpha_filter_3_method_1.webp +OK. d955c04f test/3pdata/webpsuite/alpha_no_compression.webp +OK. 0ecfd841 test/3pdata/webpsuite/bad_palette_index.webp +OK. ff0041ac test/3pdata/webpsuite/big_endian_bug_393.webp +OK. c83499ce test/3pdata/webpsuite/bryce.webp +OK. a07d001c test/3pdata/webpsuite/bug3.webp OK. 6fbda804 test/3pdata/webpsuite/color_cache_bits_11.webp -BAD d338aed8 test/3pdata/webpsuite/dual_transform.webp -OK. 1ef61f19 test/3pdata/webpsuite/grid.bmp +OK. 0fecb95b test/3pdata/webpsuite/dual_transform.webp +OK. f79224e7 test/3pdata/webpsuite/grid.bmp OK. 33b7a1f3 test/3pdata/webpsuite/grid.pgm OK. f79224e7 test/3pdata/webpsuite/grid.png OK. 06b07a30 test/3pdata/webpsuite/grid.ppm @@ -12,114 +26,118 @@ OK. fce8fc76 test/3pdata/webpsuite/lossless1.webp OK. fce8fc76 test/3pdata/webpsuite/lossless2.webp OK. fce8fc76 test/3pdata/webpsuite/lossless3.webp OK. 4cbe795c test/3pdata/webpsuite/lossless4.webp -OK. f8e3adf4 test/3pdata/webpsuite/lossless_big_random_alpha.webp +OK. fe7deddb test/3pdata/webpsuite/lossless_big_random_alpha.webp OK. 289af407 test/3pdata/webpsuite/lossless_color_transform.bmp OK. 7b7c1ae6 test/3pdata/webpsuite/lossless_color_transform.pgm OK. 289af407 test/3pdata/webpsuite/lossless_color_transform.ppm -OK. 289af407 test/3pdata/webpsuite/lossless_color_transform.webp +BAD 73e0b22f test/3pdata/webpsuite/lossless_color_transform.webp OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_0.webp OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_1.webp OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_10.webp -BAD df11692d test/3pdata/webpsuite/lossless_vec_1_11.webp +OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_11.webp OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_12.webp -BAD df11692d test/3pdata/webpsuite/lossless_vec_1_13.webp +OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_13.webp OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_14.webp -BAD df11692d test/3pdata/webpsuite/lossless_vec_1_15.webp +OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_15.webp OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_2.webp -BAD df11692d test/3pdata/webpsuite/lossless_vec_1_3.webp +OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_3.webp OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_4.webp OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_5.webp OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_6.webp -BAD df11692d test/3pdata/webpsuite/lossless_vec_1_7.webp +OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_7.webp OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_8.webp -BAD df11692d test/3pdata/webpsuite/lossless_vec_1_9.webp +OK. f79224e7 test/3pdata/webpsuite/lossless_vec_1_9.webp OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_0.webp OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_1.webp OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_10.webp -BAD 62adeb37 test/3pdata/webpsuite/lossless_vec_2_11.webp +OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_11.webp OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_12.webp -BAD 62adeb37 test/3pdata/webpsuite/lossless_vec_2_13.webp +OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_13.webp OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_14.webp -BAD 62adeb37 test/3pdata/webpsuite/lossless_vec_2_15.webp +OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_15.webp OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_2.webp -BAD 62adeb37 test/3pdata/webpsuite/lossless_vec_2_3.webp +OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_3.webp OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_4.webp OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_5.webp OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_6.webp -BAD 62adeb37 test/3pdata/webpsuite/lossless_vec_2_7.webp +OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_7.webp OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_8.webp -BAD 62adeb37 test/3pdata/webpsuite/lossless_vec_2_9.webp -OK. d1ed71a2 test/3pdata/webpsuite/lossy_extreme_probabilities.webp -OK. 2e88e913 test/3pdata/webpsuite/lossy_q0_f100.webp +OK. 6706e719 test/3pdata/webpsuite/lossless_vec_2_9.webp +OK. 263b111e test/3pdata/webpsuite/lossy_alpha1.webp +OK. 5f94bc6a test/3pdata/webpsuite/lossy_alpha2.webp +OK. e90a5b5f test/3pdata/webpsuite/lossy_alpha3.webp +OK. 840d516e test/3pdata/webpsuite/lossy_alpha4.webp +OK. 5a8a55a4 test/3pdata/webpsuite/lossy_extreme_probabilities.webp +OK. 92424a24 test/3pdata/webpsuite/lossy_q0_f100.webp OK. 6957defa test/3pdata/webpsuite/near_lossless_75.webp OK. 7c80a001 test/3pdata/webpsuite/one_color_no_palette.webp OK. 6706e719 test/3pdata/webpsuite/peak.bmp OK. 315bbd77 test/3pdata/webpsuite/peak.pgm OK. 6706e719 test/3pdata/webpsuite/peak.png OK. 6706e719 test/3pdata/webpsuite/peak.ppm -OK. 69f5f4ec test/3pdata/webpsuite/segment01.webp -OK. 69f5f4ec test/3pdata/webpsuite/segment02.webp -OK. 69f5f4ec test/3pdata/webpsuite/segment03.webp -OK. 61e94e53 test/3pdata/webpsuite/small_13x1.webp -OK. 2d7e6391 test/3pdata/webpsuite/small_1x1.webp -OK. 7c595111 test/3pdata/webpsuite/small_1x13.webp -OK. 9240e9d0 test/3pdata/webpsuite/small_31x13.webp -OK. 2e88e913 test/3pdata/webpsuite/test-nostrong.webp -OK. 2e88e913 test/3pdata/webpsuite/test.webp -OK. 8b56f1b6 test/3pdata/webpsuite/very_short.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-001.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-002.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-003.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-004.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-005.webp -OK. e3e955c2 test/3pdata/webpsuite/vp80-00-comprehensive-006.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-007.webp -OK. 69cf2fa7 test/3pdata/webpsuite/vp80-00-comprehensive-008.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-009.webp -OK. 3796a852 test/3pdata/webpsuite/vp80-00-comprehensive-010.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-011.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-012.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-013.webp -OK. e3e955c2 test/3pdata/webpsuite/vp80-00-comprehensive-014.webp -OK. 3796a852 test/3pdata/webpsuite/vp80-00-comprehensive-015.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-016.webp -OK. c42a950e test/3pdata/webpsuite/vp80-00-comprehensive-017.webp -OK. c42a950e test/3pdata/webpsuite/vp80-01-intra-1400.webp -OK. 4937f445 test/3pdata/webpsuite/vp80-01-intra-1411.webp -OK. c42a950e test/3pdata/webpsuite/vp80-01-intra-1416.webp -OK. c42a950e test/3pdata/webpsuite/vp80-01-intra-1417.webp -OK. c42a950e test/3pdata/webpsuite/vp80-02-inter-1402.webp -OK. 4937f445 test/3pdata/webpsuite/vp80-02-inter-1412.webp -OK. 96b4ed67 test/3pdata/webpsuite/vp80-02-inter-1418.webp -OK. c42a950e test/3pdata/webpsuite/vp80-02-inter-1424.webp -OK. c42a950e test/3pdata/webpsuite/vp80-03-segmentation-1401.webp -OK. c42a950e test/3pdata/webpsuite/vp80-03-segmentation-1403.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1407.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1408.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1409.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1410.webp -OK. 4937f445 test/3pdata/webpsuite/vp80-03-segmentation-1413.webp -OK. 3796a852 test/3pdata/webpsuite/vp80-03-segmentation-1414.webp -OK. 3796a852 test/3pdata/webpsuite/vp80-03-segmentation-1415.webp -OK. c42a950e test/3pdata/webpsuite/vp80-03-segmentation-1425.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1426.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1427.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1432.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1435.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1436.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1437.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1441.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-03-segmentation-1442.webp -OK. c42a950e test/3pdata/webpsuite/vp80-04-partitions-1404.webp -OK. c42a950e test/3pdata/webpsuite/vp80-04-partitions-1405.webp -OK. c42a950e test/3pdata/webpsuite/vp80-04-partitions-1406.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-05-sharpness-1428.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-05-sharpness-1429.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-05-sharpness-1430.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-05-sharpness-1431.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-05-sharpness-1433.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-05-sharpness-1434.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-05-sharpness-1438.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-05-sharpness-1439.webp -OK. 1187f564 test/3pdata/webpsuite/vp80-05-sharpness-1440.webp -OK. 35ab76d3 test/3pdata/webpsuite/vp80-05-sharpness-1443.webp +OK. 88ca241b test/3pdata/webpsuite/segment01.webp +OK. cf500318 test/3pdata/webpsuite/segment02.webp +OK. a9a3881d test/3pdata/webpsuite/segment03.webp +OK. 9fba8052 test/3pdata/webpsuite/small_13x1.webp +OK. f08db259 test/3pdata/webpsuite/small_1x1.webp +OK. ae403f6e test/3pdata/webpsuite/small_1x13.webp +OK. 70c37cb6 test/3pdata/webpsuite/small_31x13.webp +OK. 80c1ff42 test/3pdata/webpsuite/test-nostrong.webp +OK. 631f6a02 test/3pdata/webpsuite/test.webp +OK. b1e2e316 test/3pdata/webpsuite/very_short.webp +OK. ff7d9585 test/3pdata/webpsuite/vp80-00-comprehensive-001.webp +OK. 684b846a test/3pdata/webpsuite/vp80-00-comprehensive-002.webp +OK. d3127a10 test/3pdata/webpsuite/vp80-00-comprehensive-003.webp +OK. ff7d9585 test/3pdata/webpsuite/vp80-00-comprehensive-004.webp +OK. c5600ecc test/3pdata/webpsuite/vp80-00-comprehensive-005.webp +OK. 3836d469 test/3pdata/webpsuite/vp80-00-comprehensive-006.webp +OK. 04f7f4c9 test/3pdata/webpsuite/vp80-00-comprehensive-007.webp +OK. e06af858 test/3pdata/webpsuite/vp80-00-comprehensive-008.webp +OK. 9458b100 test/3pdata/webpsuite/vp80-00-comprehensive-009.webp +OK. 954647a5 test/3pdata/webpsuite/vp80-00-comprehensive-010.webp +OK. ff7d9585 test/3pdata/webpsuite/vp80-00-comprehensive-011.webp +OK. 6732c101 test/3pdata/webpsuite/vp80-00-comprehensive-012.webp +OK. df21b5c8 test/3pdata/webpsuite/vp80-00-comprehensive-013.webp +OK. de3154f1 test/3pdata/webpsuite/vp80-00-comprehensive-014.webp +OK. 6291f264 test/3pdata/webpsuite/vp80-00-comprehensive-015.webp +OK. 3346ef2b test/3pdata/webpsuite/vp80-00-comprehensive-016.webp +OK. 3346ef2b test/3pdata/webpsuite/vp80-00-comprehensive-017.webp +OK. 34a0adbb test/3pdata/webpsuite/vp80-01-intra-1400.webp +OK. 2cda6f62 test/3pdata/webpsuite/vp80-01-intra-1411.webp +OK. a321721b test/3pdata/webpsuite/vp80-01-intra-1416.webp +OK. 754a4ff8 test/3pdata/webpsuite/vp80-01-intra-1417.webp +OK. 34a0adbb test/3pdata/webpsuite/vp80-02-inter-1402.webp +OK. 2cda6f62 test/3pdata/webpsuite/vp80-02-inter-1412.webp +OK. 0bb72bca test/3pdata/webpsuite/vp80-02-inter-1418.webp +OK. 93739a20 test/3pdata/webpsuite/vp80-02-inter-1424.webp +OK. 34a0adbb test/3pdata/webpsuite/vp80-03-segmentation-1401.webp +OK. 34a0adbb test/3pdata/webpsuite/vp80-03-segmentation-1403.webp +OK. aaea276b test/3pdata/webpsuite/vp80-03-segmentation-1407.webp +OK. aaea276b test/3pdata/webpsuite/vp80-03-segmentation-1408.webp +OK. aaea276b test/3pdata/webpsuite/vp80-03-segmentation-1409.webp +OK. aaea276b test/3pdata/webpsuite/vp80-03-segmentation-1410.webp +OK. 2cda6f62 test/3pdata/webpsuite/vp80-03-segmentation-1413.webp +OK. 65fb3f68 test/3pdata/webpsuite/vp80-03-segmentation-1414.webp +OK. 65fb3f68 test/3pdata/webpsuite/vp80-03-segmentation-1415.webp +OK. 3819b4d7 test/3pdata/webpsuite/vp80-03-segmentation-1425.webp +OK. 981e1d2e test/3pdata/webpsuite/vp80-03-segmentation-1426.webp +OK. 57780055 test/3pdata/webpsuite/vp80-03-segmentation-1427.webp +OK. caf44365 test/3pdata/webpsuite/vp80-03-segmentation-1432.webp +OK. 17d6af96 test/3pdata/webpsuite/vp80-03-segmentation-1435.webp +OK. 792cc4dc test/3pdata/webpsuite/vp80-03-segmentation-1436.webp +OK. a0c109d5 test/3pdata/webpsuite/vp80-03-segmentation-1437.webp +OK. 88f3ed22 test/3pdata/webpsuite/vp80-03-segmentation-1441.webp +OK. ad7b1ced test/3pdata/webpsuite/vp80-03-segmentation-1442.webp +OK. 34a0adbb test/3pdata/webpsuite/vp80-04-partitions-1404.webp +OK. 34a0adbb test/3pdata/webpsuite/vp80-04-partitions-1405.webp +OK. 34a0adbb test/3pdata/webpsuite/vp80-04-partitions-1406.webp +OK. deb943fc test/3pdata/webpsuite/vp80-05-sharpness-1428.webp +OK. eaffae2b test/3pdata/webpsuite/vp80-05-sharpness-1429.webp +OK. 88ca32c7 test/3pdata/webpsuite/vp80-05-sharpness-1430.webp +OK. 386ff4b2 test/3pdata/webpsuite/vp80-05-sharpness-1431.webp +OK. 792cc4dc test/3pdata/webpsuite/vp80-05-sharpness-1433.webp +OK. 8220ff9f test/3pdata/webpsuite/vp80-05-sharpness-1434.webp +OK. 2b2a6b87 test/3pdata/webpsuite/vp80-05-sharpness-1438.webp +OK. 7342903f test/3pdata/webpsuite/vp80-05-sharpness-1439.webp +OK. 792cc4dc test/3pdata/webpsuite/vp80-05-sharpness-1440.webp +OK. a5c713d9 test/3pdata/webpsuite/vp80-05-sharpness-1443.webp diff --git a/test/c/mimiclib/webp.c b/test/c/mimiclib/webp.c index 2899bd3fc..378336b38 100644 --- a/test/c/mimiclib/webp.c +++ b/test/c/mimiclib/webp.c @@ -42,7 +42,10 @@ mimic_webp_decode(uint64_t* n_bytes_out, WebPDecoderConfig config; if (!WebPInitDecoderConfig(&config)) { return "mimic_webp_decode: WebPInitDecoderConfig failed"; - } else if (WebPGetFeatures(wuffs_base__io_buffer__reader_pointer(src), + } + // Disable fancy upsampling to match Wuffs' box filter chroma upsampling. + config.options.no_fancy_upsampling = 1; + if (WebPGetFeatures(wuffs_base__io_buffer__reader_pointer(src), wuffs_base__io_buffer__reader_length(src), &config.input) != VP8_STATUS_OK) { return "mimic_webp_decode: WebPGetFeatures failed"; diff --git a/test/c/std/webp.c b/test/c/std/webp.c index 23d3b19f6..388d7879f 100644 --- a/test/c/std/webp.c +++ b/test/c/std/webp.c @@ -33,7 +33,7 @@ the first "./a.out" with "./a.out -bench". Combine these changes with the "wuffs mimic cflags" to run the mimic benchmarks. */ -// ¿ wuffs mimic cflags: -DWUFFS_MIMIC -lwebp +// ¿ wuffs mimic cflags: -DWUFFS_MIMIC -I/opt/homebrew/opt/webp/include -L/opt/homebrew/opt/webp/lib -lwebp // Wuffs ships as a "single file C library" or "header file library" as per // https://github.com/nothings/stb/blob/master/docs/stb_howto.txt @@ -88,30 +88,39 @@ wuffs_webp_decode(uint64_t* n_bytes_out, // -------- const char* // -test_wuffs_webp_decode_interface_lossless() { - CHECK_FOCUS(__func__); +wuffs_webp_decode_interface(const char* filename, + uint32_t width, + uint32_t height, + uint32_t want_hash) { wuffs_webp__decoder* dec = &g_webp_decoder; CHECK_STATUS("initialize", wuffs_webp__decoder__initialize( dec, sizeof *dec, WUFFS_VERSION, WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED)); return do_test__wuffs_base__image_decoder( - wuffs_webp__decoder__upcast_as__wuffs_base__image_decoder(dec), - "test/data/bricks-color.lossless.webp", 0, SIZE_MAX, 160, 120, - 0xFF022460); + wuffs_webp__decoder__upcast_as__wuffs_base__image_decoder(dec), filename, + 0, SIZE_MAX, width, height, want_hash); +} + +const char* // +test_wuffs_webp_decode_interface_lossless() { + CHECK_FOCUS(__func__); + return wuffs_webp_decode_interface( + "test/data/bricks-color.lossless.webp", 160, 120, 0xFF022460); } const char* // test_wuffs_webp_decode_interface_lossy() { CHECK_FOCUS(__func__); - wuffs_webp__decoder* dec = &g_webp_decoder; - CHECK_STATUS("initialize", - wuffs_webp__decoder__initialize( - dec, sizeof *dec, WUFFS_VERSION, - WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED)); - return do_test__wuffs_base__image_decoder( - wuffs_webp__decoder__upcast_as__wuffs_base__image_decoder(dec), - "test/data/bricks-color.lossy.webp", 0, SIZE_MAX, 160, 120, 0xFF9F7780); + return wuffs_webp_decode_interface( + "test/data/bricks-color.lossy.webp", 160, 120, 0xFF032665); +} + +const char* // +test_wuffs_webp_decode_interface_lossy_with_alpha() { + CHECK_FOCUS(__func__); + return wuffs_webp_decode_interface( + "test/data/bricks-color.lossy-with-alpha.webp", 16, 16, 0x64427FFF); } // ---------------- Mimic Tests @@ -177,6 +186,30 @@ test_mimic_webp_lossless_decode_image_4002k_24bpp() { return do_test_mimic_webp_decode("test/data/harvesters.lossless.webp"); } +const char* // +test_mimic_webp_lossy_decode_image_2k_24bpp() { + CHECK_FOCUS(__func__); + return do_test_mimic_webp_decode("test/data/bricks-gray.lossy.webp"); +} + +const char* // +test_mimic_webp_lossy_decode_image_3k_24bpp() { + CHECK_FOCUS(__func__); + return do_test_mimic_webp_decode("test/data/hat.lossy.webp"); +} + +const char* // +test_mimic_webp_lossy_decode_image_6k_24bpp() { + CHECK_FOCUS(__func__); + return do_test_mimic_webp_decode("test/data/hibiscus.primitive.lossy.webp"); +} + +const char* // +test_mimic_webp_lossy_decode_image_174k_24bpp() { + CHECK_FOCUS(__func__); + return do_test_mimic_webp_decode("test/data/harvesters.lossy.webp"); +} + #endif // WUFFS_MIMIC // ---------------- WebP Benches @@ -232,6 +265,36 @@ bench_wuffs_webp_lossless_decode_image_4002k_24bpp() { NULL, 0, "test/data/harvesters.lossless.webp", 0, SIZE_MAX, 1); } +const char* // +bench_wuffs_webp_lossy_decode_image_40k_24bpp() { + CHECK_FOCUS(__func__); + return do_bench_image_decode( + &wuffs_webp_decode, + WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, + wuffs_base__make_pixel_format(WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL), + NULL, 0, "test/data/hat.lossy.webp", 0, SIZE_MAX, 30); +} + +const char* // +bench_wuffs_webp_lossy_decode_image_552k_24bpp() { + CHECK_FOCUS(__func__); + return do_bench_image_decode( + &wuffs_webp_decode, + WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, + wuffs_base__make_pixel_format(WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL), + NULL, 0, "test/data/hibiscus.primitive.lossy.webp", 0, SIZE_MAX, 4); +} + +const char* // +bench_wuffs_webp_lossy_decode_image_4002k_24bpp() { + CHECK_FOCUS(__func__); + return do_bench_image_decode( + &wuffs_webp_decode, + WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, + wuffs_base__make_pixel_format(WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL), + NULL, 0, "test/data/harvesters.lossy.webp", 0, SIZE_MAX, 1); +} + // ---------------- Mimic Benches #ifdef WUFFS_MIMIC @@ -287,6 +350,36 @@ bench_mimic_webp_lossless_decode_image_4002k_24bpp() { NULL, 0, "test/data/harvesters.lossless.webp", 0, SIZE_MAX, 1); } +const char* // +bench_mimic_webp_lossy_decode_image_40k_24bpp() { + CHECK_FOCUS(__func__); + return do_bench_image_decode( + &mimic_webp_decode, + WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, + wuffs_base__make_pixel_format(WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL), + NULL, 0, "test/data/hat.lossy.webp", 0, SIZE_MAX, 30); +} + +const char* // +bench_mimic_webp_lossy_decode_image_552k_24bpp() { + CHECK_FOCUS(__func__); + return do_bench_image_decode( + &mimic_webp_decode, + WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, + wuffs_base__make_pixel_format(WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL), + NULL, 0, "test/data/hibiscus.primitive.lossy.webp", 0, SIZE_MAX, 4); +} + +const char* // +bench_mimic_webp_lossy_decode_image_4002k_24bpp() { + CHECK_FOCUS(__func__); + return do_bench_image_decode( + &mimic_webp_decode, + WUFFS_INITIALIZE__LEAVE_INTERNAL_BUFFERS_UNINITIALIZED, + wuffs_base__make_pixel_format(WUFFS_BASE__PIXEL_FORMAT__BGRA_NONPREMUL), + NULL, 0, "test/data/harvesters.lossy.webp", 0, SIZE_MAX, 1); +} + #endif // WUFFS_MIMIC // ---------------- Manifest @@ -295,6 +388,7 @@ proc g_tests[] = { test_wuffs_webp_decode_interface_lossless, test_wuffs_webp_decode_interface_lossy, + test_wuffs_webp_decode_interface_lossy_with_alpha, #ifdef WUFFS_MIMIC @@ -304,6 +398,11 @@ proc g_tests[] = { test_mimic_webp_lossless_decode_image_552k_32bpp, test_mimic_webp_lossless_decode_image_4002k_24bpp, + test_mimic_webp_lossy_decode_image_2k_24bpp, + test_mimic_webp_lossy_decode_image_3k_24bpp, + test_mimic_webp_lossy_decode_image_6k_24bpp, + test_mimic_webp_lossy_decode_image_174k_24bpp, + #endif // WUFFS_MIMIC NULL, @@ -317,6 +416,10 @@ proc g_benches[] = { bench_wuffs_webp_lossless_decode_image_552k_32bpp, bench_wuffs_webp_lossless_decode_image_4002k_24bpp, + bench_wuffs_webp_lossy_decode_image_40k_24bpp, + bench_wuffs_webp_lossy_decode_image_552k_24bpp, + bench_wuffs_webp_lossy_decode_image_4002k_24bpp, + #ifdef WUFFS_MIMIC bench_mimic_webp_lossless_decode_image_19k_8bpp, @@ -325,6 +428,10 @@ proc g_benches[] = { bench_mimic_webp_lossless_decode_image_552k_32bpp, bench_mimic_webp_lossless_decode_image_4002k_24bpp, + bench_mimic_webp_lossy_decode_image_40k_24bpp, + bench_mimic_webp_lossy_decode_image_552k_24bpp, + bench_mimic_webp_lossy_decode_image_4002k_24bpp, + #endif // WUFFS_MIMIC NULL, diff --git a/test/data/bricks-color.lossy-with-alpha.webp b/test/data/bricks-color.lossy-with-alpha.webp new file mode 100644 index 000000000..18370a804 Binary files /dev/null and b/test/data/bricks-color.lossy-with-alpha.webp differ diff --git a/test/nia-checksums-of-data.txt b/test/nia-checksums-of-data.txt index 12ca826b6..85adaa644 100644 --- a/test/nia-checksums-of-data.txt +++ b/test/nia-checksums-of-data.txt @@ -58,7 +58,8 @@ OK. 5670f263 test/data/bricks-color.etc2.pkm OK. 1fef6814 test/data/bricks-color.handsum OK. 72a1f9cc test/data/bricks-color.jpeg OK. 076cb375 test/data/bricks-color.lossless.webp -OK. 9d451b1c test/data/bricks-color.lossy.webp +OK. 369d25d0 test/data/bricks-color.lossy-with-alpha.webp +OK. 3466b80b test/data/bricks-color.lossy.webp OK. 076cb375 test/data/bricks-color.png OK. 076cb375 test/data/bricks-color.qoi OK. 076cb375 test/data/bricks-color.tga @@ -71,7 +72,7 @@ OK. c2bce675 test/data/bricks-gray.bmp OK. c2bce675 test/data/bricks-gray.gif OK. 3a2478ad test/data/bricks-gray.jpeg OK. c2bce675 test/data/bricks-gray.lossless.webp -OK. 9d451b1c test/data/bricks-gray.lossy.webp +OK. cdba0ec1 test/data/bricks-gray.lossy.webp OK. c2bce675 test/data/bricks-gray.no-ancillary.png OK. c2bce675 test/data/bricks-gray.png OK. c2bce675 test/data/bricks-gray.tga @@ -89,27 +90,27 @@ OK. 3014b4c0 test/data/gifplayer-muybridge.gif OK. 030f5a48 test/data/harvesters.bmp OK. c18b3d5a test/data/harvesters.gif OK. f217df74 test/data/harvesters.jpeg -OK. 030f5a48 test/data/harvesters.lossless.webp -OK. a0a736f4 test/data/harvesters.lossy.webp +OK. a29dec22 test/data/harvesters.lossless.webp +OK. 406d5e8a test/data/harvesters.lossy.webp OK. 030f5a48 test/data/harvesters.png OK. e776c90f test/data/hat.bmp OK. 6dcba6a4 test/data/hat.gif OK. 2298f3ca test/data/hat.jpeg OK. e776c90f test/data/hat.lossless.webp -OK. 50993e8b test/data/hat.lossy.webp +OK. be7f0169 test/data/hat.lossy.webp OK. e776c90f test/data/hat.png OK. d30bfe5d test/data/hat.wbmp OK. 33a44f22 test/data/hibiscus.primitive.bmp OK. 25e212b3 test/data/hibiscus.primitive.gif OK. 9624fa44 test/data/hibiscus.primitive.jpeg OK. 33a44f22 test/data/hibiscus.primitive.lossless.webp -OK. 607f05b1 test/data/hibiscus.primitive.lossy.webp +OK. 09bbaab0 test/data/hibiscus.primitive.lossy.webp OK. 33a44f22 test/data/hibiscus.primitive.png OK. 60040742 test/data/hibiscus.regular.bmp OK. b727da8b test/data/hibiscus.regular.gif OK. 41e39405 test/data/hibiscus.regular.jpeg -OK. 60040742 test/data/hibiscus.regular.lossless.webp -OK. 607f05b1 test/data/hibiscus.regular.lossy.webp +OK. 7225a6d4 test/data/hibiscus.regular.lossless.webp +OK. 6bbe1ed2 test/data/hibiscus.regular.lossy.webp OK. 60040742 test/data/hibiscus.regular.png OK. dcbb225a test/data/hippopotamus.bmp OK. ed4b78fc test/data/hippopotamus.interlaced.gif @@ -118,7 +119,7 @@ BAD c3c4bd65 test/data/hippopotamus.interlaced.truncated.gif BAD 3feec847 test/data/hippopotamus.interlaced.truncated.png OK. 96bdbbb3 test/data/hippopotamus.jpeg OK. dcbb225a test/data/hippopotamus.lossless.webp -OK. d5577434 test/data/hippopotamus.lossy.webp +OK. 66ccbffa test/data/hippopotamus.lossy.webp OK. 2535637e test/data/hippopotamus.masked-with-muybridge.etc2.bgra-binary.pkm OK. 720180ee test/data/hippopotamus.masked-with-muybridge.etc2.bgra-nonpremul.pkm OK. d3bbed27 test/data/hippopotamus.masked-with-muybridge.gif @@ -172,7 +173,7 @@ OK. bf7e8c96 test/data/pjw-thumbnail.bmp OK. bf7e8c96 test/data/pjw-thumbnail.gif OK. 7c67a37f test/data/pjw-thumbnail.jpeg OK. bf7e8c96 test/data/pjw-thumbnail.lossless.webp -OK. 61f9ea55 test/data/pjw-thumbnail.lossy.webp +OK. cf6f520e test/data/pjw-thumbnail.lossy.webp OK. bf7e8c96 test/data/pjw-thumbnail.png OK. 38cb4cbf test/data/red-blue-gradient.dcip3d65-no-chrm-no-gama.png OK. 38cb4cbf test/data/red-blue-gradient.gamma1dot0.png