From ea7a0ceec6041c12af370f6056cdf2939e0f82f5 Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Thu, 3 Aug 2023 12:07:35 +0100 Subject: [PATCH 01/13] lavc/vvc: Make ITX modular at 2D level Make the ITX DSP context inverse transform functions perform 2D transforms rather than 1D transforms. This will allow for more efficient assembly optimisations in future. --- libavcodec/vvc/vvc_intra.c | 63 +----------- libavcodec/vvc/vvcdsp.c | 38 ++++--- libavcodec/vvc/vvcdsp.h | 3 +- libavcodec/vvc/vvcdsp_template.c | 166 ++++++++++++++++++++++++++++--- 4 files changed, 181 insertions(+), 89 deletions(-) diff --git a/libavcodec/vvc/vvc_intra.c b/libavcodec/vvc/vvc_intra.c index 7af2af439b3..2e7c0b7b988 100644 --- a/libavcodec/vvc/vvc_intra.c +++ b/libavcodec/vvc/vvc_intra.c @@ -272,32 +272,6 @@ static void predict_intra(VVCLocalContext *lc, const TransformUnit *tu, const in } } -static void scale_clip(int *coeff, const int nzw, const int w, const int h, - const int shift, const int log2_transform_range) -{ - const int add = 1 << (shift - 1); - for (int y = 0; y < h; y++) { - int *p = coeff + y * w; - for (int x = 0; x < nzw; x++) { - *p = av_clip_intp2((*p + add) >> shift, log2_transform_range); - p++; - } - memset(p, 0, sizeof(*p) * (w - nzw)); - } -} - -static void scale(int *out, const int *in, const int w, const int h, const int shift) -{ - const int add = 1 << (shift - 1); - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - int *o = out + y * w + x; - const int *i = in + y * w + x; - *o = (*i + add) >> shift; - } - } -} - // part of 8.7.3 Scaling process for transform coefficients static void derive_qp(const VVCLocalContext *lc, const TransformUnit *tu, TransformBlock *tb) { @@ -441,35 +415,6 @@ static void dequant(const VVCLocalContext *lc, const TransformUnit *tu, Transfor } } -static void itx_2d(const VVCFrameContext *fc, TransformBlock *tb, const enum TxType trh, const enum TxType trv, int *temp) -{ - const VVCSPS *sps = fc->ps.sps; - const int w = tb->tb_width; - const int h = tb->tb_height; - const int nzw = tb->max_scan_x + 1; - - for (int x = 0; x < nzw; x++) - fc->vvcdsp.itx.itx[trv][tb->log2_tb_height - 1](temp + x, w, tb->coeffs + x, w); - scale_clip(temp, nzw, w, h, 7, sps->log2_transform_range); - - for (int y = 0; y < h; y++) - fc->vvcdsp.itx.itx[trh][tb->log2_tb_width - 1](tb->coeffs + y * w, 1, temp + y * w, 1); - scale(tb->coeffs, tb->coeffs, w, h, 5 + sps->log2_transform_range - sps->bit_depth); -} - -static void itx_1d(const VVCFrameContext *fc, TransformBlock *tb, const enum TxType trh, const enum TxType trv, int *temp) -{ - const VVCSPS *sps = fc->ps.sps; - const int w = tb->tb_width; - const int h = tb->tb_height; - - if (w > 1) - fc->vvcdsp.itx.itx[trh][tb->log2_tb_width - 1](temp, 1, tb->coeffs, 1); - else - fc->vvcdsp.itx.itx[trv][tb->log2_tb_height - 1](temp, 1, tb->coeffs, 1); - scale(tb->coeffs, temp, w, h, 6 + sps->log2_transform_range - sps->bit_depth); -} - static void transform_bdpcm(TransformBlock *tb, const VVCLocalContext *lc, const CodingUnit *cu) { const VVCSPS *sps = lc->fc->ps.sps; @@ -511,14 +456,14 @@ static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int tu_idx, dequant(lc, tu, tb); if (!tb->ts) { enum TxType trh, trv; + const int nzw = tb->max_scan_x + 1; if (cu->apply_lfnst_flag[c_idx]) ilfnst_transform(lc, tb); derive_transform_type(fc, lc, tb, &trh, &trv); - if (w > 1 && h > 1) - itx_2d(fc, tb, trh, trv, temp); - else - itx_1d(fc, tb, trh, trv, temp); + + fc->vvcdsp.itx.itx[trh][trv][tb->log2_tb_width][tb->log2_tb_height]( + tb->coeffs, tb->coeffs, nzw, sps->log2_transform_range); } if (chroma_scale) diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c index 1056cb8ff9f..baff205f551 100644 --- a/libavcodec/vvc/vvcdsp.c +++ b/libavcodec/vvc/vvcdsp.c @@ -254,23 +254,31 @@ static int vvc_sad(const int16_t *src0, const int16_t *src1, int dx, int dy, return sad; } -#define itx_fn(type, s) \ -static void itx_##type##_##s(int *out, ptrdiff_t out_step, const int *in, ptrdiff_t in_step) \ -{ \ - ff_vvc_inv_##type##_##s(out, out_step, in, in_step); \ +static void scale_clip(int *coeff, const int nzw, const int w, const int h, + const int shift, const int log2_transform_range) +{ + const int add = 1 << (shift - 1); + for (int y = 0; y < h; y++) { + int *p = coeff + y * w; + for (int x = 0; x < nzw; x++) { + *p = av_clip_intp2((*p + add) >> shift, log2_transform_range); + p++; + } + memset(p, 0, sizeof(*p) * (w - nzw)); + } } -#define itx_fn_common(type) \ - itx_fn(type, 4); \ - itx_fn(type, 8); \ - itx_fn(type, 16); \ - itx_fn(type, 32); \ - -itx_fn_common(dct2); -itx_fn_common(dst7); -itx_fn_common(dct8); -itx_fn(dct2, 2); -itx_fn(dct2, 64); +static void scale(int *out, const int *in, const int w, const int h, const int shift) +{ + const int add = 1 << (shift - 1); + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + int *o = out + y * w + x; + const int *i = in + y * w + x; + *o = (*i + add) >> shift; + } + } +} typedef struct IntraEdgeParams { uint8_t* top; diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h index af5133eca8d..26683b2e915 100644 --- a/libavcodec/vvc/vvcdsp.h +++ b/libavcodec/vvc/vvcdsp.h @@ -35,6 +35,7 @@ enum TxType { }; enum TxSize { + TX_SIZE_1 = 0, TX_SIZE_2, TX_SIZE_4, TX_SIZE_8, @@ -120,7 +121,7 @@ typedef struct VVCItxDSPContext { void (*add_residual_joint)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride, int c_sign, int shift); void (*pred_residual_joint)(int *buf, int width, int height, int c_sign, int shift); - void (*itx[N_TX_TYPE][N_TX_SIZE])(int *out, ptrdiff_t out_step, const int *in, ptrdiff_t in_step); + void (*itx[N_TX_TYPE][N_TX_TYPE][N_TX_SIZE][N_TX_SIZE])(int *dst, const int *coeff, int nzw, int log2_transform_range); void (*transform_bdpcm)(int *coeffs, int width, int height, int vertical, int log2_transform_range); } VVCItxDSPContext; diff --git a/libavcodec/vvc/vvcdsp_template.c b/libavcodec/vvc/vvcdsp_template.c index d3998e633f6..328863d1d60 100644 --- a/libavcodec/vvc/vvcdsp_template.c +++ b/libavcodec/vvc/vvcdsp_template.c @@ -23,6 +23,7 @@ #include "libavcodec/bit_depth_template.c" #include "vvcdec.h" +#include "vvc_itx_1d.h" #include "vvc_inter_template.c" #include "vvc_intra_template.c" @@ -93,26 +94,163 @@ static void FUNC(transform_bdpcm)(int *coeffs, const int width, const int height } } -static void FUNC(ff_vvc_itx_dsp_init)(VVCItxDSPContext *const itx) -{ -#define VVC_ITX(TYPE, type, s) \ - itx->itx[TYPE][TX_SIZE_##s] = itx_##type##_##s; \ +#define ITX_COMMON_SIZES(TYPE_H, type_h, TYPE_V, type_v) \ + ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, 1, 4); \ + ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, 1, 8); \ + ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, 1, 16); \ + ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, 1, 32); \ + ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, 4, 1); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 4, 4); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 4, 8); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 4, 16); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 4, 32); \ + ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, 8, 1); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 8, 4); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 8, 8); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 8, 16); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 8, 32); \ + ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, 16, 1); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 16, 4); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 16, 8); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 16, 16); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 16, 32); \ + ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, 32, 1); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 32, 4); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 32, 8); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 32, 16); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 32, 32); -#define VVC_ITX_COMMON(TYPE, type) \ - VVC_ITX(TYPE, type, 4); \ - VVC_ITX(TYPE, type, 8); \ - VVC_ITX(TYPE, type, 16); \ - VVC_ITX(TYPE, type, 32); +#define ITX \ + ITX_COMMON_SIZES(DCT2, dct2, DCT2, dct2); \ + ITX_COMMON_SIZES(DCT2, dct2, DST7, dst7); \ + ITX_COMMON_SIZES(DCT2, dct2, DCT8, dct8); \ + ITX_COMMON_SIZES(DST7, dst7, DCT2, dct2); \ + ITX_COMMON_SIZES(DST7, dst7, DST7, dst7); \ + ITX_COMMON_SIZES(DST7, dst7, DCT8, dct8); \ + ITX_COMMON_SIZES(DCT8, dct8, DCT2, dct2); \ + ITX_COMMON_SIZES(DCT8, dct8, DST7, dst7); \ + ITX_COMMON_SIZES(DCT8, dct8, DCT8, dct8); \ + ITX_1D_V(DCT2, dct2, DCT2, dct2, 1, 2); \ + ITX_1D_V(DCT2, dct2, DCT2, dct2, 1, 64); \ + ITX_1D_H(DCT2, dct2, DCT2, dct2, 2, 1); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 4); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 8); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 16); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 32); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 64); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 4, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 4, 64); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 8, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 8, 64); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 16, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 16, 64); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 32, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 32, 64); \ + ITX_1D_H(DCT2, dct2, DCT2, dct2, 64, 1); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 4); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 8); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 16); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 32); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 64); \ + ITX_1D_H(DCT2, dct2, DST7, dst7, 2, 1); \ + ITX_2D(DCT2, dct2, DST7, dst7, 2, 4); \ + ITX_2D(DCT2, dct2, DST7, dst7, 2, 8); \ + ITX_2D(DCT2, dct2, DST7, dst7, 2, 16); \ + ITX_2D(DCT2, dct2, DST7, dst7, 2, 32); \ + ITX_1D_H(DCT2, dct2, DST7, dst7, 64, 1); \ + ITX_2D(DCT2, dct2, DST7, dst7, 64, 4); \ + ITX_2D(DCT2, dct2, DST7, dst7, 64, 8); \ + ITX_2D(DCT2, dct2, DST7, dst7, 64, 16); \ + ITX_2D(DCT2, dct2, DST7, dst7, 64, 32); \ + ITX_1D_H(DCT2, dct2, DCT8, dct8, 2, 1); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 2, 4); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 2, 8); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 2, 16); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 2, 32); \ + ITX_1D_H(DCT2, dct2, DCT8, dct8, 64, 1); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 64, 4); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 64, 8); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 64, 16); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 64, 32); \ + ITX_1D_V(DST7, dst7, DCT2, dct2, 1, 2); \ + ITX_2D(DST7, dst7, DCT2, dct2, 4, 2); \ + ITX_2D(DST7, dst7, DCT2, dct2, 8, 2); \ + ITX_2D(DST7, dst7, DCT2, dct2, 16, 2); \ + ITX_2D(DST7, dst7, DCT2, dct2, 32, 2); \ + ITX_1D_V(DST7, dst7, DCT2, dct2, 1, 64); \ + ITX_2D(DST7, dst7, DCT2, dct2, 4, 64); \ + ITX_2D(DST7, dst7, DCT2, dct2, 8, 64); \ + ITX_2D(DST7, dst7, DCT2, dct2, 16, 64); \ + ITX_2D(DST7, dst7, DCT2, dct2, 32, 64); \ + ITX_1D_V(DCT8, dct8, DCT2, dct2, 1, 2); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 4, 2); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 8, 2); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 16, 2); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 32, 2); \ + ITX_1D_V(DCT8, dct8, DCT2, dct2, 1, 64); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 4, 64); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 8, 64); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 16, 64); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 32, 64); + +// ITX function prototypes +#undef ITX_2D +#define ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) \ +static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int *dst, \ + const int *coeff, int nzw, int log2_transform_range) \ +{ \ + DECLARE_ALIGNED(32, int, temp)[width * height]; \ + \ + for (int x = 0; x < nzw; x++) \ + ff_vvc_inv_##type_v##_##height(temp + x, width, coeff + x, width); \ + \ + scale_clip(temp, width, width, height, 7, log2_transform_range); \ + \ + for (int y = 0; y < height; y++) \ + ff_vvc_inv_##type_h##_##width(dst + y * width, 1, temp + y * width, 1); \ + \ + scale(dst, dst, width, height, 5 + log2_transform_range - BIT_DEPTH); \ +} +#undef ITX_1D_H +#define ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, width, height) \ +static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int *dst, \ + const int *coeff, int nzw, int log2_transform_range) \ +{ \ + DECLARE_ALIGNED(32, int, temp)[width * height]; \ + \ + ff_vvc_inv_##type_h##_##width(temp, 1, coeff, 1); \ + scale(dst, temp, width, height, 6 + log2_transform_range - BIT_DEPTH); \ +} +#undef ITX_1D_V +#define ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, width, height) \ +static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int *dst, \ + const int *coeff, int nzw, int log2_transform_range) \ +{ \ + DECLARE_ALIGNED(32, int, temp)[width * height]; \ + \ + ff_vvc_inv_##type_v##_##height(temp, 1, coeff, 1); \ + scale(dst, temp, width, height, 6 + log2_transform_range - BIT_DEPTH); \ +} +ITX +static void FUNC(ff_vvc_itx_dsp_init)(VVCItxDSPContext *const itx) +{ itx->add_residual = FUNC(add_residual); itx->add_residual_joint = FUNC(add_residual_joint); itx->pred_residual_joint = FUNC(pred_residual_joint); itx->transform_bdpcm = FUNC(transform_bdpcm); - VVC_ITX(DCT2, dct2, 2) - VVC_ITX(DCT2, dct2, 64) - VVC_ITX_COMMON(DCT2, dct2) - VVC_ITX_COMMON(DCT8, dct8) - VVC_ITX_COMMON(DST7, dst7) +#undef ITX_2D +#define ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) \ + itx->itx[TYPE_H][TYPE_V][TX_SIZE_##width][TX_SIZE_##height] = FUNC(inv_##type_h##_##type_v##_##width##x##height); +#undef ITX_1D_H +#define ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, width, height) \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) +#undef ITX_1D_V +#define ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, width, height) \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) + ITX #undef VVC_ITX #undef VVC_ITX_COMMON From 76a20a76b097e8f700e58d910c5d57a252e06bac Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Thu, 3 Aug 2023 13:51:03 +0100 Subject: [PATCH 02/13] lavc/vvc: Fix ITX non-zero width usage --- libavcodec/vvc/vvc_intra.c | 3 ++- libavcodec/vvc/vvcdsp_template.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libavcodec/vvc/vvc_intra.c b/libavcodec/vvc/vvc_intra.c index 2e7c0b7b988..6e2c7620ad2 100644 --- a/libavcodec/vvc/vvc_intra.c +++ b/libavcodec/vvc/vvc_intra.c @@ -456,12 +456,13 @@ static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int tu_idx, dequant(lc, tu, tb); if (!tb->ts) { enum TxType trh, trv; - const int nzw = tb->max_scan_x + 1; + int nzw; if (cu->apply_lfnst_flag[c_idx]) ilfnst_transform(lc, tb); derive_transform_type(fc, lc, tb, &trh, &trv); + nzw = tb->max_scan_x + 1; fc->vvcdsp.itx.itx[trh][trv][tb->log2_tb_width][tb->log2_tb_height]( tb->coeffs, tb->coeffs, nzw, sps->log2_transform_range); } diff --git a/libavcodec/vvc/vvcdsp_template.c b/libavcodec/vvc/vvcdsp_template.c index 328863d1d60..886b6d5f65a 100644 --- a/libavcodec/vvc/vvcdsp_template.c +++ b/libavcodec/vvc/vvcdsp_template.c @@ -206,7 +206,7 @@ static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int *dst, for (int x = 0; x < nzw; x++) \ ff_vvc_inv_##type_v##_##height(temp + x, width, coeff + x, width); \ \ - scale_clip(temp, width, width, height, 7, log2_transform_range); \ + scale_clip(temp, nzw, width, height, 7, log2_transform_range); \ \ for (int y = 0; y < height; y++) \ ff_vvc_inv_##type_h##_##width(dst + y * width, 1, temp + y * width, 1); \ From 2d8c39991135f45c9c9ccdefb7ef8a697f0aa500 Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Thu, 3 Aug 2023 13:51:35 +0100 Subject: [PATCH 03/13] libavutil/x86inc.asm: Add REPX from x264 --- libavutil/x86/x86inc.asm | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 251ee797dec..e099ee4b10d 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -232,6 +232,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %define gprsize 4 %endif +; Repeats an instruction/operation for multiple arguments. +; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" +%macro REPX 2-* ; operation, args + %xdefine %%f(x) %1 + %rep %0 - 1 + %rotate 1 + %%f(%1) + %endrep +%endmacro + %macro PUSH 1 push %1 %ifidn rstk, rsp From b1ef0d3c5537ec7033a4ce5a7543c5480a932aaf Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Mon, 7 Aug 2023 12:46:34 +0100 Subject: [PATCH 04/13] tests/checkasm: Add VVC ITX test --- tests/checkasm/Makefile | 5 +- tests/checkasm/checkasm.c | 1 + tests/checkasm/checkasm.h | 1 + tests/checkasm/vvc_itx.c | 102 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 tests/checkasm/vvc_itx.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 9a2105da3b3..c008072bbe3 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -35,7 +35,10 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER) += vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o -AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_alf.o vvc_sao.o vvc_mc.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_alf.o \ + vvc_sao.o \ + vvc_mc.o \ + vvc_itx.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index c4f80ece513..02f2a56cc57 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -179,6 +179,7 @@ static const struct { { "vvc_alf", checkasm_check_vvc_alf }, { "vvc_sao", checkasm_check_vvc_sao }, { "vvc_mc", checkasm_check_vvc_mc }, + { "vvc_itx", checkasm_check_vvc_itx }, #endif #endif #if CONFIG_AVFILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index a82e157a4ea..408f77608ed 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -97,6 +97,7 @@ void checkasm_check_vorbisdsp(void); void checkasm_check_vvc_alf(void); void checkasm_check_vvc_sao(void); void checkasm_check_vvc_mc(void); +void checkasm_check_vvc_itx(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_itx.c b/tests/checkasm/vvc_itx.c new file mode 100644 index 00000000000..b86d9fdf96a --- /dev/null +++ b/tests/checkasm/vvc_itx.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2023 Frank Plowman + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/mem_internal.h" + +#include "libavcodec/avcodec.h" + +#include "libavcodec/vvc/vvcdsp.h" +#include "libavcodec/vvc/vvcdec.h" + +#include "checkasm.h" + +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) +#define BUF_SIZE (MAX_TB_SIZE * MAX_TB_SIZE) + +#define randomize_buffers(buf0, buf1, size, min, max) \ + do { \ + int k; \ + for (k = 0; k < size; ++k) { \ + uint32_t r = rnd(); \ + int32_t a = min + r / (max / (max - min + 1) + 1); \ + AV_WN32A(buf0 + k, a); \ + AV_WN32A(buf1 + k, a); \ + } \ + } while (0) + +const char *itx_str[N_TX_TYPE] = { + "dct2", // DCT2 + "dst7", // DST7 + "dct8", // DCT8 +}; + +const int itx_log2_min_size[N_TX_TYPE] = { + 1, // DCT2 + 2, // DST7 + 2, // DCT8 +}; + +const int itx_log2_max_size[N_TX_TYPE] = { + 6, // DCT2 + 5, // DST7 + 5, // DCT8 +}; + +static void check_itx(VVCDSPContext h, enum TxType trh, enum TxType trv, int bit_depth) +{ + LOCAL_ALIGNED_32(int, ref_dst, [BUF_SIZE]); + LOCAL_ALIGNED_32(int, new_dst, [BUF_SIZE]); + LOCAL_ALIGNED_32(int, ref_src, [BUF_SIZE]); + LOCAL_ALIGNED_32(int, new_src, [BUF_SIZE]); + + for (int log2_width = itx_log2_min_size[trh]; log2_width <= itx_log2_max_size[trh]; ++log2_width) { + for (int log2_height = itx_log2_min_size[trv]; log2_height <= itx_log2_max_size[trv]; ++log2_height) { + const int width = 1 << log2_width; + const int height = 1 << log2_height; + + declare_func_emms(AV_CPU_FLAG_MMX, void, int *dst, const int *src, int nzw, int log2_transform_range); + + randomize_buffers(ref_src, new_src, BUF_SIZE, -(1 << (bit_depth - 1)), 1 << (bit_depth - 1) - 1); + memset(ref_dst, 0, BUF_SIZE); + memset(new_dst, 0, BUF_SIZE); + + // @TODO: test extended precision (l2tr != 15) + // @TODO: test nzw != width + if (check_func(h.itx.itx[trh][trv][log2_width][log2_height], + "inv_%s_%s_%dx%d_%d", + itx_str[trh], itx_str[trv], width, height, bit_depth)) { + call_ref(ref_dst, ref_src, width, 15); + call_new(new_dst, new_src, width, 15); + checkasm_check_int32_t("vvc_itx_1d.asm", 0, ref_dst, 1, new_dst, 1, width, height, "dst"); + } + bench_new(new_dst, new_src, width, 15); + } + } +} + +void checkasm_check_vvc_itx(void) +{ + VVCDSPContext h; + ff_vvc_dsp_init(&h, 8); + check_itx(h, DCT2, DCT2, 8); + ff_vvc_dsp_init(&h, 10); + check_itx(h, DCT2, DCT2, 10); + report("idct2"); +} From 36e417619cc69865cf705ca02ff388fce4bf9a94 Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Tue, 15 Aug 2023 10:41:33 +0100 Subject: [PATCH 05/13] lavc/vvc: Change intra pixel type to `int16_t` Previously the pixels were stored as `int`s. The maximum bit depth supported by VVC is 16, hence an `int16_t` is guaranteed to be sufficient. Smaller sizes allow for more pixels to be packed into an SIMD register and therefore faster assembly optimisations. Further optimisation could be achieved by using `int8_t`s if the bit depth of the current video is only 8. --- libavcodec/vvc/vvc_ctu.c | 5 +++- libavcodec/vvc/vvc_ctu.h | 2 ++ libavcodec/vvc/vvc_intra.c | 22 ++++++++++------ libavcodec/vvc/vvc_intra_template.c | 2 +- libavcodec/vvc/vvcdec.c | 3 +++ libavcodec/vvc/vvcdec.h | 1 + libavcodec/vvc/vvcdsp.c | 4 +-- libavcodec/vvc/vvcdsp.h | 10 +++---- libavcodec/vvc/vvcdsp_template.c | 41 +++++++++++++++-------------- 9 files changed, 53 insertions(+), 37 deletions(-) diff --git a/libavcodec/vvc/vvc_ctu.c b/libavcodec/vvc/vvc_ctu.c index a212d3a44a1..5aaa4f2fdae 100644 --- a/libavcodec/vvc/vvc_ctu.c +++ b/libavcodec/vvc/vvc_ctu.c @@ -268,6 +268,8 @@ static TransformBlock* add_tb(TransformUnit *tu, VVCLocalContext *lc, tb->ts = 0; tb->coeffs = lc->coeffs; lc->coeffs += tb_width * tb_height; + tb->pixels = lc->pixels; + lc->pixels += tb_width * tb_height; return tb; } @@ -2382,6 +2384,7 @@ int ff_vvc_coding_tree_unit(VVCLocalContext *lc, } lc->coeffs = fc->tab.coeffs + rs * ctb_size * VVC_MAX_SAMPLE_ARRAYS; + lc->pixels = fc->tab.pixels + rs * ctb_size * VVC_MAX_SAMPLE_ARRAYS; lc->cu = NULL; ff_vvc_cabac_init(lc, ctu_idx, rx, ry); @@ -2474,4 +2477,4 @@ void ff_vvc_ep_init_stat_coeff(EntryPoint *ep, ep->stat_coeff[i] = persistent_rice_adaptation_enabled_flag ? 2 * (av_log2(bit_depth - 10)) : 0; } -} \ No newline at end of file +} diff --git a/libavcodec/vvc/vvc_ctu.h b/libavcodec/vvc/vvc_ctu.h index 50f081a4c47..f8d2fa454e5 100644 --- a/libavcodec/vvc/vvc_ctu.h +++ b/libavcodec/vvc/vvc_ctu.h @@ -96,6 +96,7 @@ typedef struct TransformBlock { int bd_offset; int *coeffs; + int16_t *pixels; } TransformBlock; typedef enum VVCTreeType { @@ -369,6 +370,7 @@ struct VVCLocalContext { VVCFrameContext *fc; EntryPoint *ep; int *coeffs; + int16_t *pixels; } ; typedef struct VVCAllowedSplit { diff --git a/libavcodec/vvc/vvc_intra.c b/libavcodec/vvc/vvc_intra.c index 6e2c7620ad2..90f221bb383 100644 --- a/libavcodec/vvc/vvc_intra.c +++ b/libavcodec/vvc/vvc_intra.c @@ -176,11 +176,11 @@ static void add_residual_for_joint_coding_chroma(VVCLocalContext *lc, uint8_t *dst = &fc->frame->data[c_idx][(tb->y0 >> vs) * stride + ((tb->x0 >> hs) << fc->ps.sps->pixel_shift)]; if (chroma_scale) { - fc->vvcdsp.itx.pred_residual_joint(tb->coeffs, tb->tb_width, tb->tb_height, c_sign, shift); - fc->vvcdsp.intra.lmcs_scale_chroma(lc, tb->coeffs, tb->coeffs, tb->tb_width, tb->tb_height, cu->x0, cu->y0); - fc->vvcdsp.itx.add_residual(dst, tb->coeffs, tb->tb_width, tb->tb_height, stride); + fc->vvcdsp.itx.pred_residual_joint(tb->pixels, tb->tb_width, tb->tb_height, c_sign, shift); + fc->vvcdsp.intra.lmcs_scale_chroma(lc, tb->pixels, tb->pixels, tb->tb_width, tb->tb_height, cu->x0, cu->y0); + fc->vvcdsp.itx.add_residual(dst, tb->pixels, tb->tb_width, tb->tb_height, stride); } else { - fc->vvcdsp.itx.add_residual_joint(dst, tb->coeffs, tb->tb_width, tb->tb_height, stride, c_sign, shift); + fc->vvcdsp.itx.add_residual_joint(dst, tb->pixels, tb->tb_width, tb->tb_height, stride, c_sign, shift); } } @@ -435,7 +435,7 @@ static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int tu_idx, const VVCSH *sh = &lc->sc->sh; const CodingUnit *cu = lc->cu; const int ps = fc->ps.sps->pixel_shift; - DECLARE_ALIGNED(32, int, temp)[MAX_TB_SIZE * MAX_TB_SIZE]; + DECLARE_ALIGNED(32, int16_t, temp)[MAX_TB_SIZE * MAX_TB_SIZE]; for (int i = 0; i < tu->nb_tbs; i++) { TransformBlock *tb = &tu->tbs[i]; @@ -464,12 +464,18 @@ static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int tu_idx, nzw = tb->max_scan_x + 1; fc->vvcdsp.itx.itx[trh][trv][tb->log2_tb_width][tb->log2_tb_height]( - tb->coeffs, tb->coeffs, nzw, sps->log2_transform_range); + tb->pixels, tb->coeffs, nzw, sps->log2_transform_range); + } else { + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + tb->pixels[y * w + x] = tb->coeffs[y * w + x]; + } + } } if (chroma_scale) - fc->vvcdsp.intra.lmcs_scale_chroma(lc, temp, tb->coeffs, w, h, cu->x0, cu->y0); - fc->vvcdsp.itx.add_residual(dst, chroma_scale ? temp : tb->coeffs, w, h, stride); + fc->vvcdsp.intra.lmcs_scale_chroma(lc, temp, tb->pixels, w, h, cu->x0, cu->y0); + fc->vvcdsp.itx.add_residual(dst, chroma_scale ? temp : tb->pixels, w, h, stride); if (tu->joint_cbcr_residual_flag && tb->c_idx) add_residual_for_joint_coding_chroma(lc, tu, tb, chroma_scale); diff --git a/libavcodec/vvc/vvc_intra_template.c b/libavcodec/vvc/vvc_intra_template.c index 81987a579ec..29847247295 100644 --- a/libavcodec/vvc/vvc_intra_template.c +++ b/libavcodec/vvc/vvc_intra_template.c @@ -429,7 +429,7 @@ static int FUNC(lmcs_derive_chroma_scale)(VVCLocalContext *lc, const int x0, con } // 8.7.5.3 Picture reconstruction with luma dependent chroma residual scaling process for chroma samples -static void FUNC(lmcs_scale_chroma)(VVCLocalContext *lc, int *dst, const int *coeff, +static void FUNC(lmcs_scale_chroma)(VVCLocalContext *lc, int16_t *dst, const int16_t *coeff, const int width, const int height, const int x0_cu, const int y0_cu) { const int chroma_scale = FUNC(lmcs_derive_chroma_scale)(lc, x0_cu, y0_cu); diff --git a/libavcodec/vvc/vvcdec.c b/libavcodec/vvc/vvcdec.c index 1eb2b7724ad..545ad9722e5 100644 --- a/libavcodec/vvc/vvcdec.c +++ b/libavcodec/vvc/vvcdec.c @@ -98,6 +98,9 @@ static int ctb_arrays_init(VVCFrameContext *fc, const int ctu_count, const int c fc->tab.coeffs = av_malloc(ctu_count * sizeof(*fc->tab.coeffs) * ctu_size * VVC_MAX_SAMPLE_ARRAYS); if (!fc->tab.coeffs) return AVERROR(ENOMEM); + fc->tab.pixels = av_malloc(ctu_count * sizeof(*fc->tab.pixels) * ctu_size * VVC_MAX_SAMPLE_ARRAYS); + if (!fc->tab.pixels) + return AVERROR(ENOMEM); fc->rpl_tab_pool = av_buffer_pool_init(ctu_count * sizeof(RefPicListTab), av_buffer_allocz); if (!fc->rpl_tab_pool) return AVERROR(ENOMEM); diff --git a/libavcodec/vvc/vvcdec.h b/libavcodec/vvc/vvcdec.h index 255f374d905..ea8253b8fbb 100644 --- a/libavcodec/vvc/vvcdec.h +++ b/libavcodec/vvc/vvcdec.h @@ -252,6 +252,7 @@ struct VVCFrameContext { uint8_t *alf_pixel_buffer_v[VVC_MAX_SAMPLE_ARRAYS][2]; int *coeffs; + int16_t *pixels; CTU *ctus; //used in arrays_init only diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c index baff205f551..aea052bd3e8 100644 --- a/libavcodec/vvc/vvcdsp.c +++ b/libavcodec/vvc/vvcdsp.c @@ -268,12 +268,12 @@ static void scale_clip(int *coeff, const int nzw, const int w, const int h, } } -static void scale(int *out, const int *in, const int w, const int h, const int shift) +static void scale(int16_t *out, const int *in, const int w, const int h, const int shift) { const int add = 1 << (shift - 1); for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - int *o = out + y * w + x; + int16_t *o = out + y * w + x; const int *i = in + y * w + x; *o = (*i + add) >> shift; } diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h index 26683b2e915..a5e45fc7638 100644 --- a/libavcodec/vvc/vvcdsp.h +++ b/libavcodec/vvc/vvcdsp.h @@ -102,7 +102,7 @@ struct VVCLocalContext; typedef struct VVCIntraDSPContext { void (*intra_cclm_pred)(const struct VVCLocalContext *lc, int x0, int y0, int w, int h); - void (*lmcs_scale_chroma)(struct VVCLocalContext *lc, int *dst, const int *coeff, int w, int h, int x0_cu, int y0_cu); + void (*lmcs_scale_chroma)(struct VVCLocalContext *lc, int16_t *dst, const int16_t *coeff, int w, int h, int x0_cu, int y0_cu); void (*intra_pred)(const struct VVCLocalContext *lc, int x0, int y0, int w, int h, int c_idx); void (*pred_planar)(uint8_t *src, const uint8_t *top, const uint8_t *left, int w, int h, ptrdiff_t stride); void (*pred_mip)(uint8_t *src, const uint8_t *top, const uint8_t *left, int w, int h, ptrdiff_t stride, @@ -117,11 +117,11 @@ typedef struct VVCIntraDSPContext { } VVCIntraDSPContext; typedef struct VVCItxDSPContext { - void (*add_residual)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride); - void (*add_residual_joint)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride, int c_sign, int shift); - void (*pred_residual_joint)(int *buf, int width, int height, int c_sign, int shift); + void (*add_residual)(uint8_t *dst, const int16_t *res, int width, int height, ptrdiff_t stride); + void (*add_residual_joint)(uint8_t *dst, const int16_t *res, int width, int height, ptrdiff_t stride, int c_sign, int shift); + void (*pred_residual_joint)(int16_t *buf, int width, int height, int c_sign, int shift); - void (*itx[N_TX_TYPE][N_TX_TYPE][N_TX_SIZE][N_TX_SIZE])(int *dst, const int *coeff, int nzw, int log2_transform_range); + void (*itx[N_TX_TYPE][N_TX_TYPE][N_TX_SIZE][N_TX_SIZE])(int16_t *dst, const int *coeff, int nzw, int log2_transform_range); void (*transform_bdpcm)(int *coeffs, int width, int height, int vertical, int log2_transform_range); } VVCItxDSPContext; diff --git a/libavcodec/vvc/vvcdsp_template.c b/libavcodec/vvc/vvcdsp_template.c index 886b6d5f65a..4cba3604111 100644 --- a/libavcodec/vvc/vvcdsp_template.c +++ b/libavcodec/vvc/vvcdsp_template.c @@ -29,7 +29,7 @@ #include "vvc_intra_template.c" #include "vvc_filter_template.c" -static void FUNC(add_residual)(uint8_t *_dst, const int *res, +static void FUNC(add_residual)(uint8_t *_dst, const int16_t *res, const int w, const int h, const ptrdiff_t _stride) { pixel *dst = (pixel *)_dst; @@ -45,7 +45,7 @@ static void FUNC(add_residual)(uint8_t *_dst, const int *res, } } -static void FUNC(add_residual_joint)(uint8_t *_dst, const int *res, +static void FUNC(add_residual_joint)(uint8_t *_dst, const int16_t *res, const int w, const int h, const ptrdiff_t _stride, const int c_sign, const int shift) { pixel *dst = (pixel *)_dst; @@ -62,7 +62,7 @@ static void FUNC(add_residual_joint)(uint8_t *_dst, const int *res, } } -static void FUNC(pred_residual_joint)(int *buf, const int w, const int h, +static void FUNC(pred_residual_joint)(int16_t *buf, const int w, const int h, const int c_sign, const int shift) { for (int y = 0; y < h; y++) { @@ -197,25 +197,26 @@ static void FUNC(transform_bdpcm)(int *coeffs, const int width, const int height // ITX function prototypes #undef ITX_2D -#define ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) \ -static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int *dst, \ - const int *coeff, int nzw, int log2_transform_range) \ -{ \ - DECLARE_ALIGNED(32, int, temp)[width * height]; \ - \ - for (int x = 0; x < nzw; x++) \ - ff_vvc_inv_##type_v##_##height(temp + x, width, coeff + x, width); \ - \ - scale_clip(temp, nzw, width, height, 7, log2_transform_range); \ - \ - for (int y = 0; y < height; y++) \ - ff_vvc_inv_##type_h##_##width(dst + y * width, 1, temp + y * width, 1); \ - \ - scale(dst, dst, width, height, 5 + log2_transform_range - BIT_DEPTH); \ +#define ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) \ +static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int16_t *dst, \ + const int *coeff, int nzw, int log2_transform_range) \ +{ \ + DECLARE_ALIGNED(32, int, temp)[width * height]; \ + DECLARE_ALIGNED(32, int, temp2)[width * height]; \ + \ + for (int x = 0; x < nzw; x++) \ + ff_vvc_inv_##type_v##_##height(temp + x, width, coeff + x, width); \ + \ + scale_clip(temp, nzw, width, height, 7, log2_transform_range); \ + \ + for (int y = 0; y < height; y++) \ + ff_vvc_inv_##type_h##_##width(temp2 + y * width, 1, temp + y * width, 1); \ + \ + scale(dst, temp2, width, height, 5 + log2_transform_range - BIT_DEPTH); \ } #undef ITX_1D_H #define ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, width, height) \ -static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int *dst, \ +static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int16_t *dst, \ const int *coeff, int nzw, int log2_transform_range) \ { \ DECLARE_ALIGNED(32, int, temp)[width * height]; \ @@ -225,7 +226,7 @@ static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int *dst, } #undef ITX_1D_V #define ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, width, height) \ -static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int *dst, \ +static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int16_t *dst, \ const int *coeff, int nzw, int log2_transform_range) \ { \ DECLARE_ALIGNED(32, int, temp)[width * height]; \ From 719fe2adaec65e4e660a4aa98fe402e7b10678d9 Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Tue, 15 Aug 2023 10:56:32 +0100 Subject: [PATCH 06/13] tests/checkasm/vvc_itx: Use `int16_t` dst arrays --- tests/checkasm/vvc_itx.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/checkasm/vvc_itx.c b/tests/checkasm/vvc_itx.c index b86d9fdf96a..de0c11c0f70 100644 --- a/tests/checkasm/vvc_itx.c +++ b/tests/checkasm/vvc_itx.c @@ -61,30 +61,37 @@ const int itx_log2_max_size[N_TX_TYPE] = { static void check_itx(VVCDSPContext h, enum TxType trh, enum TxType trv, int bit_depth) { - LOCAL_ALIGNED_32(int, ref_dst, [BUF_SIZE]); - LOCAL_ALIGNED_32(int, new_dst, [BUF_SIZE]); + // @TODO: test extended precision (log2_transform_range != 15) + const int log2_transform_range = 15; + + LOCAL_ALIGNED_32(int16_t, ref_dst, [BUF_SIZE]); + LOCAL_ALIGNED_32(int16_t, new_dst, [BUF_SIZE]); LOCAL_ALIGNED_32(int, ref_src, [BUF_SIZE]); LOCAL_ALIGNED_32(int, new_src, [BUF_SIZE]); for (int log2_width = itx_log2_min_size[trh]; log2_width <= itx_log2_max_size[trh]; ++log2_width) { + const int width = 1 << log2_width; for (int log2_height = itx_log2_min_size[trv]; log2_height <= itx_log2_max_size[trv]; ++log2_height) { - const int width = 1 << log2_width; const int height = 1 << log2_height; - declare_func_emms(AV_CPU_FLAG_MMX, void, int *dst, const int *src, int nzw, int log2_transform_range); + declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *dst, const int *src, + int nzw, int log2_transform_range); - randomize_buffers(ref_src, new_src, BUF_SIZE, -(1 << (bit_depth - 1)), 1 << (bit_depth - 1) - 1); + randomize_buffers(ref_src, new_src, BUF_SIZE, + -(1 << log2_transform_range), + (1 << log2_transform_range) - 1); memset(ref_dst, 0, BUF_SIZE); memset(new_dst, 0, BUF_SIZE); - // @TODO: test extended precision (l2tr != 15) // @TODO: test nzw != width if (check_func(h.itx.itx[trh][trv][log2_width][log2_height], "inv_%s_%s_%dx%d_%d", itx_str[trh], itx_str[trv], width, height, bit_depth)) { - call_ref(ref_dst, ref_src, width, 15); - call_new(new_dst, new_src, width, 15); - checkasm_check_int32_t("vvc_itx_1d.asm", 0, ref_dst, 1, new_dst, 1, width, height, "dst"); + call_ref(ref_dst, ref_src, width, log2_transform_range); + call_new(new_dst, new_src, width, log2_transform_range); + checkasm_check_int16_t("vvc_itx_1d.asm", 0, ref_dst, + sizeof(int), new_dst, sizeof(int), width, + height, "dst"); } bench_new(new_dst, new_src, width, 15); } From d1fa8b07877e0dd2a0e2203e961d47503213ec48 Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Tue, 15 Aug 2023 12:02:11 +0100 Subject: [PATCH 07/13] tests/checkasm/vvc_itx: Fix dst stride --- tests/checkasm/vvc_itx.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/checkasm/vvc_itx.c b/tests/checkasm/vvc_itx.c index de0c11c0f70..c7fc312965e 100644 --- a/tests/checkasm/vvc_itx.c +++ b/tests/checkasm/vvc_itx.c @@ -89,9 +89,10 @@ static void check_itx(VVCDSPContext h, enum TxType trh, enum TxType trv, int bit itx_str[trh], itx_str[trv], width, height, bit_depth)) { call_ref(ref_dst, ref_src, width, log2_transform_range); call_new(new_dst, new_src, width, log2_transform_range); - checkasm_check_int16_t("vvc_itx_1d.asm", 0, ref_dst, - sizeof(int), new_dst, sizeof(int), width, - height, "dst"); + checkasm_check_int16_t("vvc_itx_1d.asm", 0, + ref_dst, width * sizeof(*ref_dst), + new_dst, width * sizeof(*new_dst), + width, height, "dst"); } bench_new(new_dst, new_src, width, 15); } From 5d52e923b6aac795537b5cb5d12d9f8987b39945 Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Tue, 15 Aug 2023 13:57:54 +0100 Subject: [PATCH 08/13] lavc/vvc: Output ITX result in column-major order This is laying groundwork for assembly optimisations. It is easier to perform a column transform using SIMD instructions than it is to perform a column transform. Typically, a transposition is required. Therefore, by expecting the output of the inverse transforms in column-major order, a transposition can be eliminated. This is also effectively free, with only a minor performance hit due to cache-friendliness. --- libavcodec/vvc/vvc_intra.c | 6 ++--- libavcodec/vvc/vvcdsp_template.c | 38 +++++++++++++++----------------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/libavcodec/vvc/vvc_intra.c b/libavcodec/vvc/vvc_intra.c index 90f221bb383..52302adb12a 100644 --- a/libavcodec/vvc/vvc_intra.c +++ b/libavcodec/vvc/vvc_intra.c @@ -466,9 +466,9 @@ static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int tu_idx, fc->vvcdsp.itx.itx[trh][trv][tb->log2_tb_width][tb->log2_tb_height]( tb->pixels, tb->coeffs, nzw, sps->log2_transform_range); } else { - for (int y = 0; y < h; ++y) { - for (int x = 0; x < w; ++x) { - tb->pixels[y * w + x] = tb->coeffs[y * w + x]; + for (int x = 0; x < w; ++x) { + for (int y = 0; y < h; ++y) { + tb->pixels[x * h + y] = tb->coeffs[y * w + x]; } } } diff --git a/libavcodec/vvc/vvcdsp_template.c b/libavcodec/vvc/vvcdsp_template.c index 4cba3604111..e8d72dfb396 100644 --- a/libavcodec/vvc/vvcdsp_template.c +++ b/libavcodec/vvc/vvcdsp_template.c @@ -38,8 +38,7 @@ static void FUNC(add_residual)(uint8_t *_dst, const int16_t *res, for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - dst[x] = av_clip_pixel(dst[x] + *res); - res++; + dst[x] = av_clip_pixel(dst[x] + res[x * h + y]); } dst += stride; } @@ -54,9 +53,8 @@ static void FUNC(add_residual_joint)(uint8_t *_dst, const int16_t *res, for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - const int r = ((*res) * c_sign) >> shift; + const int r = (res[x * h + y] * c_sign) >> shift; dst[x] = av_clip_pixel(dst[x] + r); - res++; } dst += stride; } @@ -197,22 +195,22 @@ static void FUNC(transform_bdpcm)(int *coeffs, const int width, const int height // ITX function prototypes #undef ITX_2D -#define ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) \ -static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int16_t *dst, \ - const int *coeff, int nzw, int log2_transform_range) \ -{ \ - DECLARE_ALIGNED(32, int, temp)[width * height]; \ - DECLARE_ALIGNED(32, int, temp2)[width * height]; \ - \ - for (int x = 0; x < nzw; x++) \ - ff_vvc_inv_##type_v##_##height(temp + x, width, coeff + x, width); \ - \ - scale_clip(temp, nzw, width, height, 7, log2_transform_range); \ - \ - for (int y = 0; y < height; y++) \ - ff_vvc_inv_##type_h##_##width(temp2 + y * width, 1, temp + y * width, 1); \ - \ - scale(dst, temp2, width, height, 5 + log2_transform_range - BIT_DEPTH); \ +#define ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) \ +static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int16_t *dst, \ + const int *coeff, int nzw, int log2_transform_range) \ +{ \ + DECLARE_ALIGNED(32, int, temp)[width * height]; \ + DECLARE_ALIGNED(32, int, temp2)[width * height]; \ + \ + for (int x = 0; x < nzw; x++) \ + ff_vvc_inv_##type_v##_##height(temp + x, width, coeff + x, width); \ + \ + scale_clip(temp, nzw, width, height, 7, log2_transform_range); \ + \ + for (int y = 0; y < height; y++) \ + ff_vvc_inv_##type_h##_##width(temp2 + y, height, temp + y * width, 1); \ + \ + scale(dst, temp2, width, height, 5 + log2_transform_range - BIT_DEPTH); \ } #undef ITX_1D_H #define ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, width, height) \ From df05755795c4f957021d6cd578a5a6d1d32f084b Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Wed, 16 Aug 2023 16:23:50 +0100 Subject: [PATCH 09/13] lavc/vvc: Port 4x4 DCT2/DCT2 10-bit from dav1d --- libavcodec/x86/Makefile | 4 +- libavcodec/x86/vvc_itx_16bit.asm | 249 +++++++++++++++++++++++++++++++ libavcodec/x86/vvc_itx_8bit.asm | 54 +++++++ libavcodec/x86/vvcdsp_init.c | 40 +++++ 4 files changed, 346 insertions(+), 1 deletion(-) create mode 100644 libavcodec/x86/vvc_itx_16bit.asm create mode 100644 libavcodec/x86/vvc_itx_8bit.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 71a1cdf63e7..00a17f662c0 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -206,5 +206,7 @@ X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc_alf.o \ x86/vvc_sao.o \ x86/vvc_sao_10bit.o \ - x86/vvc_mc.o + x86/vvc_mc.o \ + x86/vvc_itx_8bit.o \ + x86/vvc_itx_16bit.o X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o diff --git a/libavcodec/x86/vvc_itx_16bit.asm b/libavcodec/x86/vvc_itx_16bit.asm new file mode 100644 index 00000000000..54e576927be --- /dev/null +++ b/libavcodec/x86/vvc_itx_16bit.asm @@ -0,0 +1,249 @@ +; Copyright © 2023, Frank Plowman +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright © 2021, Matthias Dressel +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 + +idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 + +%macro COEF_PAIR 2-3 0 +vvc_pd_%1_%2: dd %1, %1, %2, %2 +%define vvc_pd_%1 (vvc_pd_%1_%2 + 4*0) +%define vvc_pd_%2 (vvc_pd_%1_%2 + 4*2) +%if %3 +dd -%2, -%2 +%define vvc_pd_%2_m%2 vvc_pd_%2 +%endif +%endmacro + +COEF_PAIR 64, 36, 1 +COEF_PAIR 64, 83, 1 + +coeff_min_15: times 2 dw -0x8000 +coeff_max_15: times 2 dw 0x7fff +dconly_10: times 2 dw 0x7c00 + +cextern vvc_pw_36_83 +cextern vvc_pw_m36_m83 +cextern vvc_pw_m83_36 +cextern vvc_pw_64_64 +cextern vvc_pw_m64_64 +cextern vvc_pw_512 + +cextern vvc_pd_512 + +SECTION .text + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +INIT_YMM avx2 + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 7 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 7 +; flags: 1 = packed, 2 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else +%if %9 & 1 + vbroadcasti128 m%3, [vvc_pd_%8] +%else + vpbroadcastd m%3, [vvc_pd_%8] +%endif + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else +%if %9 & 1 + vbroadcasti128 m%5, [vvc_pd_%7] +%else + vpbroadcastd m%5, [vvc_pd_%7] +%endif + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 2 + psubd m%4, m%6, m%4 + psubd m%2, m%4, m%2 +%else +%ifnum %6 + paddd m%4, m%6 +%endif + paddd m%2, m%4 +%endif +%ifnum %6 + paddd m%1, m%6 +%endif + psubd m%1, m%3 +%ifnum %6 + psrad m%2, 7 + psrad m%1, 7 +%endif +%endmacro + +%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth +cglobal vvc_inv_%1_%2_%4_%5, 4, 6, 0, dst, c, eob, l2tr, stride, tx2 + %define %%p1 m(i%1_%4_internal_%5) + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%4_internal_%5).pass2] +%ifidn %1_%2, dct2_dct2 + test eobd, eobd + jnz %%p1 +%else +%if %3 + add eobd, %3 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x4, %3 +%ifidn %1_%2, dct2_dct2 + vpbroadcastd xm2, [dconly_%3] +%if %3 = 10 +.dconly: + imul r7d, [cq], 181 + mov [cq], eobd ; 0 + or r2d, 4 +.dconly2: + add r7d, 128 + sar r7d, 8 +.dconly3: + imul r7d, 181 + add r7d, 2176 + sar r7d, 12 + movd xm0, r7d + paddsw xm0, xm2 + vpbroadcastw xm0, xm0 +.dconly_loop: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + paddsw xm1, xm0 + psubusw xm1, xm2 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + lea dstq, [dstq+strideq*2] + sub r2d, 2 + jg .dconly_loop + WRAP_XMM RET +%else + jmp m(vvc_inv_dct2_dct2_4x4_10).dconly +%endif +%endif +%endmacro + +%macro IDCT2_4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd + ITX_MULSUB_2D %1, %2, %3, %4, %5, nornd, 64_36, 64_83, 1 + punpckhqdq m%3, m%2, m%1 ; t3 t2 + punpcklqdq m%2, m%1 ; t0 t1 + paddd m%1, m%2, m%3 ; out0 out1 + psubd m%2, m%3 ; out3 out2 + paddd m%1, m%6 + paddd m%2, m%6 + psrad m%1, 7 + psrad m%2, 7 + ; @TODO: this should depend on sps_extended_precision_flag + vpbroadcastd m%3, [coeff_min_15] + vpbroadcastd m%4, [coeff_max_15] + pmaxsd m%1, m%3 + pmaxsd m%2, m%3 + pminsd m%1, m%4 + pminsd m%2, m%4 +%endmacro + +%macro IDCT2_4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd + vpbroadcastd m%5, [vvc_pw_m83_36] + punpckhwd m%3, m%2, m%1 + vpbroadcastd m%4, [vvc_pw_36_83] + punpcklwd m%2, m%1 + vpbroadcastd m%1, [vvc_pw_m64_64] + pmaddwd m%5, m%3 + pmaddwd m%3, m%4 + vpbroadcastd m%4, [vvc_pw_64_64] + pmaddwd m%1, m%2 + pmaddwd m%2, m%4 + paddd m%4, m%1, m%5 + psubd m%5, m%1, m%5 + paddd m%1, m%2, m%3 + psubd m%2, m%3 + ; @TODO: this should depend on l2tr + REPX {paddd x, m%6}, m%4, m%1, m%5, m%2 + REPX {psrad x, 10 }, m%4, m%1, m%5, m%2 + packssdw m%1, m%4 + packssdw m%2, m%5 +%endmacro + +INV_TXFM_4X4_FN dct2, dct2 + +cglobal idct2_4x4_internal_10, 0, 8, 6, dst, c, eob, l2tr, stride, tx2 + mov strideq, 8 + call .main + vbroadcasti128 m2, [idct4_shuf] + packssdw m0, m1 + pshufb m0, m2 + jmp tx2q +.pass2: + vextracti128 xm1, m0, 1 + vpbroadcastd xm5, [vvc_pd_512] + WRAP_XMM IDCT2_4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 + lea r7, [dstq+strideq*2] + + + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [r7 +strideq*0], xm1 + movq [r7 +strideq*1], xm1 + RET +ALIGN function_align +.main: + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m5, [vvc_pd_64] +.main2: + IDCT2_4_1D_PACKED 0, 1, 2, 3, 4, 5 + ret + +%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/vvc_itx_8bit.asm b/libavcodec/x86/vvc_itx_8bit.asm new file mode 100644 index 00000000000..3938d42456a --- /dev/null +++ b/libavcodec/x86/vvc_itx_8bit.asm @@ -0,0 +1,54 @@ +; Copyright © 2023, Frank Plowman +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 + +; Note: The order of (at least some of) those constants matter! + +const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +%macro COEF_PAIR 2 +vvc_pw_%1_%2: dw %1, %2 +vvc_pw_m%2_%1: dw -%2, %1 +%endmacro + +const vvc_pw_64_64, dw 64, 64 +const vvc_pw_64_m64, dw 64, -64 +const vvc_pw_m64_64, dw -64, 64 +const vvc_pw_36_83, dw 36, 83 +const vvc_pw_m83_36, dw -83, 36 +const vvc_pw_64_64, dw 64, 64 +const vvc_pw_m64_64, dw -64, 64 +const vvc_pw_512, times 2 dw 512 + +const vvc_pd_64, dd 64 +const vvc_pd_512, dd 512 + +%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/vvcdsp_init.c b/libavcodec/x86/vvcdsp_init.c index a5849e3a2ed..797349e5665 100644 --- a/libavcodec/x86/vvcdsp_init.c +++ b/libavcodec/x86/vvcdsp_init.c @@ -241,6 +241,44 @@ PUT_VVC_LUMA_FORWARD_FUNCS(12, avx512icl) c->inter.put[LUMA][1][1] = ff_vvc_put_vvc_luma_hv_##bitd##_##opt; \ } while (0) +#define ITX_COMMON_SIZES(TYPE_H, type_h, TYPE_V, type_v, bitd, opt) \ + ITX(TYPE_H, type_h, TYPE_V, type_v, 4, 4, bitd, opt); \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 4, 8, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 4, 16, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 8, 4, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 8, 8, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 8, 16, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 8, 32, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 16, 4, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 16, 8, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 16, 16, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 16, 32, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 32, 8, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 32, 16, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 32, 32, bitd, opt); */ + +#define ITX_SIZES(bitd, opt) \ + ITX_COMMON_SIZES(DCT2, dct2, DCT2, dct2, bitd, opt); \ + /* ITX(DCT2, dct2, DCT2, dct2, 16, 64, bitd, opt); */ \ + /* ITX(DCT2, dct2, DCT2, dct2, 32, 64, bitd, opt); */ \ + /* ITX(DCT2, dct2, DCT2, dct2, 64, 16, bitd, opt); */ \ + /* ITX(DCT2, dct2, DCT2, dct2, 64, 32, bitd, opt); */ \ + /* ITX(DCT2, dct2, DCT2, dct2, 64, 64, bitd, opt); */ + +#define ITX(TYPE_H, type_h, TYPE_V, type_v, width, height, bitd, opt) \ +void ff_vvc_inv_##type_h##_##type_v##_##width##x##height##_##bitd##_##opt( \ + int16_t *dst, const int *coeff, int nzw, int log2_transform_range); +/* ITX_SIZES(8, avx2) */ +ITX_SIZES(10, avx2) + +#undef ITX +#define ITX(TYPE_H, type_h, TYPE_V, type_v, width, height, bitd, opt) \ + c->itx.itx[TYPE_H][TYPE_V][TX_SIZE_##width][TX_SIZE_##height] = ff_vvc_inv_##type_h##_##type_v##_##width##x##height##_##bitd##_##opt; + +#define ITX_INIT(bitd, opt) do { \ + ITX_SIZES(bitd, opt) \ +} while (0) + void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bit_depth) { const int cpu_flags = av_get_cpu_flags(); @@ -250,12 +288,14 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bit_depth) case 8: ALF_DSP(8); PUT_VVC_LUMA_INIT(8, avx2); + /* ITX_INIT(8, avx2); */ c->sao.band_filter[0] = ff_vvc_sao_band_filter_8_8_avx2; c->sao.band_filter[1] = ff_vvc_sao_band_filter_16_8_avx2; break; case 10: ALF_DSP(10); PUT_VVC_LUMA_INIT(10, avx2); + ITX_INIT(10, avx2); c->sao.band_filter[0] = ff_vvc_sao_band_filter_8_10_avx2; break; case 12: From 2a7d709ad771c8cba4b14fb630b378740253ab27 Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Wed, 16 Aug 2023 16:26:11 +0100 Subject: [PATCH 10/13] lavc/vvc: Remove redundant warnings When bit depth <= 10, sps_range_extension_flag must be 0, therefore sps_extended_precision_flag is not present and assumed to be 0. The derived Log2TransformRange, CoeffMin and CoeffMax are therefore constants for the 10-bit transform. --- libavcodec/x86/vvc_itx_16bit.asm | 2 -- 1 file changed, 2 deletions(-) diff --git a/libavcodec/x86/vvc_itx_16bit.asm b/libavcodec/x86/vvc_itx_16bit.asm index 54e576927be..ca323b0c485 100644 --- a/libavcodec/x86/vvc_itx_16bit.asm +++ b/libavcodec/x86/vvc_itx_16bit.asm @@ -185,7 +185,6 @@ ALIGN function_align paddd m%2, m%6 psrad m%1, 7 psrad m%2, 7 - ; @TODO: this should depend on sps_extended_precision_flag vpbroadcastd m%3, [coeff_min_15] vpbroadcastd m%4, [coeff_max_15] pmaxsd m%1, m%3 @@ -209,7 +208,6 @@ ALIGN function_align psubd m%5, m%1, m%5 paddd m%1, m%2, m%3 psubd m%2, m%3 - ; @TODO: this should depend on l2tr REPX {paddd x, m%6}, m%4, m%1, m%5, m%2 REPX {psrad x, 10 }, m%4, m%1, m%5, m%2 packssdw m%1, m%4 From 6a1dfca8d39eb7603794431574ab103c66e533bf Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Fri, 18 Aug 2023 09:31:11 +0100 Subject: [PATCH 11/13] lavc/x86/vvc_itx: Fix YASM compilation --- libavcodec/x86/vvc_itx_16bit.asm | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/libavcodec/x86/vvc_itx_16bit.asm b/libavcodec/x86/vvc_itx_16bit.asm index ca323b0c485..dcb7bd445e5 100644 --- a/libavcodec/x86/vvc_itx_16bit.asm +++ b/libavcodec/x86/vvc_itx_16bit.asm @@ -37,14 +37,10 @@ idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 vvc_pd_%1_%2: dd %1, %1, %2, %2 %define vvc_pd_%1 (vvc_pd_%1_%2 + 4*0) %define vvc_pd_%2 (vvc_pd_%1_%2 + 4*2) -%if %3 -dd -%2, -%2 -%define vvc_pd_%2_m%2 vvc_pd_%2 -%endif %endmacro -COEF_PAIR 64, 36, 1 -COEF_PAIR 64, 83, 1 +COEF_PAIR 64, 36 +COEF_PAIR 64, 83 coeff_min_15: times 2 dw -0x8000 coeff_max_15: times 2 dw 0x7fff From 56dfa0f8f5cfbbdeb6633f6eb8029eaa5f087468 Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Fri, 18 Aug 2023 12:07:18 +0100 Subject: [PATCH 12/13] lavc/x86/vvc_itx: Import remaining dav1d ASM --- libavcodec/x86/vvc_itx_16bit.asm | 8380 +++++++++++++++++++++++++++++- libavcodec/x86/vvc_itx_8bit.asm | 5503 +++++++++++++++++++- 2 files changed, 13865 insertions(+), 18 deletions(-) diff --git a/libavcodec/x86/vvc_itx_16bit.asm b/libavcodec/x86/vvc_itx_16bit.asm index dcb7bd445e5..845c371c7f4 100644 --- a/libavcodec/x86/vvc_itx_16bit.asm +++ b/libavcodec/x86/vvc_itx_16bit.asm @@ -30,30 +30,126 @@ %if ARCH_X86_64 SECTION_RODATA 32 - -idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 +itx4_shuf: dd 0x509600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 + dd 0x508901, 0xd0c09385, 0x70603523, 0xf0e0b1a7 +idct2_4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 +idct2_4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5 +iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 +idct2_16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 +iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 +vvc_pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 +idct2_4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 +idct2_32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 %macro COEF_PAIR 2-3 0 vvc_pd_%1_%2: dd %1, %1, %2, %2 %define vvc_pd_%1 (vvc_pd_%1_%2 + 4*0) %define vvc_pd_%2 (vvc_pd_%1_%2 + 4*2) +%if %3 +dd -%2, -%2 +%define vvc_pd_%2_m%2 vvc_pd_%2 +%endif %endmacro -COEF_PAIR 64, 36 -COEF_PAIR 64, 83 +COEF_PAIR 201, 995 +COEF_PAIR 9, 43 +COEF_PAIR 18, 75 +COEF_PAIR 1380, 601 +COEF_PAIR 1751, 2440 +COEF_PAIR 57, 25 +COEF_PAIR 2751, 2106 +COEF_PAIR 64, 36, 1 +COEF_PAIR 64, 83, 1 +COEF_PAIR 3035, 3513 +COEF_PAIR 70, 87 +COEF_PAIR 3703, 3290 +COEF_PAIR 3857, 4052 +COEF_PAIR 89, 50 +COEF_PAIR 90, 80 +COEF_PAIR 4091, 3973 + +vvc_pd_8: dd 8 +vvc_pd_m601: dd -601 +vvc_pd_m25: dd -25 +vvc_pd_m1380: dd -1380 +vvc_pd_m2106: dd -2106 +vvc_pd_m57: dd -57 +vvc_pd_m2751: dd -2751 +vvc_pd_m3344: dd -3344 +vvc_pd_1024: dd 1024 +vvc_pd_1321: dd 1321 +vvc_pd_1448: dd 1448 +vvc_pd_1697: dd 1697 +vvc_pd_2482: dd 2482 +vvc_pd_3072: dd 3072 ; 1024 + 2048 +vvc_pd_3803: dd 3803 +vvc_pd_5119: dd 5119 ; 1024 + 4096 - 1 +vvc_pd_5120: dd 5120 ; 1024 + 4096 +vvc_pd_5793: dd 5793 +vvc_pd_6144: dd 6144 ; 2048 + 4096 +vvc_pd_17408: dd 17408 ; 1024 + 16384 coeff_min_15: times 2 dw -0x8000 coeff_max_15: times 2 dw 0x7fff +pixel_10_max: times 2 dw 0x03ff +pixel_12_max: times 2 dw 0x0fff dconly_10: times 2 dw 0x7c00 +dconly_12: times 2 dw 0x7000 +clip_18b_min: dd -0x20000 +clip_18b_max: dd 0x1ffff +clip_20b_min: dd -0x80000 +clip_20b_max: dd 0x7ffff +const idct2_64_mul_16 +dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 9, 90, 18, 89 +dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -57, -70, -89, -18 +dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 43, 80, 75, 50 +dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -25, -87, -50, -75 + +cextern deint_shuf +cextern idct2_64_mul +cextern vvc_pw_1697x8 +cextern vvc_pw_1697x16 cextern vvc_pw_36_83 cextern vvc_pw_m36_m83 cextern vvc_pw_m83_36 cextern vvc_pw_64_64 cextern vvc_pw_m64_64 -cextern vvc_pw_512 - +cextern vvc_pw_5 +cextern vvc_pw_2048 +cextern vvc_pw_4096 +cextern vvc_pw_8192 +cextern vvc_pw_16384 +cextern vvc_pw_64x8 cextern vvc_pd_512 +cextern vvc_pd_2048 + +cextern idct2_4x8_internal_8_avx2.main +cextern idct2_4x16_internal_8_avx2.main +cextern idct2_8x8_internal_8_avx2.main +cextern idct2_8x16_internal_8_avx2.main +cextern idct2_16x4_internal_8_avx2.main +cextern idct2_16x8_internal_8_avx2.main +cextern idct2_16x16_internal_8_avx2.main +cextern vvc_inv_dct2_dct2_8x32_8_avx2.main +cextern vvc_inv_dct2_dct2_8x32_8_avx2.main_fast +cextern vvc_inv_dct2_dct2_16x32_8_avx2.main_oddhalf +cextern vvc_inv_dct2_dct2_16x32_8_avx2.main_oddhalf_fast +cextern vvc_inv_dct2_dct2_16x64_8_avx2.main_part1 +cextern vvc_inv_dct2_dct2_16x64_8_avx2.main_part2_internal + +cextern iadst_4x4_internal_8_avx2.main +cextern iadst_4x8_internal_8_avx2.main_pass2 +cextern iadst_4x16_internal_8_avx2.main2 +cextern iadst_8x4_internal_8_avx2.main +cextern iadst_8x8_internal_8_avx2.main_pass2 +cextern iadst_8x16_internal_8_avx2.main +cextern iadst_8x16_internal_8_avx2.main_pass2_end +cextern iadst_16x4_internal_8_avx2.main +cextern iadst_16x8_internal_8_avx2.main +cextern iadst_16x8_internal_8_avx2.main_pass2_end +cextern iadst_16x16_internal_8_avx2.main +cextern iadst_16x16_internal_8_avx2.main_pass2_end SECTION .text @@ -65,14 +161,71 @@ SECTION .text INIT_YMM cpuname %endmacro +%macro IWHT4_1D_PACKED 0 + ; m0 = in0 in2, m1 = in1 in3 + psubd m2, m0, m1 ; t2 + paddd xm0, xm1 ; t0 + vpermq m2, m2, q3322 + vpermq m0, m0, q1100 + vpermq m1, m1, q3120 + psubd m3, m0, m2 + psrad m3, 1 + psubd m3, m1 ; t1 t3 + psubd m0, m3 ; ____ out0 + paddd m2, m3 ; out3 ____ +%endmacro + INIT_YMM avx2 +cglobal vvc_inv_wht_wht_4x4_16, 3, 7, 6, dst, stride, c, eob, bdmax + mova xm0, [cq+16*0] + vinserti128 m0, [cq+16*2], 1 + mova xm1, [cq+16*1] + vinserti128 m1, [cq+16*3], 1 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + lea r6, [dstq+strideq*2] + psrad m0, 2 + psrad m1, 2 + IWHT4_1D_PACKED + punpckhdq m0, m3 + punpckldq m3, m2 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x33 + packssdw m0, m3 + vextracti128 xm2, m0, 1 + punpckhdq xm1, xm0, xm2 ; out2 out1 + punpckldq xm0, xm2 ; out3 out0 + movq xm2, [r6 +strideq*1] + movhps xm2, [dstq+strideq*0] + movq xm3, [r6 +strideq*0] + movhps xm3, [dstq+strideq*1] +%ifidn bdmaxd, bdmaxm + movd xm5, bdmaxd + vpbroadcastw xm5, xm5 +%else ; win64: load from stack + vpbroadcastw xm5, bdmaxm +%endif + paddsw xm0, xm2 + paddsw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movq [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm0 + RET ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 7 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 7 -; flags: 1 = packed, 2 = inv_dst2 +; flags: 1 = packed, 2 = inv_dst2, 4 = coef1 is reg, 8 = coef2 is reg ; skip round/shift if rnd is not a number %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags -%if %8 < 32 +%if %9 & 8 pmulld m%4, m%1, m%8 pmulld m%3, m%2, m%8 %else @@ -84,7 +237,7 @@ INIT_YMM avx2 pmulld m%4, m%1, m%3 pmulld m%3, m%2 %endif -%if %7 < 32 +%if %9 & 4 pmulld m%1, m%7 pmulld m%2, m%7 %else @@ -211,11 +364,14 @@ ALIGN function_align %endmacro INV_TXFM_4X4_FN dct2, dct2 +INV_TXFM_4X4_FN dct2, identity +INV_TXFM_4X4_FN dct2, adst +INV_TXFM_4X4_FN dct2, flipadst cglobal idct2_4x4_internal_10, 0, 8, 6, dst, c, eob, l2tr, stride, tx2 mov strideq, 8 call .main - vbroadcasti128 m2, [idct4_shuf] + vbroadcasti128 m2, [idct2_4_shuf] packssdw m0, m1 pshufb m0, m2 jmp tx2q @@ -224,8 +380,6 @@ cglobal idct2_4x4_internal_10, 0, 8, 6, dst, c, eob, l2tr, stride, tx2 vpbroadcastd xm5, [vvc_pd_512] WRAP_XMM IDCT2_4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 lea r7, [dstq+strideq*2] - - movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movhps [r7 +strideq*0], xm1 @@ -240,4 +394,8206 @@ ALIGN function_align IDCT2_4_1D_PACKED 0, 1, 2, 3, 4, 5 ret +INV_TXFM_4X4_FN adst, dct2 +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +%macro IADST4_1D 0 + vpbroadcastd m5, [vvc_pd_1321] + vpbroadcastd m7, [vvc_pd_2482] + pmulld m4, m0, m5 ; 1321*in0 + pmulld m6, m3, m7 ; 2482*in3 + paddd m4, m6 ; 1321*in0 + 2482*in3 + pmulld m6, m0, m7 ; 2482*in0 + paddd m0, m3 ; in0 + in3 + paddd m7, m5 ; vvc_pd_3803 + pmulld m5, m2 ; 1321*in2 + pmulld m3, m7 ; 3803*in3 + pmulld m7, m2 ; 3803*in2 + psubd m2, m0 ; in2 - in0 - in3 + vpbroadcastd m0, [vvc_pd_m3344] + pmulld m1, m0 ; -t3 + pmulld m2, m0 ; out2 (unrounded) + psubd m6, m5 ; 2482*in0 - 1321*in2 + paddd m4, m7 ; t0 + psubd m6, m3 ; t1 + paddd m3, m4, m6 + psubd m4, m1 ; out0 (unrounded) + psubd m6, m1 ; out1 (unrounded) + paddd m3, m1 ; out3 (unrounded) +%endmacro + +cglobal iadst_4x4_internal_10, 0, 7, 6, dst, stride, c, eob, tx2 + call .main + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass1_end: + vpbroadcastd m5, [vvc_pd_2048] + mova m2, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + vpermd m0, m2, m0 + psrld m2, 4 + pshufb m0, m2 +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + jmp tx2q +.pass2: + lea r6, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal_8).main +.end: + vpbroadcastd xm4, [vvc_pw_2048] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm5, [pixel_10_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET +ALIGN function_align +.main: + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] +%if WIN64 + movaps [rsp+16], xmm6 + movaps [rsp+32], xmm7 +%endif +.main2: + WRAP_XMM IADST4_1D + ret + +INV_TXFM_4X4_FN flipadst, dct2 +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_10, 0, 7, 6, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10).main + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_10).pass1_end +.pass2: + lea r6, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal_8).main + vpbroadcastd xm4, [vvc_pw_2048] + movq xm3, [dstq+strideq*1] + movhps xm3, [dstq+strideq*0] + lea r6, [dstq+strideq*2] + movq xm2, [r6 +strideq*1] + movhps xm2, [r6 +strideq*0] + vpbroadcastd xm5, [pixel_10_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm1 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movq [r6 +strideq*1], xm0 + RET + +INV_TXFM_4X4_FN identity, dct2 +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_10, 0, 7, 6, dst, stride, c, eob, tx2 + vpbroadcastd m1, [vvc_pd_5793] + pmulld m0, m1, [cq+32*0] + pmulld m1, [cq+32*1] + vpbroadcastd m5, [vvc_pd_2048] + mova m3, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + vpermd m0, m3, m0 + psrld m3, 4 + pshufb m0, m3 + jmp tx2q +.pass2: + vpbroadcastd m1, [vvc_pw_1697x8] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + pmulhrsw m1, m0 + paddsw m0, m1 + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm4, [pixel_10_max] + packssdw m5, m5 ; vvc_pw_2048 + pmulhrsw m0, m5 + pxor m5, m5 + mova [cq+32*0], m5 + mova [cq+32*1], m5 + vextracti128 xm1, m0, 1 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm5 + pmaxsw xm1, xm5 + pminsw xm0, xm4 + pminsw xm1, xm4 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET + +INV_TXFM_4X4_FN dct2, dct2, 12 +INV_TXFM_4X4_FN dct2, identity, 12 +INV_TXFM_4X4_FN dct2, adst, 12 +INV_TXFM_4X4_FN dct2, flipadst, 12 + +cglobal idct2_4x4_internal_12, 0, 7, 8, dst, stride, c, eob, tx2 + call m(idct2_4x4_internal_10).main + mova m3, [idct2_4_12_shuf] + mova m4, [idct2_4_12_shuf2] + vpermd m2, m4, m1 + vpermd m1, m3, m0 + jmp m(iadst_4x4_internal_12).pass1_end2 +.pass2: + vpbroadcastd m5, [vvc_pd_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct2_4x4_internal_10).main2 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_4x4_internal_12).end + +INV_TXFM_4X4_FN adst, dct2, 12 +INV_TXFM_4X4_FN adst, adst, 12 +INV_TXFM_4X4_FN adst, flipadst, 12 +INV_TXFM_4X4_FN adst, identity, 12 + +cglobal iadst_4x4_internal_12, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10).main + vinserti128 m1, m4, xm6, 1 + vinserti128 m2, xm3, 1 +.pass1_end: + mova m3, [itx4_shuf] + vpbroadcastd m5, [vvc_pd_1024] + psrad m1, 1 + psrad m2, 1 + vpermd m1, m3, m1 + vpermd m2, m3, m2 + paddd m1, m5 + paddd m2, m5 + psrad m1, 11 + psrad m2, 11 +.pass1_end2: + vpbroadcastd m3, [clip_18b_min] + vpbroadcastd m4, [clip_18b_max] + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pmaxsd m0, m3 + pmaxsd m1, m3 + pminsd m0, m4 + pminsd m1, m4 + jmp tx2q +.pass2: + call .main_pass2 + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass2_end: + vpbroadcastd m5, [vvc_pd_2048] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 +.end: +%if WIN64 + WIN64_RESTORE_XMM_INTERNAL + %assign xmm_regs_used 6 +%endif +.end2: + vpbroadcastd m4, [vvc_pw_16384] + movq xm2, [dstq+strideq*0] + movq xm3, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movhps xm2, [r6 +strideq*0] ; dst0 dst2 + movhps xm3, [r6 +strideq*1] ; dst1 dst3 + vpbroadcastd m5, [pixel_12_max] + vinserti128 m2, xm3, 1 + psrad m0, 3 + psrad m1, 3 + packssdw m0, m1 ; t0 t2 t1 t3 + pmulhrsw m0, m4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw m0, m2 ; out0 out2 out1 out3 + pmaxsw m0, m4 + pminsw m0, m5 + vextracti128 xm1, m0, 1 ; out1 out3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movhps [r6 +strideq*1], xm1 + RET +.main_pass2: + vextracti128 xm3, m1, 1 + mova xm2, xm1 + vextracti128 xm1, m0, 1 + jmp m(iadst_4x4_internal_10).main2 + +INV_TXFM_4X4_FN flipadst, dct2, 12 +INV_TXFM_4X4_FN flipadst, adst, 12 +INV_TXFM_4X4_FN flipadst, flipadst, 12 +INV_TXFM_4X4_FN flipadst, identity, 12 + +cglobal iflipadst_4x4_internal_12, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10).main + vinserti128 m1, m3, xm2, 1 + vinserti128 m2, m6, xm4, 1 + jmp m(iadst_4x4_internal_12).pass1_end +.pass2: + call m(iadst_4x4_internal_12).main_pass2 + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_12).pass2_end + +INV_TXFM_4X4_FN identity, dct2, 12 +INV_TXFM_4X4_FN identity, adst, 12 +INV_TXFM_4X4_FN identity, flipadst, 12 +INV_TXFM_4X4_FN identity, identity, 12 + +cglobal iidentity_4x4_internal_12, 0, 7, 8, dst, stride, c, eob, tx2 + mova m2, [itx4_shuf] + vpbroadcastd m3, [vvc_pd_1697] + vpermd m0, m2, [cq+32*0] + vpermd m2, m2, [cq+32*1] + vpbroadcastd m5, [vvc_pd_2048] + pmulld m1, m3, m0 + pmulld m3, m2 + paddd m1, m5 + paddd m3, m5 + psrad m1, 12 + psrad m3, 12 + paddd m1, m0 + paddd m2, m3 + jmp m(iadst_4x4_internal_12).pass1_end2 +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + vpbroadcastd m3, [vvc_pd_5793] + vpbroadcastd m5, [vvc_pd_2048] + pmulld m0, m3 + pmulld m1, m3 + paddd m0, m5 ; 2048 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + jmp m(iadst_4x4_internal_12).end + +%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x8, %3 +%ifidn %1_%2, dct2_dct2 + vpbroadcastd xm2, [dconly_%3] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(vvc_inv_dct2_dct2_4x4_10).dconly2 +%else + jmp m(vvc_inv_dct2_dct2_4x8_10).dconly +%endif +%endif +%endmacro + +%macro IDCT2_4_1D 8 ; src[1-4], tmp[1-3], rnd + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 36, 83 ; t2, t3 + vpbroadcastd m%5, [vvc_pd_64] + pmulld m%1, m%5 + pmulld m%3, m%5 + paddd m%1, m%8 + paddd m%5, m%1, m%3 + psubd m%1, m%3 + psrad m%5, 12 ; t0 + psrad m%1, 12 ; t1 + psubd m%3, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%5, m%4 + psubd m%4, m%5, m%4 +%endmacro + +INV_TXFM_4X8_FN dct2, dct2 +INV_TXFM_4X8_FN dct2, identity +INV_TXFM_4X8_FN dct2, adst +INV_TXFM_4X8_FN dct2, flipadst + +cglobal idct2_4x8_internal_10, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m3, [vvc_pd_64] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, m3, [cq+32*3] + vpbroadcastd m7, [vvc_pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + IDCT2_4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp tx2q +.pass2: + packssdw m0, m2 + packssdw m1, m3 + lea r6, [deint_shuf+128] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m2 ; 2 3 + punpckldq m0, m2 ; 0 1 + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + call m(idct2_4x8_internal_8).main + vpbroadcastd xm4, [vvc_pw_2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 3 2 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_10_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movhps [r6 +strideq*2], xm3 + movq [r6 +r3 ], xm3 + RET + +INV_TXFM_4X8_FN adst, dct2 +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_10, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10).main + vpbroadcastd m5, [vvc_pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass1_end: + REPX {psrad x, 12}, m0, m1, m2, m3 + jmp tx2q +.pass2: + call .pass2_main + mova xm4, [vvc_pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 +.end: + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 2 3 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 6 7 + vpbroadcastd xm5, [pixel_10_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET +ALIGN function_align +.pass2_main: + packssdw m0, m2 + packssdw m1, m3 + lea r6, [deint_shuf+128] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpckhdq m5, m4, m0 + punpckldq m4, m0 + vextracti128 xm2, m4, 1 ; 4 5 + vextracti128 xm3, m5, 1 ; 6 7 + pshufd xm4, xm4, q1032 ; 1 0 + pshufd xm5, xm5, q1032 ; 3 2 + jmp m(iadst_4x8_internal_8).main_pass2 +ALIGN function_align +.main: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.main2: + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m2, [cq+16*2] + vbroadcasti128 m3, [cq+16*5] + vbroadcasti128 m1, [cq+16*7] + vpbroadcastd m6, [vvc_pd_64] + shufpd m0, m2, 0x0c ; 0 2 + shufpd m1, m3, 0x0c ; 7 5 + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m4, [cq+16*6] + vbroadcasti128 m5, [cq+16*1] + vbroadcasti128 m3, [cq+16*3] + vpbroadcastd m7, [vvc_pd_2048] + shufpd m2, m4, 0x0c ; 4 6 + shufpd m3, m5, 0x0c ; 3 1 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 +.main3: + ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 9_43, 90_80, 1 + ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 70_87, 57_25, 1 + psubd m4, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + REPX {pmaxsd x, m8}, m4, m2, m0, m1 + REPX {pminsd x, m9}, m4, m2, m0, m1 + pxor m5, m5 + psubd m5, m4 + vpblendd m4, m2, 0xcc ; t4 t7 + vpblendd m2, m5, 0xcc ; t5 -t6 + ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 36, 83 + vpbroadcastd m5, [vvc_pd_64] + vbroadcasti128 m6, [vvc_pw_2048_m2048] ; + + - - + punpckhqdq m3, m0, m1 + punpcklqdq m0, m1 + psubd m1, m0, m3 ; t2 t3 + paddd m0, m3 ; out0 -out7 + punpckhqdq m3, m4, m2 ; t7a t6a + punpcklqdq m4, m2 ; t5a t4a + psubd m2, m4, m3 ; t7 t6 + paddd m4, m3 ; out6 -out1 + REPX {pmaxsd x, m8}, m1, m2 + REPX {pminsd x, m9}, m1, m2 + vpblendd m3, m1, m2, 0xcc + shufpd m1, m2, 0x05 + pmulld m3, m5 + pmulld m5, m1 + psignd m0, m6 ; out0 out7 + psignd m4, m6 ; out6 out1 + paddd m3, m7 + psubd m2, m3, m5 + paddd m5, m3 + psrad m2, 12 ; out4 -out5 + psrad m5, 12 ; -out3 out2 + ret + +INV_TXFM_4X8_FN flipadst, dct2 +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_10, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10).main + vpbroadcastd m5, [vvc_pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m2, m5, m6 + paddd m3, m5, m4 + jmp m(iadst_4x8_internal_10).pass1_end +.pass2: + call m(iadst_4x8_internal_10).pass2_main + mova xm4, [vvc_pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*1] + movhps xm4, [dstq+strideq*0] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*1] + movhps xm6, [r6 +strideq*0] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm3, xm4 ; 1 0 + paddw xm2, xm5 ; 3 2 + paddw xm1, xm6 ; 5 4 + paddw xm0, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_10_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 + REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0 + movhps [dstq+strideq*0], xm3 + movq [dstq+strideq*1], xm3 + movhps [dstq+strideq*2], xm2 + movq [dstq+r3 ], xm2 + movhps [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm1 + movhps [r6 +strideq*2], xm0 + movq [r6 +r3 ], xm0 + RET + +INV_TXFM_4X8_FN identity, dct2 +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_10, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m3, [vvc_pd_64] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, [cq+32*3] + vpbroadcastd m5, [vvc_pd_2048] + vpbroadcastd m4, [vvc_pd_5793] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m6, [pixel_10_max] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + vpbroadcastd m4, [vvc_pw_4096] + packssdw m0, m2 + packssdw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m4 + pmulhrsw m0, m4 + punpckhdq m1, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + vpbroadcastq m4, [r6 +strideq*0] + vpbroadcastq m5, [r6 +strideq*1] + movq xm3, [dstq+strideq*2] + movhps xm3, [dstq+r3 ] + vpblendd m2, m4, 0x30 + vpblendd m2, m5, 0xc0 + vpbroadcastq m4, [r6 +strideq*2] + vpbroadcastq m5, [r6 +r3 ] + vpblendd m3, m4, 0x30 + vpblendd m3, m5, 0xc0 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 ; out0 out1 out4 out5 + paddw m1, m3 ; out2 out3 out6 out7 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m6 + pminsw m1, m6 + vextracti128 xm2, m0, 1 ; out4 out5 + vextracti128 xm3, m1, 1 ; out6 out7 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + ret + +INV_TXFM_4X8_FN dct2, dct2, 12 +INV_TXFM_4X8_FN dct2, identity, 12 +INV_TXFM_4X8_FN dct2, adst, 12 +INV_TXFM_4X8_FN dct2, flipadst, 12 + +cglobal idct2_4x8_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(idct2_4x8_internal_10).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + ; transpose & interleave + pshufd m0, m0, q1320 + pshufd m1, m1, q1320 + pshufd m2, m2, q1320 + pshufd m3, m3, q1320 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + vpermq m0, m0, q3102 + vpermq m2, m2, q3102 + vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved) + vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved) + vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) + vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) + vpbroadcastd m7, [vvc_pd_2048] + call m(idct2_8x4_internal_10).main + psubd m3, m0, m4 ; out7 out6 + paddd m0, m4 ; out0 out1 + paddd m1, m2, m5 ; out3 out2 + psubd m2, m5 ; out4 out5 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x8_internal_12).end + +INV_TXFM_4X8_FN adst, dct2, 12 +INV_TXFM_4X8_FN adst, adst, 12 +INV_TXFM_4X8_FN adst, flipadst, 12 +INV_TXFM_4X8_FN adst, identity, 12 + +cglobal iadst_4x8_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10).main + psrad m0, m4, 1 + psrad m1, m6, 1 + psrad m2, 1 + psrad m3, 1 +.pass1_end: + vpbroadcastd m5, [vvc_pd_1024] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 11}, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call .pass2_main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 +.end: + vpbroadcastd m4, [vvc_pw_16384] + REPX {psrad x, 3}, m0, m1, m2, m3 + packssdw m0, m2 ; 0 1 4 5 (interleaved) + packssdw m1, m3 ; 2 3 6 7 (interleaved) + mova m2, [iadst8_12_shuf] + vpermd m0, m2, m0 ; 0 1 4 5 + vpermd m1, m2, m1 ; 2 3 6 7 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + vinserti128 m4, xm6, 1 + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + vinserti128 m5, xm7, 1 + paddw m0, m4 ; 0 1 4 5 + paddw m1, m5 ; 2 3 6 7 + vpbroadcastd m5, [pixel_12_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, m4}, m0, m1 + REPX {pminsw x, m5}, m0, m1 + vextracti128 xm2, m0, 1 ; out4 out5 + vextracti128 xm3, m1, 1 ; out6 out7 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET +ALIGN function_align +.pass2_main: + ; transpose & interleave + pshufd m0, m0, q1320 + pshufd m1, m1, q1320 + pshufd m2, m2, q1320 + pshufd m3, m3, q1320 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved) + vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved) + vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) + vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) + vpbroadcastd m7, [vvc_pd_2048] + jmp m(iadst_4x8_internal_10).main3 + +INV_TXFM_4X8_FN flipadst, dct2, 12 +INV_TXFM_4X8_FN flipadst, adst, 12 +INV_TXFM_4X8_FN flipadst, flipadst, 12 +INV_TXFM_4X8_FN flipadst, identity, 12 + +cglobal iflipadst_4x8_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10).main + psrad m0, m3, 1 + psrad m1, m2, 1 + psrad m2, m6, 1 + psrad m3, m4, 1 + jmp m(iadst_4x8_internal_12).pass1_end +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_4x8_internal_12).pass2_main + shufpd m3, m4, m0, 0x05 ; out1 out0 + shufpd m0, m4, 0x05 ; out7 out6 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 ; out5 out4 + psignd m2, m5, m6 ; out3 out2 + jmp m(iadst_4x8_internal_12).end + +INV_TXFM_4X8_FN identity, dct2, 12 +INV_TXFM_4X8_FN identity, adst, 12 +INV_TXFM_4X8_FN identity, flipadst, 12 +INV_TXFM_4X8_FN identity, identity, 12 + +cglobal iidentity_4x8_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iidentity_4x8_internal_10).pass1 +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m2 = in4 in5 + ; m3 = in6 in7 + vpbroadcastd m6, [pixel_12_max] + call m(iidentity_4x8_internal_10).pass2_end + RET + +%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x16, %3 +%ifidn %1_%2, dct2_dct2 + imul r6d, [cq], 181 + vpbroadcastd xm2, [dconly_%3] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 384 + sar r6d, 9 + jmp m(vvc_inv_dct2_dct2_4x4_10).dconly3 +%endif +%endmacro + +INV_TXFM_4X16_FN dct2, dct2 +INV_TXFM_4X16_FN dct2, identity +INV_TXFM_4X16_FN dct2, adst +INV_TXFM_4X16_FN dct2, flipadst + +cglobal idct2_4x16_internal_10, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m10, [vvc_pd_3072] + mova m1, [cq+32*2] + mova m3, [cq+32*6] + mova m5, [cq+32*3] + mova m7, [cq+32*7] + call .pass1_main + pmulld m0, m6, [cq+32*0] + pmulld m2, m6, [cq+32*4] + pmulld m4, m6, [cq+32*1] + pmulld m6, [cq+32*5] + call .pass1_main2 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 ; 2 3 + punpckldq m0, m4 ; 0 1 + punpckldq m4, m5, m2 ; 8 9 + punpckhdq m5, m2 ; a b + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + vextracti128 xm6, m4, 1 ; c d + vextracti128 xm7, m5, 1 ; e f + call m(idct2_4x16_internal_8).main + vpbroadcastd m9, [vvc_pw_2048] + vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 + vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 + vinserti128 m2, m4, xm5, 1 ; 8 9 b a + vinserti128 m3, m6, xm7, 1 ; c d f e + vpbroadcastd m8, [pixel_10_max] + call .pass2_end + RET +ALIGN function_align +.pass1_main: + vpbroadcastd m4, [vvc_pd_83] + vpbroadcastd m8, [vvc_pd_36] + vpbroadcastd m9, [vvc_pd_2048] + vpbroadcastd m6, [vvc_pd_1448] + ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4, 0xc ; t2l, t3l + ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4, 0xc ; t2h, t3h + ret +ALIGN function_align +.pass1_main2: + paddd m0, m10 + paddd m4, m10 + paddd m8, m0, m2 + psubd m0, m2 + paddd m9, m4, m6 + psubd m4, m6 + REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h + psubd m2, m0, m1 + paddd m1, m0 + psubd m6, m4, m5 + paddd m5, m4 + paddd m0, m8, m3 + psubd m3, m8, m3 + paddd m4, m9, m7 + psubd m7, m9, m7 + ret +ALIGN function_align +.pass2_end: + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + ret +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + vpbroadcastq m5, [dstq+strideq*2] + vpbroadcastq m6, [dstq+r6 ] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + movhps [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN adst, dct2 +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_10, 0, 7, 11, dst, stride, c, eob, tx2 + call m(iadst_16x4_internal_10).main + vpbroadcastd m6, [vvc_pd_6144] + call m(iadst_16x4_internal_10).main_end + psrad m0, m4, 13 + psrad m1, m5, 13 + psrad m2, 13 + psrad m3, 13 + psrad m4, m8, 13 + psrad m5, m9, 13 + psrad m6, 13 + psrad m7, 13 + jmp tx2q +.pass2: + call .pass2_main + vpbroadcastd m5, [vvc_pw_2048] + vpbroadcastd m8, [pixel_10_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+r6 ] + movhps xm4, [dstq+strideq*0] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movhps [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm5 + movq [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.pass2_main: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + punpckldq m4, m5, m2 + punpckhdq m5, m2 + vpblendd m3, m0, m1, 0x33 + vpblendd m0, m1, 0xcc + shufpd m2, m5, m4, 0x05 + shufpd m4, m5, 0x05 + vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5 + vinserti128 m0, xm3, 1 ; 0 3 2 1 + vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? + vinserti128 m2, xm4, 1 ; b 8 9 a + call m(iadst_4x16_internal_8).main2 + vpbroadcastd m5, [vvc_pw_64x8] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + ret +ALIGN function_align +.main: + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 2] + vbroadcasti128 m1, [cq+16*15] + vbroadcasti128 m5, [cq+16*13] + vbroadcasti128 m2, [cq+16* 4] + vbroadcasti128 m6, [cq+16* 6] + vbroadcasti128 m3, [cq+16*11] + vbroadcasti128 m7, [cq+16* 9] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m1, m5, 0x0c ; 15 13 + shufpd m2, m6, 0x0c ; 4 6 + shufpd m3, m7, 0x0c ; 11 9 + vbroadcasti128 m4, [cq+16* 8] + vbroadcasti128 m6, [cq+16*10] + vbroadcasti128 m5, [cq+16* 7] + vbroadcasti128 m7, [cq+16* 5] + shufpd m4, m6, 0x0c ; 8 10 + shufpd m5, m7, 0x0c ; 7 5 + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m7, [cq+16*14] + shufpd m6, m7, 0x0c ; 12 14 + vbroadcasti128 m7, [cq+16* 3] + vbroadcasti128 m8, [cq+16* 1] + shufpd m7, m8, 0x0c ; 3 1 +.main2: + ; expects: m12 = clip_min m13 = clip_max + vpbroadcastd m11, [vvc_pd_2048] + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1 + psubd m8, m0, m4 ; t8a t10a + paddd m0, m4 ; t0a t2a + psubd m4, m1, m5 ; t9a t11a + paddd m1, m5 ; t1a t3a + psubd m5, m2, m6 ; t12a t14a + paddd m2, m6 ; t4a t6a + psubd m6, m3, m7 ; t13a t15a + paddd m3, m7 ; t5a t7a + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8 + ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 18_75, 89_50, 1 + ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 89_50, 10, 0x9 + psubd m7, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + psubd m3, m4, m6 ; t12a t14a + paddd m4, m6 ; t8a t10a + psubd m6, m8, m5 ; t13a t15a + paddd m8, m5 ; t9a t11a + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8 + punpcklqdq m5, m3, m7 ; t12a t4 + punpckhqdq m3, m7 ; t14a t6 + punpckhqdq m7, m6, m2 ; t15a t7 + punpcklqdq m6, m2 ; t13a t5 + ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 83, 36, 0x0 + ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 36, 10, 0x8 + vpbroadcastd m10, [vvc_pd_64] + vbroadcasti128 m9, [vvc_pw_2048_m2048] ; + + - - + punpckhqdq m2, m4, m0 ; t10a t2 + punpcklqdq m4, m0 ; t8a t0 + punpckhqdq m0, m8, m1 ; t11a t3 + punpcklqdq m8, m1 ; t9a t1 + paddd m1, m6, m7 ; out2 -out3 + psubd m6, m7 ; t14a t6 + paddd m7, m5, m3 ; -out13 out12 + psubd m5, m3 ; t15a t7 + psubd m3, m8, m0 ; t11 t3a + paddd m8, m0 ; out14 -out15 + paddd m0, m4, m2 ; -out1 out0 + psubd m4, m2 ; t10 t2a + REPX {pmaxsd x, m12}, m6, m5, m3, m4 + REPX {pminsd x, m13}, m6, m5, m3, m4 + REPX {pmulld x, m10}, m6, m5, m3, m4 + paddd m6, m11 + paddd m4, m11 + paddd m2, m6, m5 ; -out5 out4 + psubd m6, m5 ; out10 -out11 + psubd m5, m4, m3 ; -out9 out8 + paddd m3, m4 ; out6 -out7 + REPX {psrad x, 12}, m2, m3, m5, m6 + REPX {psignd x, m9}, m1, m8, m3, m6 + pshufd m9, m9, q1032 + REPX {psignd x, m9}, m0, m7, m2, m5 + ret + +INV_TXFM_4X16_FN flipadst, dct2 +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_10, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + call m(iadst_16x4_internal_10).main + vpbroadcastd m6, [vvc_pd_6144] + call m(iadst_16x4_internal_10).main_end + psrad m0, m3, 13 + psrad m1, m2, 13 + psrad m2, m5, 13 + psrad m3, m4, 13 + psrad m4, m7, 13 + psrad m5, m6, 13 + psrad m6, m9, 13 + psrad m7, m8, 13 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_10).pass2_main + vpbroadcastd m5, [vvc_pw_2048] + vpbroadcastd m8, [pixel_10_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+r6 ] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0x30 + vpblendd m4, m6, 0xc0 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 + movhps [dstq+strideq*2], xm5 + movhps [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN identity, dct2 +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_10, 0, 7, 11, dst, stride, c, eob, tx2 + vpbroadcastd m7, [vvc_pd_5793] + pmulld m0, m7, [cq+32*0] + pmulld m4, m7, [cq+32*1] + pmulld m1, m7, [cq+32*2] + pmulld m5, m7, [cq+32*3] + pmulld m2, m7, [cq+32*4] + pmulld m6, m7, [cq+32*5] + pmulld m3, m7, [cq+32*6] + pmulld m7, [cq+32*7] + vpbroadcastd m8, [vvc_pd_6144] + REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 + REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m7, [vvc_pw_1697x16] + vpbroadcastd m8, [vvc_pw_2048] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + vpbroadcastd m4, [pixel_10_max] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + lea r6, [strideq*5] + pxor m3, m3 + punpckhdq m5, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + punpckldq m6, m7, m1 ; 8 9 c d + punpckhdq m7, m1 ; a b e f + pmulhrsw m0, m8 + call .write_2x4x2 + pmulhrsw m0, m5, m8 + call .write_2x4x2 + pmulhrsw m0, m6, m8 + lea dstq, [dstq+strideq*4] + call .write_2x4x2 + pmulhrsw m0, m7, m8 + call .write_2x4x2 + ret +ALIGN function_align +.write_2x4x2: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + vpbroadcastq m2, [dstq+strideq*4] + vpblendd m1, m2, 0x30 + vpbroadcastq m2, [dstq+r6 ] + vpblendd m1, m2, 0xc0 + mova [cq+32*0], m3 + mova [cq+32*1], m3 + add cq, 32*2 + paddw m1, m0 + pmaxsw m1, m3 + pminsw m1, m4 + vextracti128 xm2, m1, 1 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + movq [dstq+strideq*4], xm2 + movhps [dstq+r6 ], xm2 + lea dstq, [dstq+strideq*2] + ret + +INV_TXFM_4X16_FN dct2, dct2, 12 +INV_TXFM_4X16_FN dct2, identity, 12 +INV_TXFM_4X16_FN dct2, adst, 12 +INV_TXFM_4X16_FN dct2, flipadst, 12 + +cglobal idct2_4x16_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(idct2_4x16_internal_10).pass1 +.pass2: + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m9, m2, m3 + punpckhdq m2, m3 + punpckldq m1, m4, m5 + punpckhdq m4, m5 + punpckldq m3, m6, m7 + punpckhdq m6, m7 + punpcklqdq m5, m0, m2 ; 2 6 + punpckhqdq m12, m0, m2 ; 3 7 + punpcklqdq m0, m8, m9 ; 0 4 + punpckhqdq m10, m8, m9 ; 1 5 + punpcklqdq m2, m1, m3 ; 8 12 + punpckhqdq m13, m1, m3 ; 9 13 + punpcklqdq m9, m4, m6 ; 10 14 + punpckhqdq m4, m6 ; 11 15 + vperm2i128 m1, m5, m9, 0x20 ; 2 10 + vperm2i128 m3, m9, m5, 0x31 ; 14 6 + vpermq m11, m4, q1302 ; 15 11 + ; interleave + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13 + REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13 + call m(idct2_16x4_internal_10).pass1_main + vpermq m6, m12, q1302 ; 7 3 + vpermq m5, m13, q3120 ; 9 13 + call m(idct2_16x4_internal_10).pass1_main2 + call m(idct2_16x4_internal_10).pass1_main3 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + mova m4, [idct2_16_12_shuf] + REPX {vpermd x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m9, [vvc_pw_16384] + vpbroadcastd m8, [pixel_12_max] + call m(idct2_4x16_internal_10).pass2_end + RET + +INV_TXFM_4X16_FN adst, dct2, 12 +INV_TXFM_4X16_FN adst, adst, 12 +INV_TXFM_4X16_FN adst, flipadst, 12 +INV_TXFM_4X16_FN adst, identity, 12 + +cglobal iadst_4x16_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + call .main_pass1 + psrad m0, m4, 12 + psrad m1, m5, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, m8, 12 + psrad m5, m9, 12 + psrad m6, 12 + psrad m7, 12 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose_16x4 + call m(iadst_4x16_internal_10).main2 + pshufd m4, m5, q1032 + psrad m5, m6, 3 + pshufd m6, m7, q1032 + psrad m7, m8, 3 + REPX {pshufd x, x, q1032}, m0, m2 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6 +.pass2_end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + mova m4, [iadst16_12_shuf] + REPX {vpermd x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m9, [vvc_pw_16384] + vpbroadcastd m8, [pixel_12_max] + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call m(iadst_4x16_internal_10).write_4x4 + pmulhrsw m0, m9, m1 + call m(iadst_4x16_internal_10).write_4x4 + pmulhrsw m0, m9, m2 + call m(iadst_4x16_internal_10).write_4x4 + pmulhrsw m0, m9, m3 + call m(iadst_4x16_internal_10).write_4x4 + RET +ALIGN function_align +.transpose_16x4: + ; transpose & interleave + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m9, m2, m3 + punpckhdq m2, m3 + punpckldq m1, m4, m5 + punpckhdq m4, m5 + punpckldq m3, m6, m7 + punpckhdq m6, m7 + punpcklqdq m10, m8, m0 + punpckhqdq m0, m8 + punpcklqdq m11, m9, m2 + punpckhqdq m2, m9 + punpcklqdq m8, m1, m4 + punpckhqdq m4, m1 + punpcklqdq m9, m3, m6 + punpckhqdq m6, m3 + vperm2i128 m5, m0, m2, 0x31 ; 7 5 + vperm2i128 m7, m0, m2, 0x20 ; 3 1 + vperm2i128 m0, m10, m11, 0x20 ; 0 2 + vperm2i128 m2, m10, m11, 0x31 ; 4 6 + vperm2i128 m1, m4, m6, 0x31 ; 15 13 + vperm2i128 m3, m4, m6, 0x20 ; 11 9 + vperm2i128 m4, m8, m9, 0x20 ; 8 10 + vperm2i128 m6, m8, m9, 0x31 ; 12 14 + ret +ALIGN function_align +.main_pass1: + call m(iadst_16x4_internal_10).main + vpbroadcastd m6, [vvc_pd_3072] + paddd m10, m4, m5 + psubd m4, m3 + psubd m5, m3 + paddd m3, m10 + psubd m8, m7, m1 + paddd m7, m9 + psubd m9, m1 + paddd m7, m1 + REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 + REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 + paddd m6, m0 + ret + +INV_TXFM_4X16_FN flipadst, dct2, 12 +INV_TXFM_4X16_FN flipadst, adst, 12 +INV_TXFM_4X16_FN flipadst, flipadst, 12 +INV_TXFM_4X16_FN flipadst, identity, 12 + +cglobal iflipadst_4x16_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + call m(iadst_4x16_internal_12).main_pass1 + psrad m0, m3, 12 + psrad m1, m2, 12 + psrad m2, m5, 12 + psrad m3, m4, 12 + psrad m4, m7, 12 + psrad m5, m6, 12 + psrad m6, m9, 12 + psrad m7, m8, 12 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_4x16_internal_12).transpose_16x4 + call m(iadst_4x16_internal_10).main2 + pshufd m4, m3, q1032 + psrad m3, m5, 3 + psrad m5, m2, 3 + pshufd m2, m6, q1032 + pshufd m6, m1, q1032 + psrad m1, m7, 3 + psrad m7, m0, 3 + pshufd m0, m8, q1032 + REPX {psrad x, 3}, m0, m2, m4, m6 + jmp m(iadst_4x16_internal_12).pass2_end + +INV_TXFM_4X16_FN identity, dct2, 12 +INV_TXFM_4X16_FN identity, adst, 12 +INV_TXFM_4X16_FN identity, flipadst, 12 +INV_TXFM_4X16_FN identity, identity, 12 + +cglobal iidentity_4x16_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [vvc_pd_1697] + mova m0, [cq+32*0] + mova m4, [cq+32*1] + mova m1, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m9, [vvc_pd_6144] + pmulld m2, m8, m0 + pmulld m6, m8, m4 + pmulld m3, m8, m1 + pmulld m7, m8, m5 + mova m10, [cq+32*4] + mova m11, [cq+32*5] + mova m12, [cq+32*6] + mova m13, [cq+32*7] + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m0, m2 + pmulld m2, m8, m10 + paddd m4, m6 + pmulld m6, m8, m11 + paddd m1, m3 + pmulld m3, m8, m12 + paddd m5, m7 + pmulld m7, m8, m13 + REPX {psrad x, 1 }, m0, m4, m1, m5 + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m2, m10 + paddd m6, m11 + paddd m3, m12 + paddd m7, m13 + REPX {psrad x, 1 }, m2, m6, m3, m7 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m8, [vvc_pd_5793] + vpbroadcastd m9, [vvc_pd_1024] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m8, [vvc_pw_16384] + vpbroadcastd m4, [pixel_12_max] + call m(iidentity_4x16_internal_10).pass2_end + RET + +%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 8x4, %3 +%ifidn %1_%2, dct2_dct2 + vpbroadcastd m2, [dconly_%3] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + jmp m(vvc_inv_dct2_dct2_8x8_10).dconly3 +%else + jmp m(vvc_inv_dct2_dct2_8x4_10).dconly +%endif +%endif +%endmacro + +INV_TXFM_8X4_FN dct2, dct2 +INV_TXFM_8X4_FN dct2, identity +INV_TXFM_8X4_FN dct2, adst +INV_TXFM_8X4_FN dct2, flipadst + +cglobal idct2_8x4_internal_10, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.pass1: + vbroadcasti128 m1, [cq+16*1] + vbroadcasti128 m0, [cq+16*5] + vbroadcasti128 m2, [cq+16*3] + vbroadcasti128 m3, [cq+16*7] + vpbroadcastd m6, [vvc_pd_64] + shufpd m1, m0, 0x0c ; 1 5 + shufpd m3, m2, 0x0c ; 7 3 + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m4, [cq+16*2] + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m5, [cq+16*6] + vpbroadcastd m7, [vvc_pd_2048] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m2, m5, 0x0c ; 4 6 + REPX {pmulld x, m6}, m1, m3, m0, m2 + REPX {paddd x, m7}, m1, m3, m0, m2 + REPX {psrad x, 12}, m1, m3, m0, m2 + call .main + psubd m3, m0, m4 ; out7 out6 (interleaved) + paddd m0, m4 ; out0 out1 (interleaved) + paddd m1, m2, m5 ; out3 out2 (interleaved) + psubd m2, m5 ; out4 out5 (interleaved) + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp tx2q +.pass2: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + IDCT2_4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q2031 ; out2 out3 + jmp m(iadst_8x4_internal_10).end +ALIGN function_align +.main: + ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 18_75, 89_50, 1 + IDCT2_4_1D_PACKED 0, 2, 4, 5, 6, 7 + vpbroadcastd m6, [vvc_pd_64] + punpcklqdq m4, m1, m3 ; t4a t7a + punpckhqdq m1, m3 ; t5a t6a + psubd m3, m4, m1 ; t5a t6a + paddd m4, m1 ; t4 t7 + REPX {pmaxsd x, m8}, m3, m4, m0, m2 + REPX {pminsd x, m9}, m3, m4, m0, m2 + pmulld m3, m6 + pshufd m1, m3, q1032 + paddd m3, m7 + psubd m5, m3, m1 + paddd m1, m3 + psrad m5, 12 + psrad m1, 12 + vpblendd m5, m4, 0x33 ; t4 t5 + punpckhqdq m4, m1 ; t7 t6 + ret + +INV_TXFM_8X4_FN adst, dct2 +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_10, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_10).main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + call .pass2_main + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q3120 ; out2 out3 +.end: + vpbroadcastd m1, [vvc_pw_2048] + pmulhrsw m0, m1 + pmulhrsw m1, m2 + vpbroadcastd m5, [pixel_10_max] +.end2: + mova xm2, [dstq+strideq*0] + vinserti128 m2, [dstq+strideq*1], 1 + lea r6, [dstq+strideq*2] + mova xm3, [r6 +strideq*0] + vinserti128 m3, [r6 +strideq*1], 1 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [r6 +strideq*0], xm1 + vextracti128 [r6 +strideq*1], m1, 1 + RET +ALIGN function_align +.pass2_main: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + lea r6, [deint_shuf+128] + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + jmp m(iadst_8x4_internal_8).main +ALIGN function_align +.main: + vpbroadcastd m1, [vvc_pd_64] + pmulld m0, m1, [cq+32*0] + pmulld m3, m1, [cq+32*3] + pmulld m2, m1, [cq+32*2] + pmulld m1, [cq+32*1] + vpbroadcastd m4, [vvc_pd_2048] + REPX {paddd x, m4}, m0, m3, m2, m1 + REPX {psrad x, 12}, m0, m3, m2, m1 +.main2: + IADST4_1D + ret + +INV_TXFM_8X4_FN flipadst, dct2 +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_10, 0, 5, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_10).main + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_10).pass2_main + vpermq m2, m0, q2031 + vpermq m0, m1, q2031 + jmp m(iadst_8x4_internal_10).end + +INV_TXFM_8X4_FN identity, dct2 +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_10, 0, 7, 10, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m4, [vvc_pd_64] + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpermq m2, [cq+32*2], q3120 + vpermq m3, [cq+32*3], q3120 + vpbroadcastd m7, [vvc_pd_2048] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {paddd x, x }, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m5, [pixel_10_max] + vpbroadcastd m4, [vvc_pw_1697x8] + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m1, m4, m0 + pmulhrsw m4, m2 + paddsw m0, m1 + paddsw m2, m4 + packssdw m7, m7 ; vvc_pw_2048 +.pass2_end: + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + lea r6, [dstq+strideq*2] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m7 + pmulhrsw m0, m7 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova xm2, [dstq+strideq*0] + vinserti128 m2, [r6 +strideq*0], 1 + mova xm3, [dstq+strideq*1] + vinserti128 m3, [r6 +strideq*1], 1 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [r6 +strideq*0], m0, 1 + vextracti128 [r6 +strideq*1], m1, 1 + RET + +INV_TXFM_8X4_FN dct2, dct2, 12 +INV_TXFM_8X4_FN dct2, identity, 12 +INV_TXFM_8X4_FN dct2, adst, 12 +INV_TXFM_8X4_FN dct2, flipadst, 12 + +cglobal idct2_8x4_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + jmp m(idct2_8x4_internal_10).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12).transpose_4x8 + IDCT2_4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(iadst_8x4_internal_12).end + +INV_TXFM_8X4_FN adst, dct2, 12 +INV_TXFM_8X4_FN adst, adst, 12 +INV_TXFM_8X4_FN adst, flipadst, 12 +INV_TXFM_8X4_FN adst, identity, 12 + +cglobal iadst_8x4_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + call m(iadst_4x8_internal_10).main2 + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call .pass2_main + vpbroadcastd m5, [vvc_pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass2_end: + REPX {psrad x, 12}, m0, m1, m2, m3 +.end: + vpbroadcastd m4, [vvc_pw_16384] + REPX {psrad x, 3}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m2, m4 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m1, m1, q3120 ; out2 out3 + vpbroadcastd m5, [pixel_12_max] + jmp m(iadst_8x4_internal_10).end2 +ALIGN function_align +.pass2_main: + call .transpose_4x8 + jmp m(iadst_8x4_internal_10).main2 +ALIGN function_align +.transpose_4x8: + ; deinterleave + pshufd m0, m0, q3120 + pshufd m1, m1, q3120 + pshufd m2, m2, q3120 + pshufd m3, m3, q3120 + ; transpose + punpcklqdq m4, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m5, m2, m3 + punpckhqdq m2, m3 + vperm2i128 m1, m0, m2, 0x20 ; out1 + vperm2i128 m3, m0, m2, 0x31 ; out3 + vperm2i128 m2, m4, m5, 0x31 ; out2 + vperm2i128 m0, m4, m5, 0x20 ; out0 + ret + +INV_TXFM_8X4_FN flipadst, dct2, 12 +INV_TXFM_8X4_FN flipadst, adst, 12 +INV_TXFM_8X4_FN flipadst, flipadst, 12 +INV_TXFM_8X4_FN flipadst, identity, 12 + +cglobal iflipadst_8x4_internal_12, 0, 5, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + call m(iadst_4x8_internal_10).main2 + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12).pass2_main + vpbroadcastd m5, [vvc_pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m3, m5, m4 + paddd m2, m5, m6 + jmp m(iadst_8x4_internal_12).pass2_end + +INV_TXFM_8X4_FN identity, dct2, 12 +INV_TXFM_8X4_FN identity, adst, 12 +INV_TXFM_8X4_FN identity, flipadst, 12 +INV_TXFM_8X4_FN identity, identity, 12 + +cglobal iidentity_8x4_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iidentity_8x4_internal_10).pass1 +.pass2: + ; m0 = in0 in1 (interleaved) + ; m1 = in2 in3 (interleaved) + ; m2 = in4 in5 (interleaved) + ; m3 = in6 in7 (interleaved) + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + vpbroadcastd m4, [vvc_pd_5793] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 15}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_12_max] + vpbroadcastd m7, [vvc_pw_16384] + packssdw m0, m1 + packssdw m2, m3 + jmp m(iidentity_8x4_internal_10).pass2_end + +%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 8x8, %3 +%ifidn %1_%2, dct2_dct2 + vpbroadcastd m2, [dconly_%3] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 +.dconly2: + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm2 + vpbroadcastw m0, xm0 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + paddsw m1, m0 + psubusw m1, m2 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%else + jmp m(vvc_inv_dct2_dct2_8x8_10).dconly +%endif +%endif +%endmacro + +%macro IADST8_1D 14 ; src[1-8], tmp[1-3], vvc_pd_2048, clip[1-2] + ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 9, 90 ; t1a, t0a + ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 87, 25 ; t7a, t6a + ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 43, 80 ; t3a, t2a + ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 70, 57 ; t5a, t4a + psubd m%9, m%3, m%7 ; t6 + paddd m%3, m%7 ; t2 + psubd m%7, m%1, m%5 ; t4 + paddd m%1, m%5 ; t0 + psubd m%5, m%6, m%2 ; t7 + paddd m%6, m%2 ; t3 + psubd m%2, m%8, m%4 ; t5 + paddd m%8, m%4 ; t1 + REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 36, 83 ; t5a, t4a + ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 83, %11, 0x08 ; t6a, t7a + psubd m%10, m%7, m%9 ; t7 + paddd m%7, m%9 ; out6 + vpbroadcastd m%9, [vvc_pd_1448] + psubd m%4, m%8, m%6 ; t3 + paddd m%8, m%6 ; -out7 + psubd m%6, m%1, m%3 ; t2 + paddd m%1, m%3 ; out0 + psubd m%3, m%2, m%5 ; t6 + paddd m%2, m%5 ; -out1 + REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 + REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 + REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 + psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 + paddd m%4, m%6 ; (t2 + t3) * 1448 + psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 + paddd m%3, m%10 ; (t6 + t7) * 1448 +%endmacro + +INV_TXFM_8X8_FN dct2, dct2 +INV_TXFM_8X8_FN dct2, identity +INV_TXFM_8X8_FN dct2, adst +INV_TXFM_8X8_FN dct2, flipadst + +cglobal idct2_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + vpbroadcastd m11, [vvc_pd_2048] + call .main + call .round_shift1 + jmp tx2q +.pass2: + call .transpose_8x8_packed + call m(idct2_8x8_internal_8).main + vpbroadcastd m12, [vvc_pw_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_8x4 + RET +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_10_max] + lea r6, [strideq*3] + pxor m10, m10 +.write_8x4: + mova xm8, [dstq+strideq*0] + vinserti128 m8, [dstq+strideq*1], 1 + mova xm9, [dstq+strideq*2] + vinserti128 m9, [dstq+r6 ], 1 + mova [cq+32*0], m10 + mova [cq+32*1], m10 + mova [cq+32*2], m10 + mova [cq+32*3], m10 + add cq, 32*4 + paddw m0, m8 + paddw m1, m9 + pmaxsw m0, m10 + pmaxsw m1, m10 + pminsw m0, m11 + pminsw m1, m11 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.transpose_8x8_packed: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m4, m1 + punpckldq m4, m1 + vinserti128 m1, m3, xm2, 1 + vperm2i128 m3, m2, 0x31 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret +ALIGN function_align +.main_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main: + ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 75, 50 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 18, 89 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 36, 83 ; t2 t3 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + vpbroadcastd m3, [vvc_pd_64] + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m3 }, m0, m4, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + ret +ALIGN function_align +.round_shift1: + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct2 +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call .main + call .main_end + jmp tx2q +.pass2: + call m(idct2_8x8_internal_10).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8).main_pass2 + vpbroadcastd m5, [vvc_pw_2048] + vpbroadcastd xm12, [vvc_pw_4096] + psubw m12, m5 + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct2_8x8_internal_10).write_8x4 + RET +ALIGN function_align +.main: + mova m0, [cq+32*0] + mova m7, [cq+32*7] + mova m1, [cq+32*1] + mova m6, [cq+32*6] + mova m2, [cq+32*2] + mova m5, [cq+32*5] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + vpbroadcastd m11, [vvc_pd_2048] +.main2: + IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 + psrld m8, 10 ; vvc_pd_1 + vpbroadcastd m9, [vvc_pd_3072] + ret +ALIGN function_align +.main_end: + paddd m0, m8 + psubd m1, m8, m1 + paddd m6, m8 + psubd m7, m8, m7 + REPX {psrad x, 1 }, m0, m1, m6, m7 + ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 + ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 + psubd m8, m9, m8 ; vvc_pd_3071 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 12}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct2 +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_8x8_internal_10).main + call .main_end + jmp tx2q +.pass2: + call m(idct2_8x8_internal_10).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8).main_pass2 + vpbroadcastd m12, [vvc_pw_2048] + vpbroadcastd xm5, [vvc_pw_4096] + psubw m12, m5 + vpermq m8, m3, q2031 + vpermq m9, m2, q2031 + vpermq m2, m1, q2031 + vpermq m3, m0, q2031 + pmulhrsw m0, m8, m12 + pmulhrsw m1, m9, m12 + call m(idct2_8x8_internal_10).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct2_8x8_internal_10).write_8x4 + RET +ALIGN function_align +.main_end: + paddd m10, m8, m0 + psubd m0, m8, m7 + psubd m7, m8, m1 + paddd m1, m8, m6 + psrad m0, 1 + psrad m1, 1 + psrad m6, m7, 1 + psrad m7, m10, 1 + psubd m8, m9, m8 ; vvc_pd_6143 + psubd m10, m8, m5 + paddd m5, m9, m2 + psubd m2, m8, m3 + paddd m3, m9, m4 + psrad m4, m2, 12 + psrad m2, m10, 12 + psrad m3, 12 + psrad m5, 12 + ret + +INV_TXFM_8X8_FN identity, dct2 +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 +.pass1: + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + jmp tx2q +.pass2: + packssdw m3, m7 + vpbroadcastd m7, [pixel_10_max] +.pass2_main: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + vpbroadcastd m12, [vvc_pw_4096] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m1 + punpckhdq m4, m1 + punpckhqdq m1, m0, m2 ; 1 5 + punpcklqdq m0, m2 ; 0 4 + punpcklqdq m2, m3, m4 ; 2 6 + punpckhqdq m3, m4 ; 3 7 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_2x8x2_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_2x8x2_zero + RET +.write_2x8x2_start: + lea r6, [strideq*5] + pxor m6, m6 +.write_2x8x2_zero: + mova [cq+32*0], m6 + mova [cq+32*1], m6 + mova [cq+32*2], m6 + mova [cq+32*3], m6 + add cq, 32*4 +.write_2x8x2: + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + mova xm5, [dstq+strideq*1] + vinserti128 m5, [dstq+r6 ], 1 + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m6 + pmaxsw m1, m6 + pminsw m0, m7 + pminsw m1, m7 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*4], m0, 1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*2] + ret + +%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4] + punpckldq m%9, m%1, m%2 ; aibj emfn + punpckhdq m%1, m%2 ; ckdl gohp + punpckldq m%10, m%3, m%4 ; qyrz uCvD + punpckhdq m%3, m%4 ; sAtB wExF + punpckldq m%11, m%5, m%6 ; GOHP KSLT + punpckhdq m%5, m%6 ; IQJR MUNV + punpckldq m%12, m%7, m%8 ; WeXf aibj + punpckhdq m%7, m%8 ; YgZh ckdl + punpcklqdq m%2, m%9, m%10 ; aiqy emuC + punpckhqdq m%9, m%10 ; bjrz fnvD + punpcklqdq m%4, m%1, m%3 ; cksA gowE + punpckhqdq m%10, m%1, m%3 ; dltB hpxF + punpcklqdq m%6, m%11, m%12 ; GOWe KSai + punpckhqdq m%11, m%12 ; HPXf LTbj + punpcklqdq m%8, m%5, m%7 ; IQYg MUck + punpckhqdq m%12, m%5, m%7 ; JRZh NVdl + vperm2i128 m%1, m%2, m%6, 0x20 ; out0 + vperm2i128 m%5, m%2, m%6, 0x31 ; out4 + vperm2i128 m%2, m%9, m%11, 0x20 ; out1 + vperm2i128 m%6, m%9, m%11, 0x31 ; out5 + vperm2i128 m%3, m%4, m%8, 0x20 ; out2 + vperm2i128 m%7, m%4, m%8, 0x31 ; out6 + vperm2i128 m%4, m%10, m%12, 0x20 ; out3 + vperm2i128 m%8, m%10, m%12, 0x31 ; out7 +%endmacro + +INV_TXFM_8X8_FN dct2, dct2, 12 +INV_TXFM_8X8_FN dct2, identity, 12 +INV_TXFM_8X8_FN dct2, adst, 12 +INV_TXFM_8X8_FN dct2, flipadst, 12 + +cglobal idct2_8x8_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct2_8x8_internal_10).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose_8x8 + vpbroadcastd m11, [vvc_pd_2048] + call m(idct2_8x8_internal_10).main + call .round_shift4 + jmp m(iadst_8x8_internal_12).pass2_end +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_12_max] + lea r6, [strideq*3] + pxor m10, m10 + ret +ALIGN function_align +.transpose_8x8: + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + ret +ALIGN function_align +.round_shift4: + vpbroadcastd m1, [vvc_pd_8] + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct2, 12 +INV_TXFM_8X8_FN adst, adst, 12 +INV_TXFM_8X8_FN adst, flipadst, 12 +INV_TXFM_8X8_FN adst, identity, 12 + +cglobal iadst_8x8_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x8_internal_10).pass1 +.pass2: + call .pass2_main +.pass2_end: + packssdw m0, m1 + packssdw m1, m2, m3 + REPX {vpermq x, x, q3120}, m0, m1 + call m(idct2_8x8_internal_12).write_8x4_start + call m(idct2_8x8_internal_10).write_8x4 + packssdw m0, m4, m5 + packssdw m1, m6, m7 + REPX {vpermq x, x, q3120}, m0, m1 + call m(idct2_8x8_internal_10).write_8x4 + RET +ALIGN function_align +.pass2_main: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x8_internal_12).transpose_8x8 + vpbroadcastd m11, [vvc_pd_2048] +.pass2_main2: + call m(iadst_8x8_internal_10).main2 + pslld m9, m8, 3 ; vvc_pd_8 + paddd m0, m9 + psubd m1, m9, m1 ; 8+x + paddd m6, m9 + psubd m7, m9, m7 + REPX {psrad x, 4}, m0, m1, m6, m7 + vpbroadcastd m9, [vvc_pd_17408] + psubd m8, m9, m8 ; 17407 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 15}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct2, 12 +INV_TXFM_8X8_FN flipadst, adst, 12 +INV_TXFM_8X8_FN flipadst, flipadst, 12 +INV_TXFM_8X8_FN flipadst, identity, 12 + +cglobal iflipadst_8x8_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x8_internal_10).pass1 +.pass2: + call m(iadst_8x8_internal_12).pass2_main + packssdw m7, m7, m6 + packssdw m6, m1, m0 + packssdw m1, m5, m4 + vpermq m0, m7, q3120 + vpermq m1, m1, q3120 + call m(idct2_8x8_internal_12).write_8x4_start + call m(idct2_8x8_internal_10).write_8x4 + packssdw m0, m3, m2 + vpermq m0, m0, q3120 + vpermq m1, m6, q3120 + call m(idct2_8x8_internal_10).write_8x4 + RET + +INV_TXFM_8X8_FN identity, dct2, 12 +INV_TXFM_8X8_FN identity, adst, 12 +INV_TXFM_8X8_FN identity, flipadst, 12 +INV_TXFM_8X8_FN identity, identity, 12 + +cglobal iidentity_8x8_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(iidentity_8x8_internal_10).pass1 +.pass2: + packssdw m3, m7 + vpbroadcastd m7, [pixel_12_max] + jmp m(iidentity_8x8_internal_10).pass2_main + +%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 8x16, %4 +%ifidn %1_%2, dct2_dct2 + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_%4] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(vvc_inv_dct2_dct2_8x8_10).dconly2 +%endif +%endmacro + +INV_TXFM_8X16_FN dct2, dct2 +INV_TXFM_8X16_FN dct2, identity, 35 +INV_TXFM_8X16_FN dct2, adst +INV_TXFM_8X16_FN dct2, flipadst + +cglobal idct2_8x16_internal_10, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [vvc_pd_64] + vpbroadcastd m11, [vvc_pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct2_8x16_internal_8).main + vpbroadcastd m12, [vvc_pw_2048] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct2_8x8_internal_10).write_8x4 + pmulhrsw m0, m4, m12 + pmulhrsw m1, m5, m12 + call m(idct2_8x8_internal_10).write_8x4 + pmulhrsw m0, m6, m12 + pmulhrsw m1, m7, m12 + call m(idct2_8x8_internal_10).write_8x4 + RET +ALIGN function_align +.transpose: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 + lea r6, [deint_shuf+128] + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m3, m6 + punpckldq m3, m6 + punpckhdq m6, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + vperm2i128 m2, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vperm2i128 m3, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m7, m5, m6, 0x31 + vinserti128 m5, xm6, 1 + vperm2i128 m6, m8, m4, 0x31 + vinserti128 m4, m8, xm4, 1 + ret +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct2_8x8_internal_10).main_rect2 + jmp m(idct2_8x8_internal_10).round_shift1 +ALIGN function_align +.main_evenhalf: + paddd m1, m6, m7 ; idct2_8 out1 + psubd m6, m7 ; idct2_8 out6 + psubd m7, m0, m9 ; idct2_8 out7 + paddd m0, m9 ; idct2_8 out0 + paddd m2, m5, m4 ; idct2_8 out2 + psubd m5, m4 ; idct2_8 out5 + psubd m4, m3, m8 ; idct2_8 out4 + paddd m3, m8 ; idct2_8 out3 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +.main_oddhalf_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_fast: ; lower half zero + vpbroadcastd m7, [vvc_pd_90] + vpbroadcastd m8, [vvc_pd_9] + vpbroadcastd m6, [vvc_pd_m25] + vpbroadcastd m9, [vvc_pd_87] + vpbroadcastd m5, [vvc_pd_80] + vpbroadcastd m10, [vvc_pd_43] + vpbroadcastd m4, [vvc_pd_m57] + vpbroadcastd m15, [vvc_pd_70] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_fast2 +.main_oddhalf_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf: + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 9, 90 ; t8a, t15a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 87, 25 ; t11a, t12a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 43, 80 ; t10a, t13a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 70, 57 ; t9a, t14a +.main_oddhalf_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t9 + paddd m0, m4 ; t8 + psubd m4, m6, m2 ; t10 + paddd m2, m6 ; t11 + psubd m6, m1, m5 ; t13 + paddd m5, m1 ; t12 + psubd m1, m7, m3 ; t14 + paddd m7, m3 ; t15 + REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 + REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 + vpbroadcastd m15, [vvc_pd_83] + vpbroadcastd m10, [vvc_pd_36] + ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15, 0xc + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 0xe + psubd m3, m1, m4 ; t10 + paddd m1, m4 ; t9 + psubd m4, m0, m2 ; t11a + paddd m0, m2 ; t8a + psubd m2, m8, m6 ; t13 + paddd m6, m8 ; t14 + psubd m8, m7, m5 ; t12a + paddd m7, m5 ; t15a + REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pmulld x, m14}, m2, m8, m3, m4 + paddd m2, m11 + paddd m8, m11 + paddd m5, m2, m3 ; t13a + psubd m2, m3 ; t10a + psubd m3, m8, m4 ; t11 + paddd m4, m8 ; t12 + REPX {psrad x, 12}, m5, m2, m3, m4 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 + ret + +INV_TXFM_8X16_FN adst, dct2 +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, 35 + +cglobal iadst_8x16_internal_10, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [vvc_pd_64] + vpbroadcastd m11, [vvc_pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + call m(iadst_8x8_internal_10).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + call m(iadst_8x8_internal_10).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + call m(iadst_8x8_internal_10).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct2_8x16_internal_10).transpose + call m(iadst_8x16_internal_8).main + call m(iadst_8x16_internal_8).main_pass2_end + vpbroadcastd m8, [vvc_pw_2048] + vpbroadcastd xm12, [vvc_pw_4096] + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + psubw m12, m8 + jmp m(idct2_8x16_internal_10).end +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m7, m14, [cq+32*14] + pmulld m1, m14, [cq+32* 2] + pmulld m6, m14, [cq+32*12] + pmulld m2, m14, [cq+32* 4] + pmulld m5, m14, [cq+32*10] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp m(iadst_8x8_internal_10).main2 + +INV_TXFM_8X16_FN flipadst, dct2 +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, 35 + +cglobal iflipadst_8x16_internal_10, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [vvc_pd_64] + vpbroadcastd m11, [vvc_pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call m(iadst_8x16_internal_10).pass1_main + call m(iflipadst_8x8_internal_10).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call m(iadst_8x16_internal_10).pass1_main + call m(iflipadst_8x8_internal_10).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call m(iadst_8x16_internal_10).pass1_main + call m(iflipadst_8x8_internal_10).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct2_8x16_internal_10).transpose + call m(iadst_8x16_internal_8).main + call m(iadst_8x16_internal_8).main_pass2_end + vpbroadcastd m12, [vvc_pw_2048] + vpbroadcastd xm13, [vvc_pw_4096] + mova m11, m0 + vpermq m0, m7, q2031 + mova m10, m1 + vpermq m1, m6, q2031 + mova m9, m2 + vpermq m2, m5, q2031 + mova m8, m3 + vpermq m3, m4, q2031 + vpermq m4, m8, q3120 + vpermq m5, m9, q3120 + vpermq m6, m10, q3120 + vpermq m7, m11, q3120 + psubw m12, m13 + jmp m(idct2_8x16_internal_10).end + +INV_TXFM_8X16_FN identity, dct2 +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, vvc_pw_1697x16, [vvc_pw_16384] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 +%ifnum %4 + pmulhrsw m%2, m%4 +%else ; without rounding + psraw m%2, 1 +%endif +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_10, 0, 7, 16, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m15, [vvc_pd_64] + pmulld m0, m15, [cq+32* 0] + pmulld m8, m15, [cq+32* 1] + pmulld m1, m15, [cq+32* 2] + pmulld m9, m15, [cq+32* 3] + pmulld m2, m15, [cq+32* 4] + pmulld m10, m15, [cq+32* 5] + pmulld m3, m15, [cq+32* 6] + pmulld m11, m15, [cq+32* 7] + pmulld m4, m15, [cq+32* 8] + pmulld m12, m15, [cq+32* 9] + pmulld m5, m15, [cq+32*10] + pmulld m13, m15, [cq+32*11] + pmulld m6, m15, [cq+32*12] + pmulld m14, m15, [cq+32*13] + pmulld m7, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [cq], m7 + vpbroadcastd m7, [vvc_pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m13, m7, m15 + vpbroadcastd m8, [vvc_pw_1697x16] + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13 + vpbroadcastd m7, [pixel_10_max] + vpbroadcastd m12, [vvc_pw_2048] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + punpckhwd m9, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m13 + punpcklwd m6, m13 + punpckhwd m13, m4, m5 + punpcklwd m4, m5 + punpcklwd m5, m2, m3 + punpckhwd m2, m3 + punpckhdq m3, m0, m5 + punpckldq m0, m5 + punpckhdq m11, m9, m2 + punpckldq m9, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m13, m1 + punpckhdq m13, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m8, m9, m6 + punpckhqdq m9, m6 + punpcklqdq m10, m11, m13 + punpckhqdq m11, m13 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(iidentity_8x8_internal_10).write_2x8x2_start + pmulhrsw m0, m12, m2 + pmulhrsw m1, m12, m3 + call m(iidentity_8x8_internal_10).write_2x8x2_zero + pmulhrsw m0, m12, m8 + pmulhrsw m1, m12, m9 + lea dstq, [dstq+strideq*4] + call m(iidentity_8x8_internal_10).write_2x8x2_zero + pmulhrsw m0, m12, m10 + pmulhrsw m1, m12, m11 + call m(iidentity_8x8_internal_10).write_2x8x2_zero + ret + +INV_TXFM_8X16_FN dct2, dct2, 0, 12 +INV_TXFM_8X16_FN dct2, identity, 35, 12 +INV_TXFM_8X16_FN dct2, adst, 0, 12 +INV_TXFM_8X16_FN dct2, flipadst, 0, 12 + +cglobal idct2_8x16_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct2_8x16_internal_10).pass1 +.pass2: + lea r6, [rsp+32*4] + call .transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*10], m2 + mova [cq+32*12], m4 + mova [cq+32*14], m6 + pmaxsd m0, m12, [cq+32* 1] + pmaxsd m4, m12, m1 + pmaxsd m1, m12, [cq+32* 3] + pmaxsd m2, m12, [cq+32* 5] + pmaxsd m6, m12, m5 + pmaxsd m5, m12, m3 + pmaxsd m3, m12, [cq+32* 7] + pmaxsd m7, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m14, [vvc_pd_64] + call m(idct2_8x16_internal_10).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 2] + pmaxsd m2, m12, [cq+32* 4] + pmaxsd m3, m12, [cq+32* 6] + pmaxsd m4, m12, [cq+32* 8] + pmaxsd m5, m12, [cq+32*10] + pmaxsd m6, m12, [cq+32*12] + pmaxsd m7, m12, [cq+32*14] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + vpbroadcastd m11, [vvc_pd_8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_16x8_internal_10).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 +.end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct2_8x8_internal_12).write_8x4_start + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m2, q3120 + vpermq m1, m3, q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q3120 + call m(idct2_8x8_internal_10).write_8x4 + RET +ALIGN function_align +.transpose: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + call m(idct2_8x8_internal_12).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + mova [cq+32* 3], m3 + mova [cq+32* 4], m4 + mova [cq+32* 5], m5 + mova [cq+32* 6], m6 + mova [cq+32* 7], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, m12 + mova m5, m13 + mova m6, m14 + mova m7, m15 + jmp m(idct2_8x8_internal_12).transpose_8x8 + +INV_TXFM_8X16_FN adst, dct2, 0, 12 +INV_TXFM_8X16_FN adst, adst, 0, 12 +INV_TXFM_8X16_FN adst, flipadst, 0, 12 +INV_TXFM_8X16_FN adst, identity, 35, 12 + +cglobal iadst_8x16_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x16_internal_10).pass1 +.pass2: + lea r6, [rsp+32*4] + call .pass2_main + call m(iadst_16x8_internal_10).pass1_rotations +.pass2_end: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp m(idct2_8x16_internal_12).end +ALIGN function_align +.pass2_main: + call m(idct2_8x16_internal_12).transpose + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*15], m7 + pmaxsd m0, m13, [cq+32* 2] ; 2 + pmaxsd m3, m13, m1 ; 9 + pmaxsd m1, m13, m5 ; 13 + pmaxsd m4, m13, m2 ; 10 + pmaxsd m2, m13, [cq+32* 6] ; 6 + pmaxsd m5, m13, [cq+32* 5] ; 5 + pmaxsd m6, m13, m6 ; 14 + pmaxsd m7, m13, [cq+32* 1] ; 1 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m12, [vvc_pd_2048] + vpbroadcastd m15, [vvc_pd_64] + call m(iadst_16x8_internal_10).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m1, m13, [cq+32*15] ; 15 + pmaxsd m2, m13, [cq+32* 4] ; 4 + pmaxsd m3, m13, [cq+32*11] ; 11 + pmaxsd m4, m13, [cq+32* 8] ; 8 + pmaxsd m5, m13, [cq+32* 7] ; 7 + pmaxsd m6, m13, [cq+32*12] ; 12 + pmaxsd m7, m13, [cq+32* 3] ; 3 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_16x8_internal_10).main_part2 + vpbroadcastd m14, [vvc_pd_17408] + psrld m15, 11 ; vvc_pd_1 + psubd m13, m14, m15 ; vvc_pd_17407 + pslld m15, 3 ; vvc_pd_8 + ret + +INV_TXFM_8X16_FN flipadst, dct2, 0, 12 +INV_TXFM_8X16_FN flipadst, adst, 0, 12 +INV_TXFM_8X16_FN flipadst, flipadst, 0, 12 +INV_TXFM_8X16_FN flipadst, identity, 35, 12 + +cglobal iflipadst_8x16_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x16_internal_10).pass1 +.pass2: + lea r6, [rsp+32*4] + call m(iadst_8x16_internal_12).pass2_main + call m(iflipadst_16x8_internal_10).pass1_rotations + jmp m(iadst_8x16_internal_12).pass2_end + +INV_TXFM_8X16_FN identity, dct2, 0, 12 +INV_TXFM_8X16_FN identity, adst, 0, 12 +INV_TXFM_8X16_FN identity, flipadst, 0, 12 +INV_TXFM_8X16_FN identity, identity, 0, 12 + +cglobal iidentity_8x16_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_8x16_internal_10).pass1 +.pass2: + call .pass2_main + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m13, m7, m15 + vpbroadcastd m7, [pixel_12_max] + vpbroadcastd m12, [vvc_pw_16384] + call m(iidentity_8x16_internal_10).pass2_end + RET +ALIGN function_align +.pass2_main: + mova [cq], m7 + vpbroadcastd m7, [clip_18b_min] + REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmaxsd m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [clip_18b_max] + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pminsd m15, [cq] + mova [cq], m7 + vpbroadcastd m7, [vvc_pd_5793] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmulld m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [vvc_pd_1024] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [cq] + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ret + +%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 16x4, %3 +%ifidn %1_%2, dct2_dct2 + vpbroadcastd m3, [dconly_%3] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 +.dconly2: + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm3 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+strideq*0] + paddsw m2, m0, [dstq+strideq*1] + psubusw m1, m3 + psubusw m2, m3 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%else + jmp m(vvc_inv_dct2_dct2_16x4_10).dconly +%endif +%endif +%endmacro + +INV_TXFM_16X4_FN dct2, dct2 +INV_TXFM_16X4_FN dct2, identity +INV_TXFM_16X4_FN dct2, adst +INV_TXFM_16X4_FN dct2, flipadst + +cglobal idct2_16x4_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.pass1: + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 4] + vbroadcasti128 m1, [cq+16* 2] + vbroadcasti128 m7, [cq+16* 6] + vbroadcasti128 m5, [cq+16*10] + vbroadcasti128 m2, [cq+16* 8] + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m3, [cq+16*14] + shufpd m0, m4, 0x0c ; 0 4 + shufpd m1, m5, 0x0c ; 2 10 + shufpd m2, m6, 0x0c ; 8 12 + shufpd m3, m7, 0x0c ; 14 6 + call .pass1_main + vbroadcasti128 m10, [cq+16* 1] + vbroadcasti128 m4, [cq+16* 5] + vbroadcasti128 m11, [cq+16*15] + vbroadcasti128 m5, [cq+16*11] + shufpd m10, m4, 0x0c ; 1 5 + shufpd m11, m5, 0x0c ; 15 11 + vbroadcasti128 m5, [cq+16* 9] + vbroadcasti128 m4, [cq+16*13] + shufpd m5, m4, 0x0c ; 9 13 + vbroadcasti128 m6, [cq+16* 7] + vbroadcasti128 m4, [cq+16* 3] + shufpd m6, m4, 0x0c ; 7 3 + call .pass1_main2 + pcmpeqd m4, m4 + REPX {psubd x, m4}, m0, m1, m2, m3 + call .pass1_main3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call .transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(idct2_16x4_internal_8).main +.end: + vpbroadcastd m4, [vvc_pw_2048] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_10_max] +.end2: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] +.end3: + lea r6, [dstq+strideq*2] + paddw m2, [r6 +strideq*0] + paddw m3, [r6 +strideq*1] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + REPX {pminsw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [r6 +strideq*0], m2 + mova [r6 +strideq*1], m3 + RET +ALIGN function_align +.pass1_main: + vpbroadcastd m7, [vvc_pd_2048] + call m(idct2_8x4_internal_10).main + psubd m3, m0, m4 ; idct2_8 out7 out6 + paddd m0, m4 ; idct2_8 out0 out1 + paddd m1, m2, m5 ; idct2_8 out3 out2 + psubd m2, m5 ; idct2_8 out4 out5 + ret +ALIGN function_align +.pass1_main2: + ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 9_43, 90_80, 1 + ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 70_87, 57_25, 1 + vbroadcasti128 m12, [vvc_pd_83_m83] + psubd m4, m10, m5 + paddd m10, m5 ; t8 t11 + psignd m4, m12 ; t9 t10 + psubd m5, m11, m6 + paddd m11, m6 ; t15 t12 + psignd m5, m12 ; t14 t13 + vpbroadcastd m6, [vvc_pd_36] + vpbroadcastd m13, [vvc_pd_83] + REPX {pmaxsd x, m8}, m5, m4 + REPX {pminsd x, m9}, m5, m4 + pmulld m12, m5 + pmulld m5, m6 + vbroadcasti128 m6, [vvc_pd_36_m36] + pmulld m13, m4 + pmulld m4, m6 + REPX {pmaxsd x, m8}, m10, m11, m0, m1 + REPX {pminsd x, m9}, m10, m11, m0, m1 + paddd m12, m7 + paddd m5, m7 + paddd m4, m12 + psubd m5, m13 + psrad m4, 12 ; t14a t10a + psrad m5, 12 ; t9a t13a + vpbroadcastd m12, [vvc_pd_64] + punpckhqdq m6, m11, m5 + punpcklqdq m11, m4 + punpckhqdq m4, m10, m4 + punpcklqdq m10, m5 + psubd m5, m11, m6 ; t12a t13 + paddd m11, m6 ; t15a t14 + psubd m6, m10, m4 ; t11a t10 + paddd m10, m4 ; t8a t9 + REPX {pmaxsd x, m8}, m5, m6 + REPX {pminsd x, m9}, m5, m6 + pmulld m5, m12 + pmulld m6, m12 + REPX {pmaxsd x, m8}, m2, m3, m11, m10 + REPX {pminsd x, m9}, m2, m3, m11, m10 + ret +ALIGN function_align +.pass1_main3: + paddd m5, m7 + psubd m4, m5, m6 + paddd m5, m6 + psrad m4, 12 ; t11 t10a + psrad m5, 12 ; t12 t13a + psubd m7, m0, m11 ; out15 out14 + paddd m0, m11 ; out0 out1 + psubd m6, m1, m5 ; out12 out13 + paddd m1, m5 ; out3 out2 + psubd m5, m2, m4 ; out11 out10 + paddd m2, m4 ; out4 out5 + psubd m4, m3, m10 ; out8 out9 + paddd m3, m10 ; out7 out6 + REPX {pshufd x, x, q1032}, m1, m3, m5, m7 + ret +ALIGN function_align +.transpose_4x16_packed: + vbroadcasti128 m8, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + REPX {pshufb x, m8}, m0, m2, m4, m6 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m2, m4, m6 + punpcklqdq m4, m6 + vperm2i128 m3, m1, m2, 0x31 + vinserti128 m1, xm2, 1 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret + +INV_TXFM_16X4_FN adst, dct2 +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_4x16_internal_10).main + psrad m11, 11 ; vvc_pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + paddd m4, m5, m11 + paddd m5, m6, m11 + paddd m6, m7, m11 + paddd m7, m8, m11 +.pass1_end: + REPX {pshufd x, x, q1032}, m0, m2, m4, m6 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct2_16x4_internal_10).transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(iadst_16x4_internal_8).main + jmp m(idct2_16x4_internal_10).end +ALIGN function_align +.main: + vpbroadcastd m6, [vvc_pd_1321] + mova m0, [cq+32*0] + mova m1, [cq+32*1] + vpbroadcastd m7, [vvc_pd_2482] + mova m2, [cq+32*6] + mova m3, [cq+32*7] + pmulld m4, m0, m6 + pmulld m5, m1, m6 ; 1321*in0 + pmulld m9, m2, m7 + pmulld m8, m3, m7 ; 2482*in3 + paddd m4, m9 + paddd m8, m5 ; 1321*in0 + 2482*in3 + pmulld m5, m0, m7 + pmulld m9, m1, m7 ; 2482*in0 + paddd m0, m2 + paddd m1, m3 ; in0 + in3 + paddd m7, m6 ; vvc_pd_3803 + pmulld m2, m7 + pmulld m3, m7 ; 3803*in3 + psubd m5, m2 + psubd m9, m3 ; 2482*in0 - 3803*in3 + mova m2, [cq+32*4] + pmulld m10, m7, m2 + pmulld m3, m6, m2 + psubd m2, m0 + mova m0, [cq+32*5] + pmulld m7, m0 ; 3803*in2 + pmulld m6, m0 ; 1321*in2 + psubd m0, m1 ; in2 - in0 - in3 + vpbroadcastd m1, [vvc_pd_m3344] + paddd m4, m10 + paddd m7, m8 ; t0 + psubd m5, m3 + psubd m9, m6 ; t1 + pmulld m2, m1 + pmulld m0, m1 ; t2 + pmulld m3, m1, [cq+32*2] + pmulld m1, [cq+32*3] ; -t3 + ret +ALIGN function_align +.main_end: + ; expects: m6 = rnd + paddd m5, m6 + paddd m9, m6 + paddd m10, m4, m5 + paddd m4, m6 + paddd m8, m7, m6 + paddd m7, m9 + psubd m4, m3 ; out0 (unshifted) + psubd m5, m3 ; out1 (unshifted) + paddd m2, m6 ; out2 (unshifted) + paddd m3, m10 ; out3 (unshifted) + psubd m8, m1 ; out4 (unshifted) + psubd m9, m1 ; out5 (unshifted) + paddd m6, m0 ; out6 (unshifted) + paddd m7, m1 ; out7 (unshifted) + ret + +INV_TXFM_16X4_FN flipadst, dct2 +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_4x16_internal_10).main + psrad m11, 11 ; vvc_pd_1 + paddd m4, m3, m11 + paddd m3, m5, m11 + paddd m5, m2, m11 + paddd m2, m6, m11 + paddd m6, m1, m11 + paddd m1, m7, m11 + paddd m7, m0, m11 + paddd m0, m8, m11 + jmp m(iadst_16x4_internal_10).pass1_end +.pass2: + call m(idct2_16x4_internal_10).transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(iadst_16x4_internal_8).main + vpbroadcastd m4, [vvc_pw_2048] + pmulhrsw m5, m3, m4 + pmulhrsw m6, m2, m4 + pmulhrsw m2, m1, m4 + pmulhrsw m3, m0, m4 + paddw m0, m5, [dstq+strideq*0] + paddw m1, m6, [dstq+strideq*1] + vpbroadcastd m5, [pixel_10_max] + jmp m(idct2_16x4_internal_10).end3 + +INV_TXFM_16X4_FN identity, dct2 +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [vvc_pd_5793] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m4, [cq+32*4], q3120 ; 8 9 + vpermq m5, [cq+32*5], q3120 ; a b + vpermq m6, [cq+32*6], q3120 ; c d + vpermq m7, [cq+32*7], q3120 ; e f + vpbroadcastd m9, [vvc_pd_3072] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct2_16x4_internal_10).transpose_4x16_packed + vpbroadcastd m7, [vvc_pw_1697x8] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct2_16x4_internal_10).end + +INV_TXFM_16X4_FN dct2, dct2, 12 +INV_TXFM_16X4_FN dct2, identity, 12 +INV_TXFM_16X4_FN dct2, adst, 12 +INV_TXFM_16X4_FN dct2, flipadst, 12 + +cglobal idct2_16x4_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + jmp m(idct2_16x4_internal_10).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ; deinterleave + REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + ; transpose + punpcklqdq m8, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m9, m2, m3 + punpckhqdq m2, m3 + punpcklqdq m10, m4, m5 + punpckhqdq m4, m5 + punpcklqdq m11, m6, m7 + punpckhqdq m6, m7 + vperm2i128 m3, m0, m2, 0x31 ; out6 + vperm2i128 m1, m0, m2, 0x20 ; out2 + vperm2i128 m7, m4, m6, 0x31 ; out7 + vperm2i128 m5, m4, m6, 0x20 ; out3 + vperm2i128 m13, m10, m11, 0x31 ; out5 + vperm2i128 m12, m10, m11, 0x20 ; out1 + vperm2i128 m11, m8, m9, 0x31 ; out4 + vperm2i128 m10, m8, m9, 0x20 ; out0 + call m(idct2_4x16_internal_10).pass1_main + pmulld m0, m6, m10 + pmulld m2, m6, m11 + pmulld m4, m6, m12 + pmulld m6, m13 + vpbroadcastd m10, [vvc_pd_17408] + call m(idct2_4x16_internal_10).pass1_main2 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m5, [pixel_12_max] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + jmp m(idct2_16x4_internal_10).end2 + +INV_TXFM_16X4_FN adst, dct2, 12 +INV_TXFM_16X4_FN adst, adst, 12 +INV_TXFM_16X4_FN adst, flipadst, 12 +INV_TXFM_16X4_FN adst, identity, 12 + +cglobal iadst_16x4_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_16x4_internal_10).pass1 +.pass2: + call .pass2_main + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + jmp m(idct2_16x4_internal_10).end2 +ALIGN function_align +.pass2_main: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7 + pmaxsd m8, m4, m12 + pmaxsd m9, m5, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12).transpose_4x8 + mova [cq+32*0], m0 + mova [cq+32*2], m1 + mova [cq+32*4], m2 + mova [cq+32*6], m3 + pminsd m0, m8, m13 + pminsd m1, m9, m13 + pminsd m2, m6, m13 + pminsd m3, m7, m13 + call m(iadst_8x4_internal_12).transpose_4x8 + mova [cq+32*1], m0 + mova [cq+32*3], m1 + mova [cq+32*5], m2 + mova [cq+32*7], m3 + call m(iadst_16x4_internal_10).main + vpbroadcastd m6, [vvc_pd_2048] + call m(iadst_16x4_internal_10).main_end + psrad m0, m4, 15 + psrad m1, m5, 15 + psrad m2, 15 + psrad m3, 15 + psrad m4, m8, 15 + psrad m5, m9, 15 + psrad m6, 15 + psrad m7, 15 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m4, [vvc_pw_16384] + vpbroadcastd m5, [pixel_12_max] + ret + +INV_TXFM_16X4_FN flipadst, dct2, 12 +INV_TXFM_16X4_FN flipadst, adst, 12 +INV_TXFM_16X4_FN flipadst, flipadst, 12 +INV_TXFM_16X4_FN flipadst, identity, 12 + +cglobal iflipadst_16x4_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_16x4_internal_10).pass1 +.pass2: + call m(iadst_16x4_internal_12).pass2_main + vpermq m7, m0, q3120 + vpermq m6, m1, q3120 + vpermq m1, m2, q3120 + vpermq m0, m3, q3120 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m6, m4 + pmulhrsw m3, m7, m4 + jmp m(idct2_16x4_internal_10).end2 + +INV_TXFM_16X4_FN identity, dct2, 12 +INV_TXFM_16X4_FN identity, adst, 12 +INV_TXFM_16X4_FN identity, flipadst, 12 +INV_TXFM_16X4_FN identity, identity, 12 + +cglobal iidentity_16x4_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [vvc_pd_1697] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpbroadcastd m9, [vvc_pd_3072] + pmulld m4, m8, m0 + pmulld m5, m8, m1 + pmulld m6, m8, m2 + pmulld m7, m8, m3 + vpermq m10, [cq+32*4], q3120 ; 8 9 + vpermq m11, [cq+32*5], q3120 ; a b + vpermq m12, [cq+32*6], q3120 ; c d + vpermq m13, [cq+32*7], q3120 ; e f + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m0, m4 + pmulld m4, m8, m10 + paddd m1, m5 + pmulld m5, m8, m11 + paddd m2, m6 + pmulld m6, m8, m12 + paddd m3, m7 + pmulld m7, m8, m13 + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m4, m10 + paddd m5, m11 + paddd m6, m12 + paddd m7, m13 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m8, [vvc_pd_5793] + vpbroadcastd m9, [vvc_pd_2048] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_16x4_internal_10).transpose_4x16_packed + vpbroadcastd m4, [vvc_pw_16384] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_12_max] + jmp m(idct2_16x4_internal_10).end2 + +%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 16x8, %3 +%ifidn %1_%2, dct2_dct2 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%3] + mov [cq], eobd ; 0 + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(vvc_inv_dct2_dct2_16x4_10).dconly2 +%endif +%endmacro + +INV_TXFM_16X8_FN dct2, dct2 +INV_TXFM_16X8_FN dct2, identity +INV_TXFM_16X8_FN dct2, adst +INV_TXFM_16X8_FN dct2, flipadst + +cglobal idct2_16x8_internal_10, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [vvc_pd_64] + pmulld m0, m14, [cq+32* 1] + pmulld m1, m14, [cq+32* 3] + pmulld m2, m14, [cq+32* 5] + pmulld m3, m14, [cq+32* 7] + pmulld m4, m14, [cq+32* 9] + pmulld m5, m14, [cq+32*11] + pmulld m6, m14, [cq+32*13] + pmulld m7, m14, [cq+32*15] + vpbroadcastd m11, [vvc_pd_2048] + lea r6, [rsp+32*4] + call m(idct2_8x16_internal_10).main_oddhalf_rect2 + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct2_8x8_internal_10).main_rect2 + call m(idct2_8x16_internal_10).main_evenhalf + psrld m11, 11 ; vvc_pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call .pass1_rotations + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct2_16x8_internal_8).main + vpbroadcastd m10, [vvc_pw_2048] +.end: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call .write_16x4_start +.end2: + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m10 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m10 + call .write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + ret +ALIGN function_align +.transpose: + lea r6, [deint_shuf+128] +.transpose2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 +.transpose3: + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m4, m6 + punpckldq m4, m6 + punpckldq m6, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m3, m5 + punpckldq m3, m5 + punpcklqdq m5, m6, m3 + punpckhqdq m6, m3 + punpckhqdq m3, m2, m7 + punpcklqdq m2, m7 + punpcklqdq m7, m8, m1 + punpckhqdq m8, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m5, 0x31 + vinserti128 m0, xm5, 1 + vperm2i128 m5, m1, m6, 0x31 + vinserti128 m1, xm6, 1 + vperm2i128 m6, m2, m7, 0x31 + vinserti128 m2, xm7, 1 + vperm2i128 m7, m3, m8, 0x31 + vinserti128 m3, xm8, 1 + ret +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_10_max] + lea r3, [strideq*3] + pxor m8, m8 +.write_16x4_zero: + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7 + add cq, 32*8 +.write_16x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3 ] + REPX {pmaxsw x, m8}, m0, m1, m2, m3 + REPX {pminsw x, m9}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_16X8_FN adst, dct2 +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_10, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + lea r6, [rsp+32*4] + call .main + vpbroadcastd m14, [vvc_pd_3072] + psrld m15, 11 ; vvc_pd_1 + psubd m13, m14, m15 ; vvc_pd_3071 + call .pass1_rotations +.pass1_end: + REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp tx2q +.pass2: + call m(idct2_16x8_internal_10).transpose + call m(iadst_16x8_internal_8).main + call m(iadst_16x8_internal_8).main_pass2_end + vpbroadcastd m10, [vvc_pw_2048] + pxor m11, m11 + psubw m11, m10 + pmulhrsw m0, m10 + pmulhrsw m1, m11 + pmulhrsw m2, m10 + pmulhrsw m3, m11 + call m(idct2_16x8_internal_10).write_16x4_start + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m11 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m11 + call m(idct2_16x8_internal_10).write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] + ret +ALIGN function_align +.main: + ; expects: m13 = clip_min m14 = clip_max + vpbroadcastd m15, [vvc_pd_64] + pmulld m0, m15, [cq+32* 2] + pmulld m1, m15, [cq+32*13] + pmulld m2, m15, [cq+32* 6] + pmulld m3, m15, [cq+32* 9] + pmulld m4, m15, [cq+32*10] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32*14] + pmulld m7, m15, [cq+32* 1] + vpbroadcastd m12, [vvc_pd_2048] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + call .main_part1 + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32*15] + pmulld m2, m15, [cq+32* 4] + pmulld m3, m15, [cq+32*11] + pmulld m4, m15, [cq+32* 8] + pmulld m5, m15, [cq+32* 7] + pmulld m6, m15, [cq+32*12] + pmulld m7, m15, [cq+32* 3] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_part2: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380 + psubd m8, m0, m4 ; t8a + paddd m0, m4 ; t0a + psubd m4, m1, m5 ; t9a + paddd m1, m5 ; t1a + psubd m5, m2, m6 ; t12a + paddd m2, m6 ; t4a + psubd m6, m3, m7 ; t13a + paddd m7, m3 ; t5a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [vvc_pd_89] + vpbroadcastd m10, [vvc_pd_18] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11, 0xc + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10, 0xc + psubd m3, m0, m2 ; t4 + paddd m0, m2 ; t0 + psubd m2, m1, m7 ; t5 + paddd m1, m7 ; t1 + psubd m7, m4, m6 ; t12a + paddd m4, m6 ; t8a + psubd m6, m8, m5 ; t13a + paddd m5, m8 ; t9a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5 + vpbroadcastd m11, [vvc_pd_83] + vpbroadcastd m10, [vvc_pd_36] + ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11, 0xc + ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11, 0xc + pminsd m10, m14, [r6-32*4] ; t2 + pminsd m8, m14, [r6-32*3] ; t3 + psubd m9, m0, m10 ; t2a + paddd m0, m10 ; out0 + psubd m10, m1, m8 ; t3a + paddd m1, m8 ; -out15 + pmaxsd m9, m13 + pmaxsd m10, m13 + pminsd m9, m14 + pminsd m10, m14 + mova [r6-32*4], m1 + mova m11, [r6-32*1] ; t7a + mova m1, [r6-32*2] ; t6a + psubd m8, m3, m11 ; t7 + paddd m11, m3 ; out12 + paddd m3, m2, m1 ; -out3 + psubd m2, m1 ; t6 + pmaxsd m8, m13 + pmaxsd m2, m13 + pminsd m8, m14 + pminsd m2, m14 + mova [r6-32*1], m11 + mova [r6-32*3], m2 + mova m1, [r6+32*3] ; t15 + mova m2, [r6+32*2] ; t14 + paddd m12, m7, m1 ; -out13 + psubd m7, m1 ; t15a + psubd m11, m6, m2 ; t14a + paddd m2, m6 ; out2 + pmaxsd m7, m13 + pmaxsd m11, m13 + pminsd m7, m14 + pminsd m11, m14 + mova [r6-32*2], m12 + pminsd m1, m14, [r6+32*0] ; t10a + pminsd m12, m14, [r6+32*1] ; t11a + psubd m6, m4, m1 ; t10 + paddd m1, m4 ; -out1 + psubd m4, m5, m12 ; t11 + paddd m5, m12 ; out14 + vpbroadcastd m12, [vvc_pd_1448] + pmaxsd m6, m13 + pmaxsd m4, m13 + pminsd m6, m14 + pminsd m4, m14 + REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 + pmulld m12, [r6-32*3] ; t6 + mova [r6-32*3], m5 + paddd m5, m11, m7 ; -out5 (unshifted) + psubd m11, m7 ; out10 (unshifted) + paddd m7, m9, m10 ; -out7 (unshifted) + psubd m9, m10 ; out8 (unshifted) + psubd m10, m6, m4 ; -out9 (unshifted) + paddd m6, m4 ; out6 (unshifted) + paddd m4, m12, m8 ; out4 (unshifted) + psubd m12, m8 ; -out11 (unshifted) + ret +.main_part1: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601 + psubd m8, m0, m4 ; t10a + paddd m0, m4 ; t2a + psubd m4, m1, m5 ; t11a + paddd m1, m5 ; t3a + psubd m5, m2, m6 ; t14a + paddd m2, m6 ; t6a + psubd m6, m3, m7 ; t15a + paddd m7, m3 ; t7a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [vvc_pd_50] + vpbroadcastd m10, [vvc_pd_75] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11, 0xc + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10, 0xc + psubd m3, m0, m2 ; t6 + paddd m0, m2 ; t2 + psubd m2, m1, m7 ; t7 + paddd m1, m7 ; t3 + psubd m7, m4, m6 ; t14a + paddd m4, m6 ; t10a + psubd m6, m8, m5 ; t15a + paddd m5, m8 ; t11a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later + vpbroadcastd m11, [vvc_pd_36] + vpbroadcastd m10, [vvc_pd_83] + ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11, 0xc + ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11, 0xc + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + ret + +INV_TXFM_16X8_FN flipadst, dct2 +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_10, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + lea r6, [rsp+32*4] + call m(iadst_16x8_internal_10).main + vpbroadcastd m14, [vvc_pd_3072] + psrld m15, 11 + psubd m13, m14, m15 + call .pass1_rotations + jmp m(iadst_16x8_internal_10).pass1_end +.pass2: + call m(idct2_16x8_internal_10).transpose + call m(iadst_16x8_internal_8).main + call m(iadst_16x8_internal_8).main_pass2_end + vpbroadcastd m10, [vvc_pw_2048] + pxor m11, m11 + psubw m11, m10 + mova m12, m0 + pmulhrsw m0, m7, m11 + mova m7, m1 + pmulhrsw m1, m6, m10 + mova m6, m2 + pmulhrsw m2, m5, m11 + mova m5, m3 + pmulhrsw m3, m4, m10 + call m(idct2_16x8_internal_10).write_16x4_start + pmulhrsw m0, m5, m11 + pmulhrsw m1, m6, m10 + pmulhrsw m2, m7, m11 + pmulhrsw m3, m12, m10 + call m(idct2_16x8_internal_10).write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + ret + +INV_TXFM_16X8_FN identity, dct2 +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_10, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m15, [vvc_pd_64] + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32* 1] + pmulld m2, m15, [cq+32* 2] + pmulld m3, m15, [cq+32* 3] + pmulld m4, m15, [cq+32* 4] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32* 6] + pmulld m7, m15, [cq+32* 7] + pmulld m8, m15, [cq+32* 8] + pmulld m9, m15, [cq+32* 9] + pmulld m10, m15, [cq+32*10] + pmulld m11, m15, [cq+32*11] + pmulld m12, m15, [cq+32*12] + pmulld m13, m15, [cq+32*13] + pmulld m14, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [rsp], m7 + vpbroadcastd m7, [vvc_pd_2048] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + mova [rsp], m15 + vpbroadcastd m15, [vvc_pd_5793] + REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pmulld m15, [rsp] + mova [rsp], m7 + vpbroadcastd m7, [vvc_pd_3072] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct2_16x8_internal_10).transpose + vpbroadcastd m10, [vvc_pw_4096] + jmp m(idct2_16x8_internal_10).end + +INV_TXFM_16X8_FN dct2, dct2, 12 +INV_TXFM_16X8_FN dct2, identity, 12 +INV_TXFM_16X8_FN dct2, adst, 12 +INV_TXFM_16X8_FN dct2, flipadst, 12 + +cglobal idct2_16x8_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct2_16x8_internal_10).pass1 +.pass2: + call .pass2_main + RET +ALIGN function_align +.pass2_main: + call m(idct2_8x16_internal_12).transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m11, [vvc_pd_2048] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x8_internal_10).main + call m(idct2_8x8_internal_12).round_shift4 + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + pmaxsd m0, m12, [cq+32*0] + pmaxsd m1, m12, [cq+32*1] + pmaxsd m2, m12, [cq+32*2] + pmaxsd m3, m12, [cq+32*3] + pmaxsd m4, m12, [cq+32*4] + pmaxsd m5, m12, [cq+32*5] + pmaxsd m6, m12, [cq+32*6] + pmaxsd m7, m12, [cq+32*7] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x8_internal_10).main + call m(idct2_8x8_internal_12).round_shift4 +.end: + packssdw m0, [cq+32* 8] + packssdw m1, [cq+32* 9] + packssdw m2, [cq+32*10] + packssdw m3, [cq+32*11] + packssdw m4, [cq+32*12] + packssdw m5, [cq+32*13] + packssdw m6, [cq+32*14] + packssdw m7, [cq+32*15] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + call .write_16x4_start + call m(idct2_16x8_internal_10).write_16x4_zero + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + vpermq m2, m6, q3120 + vpermq m3, m7, q3120 + jmp m(idct2_16x8_internal_10).write_16x4_zero +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_12_max] + lea r3, [strideq*3] + pxor m8, m8 + ret + +INV_TXFM_16X8_FN adst, dct2, 12 +INV_TXFM_16X8_FN adst, adst, 12 +INV_TXFM_16X8_FN adst, flipadst, 12 +INV_TXFM_16X8_FN adst, identity, 12 + +cglobal iadst_16x8_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iadst_16x8_internal_10).pass1 +.pass2: + call .pass2_main + call m(idct2_16x8_internal_12).end + RET +ALIGN function_align +.pass2_main: + call m(idct2_8x16_internal_12).transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m11, [vvc_pd_2048] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_8x8_internal_12).pass2_main2 + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + pmaxsd m0, m12, [cq+32*0] + pmaxsd m1, m12, [cq+32*1] + pmaxsd m2, m12, [cq+32*2] + pmaxsd m3, m12, [cq+32*3] + pmaxsd m4, m12, [cq+32*4] + pmaxsd m5, m12, [cq+32*5] + pmaxsd m6, m12, [cq+32*6] + pmaxsd m7, m12, [cq+32*7] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_8x8_internal_12).pass2_main2 + ret + +INV_TXFM_16X8_FN flipadst, dct2, 12 +INV_TXFM_16X8_FN flipadst, adst, 12 +INV_TXFM_16X8_FN flipadst, flipadst, 12 +INV_TXFM_16X8_FN flipadst, identity, 12 + +cglobal iflipadst_16x8_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iflipadst_16x8_internal_10).pass1 +.pass2: + call m(iadst_16x8_internal_12).pass2_main + packssdw m13, m0, [cq+32* 8] + packssdw m12, m1, [cq+32* 9] + packssdw m11, m2, [cq+32*10] + packssdw m10, m3, [cq+32*11] + packssdw m3, m4, [cq+32*12] + packssdw m2, m5, [cq+32*13] + packssdw m1, m6, [cq+32*14] + packssdw m0, m7, [cq+32*15] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + call m(idct2_16x8_internal_12).write_16x4_start + call m(idct2_16x8_internal_10).write_16x4_zero + vpermq m0, m10, q3120 + vpermq m1, m11, q3120 + vpermq m2, m12, q3120 + vpermq m3, m13, q3120 + call m(idct2_16x8_internal_10).write_16x4_zero + RET + +INV_TXFM_16X8_FN identity, dct2, 12 +INV_TXFM_16X8_FN identity, adst, 12 +INV_TXFM_16X8_FN identity, flipadst, 12 +INV_TXFM_16X8_FN identity, identity, 12 + +cglobal iidentity_16x8_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_16x8_internal_10).pass1 +.pass2: + call m(idct2_16x8_internal_10).transpose2 + vpbroadcastd m10, [vvc_pw_4096] + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call m(idct2_16x8_internal_12).write_16x4_start + call m(idct2_16x8_internal_10).write_16x4_zero + jmp m(idct2_16x8_internal_10).end2 + +%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 16x16, %4 +%ifidn %1_%2, dct2_dct2 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%4] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 640 + sar r6d, 10 + jmp m(vvc_inv_dct2_dct2_16x4_10).dconly3 +%endif +%endmacro + +INV_TXFM_16X16_FN dct2, dct2 +INV_TXFM_16X16_FN dct2, identity, 28 +INV_TXFM_16X16_FN dct2, adst +INV_TXFM_16X16_FN dct2, flipadst + +cglobal idct2_16x16_internal_10, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + mova m10, [r6-32*4] + mova m9, [r6-32*3] + mova m8, [r6-32*2] + psubd m15, m0, m10 ; out15 + paddd m0, m10 ; out0 + psubd m10, m1, m9 ; out14 + paddd m1, m9 ; out1 + psubd m9, m2, m8 ; out13 + paddd m2, m8 ; out2 + REPX {psrad x, 2}, m0, m1, m2 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova m2, [r6-32*1] + mova m1, [r6+32*0] + mova m0, [r6+32*1] + REPX {psrad x, 2}, m9, m10, m15 + psubd m8, m3, m2 ; out12 + paddd m3, m2 ; out3 + psubd m2, m4, m1 ; out11 + paddd m4, m1 ; out4 + psubd m1, m5, m0 ; out10 + paddd m5, m0 ; out5 + REPX {psrad x, 2}, m3, m4, m5 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova m4, [r6+32*2] + mova m3, [r6+32*3] + REPX {psrad x, 2}, m1, m2, m8 + psubd m5, m6, m4 ; out9 + paddd m6, m4 ; out6 + psubd m4, m7, m3 ; out8 + paddd m7, m3 ; out7 + REPX {psrad x, 2}, m6, m7, m4, m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + mova [r6-32*4], m4 + mova [r6-32*3], m5 + mova [r6-32*2], m1 + mova [r6-32*1], m2 + mova [r6+32*0], m8 + mova [r6+32*1], m9 + mova [r6+32*2], m10 + mova [r6+32*3], m15 +.fast: + add r6, 32*8 + call .main + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + sub r6, 32*8 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + lea r6, [vvc_pw_5+128] + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] +.end: + call .write_16x16 + RET +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [vvc_pw_2048] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct2_16x8_internal_10).write_16x4_start +.write_16x16_2: + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct2_16x8_internal_10).write_16x4_zero +ALIGN function_align +.transpose: + test eobd, eobd + jl .transpose_fast + packssdw m8, [r6-32*4] + packssdw m9, [r6-32*3] + packssdw m10, [r6-32*2] + packssdw m11, [r6-32*1] + packssdw m12, [r6+32*0] + packssdw m13, [r6+32*1] + packssdw m14, [r6+32*2] + packssdw m15, [r6+32*3] + sub r6, 32*8 + packssdw m0, [r6-32*4] + packssdw m1, [r6-32*3] + packssdw m2, [r6-32*2] + packssdw m3, [r6-32*1] + packssdw m4, [r6+32*0] + packssdw m5, [r6+32*1] + packssdw m6, [r6+32*2] + packssdw m7, [r6+32*3] + mova [r6], m8 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m6, m7 + punpcklwd m6, m7 + punpcklwd m7, m4, m5 + punpckhwd m4, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m7, m6 + punpckldq m7, m6 + punpckhdq m6, m4, m3 + punpckldq m4, m3 + punpckhqdq m3, m2, m1 + punpcklqdq m2, m1 + punpckhqdq m1, m0, m7 + punpcklqdq m0, m7 + punpcklqdq m7, m8, m6 + punpckhqdq m8, m6 + punpckhqdq m6, m5, m4 + punpcklqdq m5, m4 + mova m4, [r6] + mova [r6], m8 + punpcklwd m8, m4, m9 + punpckhwd m4, m9 + punpcklwd m9, m10, m11 + punpckhwd m10, m11 + punpckhwd m11, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m12, m13 + punpcklwd m12, m13 + punpckldq m13, m4, m10 + punpckhdq m4, m10 + punpckhdq m10, m8, m9 + punpckldq m8, m9 + punpckhdq m9, m12, m14 + punpckldq m12, m14 + punpckhdq m14, m15, m11 + punpckldq m15, m11 + punpckhqdq m11, m10, m9 + punpcklqdq m10, m9 + punpckhqdq m9, m8, m12 + punpcklqdq m8, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m4, m14 + punpcklqdq m14, m4, m14 + vperm2i128 m4, m0, m8, 0x31 + vinserti128 m0, xm8, 1 + vinserti128 m8, m5, xm12, 1 + vperm2i128 m12, m5, 0x13 + vperm2i128 m5, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vinserti128 m9, m6, xm13, 1 + vperm2i128 m13, m6, 0x13 + vperm2i128 m6, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vinserti128 m10, m7, xm14, 1 + vperm2i128 m14, m7, 0x13 + vperm2i128 m7, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + mova xm11, [r6] + vinserti128 m11, xm15, 1 + vinserti128 m15, [r6+16], 0 + ret +.transpose_fast: + call m(idct2_16x8_internal_10).transpose2 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + ret +ALIGN function_align +.main: + mova m0, [cq+64* 1] + mova m1, [cq+64* 3] + mova m2, [cq+64* 5] + mova m3, [cq+64* 7] + mova m4, [cq+64* 9] + mova m5, [cq+64*11] + mova m6, [cq+64*13] + mova m7, [cq+64*15] + call m(idct2_8x16_internal_10).main_oddhalf + mova m0, [cq+64* 0] + mova m1, [cq+64* 2] + mova m2, [cq+64* 4] + mova m3, [cq+64* 6] + mova m4, [cq+64* 8] + mova m5, [cq+64*10] + mova m6, [cq+64*12] + mova m7, [cq+64*14] + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + psrld m10, m11, 10 ; vvc_pd_2 + REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_16X16_FN adst, dct2 +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_10, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + vpbroadcastd m15, [vvc_pd_64] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + vpbroadcastd m8, [vvc_pd_5120] + paddd m4, m8 + paddd m6, m8 + paddd m9, m8 + paddd m11, m8 + vpbroadcastd m8, [vvc_pd_5119] + psubd m5, m8, m5 + psubd m7, m8, m7 + psubd m10, m8, m10 + psubd m12, m8, m12 + REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + psrld m4, m15, 10 ; vvc_pd_2 + paddd m0, m4 + psubd m1, m4, m1 + paddd m2, m4 + psubd m3, m4, m3 + psubd m7, m4, [r6-32*4] + paddd m6, m4, [r6-32*3] + psubd m5, m4, [r6-32*2] + paddd m4, [r6-32*1] + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + add r6, 32*8 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova [r6-32*2], m11 + mova [r6-32*1], m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 +.fast: + add r6, 32*8 + call .main + vpbroadcastd m14, [vvc_pd_5120] + vpbroadcastd m13, [vvc_pd_5119] + psrld m15, 10 ; vvc_pd_2 + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] +.pass1_end: + REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 + sub r6, 32*8 + jmp tx2q +.pass2: + call m(idct2_16x16_internal_10).transpose + lea r6, [vvc_pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal_8).main + call m(iadst_16x16_internal_8).main_pass2_end + mova [rsp+32*0], m8 + mova [rsp+32*2], m12 + mova [rsp+32*3], m13 + vpbroadcastd m12, [vvc_pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m12 + pmulhrsw m1, m13, [rsp+32*1] + mova [rsp+32*1], m9 + pmulhrsw m2, m12 + pmulhrsw m3, m13 + call m(idct2_16x8_internal_10).write_16x4_start + pmulhrsw m0, m12, m4 + pmulhrsw m1, m13, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m13, m7 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*0] + pmulhrsw m1, m13, [rsp+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m13, m11 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*2] + pmulhrsw m1, m13, [rsp+32*3] + pmulhrsw m2, m12, m14 + pmulhrsw m3, m13, m15 + call m(idct2_16x8_internal_10).write_16x4_zero + RET +ALIGN function_align +.main: + mova m0, [cq+64* 2] + mova m1, [cq+64*13] + mova m2, [cq+64* 6] + mova m3, [cq+64* 9] + mova m4, [cq+64*10] + mova m5, [cq+64* 5] + mova m6, [cq+64*14] + mova m7, [cq+64* 1] + vpbroadcastd m12, [vvc_pd_2048] + call m(iadst_16x8_internal_10).main_part1 + mova m0, [cq+64* 0] + mova m1, [cq+64*15] + mova m2, [cq+64* 4] + mova m3, [cq+64*11] + mova m4, [cq+64* 8] + mova m5, [cq+64* 7] + mova m6, [cq+64*12] + mova m7, [cq+64* 3] + jmp m(iadst_16x8_internal_10).main_part2 + +INV_TXFM_16X16_FN flipadst, dct2 +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_10, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + vpbroadcastd m15, [vvc_pd_64] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call m(iadst_16x16_internal_10).main + sub cq, 32 + vpbroadcastd m8, [vvc_pd_5120] + paddd m11, m8 + paddd m9, m8 + paddd m6, m8 + paddd m4, m8 + vpbroadcastd m8, [vvc_pd_5119] + psubd m12, m8, m12 + psubd m10, m8, m10 + psubd m7, m8, m7 + psubd m5, m8, m5 + REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 + mova [r6+32*0], m12 + mova [r6+32*1], m11 + mova [r6+32*2], m10 + mova [r6+32*3], m9 + psrld m9, m15, 10 ; vvc_pd_2 + psubd m3, m9, m3 + paddd m2, m9 + psubd m1, m9, m1 + paddd m0, m9 + psubd m12, m9, [r6-32*4] + paddd m11, m9, [r6-32*3] + psubd m10, m9, [r6-32*2] + paddd m9, [r6-32*1] + REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0 + mova [r6-32*4], m12 + mova [r6-32*3], m11 + mova [r6-32*2], m10 + mova [r6-32*1], m9 + add r6, 32*8 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 +.fast: + add r6, 32*8 + call m(iadst_16x16_internal_10).main + vpbroadcastd m14, [vvc_pd_5120] + vpbroadcastd m13, [vvc_pd_5119] + psrld m15, 10 ; vvc_pd_2 + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + jmp m(iadst_16x16_internal_10).pass1_end +.pass2: + call m(idct2_16x16_internal_10).transpose + lea r6, [vvc_pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal_8).main + call m(iadst_16x16_internal_8).main_pass2_end + mova [rsp+32*3], m3 + mova [rsp+32*2], m2 + mova [rsp+32*0], m0 + mova m2, m13 + mova m3, m12 + vpbroadcastd m12, [vvc_pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m13, m15 + pmulhrsw m1, m12, m14 + pmulhrsw m2, m13 + pmulhrsw m3, m12 + mova m14, m8 + mova m15, m9 + call m(idct2_16x8_internal_10).write_16x4_start + pmulhrsw m0, m13, m11 + pmulhrsw m1, m12, m10 + pmulhrsw m2, m13, m15 + pmulhrsw m3, m12, m14 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m13, m7 + pmulhrsw m1, m12, m6 + pmulhrsw m2, m13, m5 + pmulhrsw m3, m12, m4 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m13, [rsp+32*3] + pmulhrsw m1, m12, [rsp+32*2] + pmulhrsw m2, m13, [rsp+32*1] + pmulhrsw m3, m12, [rsp+32*0] + call m(idct2_16x8_internal_10).write_16x4_zero + RET + +INV_TXFM_16X16_FN identity, dct2, -92 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_10, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m15, [vvc_pd_5793] + vpbroadcastd m7, [vvc_pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + pmulld m0, m15, [cq+r3+32*33] + pmulld m1, m15, [cq+r3+32*35] + pmulld m2, m15, [cq+r3+32*37] + pmulld m3, m15, [cq+r3+32*39] + add r6, 32*4 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + pmulld m0, m15, [cq+64* 0] + pmulld m1, m15, [cq+64* 1] + pmulld m2, m15, [cq+64* 2] + pmulld m3, m15, [cq+64* 3] + pmulld m4, m15, [cq+64* 4] + pmulld m5, m15, [cq+64* 5] + pmulld m6, m15, [cq+64* 6] + pmulld m8, m15, [cq+64* 7] + mova [cq], m8 + pmulld m8, m15, [cq+64* 8] + pmulld m9, m15, [cq+64* 9] + pmulld m10, m15, [cq+64*10] + pmulld m11, m15, [cq+64*11] + pmulld m12, m15, [cq+64*12] + pmulld m13, m15, [cq+64*13] + pmulld m14, m15, [cq+64*14] + pmulld m15, [cq+64*15] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct2_16x16_internal_10).transpose + + mova [cq+32*0], m15 + mova [cq+32*1], m0 + vpbroadcastd m15, [vvc_pw_1697x16] + + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [cq+32*1] + mova [cq+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [cq+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + mova m1, [cq+32*1] + jmp m(idct2_16x16_internal_10).end + +INV_TXFM_16X16_FN dct2, dct2, 0, 12 +INV_TXFM_16X16_FN dct2, identity, 28, 12 +INV_TXFM_16X16_FN dct2, adst, 0, 12 +INV_TXFM_16X16_FN dct2, flipadst, 0, 12 + +cglobal idct2_16x16_internal_12, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct2_16x16_internal_10).pass1 +.pass2: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + mova [cq+32*12], m12 + mova [cq+32*13], m13 + mova [cq+32*14], m14 + mova [cq+32*15], m15 + call .pass2_main + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] + mov r5, r6 + add r6, 32*16 + call .pass2_main + jmp m(iadst_16x16_internal_12).end +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [vvc_pw_16384] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct2_16x8_internal_12).write_16x4_start + call m(idct2_16x8_internal_10).write_16x4_zero + jmp m(idct2_16x16_internal_10).write_16x16_2 +ALIGN function_align +.pass2_main: + call m(idct2_8x8_internal_12).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m2 + mova [cq+32* 2], m4 + mova [cq+32* 3], m6 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, m1 + pmaxsd m1, m12, m3 + pmaxsd m2, m12, m5 + pmaxsd m3, m12, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3 + test eobd, eobd + jge .pass2_slow + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + jmp .pass2_fast +.pass2_slow: + sub r6, 32*8 + mova m8, [r6-32*4] + mova m4, [r6-32*3] + mova m10, [r6-32*2] + mova m5, [r6-32*1] + mova m12, [r6+32*0] + mova m6, [r6+32*1] + mova m14, [r6+32*2] + mova m7, [r6+32*3] + TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15 + mova [cq+32* 4], m8 + mova [cq+32* 5], m10 + mova [cq+32* 6], m12 + mova [cq+32* 7], m14 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m4, m5, m6, m7 + REPX {pminsd x, m13}, m4, m5, m6, m7 +.pass2_fast: + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m14, [vvc_pd_64] + call m(idct2_8x16_internal_10).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 1] + pmaxsd m2, m12, [cq+32* 2] + pmaxsd m3, m12, [cq+32* 3] + REPX {pminsd x, m13}, m0, m1, m2, m3 + test eobd, eobd + jge .pass2_slow2 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + jmp .pass2_fast2 +.pass2_slow2: + pmaxsd m4, m12, [cq+32* 4] + pmaxsd m5, m12, [cq+32* 5] + pmaxsd m6, m12, [cq+32* 6] + pmaxsd m7, m12, [cq+32* 7] + REPX {pminsd x, m13}, m4, m5, m6, m7 +.pass2_fast2: + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + psrad m11, 8 ; vvc_pd_8 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_16x8_internal_10).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ret + +INV_TXFM_16X16_FN adst, dct2, 0, 12 +INV_TXFM_16X16_FN adst, adst, 0, 12 +INV_TXFM_16X16_FN adst, flipadst, 0, 12 + +cglobal iadst_16x16_internal_12, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iadst_16x16_internal_10).pass1 +.pass2: + call .pass2_part1 + call m(iadst_16x8_internal_10).pass1_rotations + call .pass2_part2 + call m(iadst_16x8_internal_10).pass1_rotations +.pass2_part3: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 +.end: + packssdw m15, m14 + packssdw m14, m13, m12 + packssdw m13, m11, m10 + packssdw m12, m9, m8 + packssdw m11, m7, m6 + packssdw m10, m5, m4 + packssdw m7, m3, m2 + packssdw m6, m1, m0 + vpblendd m0, m6, [r5-32*4], 0x33 + vpblendd m1, m6, [r5-32*4], 0xcc + vpblendd m2, m7, [r5-32*3], 0x33 + vpblendd m3, m7, [r5-32*3], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct2_16x8_internal_12).write_16x4_start + call m(idct2_16x8_internal_10).write_16x4_zero + vpblendd m0, m10, [r5-32*2], 0x33 + vpblendd m1, m10, [r5-32*2], 0xcc + vpblendd m2, m11, [r5-32*1], 0x33 + vpblendd m3, m11, [r5-32*1], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct2_16x8_internal_10).write_16x4_zero + vpblendd m0, m12, [r5+32*0], 0x33 + vpblendd m1, m12, [r5+32*0], 0xcc + vpblendd m2, m13, [r5+32*1], 0x33 + vpblendd m3, m13, [r5+32*1], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct2_16x8_internal_10).write_16x4_zero + vpblendd m0, m14, [r5+32*2], 0x33 + vpblendd m1, m14, [r5+32*2], 0xcc + vpblendd m2, m15, [r5+32*3], 0x33 + vpblendd m3, m15, [r5+32*3], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct2_16x8_internal_10).write_16x4_zero + RET +ALIGN function_align +.pass2_part1: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + mova [cq+32*12], m12 + mova [cq+32*13], m13 + mova [cq+32*14], m14 + mova [cq+32*15], m15 +.pass2_main: + call m(idct2_8x8_internal_12).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m3 + mova [cq+32* 2], m4 + mova [cq+32* 3], m7 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + pmaxsd m0, m13, m2 + pmaxsd m2, m13, m6 + pmaxsd m5, m13, m5 + pmaxsd m7, m13, m1 + REPX {pminsd x, m14}, m0, m2, m5, m7 + test eobd, eobd + jge .pass2_slow + pxor m1, m1 + REPX {mova x, m1}, m3, m4, m6 + jmp .pass2_fast +.pass2_slow: + sub r6, 32*8 + mova m8, [r6-32*4] + mova m3, [r6-32*3] + mova m4, [r6-32*2] + mova m11, [r6-32*1] + mova m12, [r6+32*0] + mova m1, [r6+32*1] + mova m6, [r6+32*2] + mova m15, [r6+32*3] + TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14 + mova [cq+32* 4], m8 + mova [cq+32* 5], m11 + mova [cq+32* 6], m12 + mova [cq+32* 7], m15 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + REPX {pmaxsd x, m13}, m1, m3, m4, m6 + REPX {pminsd x, m14}, m1, m3, m4, m6 +.pass2_fast: + vpbroadcastd m12, [vvc_pd_2048] + vpbroadcastd m15, [vvc_pd_64] + call m(iadst_16x8_internal_10).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m7, m13, [cq+32* 1] ; 3 + pmaxsd m2, m13, [cq+32* 2] ; 4 + pmaxsd m5, m13, [cq+32* 3] ; 7 + REPX {pminsd x, m14}, m0, m2, m5, m7 + test eobd, eobd + jge .pass2_slow2 + pxor m1, m1 + REPX {mova x, m1}, m3, m4, m6 + jmp .pass2_fast2 +.pass2_slow2: + pmaxsd m4, m13, [cq+32* 4] ; 8 + pmaxsd m3, m13, [cq+32* 5] ; 11 + pmaxsd m6, m13, [cq+32* 6] ; 12 + pmaxsd m1, m13, [cq+32* 7] ; 15 + REPX {pminsd x, m14}, m1, m3, m4, m6 +.pass2_fast2: + call m(iadst_16x8_internal_10).main_part2 + vpbroadcastd m14, [vvc_pd_17408] + psrld m15, 11 ; vvc_pd_1 + psubd m13, m14, m15 ; vvc_pd_17407 + pslld m15, 3 ; vvc_pd_8 + ret +ALIGN function_align +.pass2_part2: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] + mov r5, r6 + add r6, 32*16 + jmp .pass2_main + +INV_TXFM_16X16_FN flipadst, dct2, 0, 12 +INV_TXFM_16X16_FN flipadst, adst, 0, 12 +INV_TXFM_16X16_FN flipadst, flipadst, 0, 12 + +cglobal iflipadst_16x16_internal_12, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iflipadst_16x16_internal_10).pass1 +.pass2: + call m(iadst_16x16_internal_12).pass2_part1 + call m(iflipadst_16x8_internal_10).pass1_rotations + call m(iadst_16x16_internal_12).pass2_part2 + call m(iflipadst_16x8_internal_10).pass1_rotations + jmp m(iadst_16x16_internal_12).pass2_part3 + +INV_TXFM_16X16_FN identity, dct2, -92, 12 +INV_TXFM_16X16_FN identity, identity, 0, 12 + +%macro IDTX16_12 1 ; src + pmulld m6, m7, m%1 + paddd m6, m15 + psrad m6, 12 + paddd m6, m%1 + psrad m%1, m6, 1 +%endmacro + +cglobal iidentity_16x16_internal_12, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m7, [vvc_pd_1697] + vpbroadcastd m15, [vvc_pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + mova m10, [cq+r3+32*33] + mova m11, [cq+r3+32*35] + mova m12, [cq+r3+32*37] + mova m13, [cq+r3+32*39] + add r6, 32*4 + pmulld m0, m7, m10 + pmulld m1, m7, m11 + pmulld m2, m7, m12 + pmulld m3, m7, m13 + REPX {paddd x, m15}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + paddd m0, m10 + paddd m1, m11 + paddd m2, m12 + paddd m3, m13 + REPX {psrad x, 1 }, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + mova m0, [cq+64* 0] + mova m1, [cq+64* 1] + mova m2, [cq+64* 2] + mova m3, [cq+64* 3] + mova m4, [cq+64* 4] + mova m5, [cq+64* 5] + mova m8, [cq+64* 6] + mova m9, [cq+64* 7] + REPX {IDTX16_12 x}, 0, 1, 2, 3, 4, 5, 8, 9 + mova [cq+64*0], m8 + mova [cq+64*1], m9 + mova m8, [cq+64* 8] + mova m9, [cq+64* 9] + mova m10, [cq+64*10] + mova m11, [cq+64*11] + mova m12, [cq+64*12] + mova m13, [cq+64*13] + mova m14, [cq+64*14] + REPX {IDTX16_12 x}, 8, 9, 10, 11, 12, 13, 14 + mova m6, [cq+64*15] + pmulld m7, m6 + paddd m7, m15 + psrad m7, 12 + paddd m7, m6 + mova m6, [cq+64*0] + psrad m15, m7, 1 + mova m7, [cq+64*1] + jmp tx2q +.pass2: + call m(iidentity_8x16_internal_12).pass2_main + call m(idct2_16x16_internal_10).transpose_fast + test eobd, eobd + jl .pass2_fast + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + mova m10, [r6-32*2] + mova m11, [r6-32*1] + mova m12, [r6+32*0] + mova m13, [r6+32*1] + mova m14, [r6+32*2] + mova m15, [r6+32*3] + sub r6, 32*8 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + mova m2, [r6-32*2] + mova m3, [r6-32*1] + mova m4, [r6+32*0] + mova m5, [r6+32*1] + mova m6, [r6+32*2] + mova m7, [r6+32*3] + call m(iidentity_8x16_internal_12).pass2_main + call m(idct2_16x8_internal_10).transpose2 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 + mova m13, m5 + mova m14, m6 + mova m15, m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] +.pass2_fast: + call m(idct2_16x16_internal_12).write_16x16 + RET + +%macro IDCT2_32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack + mova m%4, [r6+32*(%1-4)] + mova m%2, [r5+32*(3-%1)] + mova m%5, [r4+32*(%1-4)] + psubd m%3, m%1, m%4 ; idct2_16 out15 - n + paddd m%1, m%4 ; idct2_16 out0 + n + pmaxsd m%1, m12 + pmaxsd m%3, m12 + pminsd m%1, m13 + pminsd m%3, m13 + paddd m%1, m11 + paddd m%3, m11 + psubd m%4, m%1, m%2 ; out31 - n + paddd m%1, m%2 ; out0 + n + paddd m%2, m%3, m%5 ; out15 - n + psubd m%3, m%5 ; out16 + n + REPX {psrad x, %6}, m%1, m%3, m%2, m%4 +%if %7 & 1 + packssdw m%1, m%3 ; out0 + n, out16 + n + packssdw m%2, m%4 ; out15 - n, out31 - n +%endif +%endmacro + +cglobal vvc_inv_dct2_dct2_8x32_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vbroadcasti128 m14, [idct2_32_shuf] + mov r4, cq + call .pass1_main + mova [rsp+32*0], m2 + mova [rsp+32*1], m3 + cmp eobd, 43 + jge .eob43 + pxor m4, m4 + REPX {mova x, m4}, [rsp+32*2], m2, m3, m11 + jmp .pass1_end_fast +.eob43: + lea r6, [rsp+32*8] + mova [r6-32*4], m0 + mova [r6-32*3], m1 + call .pass1_main + mova [rsp+32*2], m2 + cmp eobd, 107 + jge .eob107 + mova m11, m3 + mova m2, m0 + mova m3, m1 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + pxor m4, m4 +.pass1_end_fast: + vpbroadcastd m10, [vvc_pw_2048] + lea r6, [deint_shuf+128] + REPX {mova x, m4}, m5, m6, m7 + call m(vvc_inv_dct2_dct2_8x32_8).main_fast + jmp .end +.eob107: + mova [rsp+32*3], m3 + mova [r6-32*2], m0 + mova [r6-32*1], m1 + call .pass1_main + cmp eobd, 171 + jge .eob171 + pshufd m12, m2, q1032 + pshufd m13, m3, q1032 + mova m4, m0 + mova m5, m1 + pxor m6, m6 + REPX {mova x, m6}, m7, m14, m15 + jmp .pass1_end +.eob171: + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + call .pass1_main + pshufd m12, [r6+32*2], q1032 ; out19 out17 + pshufd m13, [r6+32*3], q1032 ; out23 out21 + mova m4, [r6+32*0] ; out16 out18 + mova m5, [r6+32*1] ; out20 out22 + pshufd m14, m2, q1032 ; out27 out25 + pshufd m15, m3, q1032 ; out31 out29 + mova m6, m0 ; out24 out26 + mova m7, m1 ; out28 out30 +.pass1_end: + mova m0, [r6-32*4] ; out0 out2 + mova m1, [r6-32*3] ; out4 out6 + mova m2, [r6-32*2] ; out8 out10 + mova m3, [r6-32*1] ; out12 out14 + lea r6, [deint_shuf+128] + mova m11, [rsp+32*3] ; out13 out15 + vpbroadcastd m10, [vvc_pw_2048] + call m(vvc_inv_dct2_dct2_8x32_8).main +.end: ; [rsp+0*32] = m12 + vpbroadcastd m12, [vvc_pw_2048] + mov cq, r4 + mova [rsp+32*1], m8 + mova [rsp+32*2], m9 + mova [rsp+32*3], m10 + mova [rsp+32*4], m11 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4_start + vpermq m0, m2, q3120 + vpermq m1, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [rsp+32*1], q3120 + vpermq m1, [rsp+32*2], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [rsp+32*3], q3120 + vpermq m1, [rsp+32*4], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [rsp+32*0], q3120 + vpermq m1, m13, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m14, q3120 + vpermq m1, m15, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(vvc_inv_dct2_dct2_8x8_10).dconly3 +ALIGN function_align +.pass1_main_part1: + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + call m(idct2_8x8_internal_10).main + psrld m1, m11, 10 ; vvc_pd_2 + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret +ALIGN function_align +.pass1_main: + call .pass1_main_part1 + add cq, 32 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + pshufb m0, m14 + pshufb m2, m14 + pshufb m4, m14 + pshufb m6, m14 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + vperm2i128 m1, m0, m2, 0x31 ; 4 6 + vinserti128 m0, xm2, 1 ; 0 2 + vinserti128 m2, m3, xm4, 1 ; 1 3 + vperm2i128 m3, m4, 0x31 ; 5 7 + ret +.main_oddhalf_part1_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part1_fast: ; lower half zero + vpbroadcastd m7, [vvc_pd_4091] + vpbroadcastd m8, [vvc_pd_201] + vpbroadcastd m6, [vvc_pd_m1380] + vpbroadcastd m9, [vvc_pd_3857] + vpbroadcastd m5, [vvc_pd_3703] + vpbroadcastd m10, [vvc_pd_1751] + vpbroadcastd m4, [vvc_pd_m2751] + vpbroadcastd m15, [vvc_pd_3035] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part1_fast2 +.main_oddhalf_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a +.main_oddhalf_part1_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t17 + paddd m0, m4 ; t16 + psubd m4, m6, m2 ; t18 + paddd m6, m2 ; t19 + psubd m2, m1, m5 ; t29 + paddd m1, m5 ; t28 + psubd m5, m7, m3 ; t30 + paddd m7, m3 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [vvc_pd_89] + vpbroadcastd m10, [vvc_pd_18] + ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15, 0xc ; t17a, t30a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 0xe ; t29a, t18a + psubd m3, m0, m6 ; t19a + paddd m0, m6 ; t16a + psubd m6, m7, m1 ; t28a + paddd m7, m1 ; t31a + psubd m1, m5, m4 ; t18 + paddd m5, m4 ; t17 + psubd m4, m8, m2 ; t29 + paddd m8, m2 ; t30 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [vvc_pd_83] + vpbroadcastd m10, [vvc_pd_36] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 0xc ; t18a, t29a + ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15, 0xc ; t19, t28 + mova [r6-32*4], m0 + mova [r6-32*3], m5 + mova [r6-32*2], m4 + mova [r6-32*1], m6 + mova [r6+32*0], m3 + mova [r6+32*1], m1 + mova [r6+32*2], m8 + mova [r6+32*3], m7 + ret +.main_oddhalf_part2_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part2_fast: ; lower half zero + vpbroadcastd m7, [vvc_pd_m601] + vpbroadcastd m8, [vvc_pd_4052] + vpbroadcastd m6, [vvc_pd_3973] + vpbroadcastd m9, [vvc_pd_995] + vpbroadcastd m5, [vvc_pd_m2106] + vpbroadcastd m10, [vvc_pd_3513] + vpbroadcastd m4, [vvc_pd_3290] + vpbroadcastd m15, [vvc_pd_2440] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part2_fast2 +.main_oddhalf_part2_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 + ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a + ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a +.main_oddhalf_part2_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t25 + paddd m0, m4 ; t24 + psubd m4, m6, m2 ; t26 + paddd m6, m2 ; t27 + psubd m2, m1, m5 ; t21 + paddd m1, m5 ; t20 + psubd m5, m7, m3 ; t22 + paddd m7, m3 ; t23 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [vvc_pd_50] + vpbroadcastd m10, [vvc_pd_75] + ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15, 0xc ; t21a, t26a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 0xe ; t25a, t22a + psubd m3, m0, m6 ; t27a + paddd m0, m6 ; t24a + psubd m6, m7, m1 ; t20a + paddd m7, m1 ; t23a + psubd m1, m5, m4 ; t21 + paddd m5, m4 ; t22 + psubd m4, m8, m2 ; t26 + paddd m8, m2 ; t25 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [vvc_pd_83] + vpbroadcastd m10, [vvc_pd_36] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 0xe ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 0xe ; t27, t20 + mova m9, [r6-32*4] ; t16a + mova m10, [r6-32*3] ; t17 + psubd m2, m9, m7 ; t23 + paddd m9, m7 ; t16 + psubd m7, m10, m5 ; t22a + paddd m10, m5 ; t17a + REPX {pmaxsd x, m12}, m9, m10, m2, m7 + REPX {pminsd x, m13}, m9, m10, m2, m7 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova m9, [r6-32*2] ; t18a + mova m10, [r6-32*1] ; t19 + psubd m5, m9, m1 ; t21 + paddd m9, m1 ; t18 + psubd m1, m10, m6 ; t20a + paddd m10, m6 ; t19a + REPX {pmaxsd x, m12}, m9, m10, m5, m1 + REPX {pminsd x, m13}, m9, m10, m5, m1 + mova [r6-32*2], m9 + mova [r6-32*1], m10 + mova m9, [r6+32*0] ; t28 + mova m10, [r6+32*1] ; t29a + psubd m6, m9, m3 ; t27a + paddd m9, m3 ; t28a + psubd m3, m10, m4 ; t26 + paddd m10, m4 ; t29 + REPX {pmaxsd x, m12}, m9, m10, m6, m3 + REPX {pminsd x, m13}, m9, m10, m6, m3 + REPX {pmulld x, m14}, m6, m3, m1, m5 + paddd m6, m11 + paddd m3, m11 + psubd m4, m6, m1 ; t20 + paddd m6, m1 ; t27 + psubd m1, m3, m5 ; t21a + paddd m3, m5 ; t26a + REPX {psrad x, 12 }, m4, m1, m3, m6 + mova [r6+32*0], m4 + mova [r6+32*1], m1 + mova m4, [r6+32*2] ; t30 + mova m1, [r6+32*3] ; t31a + psubd m5, m4, m8 ; t25a + paddd m4, m8 ; t30a + psubd m8, m1, m0 ; t24 + paddd m1, m0 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m1 + REPX {pminsd x, m13}, m8, m5, m4, m1 + REPX {pmulld x, m14}, m5, m8, m7, m2 + paddd m5, m11 + paddd m8, m11 + psubd m0, m5, m7 ; t22 + paddd m5, m7 ; t25 + psubd m7, m8, m2 ; t23a + paddd m2, m8 ; t24a + REPX {psrad x, 12 }, m0, m7, m2, m5 + mova [r6+32*2], m0 + mova [r6+32*3], m7 + mov r4, r6 + add r6, 32*8 + mova [r6-32*4], m2 + mova [r6-32*3], m5 + mova [r6-32*2], m3 + mova [r6-32*1], m6 + mova [r6+32*0], m9 + mova [r6+32*1], m10 + mova [r6+32*2], m4 + mova [r6+32*3], m1 + mov r5, r6 + add r6, 32*8 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; vvc_pd_2 + IDCT2_32_END 0, 15, 8, 9, 10, 2 + IDCT2_32_END 1, 14, 8, 9, 10, 2 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT2_32_END 2, 15, 8, 9, 10, 2 + IDCT2_32_END 3, 14, 8, 9, 10, 2 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT2_32_END 4, 15, 8, 9, 10, 2 + IDCT2_32_END 5, 14, 8, 9, 10, 2 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT2_32_END 6, 15, 8, 9, 10, 2 + IDCT2_32_END 7, 14, 8, 9, 10, 2 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 +.transpose: + punpckhdq m15, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m4, m6 + punpckldq m4, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpcklqdq m5, m2, m15 + punpckhqdq m2, m15 + punpckhqdq m15, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m6, m1 + punpcklqdq m6, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m7, 0x31 + vinserti128 m0, xm7, 1 + vperm2i128 m7, m3, m2, 0x31 + vinserti128 m3, xm2, 1 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m15, 0x31 + vinserti128 m1, xm15, 1 + ret + +cglobal vvc_inv_identity_identity_8x32_10, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_10_max] +.pass1: + vpbroadcastd m5, [vvc_pw_5] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {paddsw x, m5}, m0, m1, m2, m3 + REPX {psraw x, 3 }, m0, m1, m2, m3 + call .main_zero + add cq, 32 + lea dstq, [dstq+strideq*8] + sub eobd, 64 + jge .loop + RET +ALIGN function_align +.main_zero: + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m1 + punpcklwd m2, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + paddw m0, m4 + mova xm4, [dstq+strideq*1] + vinserti128 m4, [dstq+r5 ], 1 + paddw m1, m4 + mova xm4, [dstq+strideq*2] + vinserti128 m4, [dstq+r6*2 ], 1 + paddw m2, m4 + mova xm4, [dstq+r6 ] + vinserti128 m4, [dstq+r4 ], 1 + paddw m3, m4 + REPX {pmaxsw x, m6}, m0, m1, m2, m3 + REPX {pminsw x, m7}, m0, m1, m2, m3 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*4], m0, 1 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+r5 ], m1, 1 + mova [dstq+strideq*2], xm2 + vextracti128 [dstq+r6*2 ], m2, 1 + mova [dstq+r6 ], xm3 + vextracti128 [dstq+r4 ], m3, 1 + ret + +cglobal vvc_inv_dct2_dct2_8x32_12, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + mov r4, cq + lea r6, [rsp+32*4] + call .pass1_main + cmp eobd, 43 + jge .eob43 + jmp .pass2_fast +.eob43: + call .pass1_main + cmp eobd, 107 + jge .eob107 +.pass2_fast: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + vpbroadcastd m14, [vvc_pd_64] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_fast + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_fast + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(idct2_8x16_internal_10).main_oddhalf_fast + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + jmp .pass2_end +.eob107: + call .pass1_main + cmp eobd, 171 + jge .eob171 + jmp .pass2 +.eob171: + call .pass1_main +.pass2: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + pmaxsd m4, m12, [cq+128*1+64] + pmaxsd m5, m12, [cq+128*7+64] + pmaxsd m6, m12, [cq+128*1+96] + pmaxsd m7, m12, [cq+128*7+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m14, [vvc_pd_64] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1 + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + pmaxsd m4, m12, [cq+128*3+64] + pmaxsd m5, m12, [cq+128*5+64] + pmaxsd m6, m12, [cq+128*3+96] + pmaxsd m7, m12, [cq+128*5+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2 + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + pmaxsd m4, m12, [cq+128*2+64] + pmaxsd m5, m12, [cq+128*6+64] + pmaxsd m6, m12, [cq+128*2+96] + pmaxsd m7, m12, [cq+128*6+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x16_internal_10).main_oddhalf + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + pmaxsd m4, m12, [cq+128*0+64] + pmaxsd m5, m12, [cq+128*4+64] + pmaxsd m6, m12, [cq+128*0+96] + pmaxsd m7, m12, [cq+128*4+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf +.pass2_end: + psrld m11, 8 ; vvc_pd_8 + IDCT2_32_END 0, 15, 8, 9, 10, 4 + IDCT2_32_END 1, 14, 8, 9, 10, 4 + punpckhqdq m8, m0, m1 ; 16 17 (interleaved) + punpcklqdq m0, m1 ; 0 1 (interleaved) + punpcklqdq m1, m14, m15 ; 14 15 (interleaved) + punpckhqdq m14, m15 ; 30 31 (interleaved) + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT2_32_END 2, 15, 8, 9, 10, 4 + IDCT2_32_END 3, 14, 8, 9, 10, 4 + punpckhqdq m8, m2, m3 ; 18 19 (interleaved) + punpcklqdq m2, m3 ; 2 3 (interleaved) + punpcklqdq m3, m14, m15 ; 12 13 (interleaved) + punpckhqdq m14, m15 ; 28 29 (interleaved) + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT2_32_END 4, 15, 8, 9, 10, 4 + IDCT2_32_END 5, 14, 8, 9, 10, 4 + punpckhqdq m8, m4, m5 ; 20 21 (interleaved) + punpcklqdq m4, m5 ; 4 5 (interleaved) + punpcklqdq m5, m14, m15 ; 10 11 (interleaved) + punpckhqdq m14, m15 ; 26 27 (interleaved) + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT2_32_END 6, 15, 8, 9, 10, 4 + IDCT2_32_END 7, 14, 8, 9, 10, 4 + punpckhqdq m8, m6, m7 ; 22 23 (interleaved) + punpcklqdq m6, m7 ; 6 7 (interleaved) + punpcklqdq m7, m14, m15 ; 8 9 (interleaved) + punpckhqdq m14, m15 ; 24 25 (interleaved) + mova [r5-32*3], m8 + mova [r5-32*4], m14 + mova m15, m1 +.end: + vpermq m0, m0, q3120 + vpermq m1, m2, q3120 + call m(idct2_8x8_internal_12).write_8x4_start + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m6, q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m7, q3120 + vpermq m1, m5, q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m3, q3120 + vpermq m1, m15, q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [r5+32*3], q3120 + vpermq m1, [r5+32*1], q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [r5-32*1], q3120 + vpermq m1, [r5-32*3], q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [r5-32*4], q3120 + vpermq m1, [r5-32*2], q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [r5+32*0], q3120 + vpermq m1, [r5+32*2], q3120 + call m(idct2_8x8_internal_10).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_12] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(vvc_inv_dct2_dct2_8x8_10).dconly3 +ALIGN function_align +.pass1_main: + call m(vvc_inv_dct2_dct2_8x32_10).pass1_main_part1 + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 32 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; vvc_pd_2 + IDCT2_32_END 0, 15, 8, 9, 10, 2, 0 + mova [cq+32*16], m8 + mova [cq+32*31], m9 + IDCT2_32_END 1, 14, 8, 9, 10, 2, 0 + mova [cq+32*17], m8 + mova [cq+32*30], m9 + mova [cq+32*14], m14 + IDCT2_32_END 2, 14, 8, 9, 10, 2, 0 + mova [cq+32*18], m8 + mova [cq+32*29], m9 + mova [cq+32*13], m14 + IDCT2_32_END 3, 14, 8, 9, 10, 2, 0 + mova [cq+32*19], m8 + mova [cq+32*28], m9 + mova [cq+32*12], m14 + IDCT2_32_END 4, 14, 8, 9, 10, 2, 0 + mova [cq+32*20], m8 + mova [cq+32*27], m9 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + IDCT2_32_END 5, 10, 0, 1, 2, 2, 0 + mova [cq+32*21], m0 + mova [cq+32*26], m1 + IDCT2_32_END 6, 9, 0, 1, 2, 2, 0 + mova [cq+32*22], m0 + mova [cq+32*25], m1 + IDCT2_32_END 7, 8, 0, 1, 2, 2, 0 + mova [cq+32*23], m0 + mova [cq+32*24], m1 + mova m0, [cq+32* 0] + mova m1, [cq+32* 1] + mova m2, [cq+32* 2] + mova m11, m14 + mova m12, [cq+32*12] + mova m13, [cq+32*13] + mova m14, [cq+32*14] + ret + +cglobal vvc_inv_identity_identity_8x32_12, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12_max] + jmp m(vvc_inv_identity_identity_8x32_10).pass1 + +cglobal vvc_inv_dct2_dct2_32x8_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 8 +.dconly: + add r6d, 640 + sar r6d, 10 +.dconly2: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm3 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + psubusw m1, m3 + psubusw m2, m3 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + call .pass1 + call m(vvc_inv_dct2_dct2_8x32_10).main_end + lea r6, [deint_shuf+128] + vpbroadcastd m11, [vvc_pw_2048] + mov r4, dstq + call .pass2 + mova m0, [r5+32*3] ; 16 17 + mova m1, [r5+32*2] ; 30 31 + mova m2, [r5+32*1] ; 18 19 + mova m3, [r5+32*0] ; 28 29 + mova m4, [r5-32*1] ; 20 21 + mova m5, [r5-32*2] ; 26 27 + mova m6, [r5-32*3] ; 22 23 + mova m7, [r5-32*4] ; 24 25 + call m(vvc_inv_dct2_dct2_8x32_10).transpose + lea dstq, [r4+32] + call .pass2 + RET +ALIGN function_align +.pass2: + call m(idct2_16x8_internal_8).main + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + call m(idct2_16x8_internal_10).write_16x4_start + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 + jmp m(idct2_16x8_internal_10).write_16x4_zero +ALIGN function_align +.pass1: + mova m0, [cq+32* 1] + mova m1, [cq+32* 7] + mova m2, [cq+32* 9] + mova m3, [cq+32*15] + mova m4, [cq+32*17] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32*31] + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m14, [vvc_pd_64] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1 + mova m0, [cq+32* 3] + mova m1, [cq+32* 5] + mova m2, [cq+32*11] + mova m3, [cq+32*13] + mova m4, [cq+32*19] + mova m5, [cq+32*21] + mova m6, [cq+32*27] + mova m7, [cq+32*29] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2 + mova m0, [cq+32* 2] + mova m1, [cq+32* 6] + mova m2, [cq+32*10] + mova m3, [cq+32*14] + mova m4, [cq+32*18] + mova m5, [cq+32*22] + mova m6, [cq+32*26] + mova m7, [cq+32*30] + call m(idct2_8x16_internal_10).main_oddhalf + mova m0, [cq+32* 0] + mova m1, [cq+32* 4] + mova m2, [cq+32* 8] + mova m3, [cq+32*12] + mova m4, [cq+32*16] + mova m5, [cq+32*20] + mova m6, [cq+32*24] + mova m7, [cq+32*28] + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + ret + +cglobal vvc_inv_identity_identity_32x8_10, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_10_max] +.pass1: + vpbroadcastd m5, [vvc_pw_4096] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+32*0] + packssdw m0, [cq+32*1] + mova m1, [cq+32*2] + packssdw m1, [cq+32*3] + REPX {mova [cq+32*x], m6}, 0, 1, 2, 3 + add cq, 32*8 + mova m2, [cq-32*4] + packssdw m2, [cq-32*3] + mova m3, [cq-32*2] + packssdw m3, [cq-32*1] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 + call m(vvc_inv_identity_identity_8x32_10).main + add dstq, 16 + sub eobd, 64 + jge .loop + RET + +cglobal vvc_inv_dct2_dct2_32x8_12, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_12] + mov [cq], eobd ; 0 + or r3d, 8 + jmp m(vvc_inv_dct2_dct2_32x8_10).dconly +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + call m(vvc_inv_dct2_dct2_32x8_10).pass1 + call m(vvc_inv_dct2_dct2_8x32_12).main_end + mov r4, dstq + call m(idct2_16x8_internal_12).pass2_main + mova m0, [cq+32* 0] ; 16 + mova m1, [cq+32* 1] ; 17 + mova m2, [cq+32* 2] ; 18 + mova m3, [cq+32* 3] ; 19 + mova m4, [cq+32* 4] ; 20 + mova m5, [cq+32* 5] ; 21 + mova m6, [cq+32* 6] ; 22 + mova m7, [cq+32* 7] ; 23 + mova m8, [cq+32* 8] ; 24 + mova m9, [cq+32* 9] ; 25 + mova m10, [cq+32*10] ; 26 + mova m11, [cq+32*11] ; 27 + mova m12, [cq+32*12] ; 28 + mova m13, [cq+32*13] ; 29 + mova m14, [cq+32*14] ; 30 + mova m15, [cq+32*15] ; 31 + lea dstq, [r4+32] + call m(idct2_16x8_internal_12).pass2_main + RET + +cglobal vvc_inv_identity_identity_32x8_12, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12_max] + jmp m(vvc_inv_identity_identity_32x8_10).pass1 + +%macro IDCT2_32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 +%if %1 == 0 + pxor m6, m6 +%endif + pmulhrsw m%3, m15 + pmulhrsw m%1, m15 + paddw m%3, [dstq+%5] + paddw m%1, [r2+%6] + pmaxsw m%3, m6 + pmaxsw m%1, m6 + pminsw m%3, m7 + pminsw m%1, m7 + mova [dstq+%5], m%3 + mova [r2+%6], m%1 +%endmacro + +cglobal vvc_inv_dct2_dct2_16x32_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*16] + lea r4, [r6+32*8] + lea r5, [r6+32*16] + call .main + sub eobd, 44 + jge .eob44 + vperm2i128 m2, m0, m3, 0x31 ; 5 + vinserti128 m0, xm3, 1 ; 1 + vperm2i128 m3, m1, m4, 0x31 ; 7 + vinserti128 m1, xm4, 1 ; 3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 + jmp .fast +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(vvc_inv_dct2_dct2_16x4_10).dconly2 +.eob44: + mova [r4+16*0], xm0 + mova [r4+16*1], xm3 + mova [r4+16*2], xm1 + mova [r4+16*3], xm4 + vextracti128 [r4+16*4], m0, 1 + vextracti128 [r4+16*5], m3, 1 + vextracti128 [r4+16*6], m1, 1 + vextracti128 [r4+16*7], m4, 1 + call .main + sub eobd, 107 + jge .eob151 + vperm2i128 m7, m1, m4, 0x31 ; 15 + vinserti128 m5, m1, xm4, 1 ; 11 + vperm2i128 m6, m0, m3, 0x31 ; 13 + vinserti128 m4, m0, xm3, 1 ; 9 + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] +.fast: + lea r6, [vvc_pw_5+128] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct2_16 +.eob151: + mova [r4-16*8], xm0 + mova [r4-16*7], xm3 + mova [r4-16*6], xm1 + mova [r4-16*5], xm4 + vextracti128 [r4-16*4], m0, 1 + vextracti128 [r4-16*3], m3, 1 + vextracti128 [r4-16*2], m1, 1 + vextracti128 [r4-16*1], m4, 1 + call .main + sub eobd, 128 + jge .eob279 + vperm2i128 m10, m0, m3, 0x31 ; 21 + vinserti128 m8, m0, xm3, 1 ; 17 + vperm2i128 m11, m1, m4, 0x31 ; 23 + vinserti128 m9, m1, xm4, 1 ; 19 + pxor m12, m12 + REPX {mova x, m12}, m13, m14, m15 + REPX {mova [r6+32*x], m12}, 0, 1, 2, 3 + jmp .full +.eob279: + mova [r5+16*0], xm0 + mova [r5+16*1], xm3 + mova [r5+16*2], xm1 + mova [r5+16*3], xm4 + vextracti128 [r5+16*4], m0, 1 + vextracti128 [r5+16*5], m3, 1 + vextracti128 [r5+16*6], m1, 1 + vextracti128 [r5+16*7], m4, 1 + call .main + vperm2i128 m14, m0, m3, 0x31 ; 29 + vinserti128 m12, m0, xm3, 1 ; 25 + vperm2i128 m15, m1, m4, 0x31 ; 31 + vinserti128 m13, m1, xm4, 1 ; 27 + mova m8, [r5+32*0] + mova m9, [r5+32*1] + mova m10, [r5+32*2] + mova m11, [r5+32*3] +.full: + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] + mova m4, [r4-32*4] + mova m5, [r4-32*3] + mova m6, [r4-32*2] + mova m7, [r4-32*1] + lea r6, [vvc_pw_5 + 128] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + lea r3, [rsp+32*8] + mova m8, [r3+32*0] + mova m9, [r3+32*1] + mova m10, [r3+32*2] + mova m11, [r3+32*3] + mova m12, [r3-32*4] + mova m13, [r3-32*3] + mova m14, [r3-32*2] + mova m15, [r3-32*1] +.idct2_16: + lea r3, [rsp+32*16] + mova m0, [r3+32*0] + mova m1, [r3+32*1] + mova m2, [r3+32*2] + mova m3, [r3+32*3] + mova m4, [r3-32*4] + mova m5, [r3-32*3] + mova m6, [r3-32*2] + mova m7, [r3-32*1] + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +.main: + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 3] + pmulld m2, m14, [cq+128* 5] + pmulld m3, m14, [cq+128* 7] + pmulld m4, m14, [cq+128* 9] + pmulld m5, m14, [cq+128*11] + pmulld m6, m14, [cq+128*13] + pmulld m7, m14, [cq+128*15] + call m(idct2_8x16_internal_10).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 2] + pmulld m2, m14, [cq+128* 4] + pmulld m3, m14, [cq+128* 6] + pmulld m4, m14, [cq+128* 8] + pmulld m5, m14, [cq+128*10] + pmulld m6, m14, [cq+128*12] + pmulld m7, m14, [cq+128*14] + call m(idct2_8x8_internal_10).main_rect2 + call m(idct2_8x16_internal_10).main_evenhalf + psrld m15, m11, 11 ; vvc_pd_1 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*2] + paddd m15, m1, m9 ; out1 + psubd m1, m9 ; out14 + mova m9, [r6-32*1] + REPX {psrad x, 1}, m0, m15, m10, m1 + packssdw m0, m15 + packssdw m1, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6+32*0] + paddd m15, m3, m9 ; out3 + psubd m3, m9 ; out12 + mova m9, [r6+32*1] + REPX {psrad x, 1}, m2, m15, m10, m3 + packssdw m2, m15 + packssdw m3, m10 + psubd m10, m4, m8 ; out11 + paddd m4, m8 ; out4 + mova m8, [r6+32*2] + paddd m15, m5, m9 ; out5 + psubd m5, m9 ; out10 + mova m9, [r6+32*3] + REPX {psrad x, 1}, m4, m10, m15, m5 + packssdw m4, m15 + packssdw m5, m10 + psubd m10, m6, m8 ; out9 + paddd m6, m8 ; out6 + paddd m15, m7, m9 ; out7 + psubd m7, m9 ; out8 + REPX {psrad x, 1}, m6, m10, m15, m7 + packssdw m6, m15 + packssdw m7, m10 + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m4, m6 + punpcklwd m4, m6 + punpcklwd m6, m7, m5 + punpckhwd m7, m5 + pxor m5, m5 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m5 + mova [cq+r7+128*0], m5 + mova [cq+r7+128*1], m5 + mova [cq+r7+128*2], m5 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + punpcklwd m5, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m1 + punpckhwd m4, m1 + punpckhwd m1, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m6, m7 + punpcklwd m6, m7 + punpcklqdq m7, m1, m4 + punpckhqdq m1, m4 + punpckhqdq m4, m8, m3 + punpcklqdq m8, m3 + punpckhqdq m3, m6, m5 + punpcklqdq m6, m5 + punpcklqdq m5, m0, m2 + punpckhqdq m0, m2 + mova [r6+16*0], xm5 + mova [r6+16*1], xm6 + mova [r6+16*2], xm7 + mova [r6+16*3], xm8 + vextracti128 [r6+16*4], m5, 1 + vextracti128 [r6+16*5], m6, 1 + vextracti128 [r6+16*6], m7, 1 + vextracti128 [r6+16*7], m8, 1 + sub r6, 32*4 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*2], m7 + mova [rsp+gprsize+32*3], m15 + vpbroadcastd m15, [vvc_pw_2048] + vpbroadcastd m7, [pixel_10_max] + IDCT2_32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 + IDCT2_32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 + IDCT2_32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 + IDCT2_32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT2_32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4 + IDCT2_32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8 + IDCT2_32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4 + IDCT2_32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*0] + IDCT2_32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4 + IDCT2_32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8 + IDCT2_32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4 + IDCT2_32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*2] + mova m2, [rsp+gprsize+32*3] + IDCT2_32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4 + IDCT2_32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8 + IDCT2_32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4 + IDCT2_32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 + ret + +cglobal vvc_inv_identity_identity_16x32_10, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_10_max] +.pass1: + vpbroadcastd m8, [vvc_pw_64x8] + vpbroadcastd m9, [vvc_pw_1697x16] + vpbroadcastd m11, [vvc_pw_8192] + lea r6, [strideq*5] + pxor m6, m6 + paddw m10, m11, m11 ; vvc_pw_16384 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main2: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpcklwd m4, m2, m1 + punpckhwd m2, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + call m(iidentity_8x8_internal_10).write_2x8x2 + punpcklqdq m0, m3, m2 + punpckhqdq m1, m3, m2 + jmp m(iidentity_8x8_internal_10).write_2x8x2 + +cglobal vvc_inv_identity_identity_16x32_12, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_12_max] + jmp m(vvc_inv_identity_identity_16x32_10).pass1 + +cglobal vvc_inv_dct2_dct2_32x16_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*4] + call .main + cmp eobd, 36 + jge .full + call m(vvc_inv_dct2_dct2_8x32_10).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + lea r6, [vvc_pw_5+128] + mov r7, dstq + call m(idct2_16x16_internal_8).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(vvc_inv_dct2_dct2_8x32_10).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + jmp .end +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(vvc_inv_dct2_dct2_32x8_10).dconly2 +.full: + add cq, 32 + mova [r4+32*3], m0 + mova [r4+32*2], m1 + mova [r4+32*1], m2 + mova [r4+32*0], m3 + mova [r4-32*1], m4 + mova [r4-32*2], m5 + mova [r4-32*3], m6 + mova [r4-32*4], m7 + call .main + sub r4, 32*16 ; topleft 16x8 + call .transpose_16x16 + lea r6, [vvc_pw_5+128] + mov r7, dstq + call m(idct2_16x16_internal_8).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + add r4, 32*8 ; bottomleft 16x8 + call .transpose_16x16 +.end: + lea dstq, [r7+32] + call m(idct2_16x16_internal_8).main + call .write_16x16 + RET +ALIGN function_align +.transpose_16x16: + punpckhdq m8, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpckhdq m5, m4, m6 + punpckldq m4, m6 + punpckhqdq m6, m0, m4 + punpcklqdq m0, m4 + punpckhqdq m4, m1, m5 + punpcklqdq m1, m5 + punpckhqdq m5, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m2, m8 + punpcklqdq m2, m8 + vinserti128 m8, m0, xm7, 1 + vperm2i128 m12, m0, m7, 0x31 + vinserti128 m9, m6, xm5, 1 + vperm2i128 m13, m6, m5, 0x31 + vinserti128 m10, m1, xm2, 1 + vperm2i128 m14, m1, m2, 0x31 + vinserti128 m11, m4, xm3, 1 + vperm2i128 m15, m4, m3, 0x31 + mova m0, [r4+32*3] + mova m1, [r4+32*2] + mova m2, [r4+32*1] + mova m3, [r4+32*0] + mova m4, [r4-32*1] + mova m5, [r4-32*2] + mova m6, [r4-32*3] + mova m7, [r4-32*4] + mova [rsp+gprsize], m15 + jmp m(vvc_inv_dct2_dct2_8x32_10).transpose +ALIGN function_align +.main: + vpbroadcastd m14, [vvc_pd_64] + vpbroadcastd m11, [vvc_pd_2048] + pmulld m0, m14, [cq+64* 1] + pmulld m1, m14, [cq+64* 7] + pmulld m2, m14, [cq+64* 9] + pmulld m3, m14, [cq+64*15] + pmulld m4, m14, [cq+64*17] + pmulld m5, m14, [cq+64*23] + pmulld m6, m14, [cq+64*25] + pmulld m7, m14, [cq+64*31] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+64* 3] + pmulld m1, m14, [cq+64* 5] + pmulld m2, m14, [cq+64*11] + pmulld m3, m14, [cq+64*13] + pmulld m4, m14, [cq+64*19] + pmulld m5, m14, [cq+64*21] + pmulld m6, m14, [cq+64*27] + pmulld m7, m14, [cq+64*29] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+64* 2] + pmulld m1, m14, [cq+64* 6] + pmulld m2, m14, [cq+64*10] + pmulld m3, m14, [cq+64*14] + pmulld m4, m14, [cq+64*18] + pmulld m5, m14, [cq+64*22] + pmulld m6, m14, [cq+64*26] + pmulld m7, m14, [cq+64*30] + call m(idct2_8x16_internal_10).main_oddhalf_rect2 + pmulld m0, m14, [cq+64* 0] + pmulld m1, m14, [cq+64* 4] + pmulld m2, m14, [cq+64* 8] + pmulld m3, m14, [cq+64*12] + pmulld m4, m14, [cq+64*16] + pmulld m5, m14, [cq+64*20] + pmulld m6, m14, [cq+64*24] + pmulld m7, m14, [cq+64*28] + call m(idct2_8x8_internal_10).main_rect2 + call m(idct2_8x16_internal_10).main_evenhalf + pxor m8, m8 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m8 + mova [cq+r7-64*1], m8 + mova [cq+r7+64*0], m8 + mova [cq+r7+64*1], m8 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m11, 11 ; vvc_pd_1 + IDCT2_32_END 0, 15, 8, 9, 10, 1 + IDCT2_32_END 1, 14, 8, 9, 10, 1 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT2_32_END 2, 15, 8, 9, 10, 1 + IDCT2_32_END 3, 14, 8, 9, 10, 1 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT2_32_END 4, 15, 8, 9, 10, 1 + IDCT2_32_END 5, 14, 8, 9, 10, 1 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT2_32_END 6, 15, 8, 9, 10, 1 + IDCT2_32_END 7, 14, 8, 9, 10, 1 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 + ret +ALIGN function_align +.write_16x16: + mova m1, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [vvc_pw_2048] + vpbroadcastd m9, [pixel_10_max] + lea r3, [strideq*3] + pxor m8, m8 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct2_16x8_internal_10).write_16x4 + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct2_16x8_internal_10).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct2_16x8_internal_10).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct2_16x8_internal_10).write_16x4 + +cglobal vvc_inv_identity_identity_32x16_10, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_10_max] +.pass1: + vpbroadcastd m8, [vvc_pw_64x8] + vpbroadcastd m9, [vvc_pw_1697x16] + vpbroadcastd m10, [vvc_pw_4096] + lea r6, [strideq*5] + pxor m6, m6 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*1] + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*2] + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*3] + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {paddsw x, x }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3 + REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(vvc_inv_identity_identity_16x32_10).main2 + +cglobal vvc_inv_identity_identity_32x16_12, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_12_max] + jmp m(vvc_inv_identity_identity_32x16_10).pass1 + +cglobal vvc_inv_dct2_dct2_32x32_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*38, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 32 + jmp m(vvc_inv_dct2_dct2_32x8_10).dconly +.fast: + lea r4, [rsp+32*71] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r3, [rsp+32*3] + mov r4, r6 + lea r5, [r6+32*8] + lea r6, [vvc_pw_5+128] + call .pass2_oddhalf + call .pass2_evenhalf + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call m(vvc_inv_dct2_dct2_16x32_10).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + lea r3, [rsp+32*11] + call .pass2_oddhalf + call .pass2_evenhalf + lea r3, [strideq*3] + call m(vvc_inv_dct2_dct2_16x32_10).pass2_end + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 7] + mova m2, [cq+128* 9] + mova m3, [cq+128*15] + mova m4, [cq+128*17] + mova m5, [cq+128*23] + mova m6, [cq+128*25] + mova m7, [cq+128*31] + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m14, [vvc_pd_64] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128* 5] + mova m2, [cq+128*11] + mova m3, [cq+128*13] + mova m4, [cq+128*19] + mova m5, [cq+128*21] + mova m6, [cq+128*27] + mova m7, [cq+128*29] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128* 6] + mova m2, [cq+128*10] + mova m3, [cq+128*14] + mova m4, [cq+128*18] + mova m5, [cq+128*22] + mova m6, [cq+128*26] + mova m7, [cq+128*30] + call m(idct2_8x16_internal_10).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 4] + mova m2, [cq+128* 8] + mova m3, [cq+128*12] + mova m4, [cq+128*16] + mova m5, [cq+128*20] + mova m6, [cq+128*24] + mova m7, [cq+128*28] + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + call m(vvc_inv_dct2_dct2_8x32_10).main_end + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(vvc_inv_dct2_dct2_8x32_10).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret +ALIGN function_align +.pass2_oddhalf: + mova m0, [r3+32* 1] ; 1 + mova m1, [r3+32* 3] ; 3 + mova m2, [r3+32* 5] ; 5 + mova m3, [r3+32* 7] ; 7 + mova m4, [r3+32*17] ; 9 + mova m5, [r3+32*19] ; 11 + mova m6, [r3+32*21] ; 13 + mova m7, [r3+32*23] ; 15 + mova m8, [r3+32*33] ; 17 + mova m9, [r3+32*35] ; 19 + mova m10, [r3+32*37] ; 21 + mova m11, [r3+32*39] ; 23 + mova m12, [r3+32*49] ; 25 + mova m13, [r3+32*51] ; 27 + mova m14, [r3+32*53] ; 29 + mova m15, [r3+32*55] ; 31 + jmp m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf +ALIGN function_align +.pass2_evenhalf: + mova m0, [r3+32* 0] ; 0 + mova m1, [r3+32* 2] ; 2 + mova m2, [r3+32* 4] ; 4 + mova m3, [r3+32* 6] ; 6 + mova m4, [r3+32*16] ; 8 + mova m5, [r3+32*18] ; 10 + mova m6, [r3+32*20] ; 12 + mova m7, [r3+32*22] ; 14 + mova m8, [r3+32*32] ; 16 + mova m9, [r3+32*34] ; 18 + mova m10, [r3+32*36] ; 20 + mova m11, [r3+32*38] ; 22 + mova m12, [r3+32*48] ; 24 + mova m13, [r3+32*50] ; 26 + mova m14, [r3+32*52] ; 28 + mova m15, [r3+32*54] ; 30 + mova [rsp+gprsize], m15 + jmp m(idct2_16x16_internal_8).main + +cglobal vvc_inv_identity_identity_32x32_10, 4, 8, 8, dst, stride, c, eob +%undef cmp + vpbroadcastd m7, [pixel_10_max] +.pass1: + vpbroadcastd m5, [vvc_pw_8192] + pxor m6, m6 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 + call .main ; 0 + cmp eobd, 36 + jl .ret + add cq, 128*8 ; 0 1 + mov r7, dstq ; 1 + add dstq, 16 + call .main + call .main2 + cmp eobd, 136 + jl .ret + add cq, 128*16-32 ; 0 1 2 + lea dstq, [r7+16*2] ; 1 2 + call .main ; 2 + call .main2 + call .main2 + cmp eobd, 300 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + add r7, 16*3 ; 1 2 3 + mov dstq, r7 ; 2 3 + call .main ; 3 + call .main2 + call .main2 + call .main2 + cmp eobd, 535 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + mov r7, dstq ; 2 3 4 + call .main ; 3 4 + call .main2 + call .main2 + cmp eobd, 755 + jl .ret + add cq, 128*16-32 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + call .main ; 2 3 4 5 + call .main2 ; 3 4 5 + cmp eobd, 911 + jl .ret + add cq, 128*8 ; 0 1 2 3 + add dstq, 16 ; 1 2 3 4 + call .main ; 2 3 4 5 +.ret: ; 3 4 5 6 + RET +ALIGN function_align +.main2: + sub cq, 128*8-32 + lea dstq, [dstq+strideq*8-16] +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + jmp m(vvc_inv_identity_identity_8x32_10).main_zero + +cglobal vvc_inv_identity_identity_32x32_12, 4, 8, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12_max] + jmp m(vvc_inv_identity_identity_32x32_10).pass1 + +%macro IDCT2_64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [r5-32*(51-%1)] ; idct2_16 out 0+n + mova m%4, [r4-32*(14+%1)] ; idct2_32 out31-n +%else + mova m%5, [r4-32*(45-%1)] + mova m%4, [r5-32*(20+%1)] +%endif + paddsw m%6, m%5, m%4 ; idct2_32 out 0+n + psubsw m%5, m%4 ; idct2_32 out31-n + paddsw m%4, m%5, m%3 ; out31-n + psubsw m%5, m%3 ; out32+n + paddsw m%3, m%6, m%2 ; out 0+n + psubsw m%6, m%2 ; out63-n + REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + paddw m%3, [%%d0+%7 ] + paddw m%4, [%%d1+%8 ] + paddw m%5, [%%d0+%9 ] + paddw m%6, [%%d1+%10] + pxor m%2, m%2 + REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 + vpbroadcastd m%2, [pixel_10_max] + REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 + mova [%%d0+%7 ], m%3 + mova [%%d1+%8 ], m%4 + mova [%%d0+%9 ], m%5 + mova [%%d1+%10], m%6 +%endmacro + +cglobal vvc_inv_dct2_dct2_16x64_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*6] + call .main + sub eobd, 44 + jl .fast + call .main + sub eobd, 107 + jl .fast + call .main + sub eobd, 128 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 64 + add r6d, 640 + sar r6d, 10 + jmp m(vvc_inv_dct2_dct2_16x4_10).dconly3 +.fast: + lea r4, [rsp+32*38] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r6, [vvc_pw_5+128] + mova m0, [rsp+32* 2] ; in0 + mova m1, [rsp+32* 6] ; in4 + mova m2, [rsp+32*10] ; in8 + mova m3, [rsp+32*14] ; in12 + mova m4, [rsp+32*18] ; in16 + mova m5, [rsp+32*22] ; in20 + mova m6, [rsp+32*26] ; in24 + mova m7, [rsp+32*30] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*38] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [rsp+32* 4] ; in2 + mova m1, [rsp+32* 8] ; in6 + mova m2, [rsp+32*12] ; in10 + mova m3, [rsp+32*16] ; in14 + mova m4, [rsp+32*20] ; in18 + mova m5, [rsp+32*24] ; in22 + mova m6, [rsp+32*28] ; in26 + mova m7, [rsp+32*32] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + mova m0, [rsp+32* 3] ; in1 + mova m1, [rsp+32*33] ; in31 + mova m2, [rsp+32*19] ; in17 + mova m3, [rsp+32*17] ; in15 + mova m4, [rsp+32*11] ; in9 + mova m5, [rsp+32*25] ; in23 + mova m6, [rsp+32*27] ; in25 + mova m7, [rsp+32* 9] ; in7 + lea r6, [idct2_64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + mova m0, [rsp+32* 7] ; in5 + mova m1, [rsp+32*29] ; in27 + mova m2, [rsp+32*23] ; in21 + mova m3, [rsp+32*13] ; in11 + mova m4, [rsp+32*15] ; in13 + mova m5, [rsp+32*21] ; in19 + mova m6, [rsp+32*31] ; in29 + mova m7, [rsp+32* 5] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + call .main_part2_pass2 + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + call m(idct2_8x16_internal_10).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + pxor m15, m15 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + psrld m15, m11, 10 ; vvc_pd_2 + mova m8, [r6-32*4] + mova m9, [r6+32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*3] + psubd m15, m7, m9 ; out8 + paddd m7, m9 ; out7 + mova m9, [r6+32*2] + REPX {psrad x, 2}, m0, m15, m10, m7 + packssdw m0, m15 + packssdw m7, m10 + psubd m10, m1, m8 ; out14 + paddd m1, m8 ; out1 + mova m8, [r6-32*2] + psubd m15, m6, m9 ; out9 + paddd m6, m9 ; out6 + mova m9, [r6+32*1] + REPX {psrad x, 2}, m1, m15, m10, m6 + packssdw m1, m15 + packssdw m6, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6-32*1] + psubd m15, m5, m9 ; out10 + paddd m5, m9 ; out5 + mova m9, [r6+32*0] + REPX {psrad x, 2}, m2, m15, m10, m5 + packssdw m2, m15 + packssdw m5, m10 + psubd m10, m3, m8 ; out12 + paddd m3, m8 ; out3 + psubd m15, m4, m9 ; out11 + paddd m4, m9 ; out4 + REPX {psrad x, 2}, m3, m15, m10, m4 + packssdw m3, m15 + packssdw m4, m10 + call m(idct2_16x8_internal_10).transpose3 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + ret +.main_part2_pass2: + vpbroadcastd m11, [vvc_pw_36_83] + vpbroadcastd m12, [vvc_pw_m83_36] + vpbroadcastd m13, [vvc_pw_64_64] + lea r6, [vvc_pw_5+128] + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [vvc_pw_m64_64] + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_internal + vpbroadcastd m14, [vvc_pw_2048] + IDCT2_64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 + IDCT2_64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 + IDCT2_64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + IDCT2_64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp r4, r5 + jne .main_part2_pass2_loop + ret +ALIGN function_align +.main_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_part1: ; idct2_64 steps 1-5 + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/83/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vpbroadcastd m7, [r5+4*0] + vpbroadcastd m8, [r5+4*1] + vpbroadcastd m6, [r5+4*2] + vpbroadcastd m9, [r5+4*3] + vpbroadcastd m5, [r5+4*4] + vpbroadcastd m10, [r5+4*5] + vpbroadcastd m4, [r5+4*6] + vpbroadcastd m15, [r5+4*7] + pmulld m7, m0 ; t63a + pmulld m0, m8 ; t32a + pmulld m6, m1 ; t62a + pmulld m1, m9 ; t33a + pmulld m5, m2 ; t61a + pmulld m2, m10 ; t34a + pmulld m4, m3 ; t60a + pmulld m3, m15 ; t35a + vpbroadcastd m10, [r5+4*8] + vpbroadcastd m15, [r5+4*9] + REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 + REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 + psubd m8, m0, m1 ; t33 + paddd m0, m1 ; t32 + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m5 ; t61 + paddd m4, m5 ; t60 + REPX {pmaxsd x, m12}, m8, m1, m6, m2 + REPX {pminsd x, m13}, m8, m1, m6, m2 + ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15, 0xc ; t33a, t62a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 0xe ; t61a, t34a + REPX {pmaxsd x, m12}, m0, m3, m7, m4 + REPX {pminsd x, m13}, m0, m3, m7, m4 + vpbroadcastd m10, [r5+4*10] + vpbroadcastd m15, [r5+4*11] + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m8, m2 ; t61 + paddd m8, m2 ; t62 + REPX {pmaxsd x, m12}, m5, m3, m4, m6 + REPX {pminsd x, m13}, m5, m3, m4, m6 + ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15, 0xc ; t35, t60 + ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15, 0xc ; t34a, t61a + REPX {pmaxsd x, m12}, m0, m7, m1, m8 + REPX {pminsd x, m13}, m0, m7, m1, m8 + add r5, 4*12 + mova [r6-32*4], m0 + mova [r6+32*3], m7 + mova [r6-32*3], m1 + mova [r6+32*2], m8 + mova [r6-32*2], m6 + mova [r6+32*1], m4 + mova [r6-32*1], m3 + mova [r6+32*0], m5 + add r6, 32*8 + ret +.main_part2: ; idct2_64 steps 6-9 + lea r5, [r6+32*3] + sub r6, 32*4 + vpbroadcastd m10, [vvc_pd_36] + vpbroadcastd m15, [vvc_pd_83] +.main_part2_loop: + mova m0, [r6-32*32] ; t32a + mova m1, [r5-32*24] ; t39a + mova m2, [r5-32*32] ; t63a + mova m3, [r6-32*24] ; t56a + mova m4, [r6-32*16] ; t40a + mova m5, [r5-32* 8] ; t47a + mova m6, [r5-32*16] ; t55a + mova m7, [r6-32* 8] ; t48a + psubd m8, m0, m1 ; t39 + paddd m0, m1 ; t32 + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + REPX {pmaxsd x, m12}, m8, m1, m3, m4 + REPX {pminsd x, m13}, m8, m1, m3, m4 + ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15, 0xc ; t39a, t56a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 0xe ; t55a, t40a + REPX {pmaxsd x, m12}, m0, m2, m5, m7 + REPX {pminsd x, m13}, m0, m5, m2, m7 + psubd m6, m2, m7 ; t48a + paddd m2, m7 ; t63a + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m8, m4 ; t55 + paddd m8, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + REPX {pmaxsd x, m12}, m6, m7, m5, m4 + REPX {pminsd x, m13}, m6, m7, m5, m4 + REPX {pmulld x, m14}, m6, m7, m5, m4 + REPX {pmaxsd x, m12}, m2, m0, m8, m1 + REPX {pminsd x, m13}, m2, m0, m8, m1 + paddd m6, m11 + paddd m5, m11 + psubd m3, m6, m7 ; t47 + paddd m6, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m3, m6, m7, m5 + mova [r5-32* 8], m2 + mova [r6-32*32], m0 + mova [r6-32* 8], m8 + mova [r5-32*32], m1 + mova [r5-32*24], m3 + mova [r6-32*16], m6 + mova [r6-32*24], m7 + mova [r5-32*16], m5 + add r6, 32 + sub r5, 32 + cmp r6, r5 + jl .main_part2_loop + ret + +cglobal vvc_inv_dct2_dct2_32x64_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*6] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 64 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(vvc_inv_dct2_dct2_32x8_10).dconly2 +.fast: + lea r4, [rsp+32*70] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r6, [vvc_pw_5 + 128] + mov r10, rsp + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10+32* 2] ; in0 + mova m1, [r10+32* 6] ; in4 + mova m2, [r10+32*18] ; in8 + mova m3, [r10+32*22] ; in12 + mova m4, [r10+32*34] ; in16 + mova m5, [r10+32*38] ; in20 + mova m6, [r10+32*50] ; in24 + mova m7, [r10+32*54] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*70] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10+32* 4] ; in2 + mova m1, [r10+32* 8] ; in6 + mova m2, [r10+32*20] ; in10 + mova m3, [r10+32*24] ; in14 + mova m4, [r10+32*36] ; in18 + mova m5, [r10+32*40] ; in22 + mova m6, [r10+32*52] ; in26 + mova m7, [r10+32*56] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + mova m0, [r10+32* 3] ; in1 + mova m1, [r10+32*57] ; in31 + mova m2, [r10+32*35] ; in17 + mova m3, [r10+32*25] ; in15 + mova m4, [r10+32*19] ; in9 + mova m5, [r10+32*41] ; in23 + mova m6, [r10+32*51] ; in25 + mova m7, [r10+32* 9] ; in7 + lea r6, [idct2_64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + mova m0, [r10+32* 7] ; in5 + mova m1, [r10+32*53] ; in27 + mova m2, [r10+32*39] ; in21 + mova m3, [r10+32*21] ; in11 + mova m4, [r10+32*23] ; in13 + mova m5, [r10+32*37] ; in19 + mova m6, [r10+32*55] ; in29 + mova m7, [r10+32* 5] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_10).main_part2_pass2 + add r10, 32*8 + sub r4, 32*98 ; rsp+32*16 + sub dstq, r8 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + vpbroadcastd m14, [vvc_pd_64] + vpbroadcastd m11, [vvc_pd_2048] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 7] + pmulld m2, m14, [cq+128* 9] + pmulld m3, m14, [cq+128*15] + pmulld m4, m14, [cq+128*17] + pmulld m5, m14, [cq+128*23] + pmulld m6, m14, [cq+128*25] + pmulld m7, m14, [cq+128*31] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128* 5] + pmulld m2, m14, [cq+128*11] + pmulld m3, m14, [cq+128*13] + pmulld m4, m14, [cq+128*19] + pmulld m5, m14, [cq+128*21] + pmulld m6, m14, [cq+128*27] + pmulld m7, m14, [cq+128*29] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128* 6] + pmulld m2, m14, [cq+128*10] + pmulld m3, m14, [cq+128*14] + pmulld m4, m14, [cq+128*18] + pmulld m5, m14, [cq+128*22] + pmulld m6, m14, [cq+128*26] + pmulld m7, m14, [cq+128*30] + call m(idct2_8x16_internal_10).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 4] + pmulld m2, m14, [cq+128* 8] + pmulld m3, m14, [cq+128*12] + pmulld m4, m14, [cq+128*16] + pmulld m5, m14, [cq+128*20] + pmulld m6, m14, [cq+128*24] + pmulld m7, m14, [cq+128*28] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + call m(idct2_8x8_internal_10).main_rect2 + call m(idct2_8x16_internal_10).main_evenhalf + call m(vvc_inv_dct2_dct2_32x16_10).main_end + call m(vvc_inv_dct2_dct2_8x32_10).transpose + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(vvc_inv_dct2_dct2_8x32_10).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret + +cglobal vvc_inv_dct2_dct2_64x16_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .normal + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 16 +.dconly: + add r6d, 640 + sar r6d, 10 +.dconly2: + vpbroadcastd m5, [dconly_10] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm5 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + paddsw m3, m0, [dstq+32*2] + paddsw m4, m0, [dstq+32*3] + REPX {psubusw x, m5}, m1, m2, m3, m4 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + mova [dstq+32*2], m3 + mova [dstq+32*3], m4 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*4] + call .main + call .shift_transpose + cmp eobd, 36 + jl .fast + call .main + call .shift_transpose + jmp .pass2 +.fast: + pxor m0, m0 + mov r3d, 4 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + dec r3d + jg .fast_loop +.pass2: + lea r7, [r6-32*64] + lea r4, [r6-32*32] + lea r6, [vvc_pw_5+128] + mov r5, dstq +.pass2_loop: + mova m0, [r7-32*4] + mova m1, [r7-32*3] + mova m2, [r7-32*2] + mova m3, [r7-32*1] + mova m4, [r7+32*0] + mova m5, [r7+32*1] + mova m6, [r7+32*2] + mova m7, [r7+32*3] + add r7, 32*32 + mova m8, [r7-32*4] + mova m9, [r7-32*3] + mova m10, [r7-32*2] + mova m11, [r7-32*1] + mova m12, [r7+32*0] + mova m13, [r7+32*1] + mova m14, [r7+32*2] + mova m15, [r7+32*3] + sub r7, 32*24 + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + call m(vvc_inv_dct2_dct2_32x16_10).write_16x16 + add r5, 32 + mov dstq, r5 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct2_64_mul_16] + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+64* 7] + mova m1, [cq+64*25] + mova m2, [cq+64*23] + mova m3, [cq+64* 9] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+64* 3] + mova m1, [cq+64*29] + mova m2, [cq+64*19] + mova m3, [cq+64*13] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + call m(vvc_inv_dct2_dct2_16x64_10).main_part2 + mova m0, [cq+64* 2] + mova m1, [cq+64*14] + mova m2, [cq+64*18] + mova m3, [cq+64*30] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_fast + mova m0, [cq+64* 6] + mova m1, [cq+64*10] + mova m2, [cq+64*22] + mova m3, [cq+64*26] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_fast + mova m0, [cq+64* 4] + mova m1, [cq+64*12] + mova m2, [cq+64*20] + mova m3, [cq+64*28] + call m(idct2_8x16_internal_10).main_oddhalf_fast + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m2, [cq+64*16] + mova m3, [cq+64*24] + pxor m15, m15 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m15 + mova [cq+r7-64*1], m15 + mova [cq+r7+64*0], m15 + mova [cq+r7+64*1], m15 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m15, m11, 10 ; vvc_pd_2 +.main_end2: + add cq, 32 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct2_8x8_internal_10).main + add r6, 32*8 + call m(idct2_8x16_internal_10).main_evenhalf + mova [r6+32*2], m1 + mova [r6+32*1], m2 + mova [r6+32*0], m3 + mova [r6-32*1], m4 + mova [r6-32*2], m5 + mova [r6-32*3], m6 + mova [r6-32*4], m7 + jmp .main_end_loop_start +.main_end_loop: + mova m0, [r6+32* 3] ; idct2_8 0 + n +.main_end_loop_start: + mova m1, [r5+32* 4] ; idct2_16 15 - n + mova m2, [r5-32*12] ; idct2_32 16 + n + mova m3, [r6-32*13] ; idct2_32 31 - n + mova m4, [r6-32*29] ; idct2_64 63 - n + mova m5, [r5-32*28] ; idct2_64 48 + n + mova m6, [r6-32*45] ; idct2_64 47 - n + mova m7, [r5-32*44] ; idct2_64 32 + n + paddd m8, m0, m1 ; idct2_16 out0 + n + psubd m0, m1 ; idct2_16 out15 - n + REPX {pmaxsd x, m12}, m8, m0 + REPX {pminsd x, m13}, m8, m0 + paddd m1, m8, m3 ; idct2_32 out0 + n + psubd m8, m3 ; idct2_32 out31 - n + paddd m3, m0, m2 ; idct2_32 out15 - n + psubd m0, m2 ; idct2_32 out16 + n + REPX {pmaxsd x, m12}, m1, m8, m3, m0 + REPX {pminsd x, m13}, m1, m3, m8, m0 + REPX {paddd x, m15}, m1, m3, m0, m8 + paddd m2, m1, m4 ; idct2_64 out0 + n (unshifted) + psubd m1, m4 ; idct2_64 out63 - n (unshifted) + paddd m4, m3, m5 ; idct2_64 out15 - n (unshifted) + psubd m3, m5 ; idct2_64 out48 + n (unshifted) + paddd m5, m0, m6 ; idct2_64 out16 + n (unshifted) + psubd m0, m6 ; idct2_64 out47 - n (unshifted) + paddd m6, m8, m7 ; idct2_64 out31 - n (unshifted) + psubd m8, m7 ; idct2_64 out32 + n (unshifted) + mova [r5-32*44], m2 + mova [r6+32* 3], m1 + mova [r6-32*45], m4 + mova [r5+32* 4], m3 + mova [r5-32*28], m5 + mova [r6-32*13], m0 + mova [r6-32*29], m6 + mova [r5-32*12], m8 + add r5, 32 + sub r6, 32 + cmp r5, r6 + jl .main_end_loop + ret +.shift_transpose: +%macro IDCT2_64_SHIFT_TRANSPOSE 1 ; shift + sub r6, 32*48 + mov r5, r6 +%%loop: + mova m0, [r6-32* 4] + mova m4, [r6+32* 4] + mova m1, [r6-32* 3] + mova m5, [r6+32* 5] + mova m2, [r6-32* 2] + mova m6, [r6+32* 6] + mova m3, [r6-32* 1] + mova m7, [r6+32* 7] + REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m4, [r6+32* 0] + mova m6, [r6+32* 8] + mova m5, [r6+32* 1] + mova m7, [r6+32* 9] + REPX {psrad x, %1}, m4, m6, m5, m7 + packssdw m4, m6 + packssdw m5, m7 + mova m6, [r6+32* 2] + mova m8, [r6+32*10] + mova m7, [r6+32* 3] + mova m9, [r6+32*11] + REPX {psrad x, %1}, m6, m8, m7, m9 + packssdw m6, m8 + packssdw m7, m9 + call m(idct2_16x8_internal_10).transpose3 + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + add r6, 32*16 + add r5, 32*8 + cmp r5, r4 + jl %%loop + mov r6, r4 +%endmacro + IDCT2_64_SHIFT_TRANSPOSE 2 + ret + +cglobal vvc_inv_dct2_dct2_64x32_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(vvc_inv_dct2_dct2_64x16_10).dconly2 +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r7, [r6-32*32] + lea r5, [r6+32*8] + lea r6, [vvc_pw_5+128] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq +.pass2_loop: + mova m0, [r7-32*99] + mova m1, [r7-32*97] + mova m2, [r7-32*95] + mova m3, [r7-32*93] + mova m4, [r7-32*67] + mova m5, [r7-32*65] + mova m6, [r7-32*63] + mova m7, [r7-32*61] + mova m8, [r7-32*35] + mova m9, [r7-32*33] + mova m10, [r7-32*31] + mova m11, [r7-32*29] + mova m12, [r7-32* 3] + mova m13, [r7-32* 1] + mova m14, [r7+32* 1] + mova m15, [r7+32* 3] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + mova m0, [r7-32*100] + mova m1, [r7-32*98] + mova m2, [r7-32*96] + mova m3, [r7-32*94] + mova m4, [r7-32*68] + mova m5, [r7-32*66] + mova m6, [r7-32*64] + mova m7, [r7-32*62] + mova m8, [r7-32*36] + mova m9, [r7-32*34] + mova m10, [r7-32*32] + mova m11, [r7-32*30] + mova m12, [r7-32* 4] + mova m13, [r7-32* 2] + mova m14, [r7+32* 0] + mova m15, [r7+32* 2] + add r7, 32*8 + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + call m(vvc_inv_dct2_dct2_16x32_10).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct2_64_mul_16] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128*31] + pmulld m2, m14, [cq+128*17] + pmulld m3, m14, [cq+128*15] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1_rect2 + pmulld m0, m14, [cq+128* 7] + pmulld m1, m14, [cq+128*25] + pmulld m2, m14, [cq+128*23] + pmulld m3, m14, [cq+128* 9] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1_rect2 + pmulld m0, m14, [cq+128* 5] + pmulld m1, m14, [cq+128*27] + pmulld m2, m14, [cq+128*21] + pmulld m3, m14, [cq+128*11] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128*29] + pmulld m2, m14, [cq+128*19] + pmulld m3, m14, [cq+128*13] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1_rect2 + call m(vvc_inv_dct2_dct2_16x64_10).main_part2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128*14] + pmulld m2, m14, [cq+128*18] + pmulld m3, m14, [cq+128*30] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_fast_rect2 + pmulld m0, m14, [cq+128* 6] + pmulld m1, m14, [cq+128*10] + pmulld m2, m14, [cq+128*22] + pmulld m3, m14, [cq+128*26] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_fast_rect2 + pmulld m0, m14, [cq+128* 4] + pmulld m1, m14, [cq+128*12] + pmulld m2, m14, [cq+128*20] + pmulld m3, m14, [cq+128*28] + call m(idct2_8x16_internal_10).main_oddhalf_fast_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 8] + pmulld m2, m14, [cq+128*16] + pmulld m3, m14, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + psrld m15, m11, 11 ; vvc_pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + call m(vvc_inv_dct2_dct2_64x16_10).main_end2 + IDCT2_64_SHIFT_TRANSPOSE 1 + ret + +cglobal vvc_inv_dct2_dct2_64x64_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 64 + jmp m(vvc_inv_dct2_dct2_64x16_10).dconly +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r10, [r6-32*32] + lea r6, [vvc_pw_5+128] + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10-32*100] ; in0 + mova m1, [r10-32*96] ; in4 + mova m2, [r10-32*68] ; in8 + mova m3, [r10-32*64] ; in12 + mova m4, [r10-32*36] ; in16 + mova m5, [r10-32*32] ; in20 + mova m6, [r10-32* 4] ; in24 + mova m7, [r10+32* 0] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10-32*98] ; in2 + mova m1, [r10-32*94] ; in6 + mova m2, [r10-32*66] ; in10 + mova m3, [r10-32*62] ; in14 + mova m4, [r10-32*34] ; in18 + mova m5, [r10-32*30] ; in22 + mova m6, [r10-32* 2] ; in26 + mova m7, [r10+32* 2] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + mova m0, [r10-32*99] ; in1 + mova m1, [r10+32* 3] ; in31 + mova m2, [r10-32*35] ; in17 + mova m3, [r10-32*61] ; in15 + mova m4, [r10-32*67] ; in9 + mova m5, [r10-32*29] ; in23 + mova m6, [r10-32* 3] ; in25 + mova m7, [r10-32*93] ; in7 + lea r6, [idct2_64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + mova m0, [r10-32*95] ; in5 + mova m1, [r10-32* 1] ; in27 + mova m2, [r10-32*31] ; in21 + mova m3, [r10-32*65] ; in11 + mova m4, [r10-32*63] ; in13 + mova m5, [r10-32*33] ; in19 + mova m6, [r10+32* 1] ; in29 + mova m7, [r10-32*97] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_10).main_part2_pass2 + add r10, 32*8 + sub dstq, r8 + sub r4, 32*44 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct2_64_mul_16] + mova m0, [cq+128* 1] + mova m1, [cq+128*31] + mova m2, [cq+128*17] + mova m3, [cq+128*15] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+128* 7] + mova m1, [cq+128*25] + mova m2, [cq+128*23] + mova m3, [cq+128* 9] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+128* 5] + mova m1, [cq+128*27] + mova m2, [cq+128*21] + mova m3, [cq+128*11] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128*29] + mova m2, [cq+128*19] + mova m3, [cq+128*13] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + call m(vvc_inv_dct2_dct2_16x64_10).main_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128*14] + mova m2, [cq+128*18] + mova m3, [cq+128*30] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_fast + mova m0, [cq+128* 6] + mova m1, [cq+128*10] + mova m2, [cq+128*22] + mova m3, [cq+128*26] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_fast + mova m0, [cq+128* 4] + mova m1, [cq+128*12] + mova m2, [cq+128*20] + mova m3, [cq+128*28] + call m(idct2_8x16_internal_10).main_oddhalf_fast + mova m0, [cq+128* 0] + mova m1, [cq+128* 8] + mova m2, [cq+128*16] + mova m3, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + call m(vvc_inv_dct2_dct2_64x16_10).main_end + jmp m(vvc_inv_dct2_dct2_64x16_10).shift_transpose + %endif ; ARCH_X86_64 diff --git a/libavcodec/x86/vvc_itx_8bit.asm b/libavcodec/x86/vvc_itx_8bit.asm index 3938d42456a..60e727dc14e 100644 --- a/libavcodec/x86/vvc_itx_8bit.asm +++ b/libavcodec/x86/vvc_itx_8bit.asm @@ -39,16 +39,5507 @@ vvc_pw_%1_%2: dw %1, %2 vvc_pw_m%2_%1: dw -%2, %1 %endmacro +; ADST-only +vvc_pw_3803_1321: dw 3803, 1321 +vvc_pw_m1321_2482: dw -1321, 2482 +vvc_pw_2482_3344: dw 2482, 3344 +vvc_pw_m3344_3344: dw -3344, 3344 +vvc_pw_m3803_3344: dw -3803, 3344 +vvc_pw_m3803_m6688: dw -3803, -6688 +vvc_pw_64_m64: dw 64, -64 + +const vvc_pw_5, times 2 dw 5 +const vvc_pw_2048, times 2 dw 2048 +const vvc_pw_4096, times 2 dw 4096 +const vvc_pw_8192, times 2 dw 8192 +const vvc_pw_16384, times 2 dw 16384 +const vvc_pw_1697x16, times 2 dw 1697*16 +const vvc_pw_1697x8, times 2 dw 1697*8 +const vvc_pw_64x8, times 2 dw 64*8 +const vvc_pd_64, dd 64 +const vvc_pd_512, dd 512 +const vvc_pd_2048, dd 2048 + const vvc_pw_64_64, dw 64, 64 -const vvc_pw_64_m64, dw 64, -64 const vvc_pw_m64_64, dw -64, 64 const vvc_pw_36_83, dw 36, 83 const vvc_pw_m83_36, dw -83, 36 -const vvc_pw_64_64, dw 64, 64 -const vvc_pw_m64_64, dw -64, 64 -const vvc_pw_512, times 2 dw 512 +COEF_PAIR 83, 36 +COEF_PAIR 201, 4091 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 2440, 3290 +COEF_PAIR 3035, 2751 +COEF_PAIR 3513, 2106 +COEF_PAIR 3857, 1380 +COEF_PAIR 4052, 601 +COEF_PAIR 9, 90 +COEF_PAIR 43, 80 +COEF_PAIR 70, 57 +COEF_PAIR 87, 25 +COEF_PAIR 18, 89 +COEF_PAIR 75, 50 +vvc_pw_m18_m89: dw -18, -89 +const vvc_pw_m36_m83, dw -36, -83 +vvc_pw_m75_m50: dw -75, -50 +vvc_pw_m9_m90: dw -9, -90 +vvc_pw_m70_m57: dw -70, -57 +vvc_pw_m43_m80: dw -43, -80 +vvc_pw_m87_m25: dw -87, -25 +COEF_PAIR 50, 75 +COEF_PAIR 89, 18 + +%macro COEF_X8 1-* +%rep %0 + dw %1*8, %1*8 + %rotate 1 +%endrep +%endmacro + +vvc_pw_3703x8: COEF_X8 3703 +vvc_pw_1751x8: COEF_X8 1751 +vvc_pw_m1380x8: COEF_X8 -1380 +vvc_pw_3857x8: COEF_X8 3857 +vvc_pw_3973x8: COEF_X8 3973 +vvc_pw_995x8: COEF_X8 995 +vvc_pw_m2106x8: COEF_X8 -2106 +vvc_pw_3513x8: COEF_X8 3513 +vvc_pw_3290x8: COEF_X8 3290 +vvc_pw_2440x8: COEF_X8 2440 +vvc_pw_m601x8: COEF_X8 -601 +vvc_pw_4052x8: COEF_X8 4052 + +const idct2_64_mul +COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 +COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 +COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 +COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 + +vvc_pw_201_4091x8: dw 201*8, 4091*8 +vvc_pw_m601_4052x8: dw -601*8, 4052*8 +vvc_pw_995_3973x8: dw 995*8, 3973*8 +vvc_pw_m1380_3857x8: dw -1380*8, 3857*8 +vvc_pw_1751_3703x8: dw 1751*8, 3703*8 +vvc_pw_m2106_3513x8: dw -2106*8, 3513*8 +vvc_pw_2440_3290x8: dw 2440*8, 3290*8 +vvc_pw_m2751_3035x8: dw -2751*8, 3035*8 + +%define o_idct2_64_offset idct2_64_mul - (o_base) - 8 + +SECTION .text + +; Code size reduction trickery: Instead of using rip-relative loads with +; mandatory 4-byte offsets everywhere, we can set up a base pointer with a +; single rip-relative lea and then address things relative from that with +; 1-byte offsets as long as data is within +-128 bytes of the base pointer. +%define o_base deint_shuf + 128 +%define o(x) (r6 - (o_base) + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; flags: 1 = swap, 2 = interleave, 4: coef_regs +%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags +%if %7 & 4 + pmaddwd m%2, m%5, m%1 + pmaddwd m%1, m%6 +%else +%if %7 & 1 + vpbroadcastd m%2, [o(vvc_pw_%5_%6)] + vpbroadcastd m%3, [o(vvc_pw_m%6_%5)] +%else + vpbroadcastd m%2, [o(vvc_pw_m%6_%5)] + vpbroadcastd m%3, [o(vvc_pw_%5_%6)] +%endif + pmaddwd m%2, m%1 + pmaddwd m%1, m%3 +%endif + paddd m%2, m%4 + paddd m%1, m%4 +%if %7 & 2 + pslld m%2, 4 + psrld m%1, 12 + pblendw m%1, m%2, 0xaa +%else + psrad m%2, 7 + psrad m%1, 7 + packssdw m%1, m%2 +%endif +%endmacro + +; flags: 1 = swap, 2 = interleave, 4 = coef_regs +%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags +%if %10 & 1 + vpbroadcastd m%3, [o(vvc_pw_%8_%9)] + vpbroadcastd m%4, [o(vvc_pw_m%9_%8)] + vpbroadcastd xm%2, [o(vvc_pw_%6_%7)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(vvc_pw_m%7_%6)] +%else + vpbroadcastd m%3, [o(vvc_pw_m%9_%8)] + vpbroadcastd m%4, [o(vvc_pw_%8_%9)] + vpbroadcastd xm%2, [o(vvc_pw_m%7_%6)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(vvc_pw_%6_%7)] +%endif + vpblendd m%3, m%4, 0xf0 + ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) +%endmacro + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 1 = coef_regs +%macro ITX_MULSUB_2W 8-9 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], flags, dst2 + punpckhwd m%3, m%2, m%1 + punpcklwd m%2, m%1 +%if %8 & 1 + pmaddwd m%1, m%7, m%2 + pmaddwd m%4, m%7, m%3 +%else + vpbroadcastd m%1, [o(vvc_pw_m%7_%6)] + pmaddwd m%4, m%3, m%1 + pmaddwd m%1, m%2 +%endif + paddd m%4, m%5 + paddd m%1, m%5 + psrad m%4, 7 + psrad m%1, 7 + packssdw m%1, m%4 +%if %8 & 1 + pmaddwd m%3, m%6 + pmaddwd m%2, m%6 +%else + vpbroadcastd m%4, [o(vvc_pw_%6_%7)] + pmaddwd m%3, m%4 + pmaddwd m%2, m%4 +%endif + paddd m%3, m%5 + paddd m%2, m%5 + psrad m%3, 7 + psrad m%2, 7 +%if %0 == 9 + packssdw m%8, m%2, m%3 +%else + packssdw m%2, m%3 +%endif +%endmacro + +%macro IDCT2_4_1D 7 ; src[1-4], tmp[1-2], vvc_pd_64 + ITX_MULSUB_2W %2, %4, %5, %6, %7, 36, 83, 0, %5 ; t2, t3 + ITX_MULSUB_2W %1, %3, %4, %6, %7, 64, 64, 0, %4 ; t1, t0 + psubsw m%3, m%1, m%2 + paddsw m%2, m%1 + paddsw m%1, m%4, m%5 + psubsw m%4, m%5 +%endmacro + +%macro IDCT2_8_1D 11 ; src[1-8], tmp[1-2], vvc_pd_64 + ITX_MULSUB_2W %6, %4, %9, %10, %11, 75, 50, 0 ; t5a, t6a + ITX_MULSUB_2W %2, %8, %9, %10, %11, 18, 89, 0 ; t4a, t7a + ITX_MULSUB_2W %3, %7, %9, %10, %11, 36, 83, 0 ; t2, t3 + paddsw m%9, m%2, m%6 ; t4 + psubsw m%2, m%6 ; t5a + paddsw m%10, m%8, m%4 ; t7 + psubsw m%8, m%4 ; t6a + ITX_MULSUB_2W %1, %5, %4, %6, %11, 64, 64, 0 ; t1, t0 + ITX_MULSUB_2W %8, %2, %4, %6, %11, 64, 64, 0 ; t5, t6 + psubsw m%6, m%1, m%3 ; dct4 out2 + paddsw m%3, m%1 ; dct4 out1 + paddsw m%1, m%5, m%7 ; dct4 out0 + psubsw m%5, m%7 ; dct4 out3 + psubsw m%7, m%3, m%2 ; out6 + paddsw m%2, m%3 ; out1 + paddsw m%3, m%6, m%8 ; out2 + psubsw m%6, m%8 ; out5 + psubsw m%8, m%1, m%10 ; out7 + paddsw m%1, m%10 ; out0 + paddsw m%4, m%5, m%9 ; out3 + psubsw m%5, m%9 ; out4 +%endmacro + +; in1 = %1, in3 = %2, in5 = %3, in7 = %4 +; in9 = %5, in11 = %6, in13 = %7, in15 = %8 +%macro IDCT2_16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], vvc_pd_64 + ITX_MULSUB_2W %1, %8, %9, %10, %11, 9, 90, 0 ; t8a, t15a + ITX_MULSUB_2W %5, %4, %9, %10, %11, 70, 57, 0 ; t9a, t14a + ITX_MULSUB_2W %3, %6, %9, %10, %11, 43, 80, 0 ; t10a, t13a + ITX_MULSUB_2W %7, %2, %9, %10, %11, 87, 25, 0 ; t11a, t12a + psubsw m%9, m%2, m%6 ; t13 + paddsw m%6, m%2 ; t12 + psubsw m%2, m%8, m%4 ; t14 + paddsw m%8, m%4 ; t15 + psubsw m%4, m%7, m%3 ; t10 + paddsw m%3, m%7 ; t11 + psubsw m%7, m%1, m%5 ; t9 + paddsw m%1, m%5 ; t8 + ITX_MULSUB_2W %2, %7, %5, %10, %11, 36, 83, 0 ; t9a, t14a + ITX_MULSUB_2W %9, %4, %5, %10, %11, m83, 36, 0 ; t10a, t13a + psubsw m%5, m%1, m%3 ; t11a + paddsw m%1, m%3 ; t8a + psubsw m%3, m%7, m%4 ; t13 + paddsw m%7, m%4 ; t14 + psubsw m%4, m%8, m%6 ; t12a + paddsw m%8, m%6 ; t15a + psubsw m%6, m%2, m%9 ; t10 + paddsw m%2, m%9 ; t9 + ITX_MULSUB_2W %3, %6, %9, %10, %11, 64, 64, 0 ; t10a, t13a + ITX_MULSUB_2W %4, %5, %9, %10, %11, 64, 64, 0 ; t11, t12 +%endmacro + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + vpbroadcastd m2, [o(vvc_pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + movd m2, [%%row_adr1] + pinsrd m2, [%%row_adr2], 1 + movd m3, [%%row_adr3] + pinsrd m3, [%%row_adr4], 1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + movd [%%row_adr1], m0 + pextrd [%%row_adr2], m0, 1 + pextrd [%%row_adr3], m0, 2 + pextrd [%%row_adr4], m0, 3 + ret +%endmacro + +%macro IWHT4_1D_PACKED 0 + punpckhqdq m3, m0, m1 ; in1 in3 + punpcklqdq m0, m1 ; in0 in2 + psubw m2, m0, m3 + paddw m0, m3 + punpckhqdq m2, m2 ; t2 t2 + punpcklqdq m0, m0 ; t0 t0 + psubw m1, m0, m2 + psraw m1, 1 + psubw m1, m3 ; t1 t3 + psubw m0, m1 ; ____ out0 + paddw m2, m1 ; out3 ____ +%endmacro + +INIT_XMM avx2 +cglobal vvc_inv_wht_wht_4x4_8, 3, 3, 4, dst, stride, c + mova m0, [cq+16*0] + mova m1, [cq+16*1] + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + psraw m0, 2 + psraw m1, 2 + IWHT4_1D_PACKED + punpckhwd m0, m1 + punpcklwd m3, m1, m2 + punpckhdq m1, m0, m3 + punpckldq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x03 + ITX4_END 3, 0, 2, 1, 0 + +%macro INV_TXFM_FN 3 ; type1, type2, size +cglobal vvc_inv_%1_%2_%3_8, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%3_internal_8) + lea r6, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%3_internal_8).pass2] +%ifidn %1_%2, dct2_dct2 + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4 +%ifidn %1_%2, dct2_dct2 + vpbroadcastw m0, [cq] + vpbroadcastd m1, [o(vvc_pw_64x8)] + pmulhrsw m0, m1 + mov [cq], eobd ; 0 + pmulhrsw m0, m1 + mova m1, m0 + jmp m(iadst_4x4_internal_8).end2 +%endif +%endmacro + +%macro IDCT2_4_1D_PACKED 0 + vpbroadcastd m4, [o(vvc_pd_64)] + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + ITX_MUL2X_PACK 2, 0, 3, 4, 36, 83 + ITX_MUL2X_PACK 1, 0, 3, 4, 64, 64 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 +%endmacro + +%macro IADST4_1D_PACKED 0 + punpcklwd m2, m1, m0 + punpckhwd m3, m1, m0 + vpbroadcastd m5, [o(vvc_pw_m3344_3344)] + vpbroadcastd m0, [o(vvc_pw_3803_1321)] + vpbroadcastd m4, [o(vvc_pw_m1321_2482)] + pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 + psrld m5, 16 + pmaddwd m0, m2 + pmaddwd m2, m4 + pmaddwd m5, m3 ; 3344*in0 + paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 + vpbroadcastd m4, [o(vvc_pw_2482_3344)] + vpbroadcastd m5, [o(vvc_pw_m3803_3344)] + pmaddwd m4, m3 + pmaddwd m5, m3 + paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 + vpbroadcastd m0, [o(vvc_pw_m3803_m6688)] + pmaddwd m3, m0 + vpbroadcastd m0, [o(vvc_pd_64)] + paddd m2, m0 + paddd m1, m0 + paddd m0, m4 + paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 + paddd m2, m4 + paddd m2, m3 + REPX {psrad x, 7}, m1, m2, m0, m5 + packssdw m0, m5 ; out0 out1 + packssdw m1, m2 ; out2 out3 +%endmacro + +INV_TXFM_4X4_FN dct2, dct2 +INV_TXFM_4X4_FN dct2, adst +INV_TXFM_4X4_FN dct2, flipadst +INV_TXFM_4X4_FN dct2, identity + +cglobal idct2_4x4_internal_8, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + IDCT2_4_1D_PACKED + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + IDCT2_4_1D_PACKED + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct2 +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call .main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 0, 1, 2, 3 +ALIGN function_align +cglobal_label .main + IADST4_1D_PACKED + ret + +INV_TXFM_4X4_FN flipadst, dct2 +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call m(iadst_4x4_internal_8).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x4_internal_8).main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct2 +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + vpbroadcastd m3, [o(vvc_pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(vvc_pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8).end + +%macro WRITE_4X8 2 ; coefs[1-2] + movd xm4, [dstq+strideq*0] + pinsrd xm4, [dstq+strideq*1], 1 + movd xm5, [dstq+strideq*2] + pinsrd xm5, [dstq+r3 ], 1 + pinsrd xm4, [r2 +strideq*0], 2 + pinsrd xm4, [r2 +strideq*1], 3 + pinsrd xm5, [r2 +strideq*2], 2 + pinsrd xm5, [r2 +r3 ], 3 + pmovzxbw m4, xm4 + pmovzxbw m5, xm5 + paddw m4, m%1 + paddw m5, m%2 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+strideq*2], xm4, 2 + pextrd [dstq+r3 ], xm4, 3 + movd [r2 +strideq*0], xm5 + pextrd [r2 +strideq*1], xm5, 1 + pextrd [r2 +strideq*2], xm5, 2 + pextrd [r2 +r3 ], xm5, 3 +%endmacro + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_4x8_internal_8).end3 +%endif +%endmacro + +%macro IDCT2_8_1D_PACKED 0 + vpbroadcastd m6, [o(vvc_pd_64)] + punpckhwd m5, m3, m0 ; in7 in1 + punpckhwd m4, m1, m2 ; in3 in5 + punpcklwd m3, m1 ; in6 in2 + punpcklwd m2, m0 ; in4 in0 + ITX_MUL2X_PACK 5, 0, 1, 6, 18, 89, 3 ; t4a t7a + ITX_MUL2X_PACK 4, 0, 1, 6, 75, 50, 3 ; t5a t6a + ITX_MUL2X_PACK 3, 0, 1, 6, 36, 83 ; t3 t2 + psubsw m0, m5, m4 ; t5a t6a (interleaved) + paddsw m4, m5 ; t4 t7 (interleaved) + ITX_MUL2X_PACK 2, 1, 5, 6, 64, 64 ; t0 t1 + vpbroadcastd m1, [o(vvc_pw_m64_64)] + ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 +%if mmsize > 16 + vbroadcasti128 m1, [o(deint_shuf)] + pshufb m4, m1 +%else + pshufb m4, [o(deint_shuf)] +%endif + psubsw m1, m2, m3 ; tmp3 tmp2 + paddsw m3, m2 ; tmp0 tmp1 + shufps m2, m4, m0, q1032 ; t7 t6 + vpblendd m4, m0, 0xcc ; t4 t5 + paddsw m0, m3, m2 ; out0 out1 + psubsw m3, m2 ; out7 out6 + psubsw m2, m1, m4 ; out4 out5 + paddsw m1, m4 ; out3 out2 +%endmacro + +%macro IADST8_1D_PACKED 1 ; pass + vpbroadcastd m6, [o(vvc_pd_64)] + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 +%if %1 == 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 9, 90, 3 ; t1a t0a + ITX_MUL2X_PACK 1, 4, 5, 6, 43, 80, 2 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 70, 57, 3 ; t5a t4a + ITX_MUL2X_PACK 3, 4, 5, 6, 87, 25, 2 ; t6a t7a + psubsw m4, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + ITX_MUL2X_PACK 4, 2, 3, 6, 36, 83, 3 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 83, 36, 2 ; t7a t6a +%if mmsize > 16 + vbroadcasti128 m2, [o(deint_shuf)] +%else + mova m2, [o(deint_shuf)] +%endif + pshuflw m1, m1, q2301 + pshufhw m1, m1, q2301 + psubsw m3, m0, m1 ; t3 t2 + paddsw m0, m1 ; -out7 out0 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + pshufb m0, m2 + pshufb m4, m2 + vpbroadcastd m5, [o(vvc_pw_m64_64)] + pmaddwd m2, m5, m3 + pmaddwd m5, m1 + paddd m2, m6 + paddd m5, m6 + psrad m2, 7 + psrad m5, 7 + packssdw m2, m5 ; out4 -out5 + vpbroadcastd m5, [o(vvc_pw_64_64)] + pmaddwd m3, m5 + pmaddwd m1, m5 + paddd m3, m6 + paddd m1, m6 + psrad m3, 7 + psrad m1, 7 + packssdw m1, m3 ; out2 -out3 + punpcklqdq m3, m4, m0 ; out6 -out7 + punpckhqdq m0, m4 ; out0 -out1 +%else + ITX_MUL2X_PACK 0, 4, 5, 6, 9, 90 ; t0a t1a + ITX_MUL2X_PACK 1, 4, 5, 6, 43, 80 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 70, 57 ; t4a t5a + ITX_MUL2X_PACK 3, 4, 5, 6, 87, 25 ; t6a t7a + psubsw m4, m0, m2 ; t4 t5 + paddsw m0, m2 ; t0 t1 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + shufps m2, m5, m4, q1032 + punpckhwd m4, m2 + punpcklwd m5, m2 + ITX_MUL2X_PACK 4, 2, 3, 6, 36, 83, 1 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 83, 36 ; t7a t6a + psubsw m2, m0, m1 ; t2 t3 + paddsw m0, m1 ; out0 -out7 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + vpbroadcastd m5, [o(vvc_pw_64x8)] + vpblendd m3, m0, m4, 0x33 ; out6 -out7 + vpblendd m0, m4, 0xcc ; out0 -out1 + shufps m4, m2, m1, q1032 ; t3 t7 + vpblendd m1, m2, 0x33 ; t2 t6 + psubsw m2, m1, m4 ; t2-t3 t6-t7 + paddsw m1, m4 ; t2+t3 t6+t7 + pmulhrsw m2, m5 ; out4 -out5 + pshufd m1, m1, q1032 + pmulhrsw m1, m5 ; out2 -out3 +%endif +%endmacro + +INIT_YMM avx2 +INV_TXFM_4X8_FN dct2, dct2 +INV_TXFM_4X8_FN dct2, adst +INV_TXFM_4X8_FN dct2, flipadst +INV_TXFM_4X8_FN dct2, identity + +cglobal idct2_4x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(vvc_pw_64x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + IDCT2_4_1D_PACKED + vbroadcasti128 m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + call .main + vpbroadcastd m4, [o(vvc_pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pshufd m1, m1, q1032 + jmp m(iadst_4x8_internal_8).end2 +ALIGN function_align +cglobal_label .main + WRAP_XMM IDCT2_8_1D_PACKED + ret + +INV_TXFM_4X8_FN adst, dct2 +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(vvc_pw_64x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8).main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call .main_pass2 + vpbroadcastd m4, [o(vvc_pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pxor m5, m5 + psubw m5, m4 +.end: + vpblendd m4, m5, 0xcc +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 + WIN64_RESTORE_XMM + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 +.end3: + lea r2, [dstq+strideq*4] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + RET +ALIGN function_align +.main_pass1: + WRAP_XMM IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + WRAP_XMM IADST8_1D_PACKED 2 + ret + +INV_TXFM_4X8_FN flipadst, dct2 +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(vvc_pw_64x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8).main + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call m(iadst_4x8_internal_8).main_pass2 + vpbroadcastd m5, [o(vvc_pw_2048)] + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + pxor m4, m4 + psubw m4, m5 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + jmp m(iadst_4x8_internal_8).end + +INV_TXFM_4X8_FN identity, dct2 +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m2, [cq+32*0], q3120 + vpermq m0, [cq+32*1], q3120 + vpbroadcastd m3, [o(vvc_pw_64x8)] + vpbroadcastd m4, [o(vvc_pw_1697x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + pmulhrsw m2, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m2 + paddsw m1, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(vvc_pw_4096)] + jmp m(iadst_4x8_internal_8).end2 + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + movd xm3, [o(vvc_pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm2 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm3 + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp m(iadst_4x16_internal_8).end3 +%endif +%endmacro + +%macro IDCT2_16_1D_PACKED 0 + vpbroadcastd m10, [o(vvc_pd_64)] +.main2: + punpckhwd m8, m7, m0 ; dct16 in15 in1 + punpcklwd m9, m4, m0 ; dct4 in2 in0 + punpckhwd m0, m3, m4 ; dct16 in7 in9 + punpcklwd m7, m1 ; dct8 in7 in1 + punpckhwd m1, m6 ; dct16 in3 in13 + punpcklwd m3, m5 ; dct8 in3 in5 + punpckhwd m5, m2 ; dct16 in11 in5 + punpcklwd m6, m2 ; dct4 in3 in1 + ITX_MUL2X_PACK 8, 2, 4, 10, 9, 90, 3 ; t8a t15a + ITX_MUL2X_PACK 0, 2, 4, 10, 70, 57, 3 ; t9a t14a + ITX_MUL2X_PACK 1, 2, 4, 10, 87, 25, 3 ; t11a t12a + ITX_MUL2X_PACK 5, 2, 4, 10, 43, 80, 3 ; t10a t13a + ITX_MUL2X_PACK 7, 2, 4, 10, 18, 89, 3 ; t4a t7a + ITX_MUL2X_PACK 3, 2, 4, 10, 75, 50, 3 ; t5a t6a + ITX_MUL2X_PACK 6, 2, 4, 10, 36, 83 ; t3 t2 + psubsw m2, m8, m0 ; t9 t14 + paddsw m8, m0 ; t8 t15 + psubsw m0, m1, m5 ; t10 t13 + paddsw m1, m5 ; t11 t12 + vpbroadcastd m5, [o(vvc_pw_m83_36)] ; reuse vvc_pw_36_83 + ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a + vpbroadcastd m4, [o(vvc_pw_m36_m83)] ; reuse vvc_pw_m83_36 + ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a + psubsw m4, m8, m1 ; t11a t12a + paddsw m8, m1 ; t8a t15a + psubsw m1, m7, m3 ; t5a t6a + paddsw m7, m3 ; t4 t7 + paddsw m3, m2, m0 ; t9 t14 + psubsw m2, m0 ; t10 t13 +%if mmsize > 16 + vbroadcasti128 m0, [o(deint_shuf)] +%else + mova m0, [o(deint_shuf)] +%endif + pshufb m8, m0 + pshufb m7, m0 + pshufb m3, m0 + ITX_MUL2X_PACK 9, 0, 5, 10, 64, 64 ; t0 t1 + vpbroadcastd m0, [o(vvc_pw_m64_64)] + ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 + vpbroadcastd m5, [o(vvc_pw_64_64)] + ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 + vpbroadcastd m0, [o(vvc_pw_m64_64)] + ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a + punpckhqdq m0, m8, m3 ; t15a t14 + punpcklqdq m8, m3 ; t8a t9 + shufps m5, m4, m2, q1032 ; t12 t13a + vpblendd m4, m2, 0xcc ; t11 t10a + shufps m2, m7, m1, q1032 ; t7 t6 + vpblendd m7, m1, 0xcc ; t4 t5 + psubsw m1, m9, m6 ; dct4 out3 out2 + paddsw m9, m6 ; dct4 out0 out1 + psubsw m3, m9, m2 ; dct8 out7 out6 + paddsw m9, m2 ; dct8 out0 out1 + psubsw m2, m1, m7 ; dct8 out4 out5 + paddsw m1, m7 ; dct8 out3 out2 + psubsw m7, m9, m0 ; out15 out14 + paddsw m0, m9 ; out0 out1 + psubsw m6, m1, m5 ; out12 out13 + paddsw m1, m5 ; out3 out2 + psubsw m5, m2, m4 ; out11 out10 + paddsw m2, m4 ; out4 out5 + psubsw m4, m3, m8 ; out8 out9 + paddsw m3, m8 ; out7 out6 +%endmacro + +INV_TXFM_4X16_FN dct2, dct2 +INV_TXFM_4X16_FN dct2, adst +INV_TXFM_4X16_FN dct2, flipadst +INV_TXFM_4X16_FN dct2, identity + +cglobal idct2_4x16_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(idct2_16x4_internal_8).main + vpbroadcastd m5, [o(vvc_pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m0, m4, m2, m3 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vextracti128 xm4, m0, 1 + vextracti128 xm5, m1, 1 + vextracti128 xm6, m2, 1 + vextracti128 xm7, m3, 1 + call .main + vinserti128 m0, xm4, 1 + vinserti128 m1, xm5, 1 + vpbroadcastd m5, [o(vvc_pw_2048)] + vinserti128 m2, xm6, 1 + vinserti128 m3, xm7, 1 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x16_internal_8).end2 +ALIGN function_align +cglobal_label .main + WRAP_XMM IDCT2_16_1D_PACKED + ret + +INV_TXFM_4X16_FN adst, dct2 +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal_8).main + vpbroadcastd m5, [o(vvc_pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m4, m2, m3, m0 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main + vpbroadcastd m5, [o(vvc_pw_64x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m5, [o(vvc_pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m1, m0, 0x33 + vpblendd m0, m2, 0x33 + vpblendd m2, m3, 0x33 + vpblendd m3, m1, 0x33 + vpermq m0, m0, q2031 + vpermq m1, m2, q1302 + vpermq m2, m3, q3120 + vpermq m3, m4, q0213 + psubw m6, m7, m5 +.end: + vpblendd m5, m6, 0xcc +.end2: + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + WIN64_RESTORE_XMM + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + lea r2, [dstq+strideq*8] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + lea dstq, [dstq+strideq*4] + lea r2, [r2 +strideq*4] + WRITE_4X8 2, 3 + RET +ALIGN function_align +.main: + vpblendd m4, m1, m0, 0xcc + vpblendd m1, m0, 0x33 + vpblendd m5, m2, m3, 0xcc + vpblendd m2, m3, 0x33 + vperm2i128 m3, m5, m2, 0x31 + vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 + vperm2i128 m4, m1, m4, 0x31 + vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 + pshufd m3, m3, q1032 ; in15 in12 in13 in14 + pshufd m2, m4, q1032 ; in11 in8 in9 in10 +cglobal_label .main2 + vpbroadcastd m8, [o(vvc_pd_64)] + pxor m7, m7 + punpckhwd m4, m3, m0 ; in12 in3 in14 in1 + punpcklwd m0, m3 ; in0 in15 in2 in13 + punpckhwd m3, m2, m1 ; in8 in7 in10 in5 + punpcklwd m1, m2 ; in4 in11 in6 in9 + ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 + ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 + ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 + ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 + psubsw m2, m0, m3 ; t9a t8a t11a t10a + paddsw m0, m3 ; t1a t0a t3a t2a + psubsw m3, m1, m4 ; t13a t12a t15a t14a + paddsw m1, m4 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 4, 5, 6, 8, 18, 89, 75, 50, 3 + psubw m6, m7, m5 + ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 + vpbroadcastd m6, [o(vvc_pw_m83_36)] + vpbroadcastd m5, [o(vvc_pw_36_83)] + psubsw m4, m0, m1 ; t5 t4 t7 t6 + paddsw m0, m1 ; t1 t0 t3 t2 + psubsw m1, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + psubw m3, m7, m6 ; vvc_pw_83_m36 + vpblendd m6, m3, 0xf0 + ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a + ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 + vbroadcasti128 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a + vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a + vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 + vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 + pshufd m2, m2, q1032 ; t6a t7a t14 t15 + psubsw m1, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m4, m2 ; -out3 out12 out2 -out13 + psubsw m4, m2 ; t6 t7 t14a t15a + shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a + vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m5, [o(vvc_pw_m64_64)] + vpbroadcastd m6, [o(vvc_pw_64_64)] + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + pmaddwd m2, m5, m4 + pmaddwd m4, m6 + pmaddwd m5, m1 + pmaddwd m1, m6 + REPX {paddd x, m8}, m5, m1, m2, m4 + REPX {psrad x, 7}, m5, m2, m1, m4 + packssdw m2, m5 ; -out11 out8 out10 -out9 + packssdw m1, m4 ; -out7 out4 out6 -out5 + ret + +INV_TXFM_4X16_FN flipadst, dct2 +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal_8).main + vpbroadcastd m5, [o(vvc_pw_16384)] + punpcklwd m4, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m3, m2 + punpckhwd m3, m2 + REPX {pmulhrsw x, m5}, m4, m1, m0, m3 + punpckldq m2, m3, m1 + punpckhdq m3, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_8).main + vpbroadcastd m5, [o(vvc_pw_64x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m6, [o(vvc_pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m0, m2, 0x33 + vpblendd m0, m1, 0xcc + vpblendd m1, m3, 0xcc + vpblendd m2, m3, 0x33 + vpermq m0, m0, q3120 + vpermq m1, m1, q0213 + vpermq m2, m2, q2031 + vpermq m3, m4, q1302 + psubw m5, m7, m6 + jmp m(iadst_4x16_internal_8).end + +INV_TXFM_4X16_FN identity, dct2 +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova m3, [cq+32*0] + mova m2, [cq+32*1] + mova m4, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m8, [o(vvc_pw_1697x8)] + pcmpeqw m0, m0 ; -1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 + pmulhrsw m8, m4 + pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is + pxor m1, m9 ; unsigned. as long as both signs are equal + pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the + pxor m2, m9 ; pmulhrsw result will become 0 which causes + pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless + pxor m3, m9 ; we explicitly deal with that case here. + pcmpeqw m0, m4 + pxor m4, m0 + pavgw m1, m5 + pavgw m2, m6 + pavgw m3, m7 + pavgw m4, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(vvc_pw_1697x16)] + vpbroadcastd m5, [o(vvc_pw_2048)] + pmulhrsw m4, m8, m0 + pmulhrsw m6, m8, m1 + pmulhrsw m7, m8, m2 + pmulhrsw m8, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m6 + paddsw m2, m7 + paddsw m3, m8 + jmp m(iadst_4x16_internal_8).end2 + +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] + movq xm%3, [dstq ] + movhps xm%3, [dstq+%5] + movq xm%4, [dstq+%6] + movhps xm%4, [dstq+%7] + pmovzxbw m%3, xm%3 + pmovzxbw m%4, xm%4 +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vextracti128 xm%4, m%3, 1 + movq [dstq ], xm%3 + movhps [dstq+%6], xm%3 + movq [dstq+%5], xm%4 + movhps [dstq+%7], xm%4 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + mov [cq], eobd + pmulhrsw xm0, xm1 + jmp m(vvc_inv_dct2_dct2_8x8_8).dconly2 +%endif +%endmacro + +INV_TXFM_8X4_FN dct2, dct2 +INV_TXFM_8X4_FN dct2, adst +INV_TXFM_8X4_FN dct2, flipadst +INV_TXFM_8X4_FN dct2, identity + +cglobal idct2_8x4_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm3, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm3, [cq+16*0] + pmulhrsw xm1, xm3, [cq+16*1] + pmulhrsw xm2, xm3, [cq+16*2] + pmulhrsw xm3, [cq+16*3] + call m(idct2_4x8_internal_8).main + vbroadcasti128 m4, [o(deint_shuf)] + vinserti128 m3, m1, xm3, 1 + vinserti128 m1, m0, xm2, 1 + shufps m0, m1, m3, q0220 + shufps m1, m3, q1331 + pshufb m0, m4 + pshufb m1, m4 + jmp tx2q +.pass2: + IDCT2_4_1D_PACKED + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_8x4_internal_8).end2 + +INV_TXFM_8X4_FN adst, dct2 +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(vvc_pw_64x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8).main_pass1 + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pxor m3, m3 + psubsw m3, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +.end2: + vpbroadcastd m2, [o(vvc_pw_2048)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + WIN64_RESTORE_XMM +.end3: + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + RET +ALIGN function_align +cglobal_label .main + IADST4_1D_PACKED + ret + +INV_TXFM_8X4_FN flipadst, dct2 +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(vvc_pw_64x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8).main_pass1 + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + punpckhwd m1, m3, m2 + punpcklwd m3, m2 + pxor m0, m0 + psubsw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_8).main + mova m2, m1 + vpermq m1, m0, q2031 + vpermq m0, m2, q2031 + jmp m(iadst_8x4_internal_8).end2 + +INV_TXFM_8X4_FN identity, dct2 +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm0, [cq+16*1] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m0, [cq+16*3], 1 + vpbroadcastd m3, [o(vvc_pw_64x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + paddsw m0, m0 + paddsw m1, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(vvc_pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_8x4_internal_8).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + or r3d, 8 +.dconly: + pmulhrsw xm0, xm2 +.dconly2: + movd xm2, [vvc_pw_2048] + pmulhrsw xm0, xm1 + lea r2, [strideq*3] + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 +.dconly_loop: + WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2 + lea dstq, [dstq+strideq*4] + sub r3d, 4 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct2, dct2 +INV_TXFM_8X8_FN dct2, adst +INV_TXFM_8X8_FN dct2, flipadst +INV_TXFM_8X8_FN dct2, identity + +cglobal idct2_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m1, [cq+32*1], q3120 ; 2 3 + call .main + shufps m4, m0, m1, q0220 + shufps m5, m0, m1, q1331 + shufps m1, m2, m3, q0220 + shufps m3, m2, m3, q1331 + vbroadcasti128 m0, [o(deint_shuf)] + vpbroadcastd m2, [o(vvc_pw_16384)] + REPX {pshufb x, m0}, m4, m5, m1, m3 + REPX {pmulhrsw x, m2}, m4, m5, m1, m3 + vinserti128 m0, m4, xm1, 1 + vperm2i128 m2, m4, m1, 0x31 + vinserti128 m1, m5, xm3, 1 + vperm2i128 m3, m5, m3, 0x31 + jmp tx2q +.pass2: + call .main + vpbroadcastd m4, [o(vvc_pw_2048)] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + jmp m(iadst_8x8_internal_8).end2 +ALIGN function_align +cglobal_label .main + IDCT2_8_1D_PACKED + ret + +INV_TXFM_8X8_FN adst, dct2 +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call .main_pass1 + vpbroadcastd m5, [o(vvc_pw_16384)] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + pxor m3, m3 + psubw m3, m5 ; negate odd elements during rounding + pmulhrsw m4, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m5 + pmulhrsw m2, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + vperm2i128 m2, m3, m0, 0x31 + vinserti128 m0, m3, xm0, 1 + vperm2i128 m3, m4, m1, 0x31 + vinserti128 m1, m4, xm1, 1 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call .main_pass2 + vpbroadcastd m5, [o(vvc_pw_2048)] + vpbroadcastd xm4, [o(vvc_pw_4096)] + psubw m4, m5 ; lower half = 2048, upper half = -2048 +.end: + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + pmulhrsw m2, m4 + pmulhrsw m3, m4 + WIN64_RESTORE_XMM +.end4: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 5 + RET +ALIGN function_align +.main_pass1: + IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + IADST8_1D_PACKED 2 + ret + +INV_TXFM_8X8_FN flipadst, dct2 +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call m(iadst_8x8_internal_8).main_pass1 + vpbroadcastd m5, [o(vvc_pw_16384)] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + pxor m0, m0 + psubw m0, m5 + pmulhrsw m4, m0 + pmulhrsw m3, m5 + pmulhrsw m2, m0 + pmulhrsw m1, m5 + punpckhwd m0, m4, m3 + punpcklwd m4, m3 + punpckhwd m3, m2, m1 + punpcklwd m2, m1 + vinserti128 m1, m0, xm3, 1 + vperm2i128 m3, m0, m3, 0x31 + vinserti128 m0, m4, xm2, 1 + vperm2i128 m2, m4, m2, 0x31 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8).main_pass2 + vpbroadcastd m4, [o(vvc_pw_2048)] + vpbroadcastd xm5, [o(vvc_pw_4096)] + psubw m4, m5 ; lower half = -2048, upper half = 2048 + vpermq m5, m3, q2031 + vpermq m3, m0, q2031 + vpermq m0, m2, q2031 + vpermq m2, m1, q2031 + pmulhrsw m1, m0, m4 + pmulhrsw m0, m5, m4 + jmp m(iadst_8x8_internal_8).end3 + +INV_TXFM_8X8_FN identity, dct2 +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*1] + vinserti128 m3, [cq+16*4], 1 + vinserti128 m2, [cq+16*5], 1 + mova xm4, [cq+16*2] + mova xm0, [cq+16*3] + vinserti128 m4, [cq+16*6], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(vvc_pw_4096)] + jmp m(iadst_8x8_internal_8).end + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 16 + jmp m(vvc_inv_dct2_dct2_8x8_8).dconly +%endif +%endmacro + +%macro ITX_8X16_LOAD_COEFS 0 + vpbroadcastd m4, [o(vvc_pw_64x8)] + pmulhrsw m0, m4, [cq+32*0] + add cq, 32*4 + pmulhrsw m7, m4, [cq+32*3] + pmulhrsw m1, m4, [cq-32*3] + pmulhrsw m6, m4, [cq+32*2] + pmulhrsw m2, m4, [cq-32*2] + pmulhrsw m5, m4, [cq+32*1] + pmulhrsw m3, m4, [cq-32*1] + pmulhrsw m4, [cq+32*0] +%endmacro + +INV_TXFM_8X16_FN dct2, dct2 +INV_TXFM_8X16_FN dct2, adst +INV_TXFM_8X16_FN dct2, flipadst +INV_TXFM_8X16_FN dct2, identity + +cglobal idct2_8x16_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(idct2_16x8_internal_8).main + vpbroadcastd m10, [o(vvc_pw_16384)] +.pass1_end: + vperm2i128 m9, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vperm2i128 m8, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 +.pass1_end2: + punpckhwd m7, m5, m6 + punpcklwd m5, m6 + punpcklwd m6, m8, m9 + punpckhwd m8, m9 + REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + call .main + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + vpbroadcastd m8, [o(vvc_pw_2048)] +.end2: + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +.end3: + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 8, 9 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 4, 5, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 6, 7, 0, 1 + RET +ALIGN function_align +cglobal_label .main + IDCT2_16_1D_PACKED + ret + +INV_TXFM_8X16_FN adst, dct2 +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal_8).main + call m(iadst_16x8_internal_8).main_pass1_end + vpbroadcastd m10, [o(vvc_pw_16384)] + pslld m9, m10, 17 + psubw m10, m9 ; 16384, -16384 + jmp m(idct2_8x16_internal_8).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + vpbroadcastd m9, [o(vvc_pw_2048)] + vpbroadcastd xm8, [o(vvc_pw_4096)] + psubw m8, m9 + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + jmp m(idct2_8x16_internal_8).end2 +ALIGN function_align +cglobal_label .main + REPX {pshufd x, x, q1032}, m7, m1, m5, m3 +.main2: + vpbroadcastd m10, [o(vvc_pd_64)] + punpckhwd m8, m7, m0 ; in14 in1 + punpcklwd m0, m7 ; in0 in15 + punpcklwd m7, m6, m1 ; in12 in3 + punpckhwd m1, m6 ; in2 in13 + punpckhwd m6, m5, m2 ; in10 in5 + punpcklwd m2, m5 ; in4 in11 + punpcklwd m5, m4, m3 ; in8 in7 + punpckhwd m3, m4 ; in6 in9 + ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 + ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 + ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 + ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 + ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 + ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 + ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 + psubsw m4, m0, m5 ; t9a t8a + paddsw m0, m5 ; t1a t0a + psubsw m5, m1, m6 ; t11a t10a + paddsw m1, m6 ; t3a t2a + psubsw m6, m2, m7 ; t13a t12a + paddsw m2, m7 ; t5a t4a + psubsw m7, m3, m8 ; t15a t14a + paddsw m3, m8 ; t7a t6a + vpbroadcastd m11, [o(vvc_pw_m89_18)] + vpbroadcastd m12, [o(vvc_pw_18_89)] + pxor m9, m9 + ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 + psubw m8, m9, m11 ; vvc_pw_89_m18 + ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 + vpbroadcastd m11, [o(vvc_pw_m50_75)] + vpbroadcastd m12, [o(vvc_pw_75_50)] + ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 + psubw m8, m9, m11 ; vvc_pw_50_m75 + ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 + psubsw m8, m1, m3 ; t7 t6 + paddsw m1, m3 ; t3 t2 + psubsw m3, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m2, m5, m7 ; t14a t15a + paddsw m7, m5 ; t10a t11a + psubsw m5, m4, m6 ; t12a t13a + paddsw m4, m6 ; t8a t9a + vpbroadcastd m11, [o(vvc_pw_m83_36)] + vpbroadcastd m12, [o(vvc_pw_36_83)] + ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a + psubw m6, m9, m11 ; vvc_pw_83_m36 + ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a + vpbroadcastd m11, [o(vvc_pw_m36_83)] + vpbroadcastd m12, [o(vvc_pw_83_36)] + ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 + psubw m6, m9, m11 ; vvc_pw_36_m83 + ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 + vbroadcasti128 m12, [o(deint_shuf)] + paddsw m6, m4, m7 ; -out1 out14 + psubsw m4, m7 ; t10 t11 + psubsw m11, m3, m8 ; t7 t6 + paddsw m8, m3 ; out12 -out3 + psubsw m3, m0, m1 ; t3a t2a + paddsw m0, m1 ; -out15 out0 + paddsw m1, m2, m5 ; -out13 out2 + psubsw m5, m2 ; t15a t14a + pshufb m0, m12 + pshufb m6, m12 + pshufb m8, m12 + pshufb m1, m12 + shufps m7, m6, m0, q1032 ; out14 -out15 + vpblendd m0, m6, 0x33 ; -out1 out0 + punpcklqdq m6, m8, m1 ; out12 -out13 + punpckhqdq m1, m8, m1 ; -out3 out2 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m8, [o(vvc_pw_m64_64)] + vpbroadcastd m12, [o(vvc_pw_64_64)] + pmaddwd m9, m8, m11 ; -out11 + pmaddwd m2, m12, m5 ; -out5 + pmaddwd m5, m8 ; out10 + pmaddwd m11, m12 ; out4 + REPX {paddd x, m10}, m9, m5, m2, m11 + REPX {psrad x, 7 }, m9, m5, m2, m11 + packssdw m5, m9 ; out10 -out11 + packssdw m2, m11 ; -out5 out4 + pmaddwd m11, m8, m3 ; out8 + vpbroadcastd m8, [o(vvc_pw_64_m64)] + pmaddwd m3, m12 ; -out7 + pmaddwd m8, m4 ; -out9 + pmaddwd m4, m12 ; out6 + REPX {paddd x, m10}, m11, m3, m8, m4 + REPX {psrad x, 7 }, m11, m3, m8, m4 + packssdw m3, m4 ; -out7 out6 + packssdw m4, m11, m8 ; out8 -out9 + vpbroadcastd m10, [o(vvc_pw_16384)] + pxor m9, m9 + ret +ALIGN function_align +cglobal_label .main_pass2_end + vpbroadcastd m8, [o(vvc_pw_64x8)] + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + punpcklqdq m11, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + vpblendd m3, m4, 0xcc ; t3a t11 + psubsw m4, m2, m3 ; out8 -out9 + paddsw m3, m2 ; -out7 out6 + paddsw m2, m5, m11 ; -out5 out4 + psubsw m5, m11 ; out10 -out11 + REPX {pmulhrsw x, m8}, m2, m3, m4, m5 + ret + +INV_TXFM_8X16_FN flipadst, dct2 +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal_8).main + call m(iadst_16x8_internal_8).main_pass1_end + vpbroadcastd m9, [o(vvc_pw_16384)] + pslld m10, m9, 17 + psubw m10, m9 ; -16384, 16384 + vperm2i128 m9, m4, m0, 0x31 + vinserti128 m0, m4, xm0, 1 + vperm2i128 m8, m5, m1, 0x31 + vinserti128 m4, m5, xm1, 1 + vperm2i128 m5, m7, m3, 0x31 + vinserti128 m3, m7, xm3, 1 + vinserti128 m1, m6, xm2, 1 + vperm2i128 m6, m6, m2, 0x31 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m3, m1 + punpckhwd m3, m1 + jmp m(idct2_8x16_internal_8).pass1_end2 +.pass2: + call m(iadst_8x16_internal_8).main + call m(iadst_8x16_internal_8).main_pass2_end + vpbroadcastd m8, [o(vvc_pw_2048)] + vpbroadcastd xm9, [o(vvc_pw_4096)] + psubw m8, m9 + vpermq m9, m0, q3120 + vpermq m0, m7, q2031 + vpermq m7, m1, q3120 + vpermq m1, m6, q2031 + vpermq m6, m2, q3120 + vpermq m2, m5, q2031 + vpermq m5, m3, q3120 + vpermq m3, m4, q2031 + pmulhrsw m0, m8 + pmulhrsw m1, m8 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + pmulhrsw m4, m5, m8 + pmulhrsw m5, m6, m8 + pmulhrsw m6, m7, m8 + pmulhrsw m7, m9, m8 + jmp m(idct2_8x16_internal_8).end3 + +INV_TXFM_8X16_FN identity, dct2 +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, vvc_pw_1697x16, [vvc_pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*2] + add cq, 16*8 + vinserti128 m3, [cq+16*0], 1 + vinserti128 m2, [cq+16*2], 1 + vpbroadcastd m9, [o(vvc_pw_64x8)] + mova xm4, [cq-16*4] + mova xm5, [cq-16*2] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*6], 1 + mova xm7, [cq-16*7] + mova xm6, [cq-16*5] + vinserti128 m7, [cq+16*1], 1 + vinserti128 m6, [cq+16*3], 1 + mova xm8, [cq-16*3] + mova xm0, [cq-16*1] + vinserti128 m8, [cq+16*5], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + punpcklwd m5, m7, m6 + punpckhwd m7, m6 + punpcklwd m6, m8, m0 + punpckhwd m8, m0 + REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(vvc_pw_1697x16)] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(idct2_8x16_internal_8).end + +%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] + pmovzxbw m%3, [dstq+%5] +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif + pmovzxbw m%4, [dstq+%6] +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vpermq m%3, m%3, q3120 + mova [dstq+%5], xm%3 + vextracti128 [dstq+%6], m%3, 1 +%endmacro + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + or r3d, 4 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [vvc_pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X4_FN dct2, dct2 +INV_TXFM_16X4_FN dct2, adst +INV_TXFM_16X4_FN dct2, flipadst +INV_TXFM_16X4_FN dct2, identity + +cglobal idct2_16x4_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] + mova xm4, [cq+16*4] + mova xm5, [cq+16*5] + mova xm6, [cq+16*6] + mova xm7, [cq+16*7] + call m(idct2_4x16_internal_8).main + vinserti128 m6, m2, xm6, 1 + vinserti128 m2, m0, xm4, 1 + vinserti128 m0, m1, xm5, 1 + vinserti128 m1, m3, xm7, 1 + punpcklwd m3, m2, m6 + punpckhwd m2, m6 + vpbroadcastd m6, [o(vvc_pw_16384)] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + mova m1, m6 + jmp m(iadst_16x4_internal_8).pass1_end +.pass2: + call .main + jmp m(iadst_16x4_internal_8).end +ALIGN function_align +cglobal_label .main + vpbroadcastd m6, [o(vvc_pd_64)] + IDCT2_4_1D 0, 1, 2, 3, 4, 5, 6 + ret + +INV_TXFM_16X4_FN adst, dct2 +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal_8).main2 + call m(iadst_4x16_internal_8).main_pass1_end + punpcklwd m4, m3, m1 + punpcklwd m5, m2, m0 + punpckhwd m0, m1 + punpckhwd m2, m3 + vpbroadcastd m1, [o(vvc_pw_16384)] + vinserti128 m3, m0, xm2, 1 + vperm2i128 m2, m0, m2, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m6, m7, m1 +.pass1_end: + pmulhrsw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m4, m1 + pmulhrsw m0, m6 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m4, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + WIN64_RESTORE_XMM +.end2: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m6, [o(vvc_pw_m3344_3344)] + vpbroadcastd m7, [o(vvc_pw_3803_1321)] + vpbroadcastd m8, [o(vvc_pw_m1321_2482)] + vpbroadcastd m9, [o(vvc_pw_2482_3344)] + punpcklwd m4, m2, m0 ; in2 in0 l + punpckhwd m2, m0 ; in2 in0 h + psrld m5, m6, 16 + pmaddwd m10, m6, m4 ; t2:02 l + pmaddwd m6, m2 ; t2:02 h + pmaddwd m0, m7, m4 ; t0:02 l + pmaddwd m7, m2 ; t0:02 h + pmaddwd m4, m8 ; t1:02 l + pmaddwd m8, m2 ; t1:02 h + punpckhwd m2, m3, m1 ; in3 in1 h + punpcklwd m3, m1 ; in3 in1 l + pmaddwd m1, m5, m2 ; t2:3 h + pmaddwd m5, m3 ; t2:3 l + paddd m6, m1 + vpbroadcastd m1, [o(vvc_pd_64)] + paddd m10, m5 + pmaddwd m5, m9, m3 + pmaddwd m9, m2 + paddd m0, m1 + paddd m7, m1 + paddd m0, m5 ; t0 + t3 + 2048 l + paddd m7, m9 ; t0 + t3 + 2048 h + vpbroadcastd m9, [o(vvc_pw_m3803_3344)] + pmaddwd m5, m9, m2 + pmaddwd m9, m3 + paddd m10, m1 ; t2 + 2048 l + paddd m6, m1 ; t2 + 2048 h + paddd m5, m1 ; t1:13 + 2048 h + paddd m1, m9 ; t1:13 + 2048 l + vpbroadcastd m9, [o(vvc_pw_m3803_m6688)] + pmaddwd m2, m9 + pmaddwd m3, m9 + paddd m5, m8 ; t1 + t3 + 2048 h + paddd m1, m4 ; t1 + t3 + 2048 l + paddd m8, m7 + paddd m4, m0 + paddd m2, m8 ; t0 + t1 - t3 + 2048 h + paddd m3, m4 ; t0 + t1 - t3 + 2048 l + REPX {psrad x, 7}, m10, m6, m0, m7, m5, m1, m2, m3 + packssdw m0, m7 + packssdw m1, m5 + packssdw m3, m2 + packssdw m2, m10, m6 + ret + +INV_TXFM_16X4_FN flipadst, dct2 +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal_8).main2 + call m(iadst_4x16_internal_8).main_pass1_end + punpckhwd m4, m3, m2 + punpckhwd m5, m1, m0 + punpcklwd m0, m2 + punpcklwd m1, m3 + vpbroadcastd m6, [o(vvc_pw_16384)] + vinserti128 m3, m0, xm1, 1 + vperm2i128 m2, m0, m1, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m1, m7, m6 + jmp m(iadst_16x4_internal_8).pass1_end +ALIGN function_align +.pass2: + call m(iadst_16x4_internal_8).main + vpbroadcastd m4, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m4}, m3, m2, m1, m0 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 + RET + +INV_TXFM_16X4_FN identity, dct2 +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm4, [cq+16*1] + vinserti128 m2, [cq+16*4], 1 + vinserti128 m4, [cq+16*5], 1 + mova xm0, [cq+16*2] + mova xm1, [cq+16*3] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m7, [o(vvc_pw_1697x16)] + vpbroadcastd m8, [o(vvc_pw_16384)] + punpcklwd m3, m2, m4 + punpckhwd m2, m4 + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + pmulhrsw m0, m7, m1 + pmulhrsw m5, m7, m2 + pmulhrsw m6, m7, m3 + pmulhrsw m7, m4 + REPX {pmulhrsw x, m8}, m0, m5, m6, m7 + paddsw m1, m0 + paddsw m2, m5 + paddsw m3, m6 + paddsw m4, m7 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m7, [o(vvc_pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_16x4_internal_8).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 8 + jmp m(vvc_inv_dct2_dct2_16x4_8).dconly +%endif +%endmacro + +%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd + vpbroadcastd m8, [o(vvc_pw_64x8)] + vpermq m0, [cq+32*0], q3120 + add cq, 32*4 + vpermq m7, [cq+32*3], q%1 + vpermq m1, [cq-32*3], q%1 + vpermq m6, [cq+32*2], q3120 + vpermq m2, [cq-32*2], q3120 + vpermq m5, [cq+32*1], q%1 + vpermq m3, [cq-32*1], q%1 + vpermq m4, [cq+32*0], q3120 + REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 +%endmacro + +INV_TXFM_16X8_FN dct2, dct2 +INV_TXFM_16X8_FN dct2, adst +INV_TXFM_16X8_FN dct2, flipadst +INV_TXFM_16X8_FN dct2, identity + +cglobal idct2_16x8_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 3120 + call m(idct2_8x16_internal_8).main + vpbroadcastd m10, [o(vvc_pw_16384)] + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + REPX {pmulhrsw x, m10}, m8, m1, m4, m6 +.pass1_end: + REPX {pmulhrsw x, m10}, m0, m2, m9, m5 + punpckhwd m3, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m9, m4 + punpckhwd m9, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m8 + punpckhdq m3, m8 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m8, m9, m5 + punpckhdq m9, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m8, 0x31 + vinserti128 m2, xm8, 1 + vperm2i128 m7, m3, m9, 0x31 + vinserti128 m3, xm9, 1 + jmp tx2q +.pass2: + call .main + vpbroadcastd m8, [o(vvc_pw_2048)] +.end: + REPX {pmulhrsw x, m8}, m0, m2, m4, m6 +.end2: + REPX {pmulhrsw x, m8}, m1, m3, m5, m7 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 +.end3: + pxor m0, m0 + REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 +.end4: + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(vvc_pd_64)] +.main2: + IDCT2_8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 + ret + +INV_TXFM_16X8_FN adst, dct2 +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal_8).main2 + call m(iadst_8x16_internal_8).main_pass1_end + psubw m11, m9, m10 + punpcklwd m8, m0, m2 + punpckhwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpckhwd m6, m5, m7 + punpcklwd m5, m7 + REPX {pmulhrsw x, m11}, m8, m1, m4, m6 + jmp m(idct2_16x8_internal_8).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + pxor m8, m8 + psubw m8, m9 + REPX {pmulhrsw x, m9}, m0, m2, m4, m6 + jmp m(idct2_16x8_internal_8).end2 +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(vvc_pd_64)] + ITX_MULSUB_2W 7, 0, 8, 9, 10, 9, 90, 0 ; t1a, t0a + ITX_MULSUB_2W 3, 4, 8, 9, 10, 70, 57, 0 ; t5a, t4a + ITX_MULSUB_2W 1, 6, 8, 9, 10, 87, 25, 0 ; t7a, t6a + ITX_MULSUB_2W 5, 2, 8, 9, 10, 43, 80, 0 ; t3a, t2a + psubsw m8, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m0, m4 ; t4 + paddsw m0, m4 ; t0 + psubsw m4, m5, m1 ; t7 + paddsw m5, m1 ; t3 + psubsw m1, m7, m3 ; t5 + paddsw m7, m3 ; t1 + ITX_MULSUB_2W 6, 1, 3, 9, 10, 36, 83, 0 ; t5a, t4a + ITX_MULSUB_2W 4, 8, 3, 9, 10, 83, 36, 0 ; t6a, t7a + psubsw m9, m6, m8 ; t7 + paddsw m6, m8 ; out6 + psubsw m3, m7, m5 ; t3 + paddsw m7, m5 ; -out7 + psubsw m5, m0, m2 ; t2 + paddsw m0, m2 ; out0 + psubsw m2, m1, m4 ; t6 + paddsw m1, m4 ; -out1 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m11, [o(vvc_pw_m64_64)] + vpbroadcastd m12, [o(vvc_pw_64_64)] + punpckhwd m4, m3, m5 + punpcklwd m3, m5 + pmaddwd m5, m11, m4 + pmaddwd m4, m12 + pmaddwd m8, m11, m3 + pmaddwd m3, m12 + REPX {paddd x, m10}, m5, m4, m8, m3 + REPX {psrad x, 7 }, m5, m8, m4, m3 + packssdw m3, m4 ; -out3 + packssdw m4, m8, m5 ; out4 + punpcklwd m5, m9, m2 + punpckhwd m9, m2 + pmaddwd m2, m12, m5 + pmaddwd m5, m11 + pmaddwd m12, m9 + pmaddwd m11, m9 + REPX {paddd x, m10}, m2, m5, m12, m11 + REPX {psrad x, 7 }, m2, m7, m5, m11 + packssdw m2, m12 ; out2 + packssdw m5, m11 ; -out5 + ret +ALIGN function_align +cglobal_label .main_pass2_end + vpbroadcastd m8, [o(vvc_pw_64x8)] + psubsw m4, m5, m3 + paddsw m3, m5 + psubsw m5, m2, m9 + paddsw m2, m9 + pmulhrsw m2, m8 ; out2 + pmulhrsw m3, m8 ; -out3 + pmulhrsw m4, m8 ; out4 + pmulhrsw m5, m8 ; -out5 + vpbroadcastd m9, [o(vvc_pw_2048)] + ret + +INV_TXFM_16X8_FN flipadst, dct2 +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal_8).main2 + call m(iadst_8x16_internal_8).main_pass1_end + psubw m9, m10 + punpcklwd m8, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m7, m5 + punpckhwd m7, m5 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m2, m0 + punpcklwd m2, m0 + REPX {pmulhrsw x, m10}, m8, m4, m5, m1 + REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 + punpcklwd m0, m7, m4 + punpckhwd m7, m4 + punpckhwd m4, m6, m8 + punpcklwd m6, m8 + punpckhwd m8, m3, m5 + punpcklwd m3, m5 + punpcklwd m5, m2, m1 + punpckhwd m2, m1 + punpckhdq m1, m0, m6 + punpckldq m0, m6 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckhdq m4, m3, m5 + punpckldq m3, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m4, 0x31 + vinserti128 m1, xm4, 1 + vperm2i128 m4, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vinserti128 m3, m7, xm8, 1 + vperm2i128 m7, m8, 0x31 + jmp tx2q +.pass2: + call m(iadst_16x8_internal_8).main + call m(iadst_16x8_internal_8).main_pass2_end + pxor m8, m8 + psubw m8, m9 + pmulhrsw m10, m7, m8 + pmulhrsw m7, m0, m9 + pmulhrsw m0, m6, m9 + pmulhrsw m6, m1, m8 + pmulhrsw m1, m5, m8 + pmulhrsw m5, m2, m9 + pmulhrsw m2, m4, m9 + pmulhrsw m4, m3, m8 + lea r3, [strideq*3] + WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 + WRITE_16X2 1, 2, 0, 1, strideq*2, r3 + jmp m(idct2_16x8_internal_8).end3 + +INV_TXFM_16X8_FN identity, dct2 +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm7, [cq+16*0] + mova xm2, [cq+16*1] + add cq, 16*8 + vpbroadcastd m3, [o(vvc_pw_64x8)] + vinserti128 m7, [cq+16*0], 1 + vinserti128 m2, [cq+16*1], 1 + mova xm6, [cq-16*6] + mova xm4, [cq-16*5] + vinserti128 m6, [cq+16*2], 1 + vinserti128 m4, [cq+16*3], 1 + mova xm8, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m8, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm0, [cq-16*2] + mova xm1, [cq-16*1] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m10, [o(vvc_pw_1697x16)] + vpbroadcastd m11, [o(vvc_pw_16384)] + REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 + punpcklwd m3, m7, m2 + punpckhwd m7, m2 + punpcklwd m2, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m8, m5 + punpckhwd m8, m5 + punpcklwd m5, m0, m1 + punpckhwd m0, m1 + punpckldq m1, m3, m2 + punpckhdq m3, m2 + punpckldq m2, m4, m5 + punpckhdq m4, m5 + punpckldq m5, m7, m6 + punpckhdq m7, m6 + punpckldq m6, m8, m0 + punpckhdq m8, m0 + REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m6 + punpckhqdq m5, m6 + punpcklqdq m6, m7, m8 + punpckhqdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(vvc_pw_4096)] + jmp m(idct2_16x8_internal_8).end + +%define o_base vvc_pw_5 + 128 + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 16 + jmp m(vvc_inv_dct2_dct2_16x4_8).dconly +%endif +%endmacro + +%macro ITX_16X16_LOAD_COEFS 0 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + add cq, 32*8 + mova m4, [cq-32*4] + mova m5, [cq-32*3] + mova m6, [cq-32*2] + mova m7, [cq-32*1] + mova m8, [cq+32*0] + mova m9, [cq+32*1] + mova m10, [cq+32*2] + mova m11, [cq+32*3] + mova m12, [cq+32*4] + mova m13, [cq+32*5] + mova m14, [cq+32*6] + mova m15, [cq+32*7] + mova [rsp], m15 +%endmacro + +INV_TXFM_16X16_FN dct2, dct2 +INV_TXFM_16X16_FN dct2, adst +INV_TXFM_16X16_FN dct2, flipadst +INV_TXFM_16X16_FN dct2, identity + +cglobal idct2_16x16_internal_8, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main +.pass1_end: + vpbroadcastd m1, [o(vvc_pw_8192)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 +.pass1_end2: + vextracti128 [rsp+16*4], m0, 1 + mova [rsp+16*0], xm0 + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + vperm2i128 m8, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vperm2i128 m9, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vperm2i128 m10, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + vperm2i128 m11, m4, m12, 0x31 + vinserti128 m4, xm12, 1 + vperm2i128 m12, m5, m13, 0x31 + vinserti128 m5, xm13, 1 + vperm2i128 m13, m6, m14, 0x31 + vinserti128 m6, xm14, 1 + vperm2i128 m14, m7, m15, 0x31 + vinserti128 m7, xm15, 1 + mova m15, [rsp+32*2] +.pass1_end3: + punpcklwd m0, m9, m10 + punpckhwd m9, m10 + punpcklwd m10, m15, m8 + punpckhwd m15, m8 + punpckhwd m8, m11, m12 + punpcklwd m11, m12 + punpckhwd m12, m13, m14 + punpcklwd m13, m14 + punpckhdq m14, m11, m13 + punpckldq m11, m13 + punpckldq m13, m15, m9 + punpckhdq m15, m9 + punpckldq m9, m10, m0 + punpckhdq m10, m0 + punpckhdq m0, m8, m12 + punpckldq m8, m12 + punpcklqdq m12, m13, m8 + punpckhqdq m13, m8 + punpcklqdq m8, m9, m11 + punpckhqdq m9, m11 + punpckhqdq m11, m10, m14 + punpcklqdq m10, m14 + punpcklqdq m14, m15, m0 + punpckhqdq m15, m0 + mova m0, [rsp] + mova [rsp], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m1, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp], m6 +.end2: + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + lea r3, [strideq*3] + WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 +.end3: + pxor m2, m2 + REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 + lea dstq, [dstq+strideq*4] + WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 + WRITE_16X2 10, 11, 0, 1, strideq*2, r3 + REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 + lea dstq, [dstq+strideq*4] + WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 + WRITE_16X2 14, 15, 0, 1, strideq*2, r3 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m15, [o(vvc_pd_64)] + mova [rsp+gprsize+32*1], m1 + mova [rsp+gprsize+32*2], m9 + IDCT2_8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 + mova m1, [rsp+gprsize+32*2] ; in9 + mova [rsp+gprsize+32*2], m14 ; tmp7 + mova m9, [rsp+gprsize+32*1] ; in1 + mova [rsp+gprsize+32*1], m10 ; tmp5 + mova m14, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m6 ; tmp3 + IDCT2_16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 + mova m6, [rsp+gprsize+32*1] ; tmp5 + psubsw m15, m0, m14 ; out15 + paddsw m0, m14 ; out0 + psubsw m14, m2, m13 ; out14 + paddsw m2, m13 ; out1 + mova [rsp+gprsize+32*1], m2 + psubsw m13, m4, m11 ; out13 + paddsw m2, m4, m11 ; out2 + psubsw m11, m8, m7 ; out11 + paddsw m4, m8, m7 ; out4 + mova m7, [rsp+gprsize+32*2] ; tmp7 + psubsw m10, m6, m5 ; out10 + paddsw m5, m6 ; out5 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; out7 + psubsw m9, m12, m3 ; out9 + paddsw m6, m12, m3 ; out6 + mova m3, [rsp+gprsize+32*0] ; tmp3 + psubsw m12, m3, m1 ; out12 + paddsw m3, m1 ; out3 + ret + +INV_TXFM_16X16_FN adst, dct2 +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main + call .main_pass1_end + pmulhrsw m0, m1, [cq+32*0] + pmulhrsw m2, m1, [cq+32*1] + REPX {pmulhrsw x, m1}, m4, m6, m8, m10 + pmulhrsw m12, m1, [cq+32*2] + pmulhrsw m14, m1, [cq+32*3] + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 + pxor m8, m8 + psubw m1, m8, m1 + jmp m(idct2_16x16_internal_8).pass1_end2 +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp+32*0], m6 + pxor m6, m6 + psubw m1, m6, m1 + jmp m(idct2_16x16_internal_8).end2 +ALIGN function_align +cglobal_label .main + vpbroadcastd m15, [o(vvc_pd_64)] + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*2], m4 + ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973, 0 ; t3, t2 + ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290, 0 ; t7, t6 + ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106, 0 ; t11, t10 + ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601, 0 ; t15, t14 + psubsw m0, m2, m10 ; t10a + paddsw m2, m10 ; t2a + psubsw m10, m13, m5 ; t11a + paddsw m13, m5 ; t3a + psubsw m5, m6, m14 ; t14a + paddsw m6, m14 ; t6a + psubsw m14, m9, m1 ; t15a + paddsw m9, m1 ; t7a + ITX_MULSUB_2W 0, 10, 1, 4, 15, 75, 50, 0 ; t11, t10 + ITX_MULSUB_2W 14, 5, 1, 4, 15, 50, 75, 0 ; t14, t15 + psubsw m1, m10, m14 ; t14a + paddsw m10, m14 ; t10a + psubsw m14, m0, m5 ; t15a + paddsw m0, m5 ; t11a + psubsw m5, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m13, m9 ; t7 + paddsw m13, m9 ; t3 + ITX_MULSUB_2W 6, 5, 4, 9, 15, 83, 36, 0 ; t6a, t7a + ITX_MULSUB_2W 14, 1, 4, 9, 15, 83, 36, 0 ; t14, t15 + mova m9, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m10 ; t10a + mova m4, [rsp+gprsize+32*1] ; in0 + mova [rsp+gprsize+32*1], m6 ; t6a + mova m6, [rsp+gprsize+32*2] ; in4 + mova [rsp+gprsize+32*2], m2 ; t2 + ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091, 0 ; t1, t0 + ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703, 0 ; t5, t4 + ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751, 0 ; t9, t8 + ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380, 0 ; t13, t12 + psubsw m10, m4, m8 ; t8a + paddsw m8, m4 ; t0a + psubsw m4, m9, m7 ; t9a + paddsw m9, m7 ; t1a + psubsw m7, m6, m12 ; t12a + paddsw m6, m12 ; t4a + psubsw m12, m11, m3 ; t13a + paddsw m11, m3 ; t5a + ITX_MULSUB_2W 10, 4, 2, 3, 15, 18, 89, 0 ; t9, t8 + ITX_MULSUB_2W 12, 7, 2, 3, 15, 89, 18, 0 ; t12, t13 + psubsw m3, m9, m11 ; t5 + paddsw m9, m11 ; t1 + psubsw m11, m4, m12 ; t12a + paddsw m4, m12 ; t8a + paddsw m12, m8, m6 ; t0 + psubsw m8, m6 ; t4 + paddsw m6, m10, m7 ; t9a + psubsw m10, m7 ; t13a + ITX_MULSUB_2W 8, 3, 2, 7, 15, 36, 83, 0 ; t5a, t4a + ITX_MULSUB_2W 11, 10, 2, 7, 15, 36, 83, 0 ; t13, t12 + mova m7, [rsp+gprsize+32*0] ; t10a + mova m2, [rsp+gprsize+32*1] ; t6a + paddsw m15, m9, m13 ; -out15 + psubsw m9, m13 ; t3a + paddsw m13, m11, m1 ; -out13 + psubsw m11, m1 ; t15a + psubsw m1, m4, m7 ; t10 + paddsw m7, m4 ; -out1 + psubsw m4, m3, m2 ; t6 + paddsw m3, m2 ; -out3 + paddsw m2, m10, m14 ; out2 + psubsw m10, m14 ; t14a + paddsw m14, m6, m0 ; out14 + psubsw m6, m0 ; t11 + mova m0, [rsp+gprsize+32*2] ; t2 + mova [rsp+gprsize+32*1], m7 + psubsw m7, m12, m0 ; t2a + paddsw m0, m12 ; out0 + paddsw m12, m8, m5 ; out12 + psubsw m8, m5 ; t7 + ret +ALIGN function_align +.main_pass1_end: + mova [cq+32*0], m0 + mova [cq+32*1], m2 + mova [cq+32*2], m12 + mova [cq+32*3], m14 + vpbroadcastd m14, [vvc_pw_m64_64] + vpbroadcastd m12, [vvc_pw_64_64] + vpbroadcastd m2, [vvc_pd_64] + punpcklwd m5, m11, m10 + punpckhwd m11, m10 + pmaddwd m10, m14, m5 + pmaddwd m0, m14, m11 + pmaddwd m5, m12 + pmaddwd m11, m12 + REPX {paddd x, m2}, m10, m0, m5, m11 + REPX {psrad x, 7}, m10, m0, m5, m11 + packssdw m10, m0 ; out10 + packssdw m5, m11 ; -out5 + punpcklwd m11, m8, m4 + punpckhwd m8, m4 + pmaddwd m4, m12, m11 + pmaddwd m0, m12, m8 + pmaddwd m11, m14 + pmaddwd m8, m14 + REPX {paddd x, m2}, m4, m0, m11, m8 + REPX {psrad x, 7}, m4, m0, m11, m8 + packssdw m4, m0 ; out4 + packssdw m11, m8 ; -out11 + punpcklwd m8, m9, m7 + punpckhwd m9, m7 + pmaddwd m7, m12, m8 + pmaddwd m0, m12, m9 + pmaddwd m8, m14 + pmaddwd m9, m14 + REPX {paddd x, m2}, m7, m0, m8, m9 + REPX {psrad x, 7}, m7, m0, m8, m9 + packssdw m7, m0 ; -out7 + packssdw m8, m9 ; out8 + punpckhwd m0, m6, m1 + punpcklwd m6, m1 + pmaddwd m1, m14, m0 + pmaddwd m9, m14, m6 + pmaddwd m0, m12 + pmaddwd m6, m12 + REPX {paddd x, m2}, m1, m9, m0, m6 + REPX {psrad x, 7}, m1, m9, m0, m6 + packssdw m9, m1 ; -out7 + packssdw m6, m0 ; out8 + vpbroadcastd m1, [o(vvc_pw_8192)] + ret +ALIGN function_align +cglobal_label .main_pass2_end + ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to + ; 16-bit here will produce the same result as using 32-bit intermediates. + paddsw m5, m10, m11 ; -out5 + psubsw m10, m11 ; out10 + psubsw m11, m4, m8 ; -out11 + paddsw m4, m8 ; out4 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; -out7 + psubsw m9, m1, m6 ; -out9 + paddsw m6, m1 ; out6 + vpbroadcastd m1, [o(vvc_pw_64x8)] + REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 + vpbroadcastd m1, [o(vvc_pw_2048)] + ret + +INV_TXFM_16X16_FN flipadst, dct2 +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call m(iadst_16x16_internal_8).main + call m(iadst_16x16_internal_8).main_pass1_end + pmulhrsw m6, m1 + pmulhrsw m2, m1, m8 + mova [rsp+32*2], m6 + pmulhrsw m6, m1, m4 + pmulhrsw m4, m1, m10 + pmulhrsw m8, m1, [cq+32*3] + pmulhrsw m10, m1, [cq+32*2] + pmulhrsw m12, m1, [cq+32*1] + pmulhrsw m14, m1, [cq+32*0] + pxor m0, m0 + psubw m0, m1 + REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 + pmulhrsw m1, m0, m9 + pmulhrsw m9, m0, m13 + pmulhrsw m0, [rsp+32*1] + mova [rsp+16*0], xm15 + mova [rsp+16*1], xm7 + vperm2i128 m15, m15, m7, 0x31 + vinserti128 m7, m2, xm14, 1 + vperm2i128 m14, m2, m14, 0x31 + vinserti128 m2, m9, xm5, 1 + vperm2i128 m9, m9, m5, 0x31 + vinserti128 m5, m4, xm12, 1 + vperm2i128 m12, m4, m12, 0x31 + vinserti128 m4, m11, xm3, 1 + vperm2i128 m11, m11, m3, 0x31 + vinserti128 m3, m10, xm6, 1 + vperm2i128 m10, m10, m6, 0x31 + vinserti128 m6, m1, xm0, 1 + vperm2i128 m13, m1, m0, 0x31 + vinserti128 m1, m8, [rsp+32*2], 1 + vperm2i128 m8, m8, [rsp+32*2], 0x31 + jmp m(idct2_16x16_internal_8).pass1_end3 +.pass2: + call m(iadst_16x16_internal_8).main + call m(iadst_16x16_internal_8).main_pass2_end + pmulhrsw m0, m1 + pmulhrsw m8, m1 + mova [rsp+32*0], m0 + mova [rsp+32*2], m8 + pxor m0, m0 + psubw m0, m1 + pmulhrsw m8, m0, m7 + pmulhrsw m7, m0, m9 + pmulhrsw m9, m1, m6 + pmulhrsw m6, m1, m10 + pmulhrsw m10, m0, m5 + pmulhrsw m5, m0, m11 + pmulhrsw m11, m1, m4 + pmulhrsw m4, m1, m12 + pmulhrsw m12, m0, m3 + pmulhrsw m3, m0, m13 + pmulhrsw m13, m1, m2 + pmulhrsw m1, m14 + pmulhrsw m14, m0, [rsp+32*1] + pmulhrsw m0, m15 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 + mova m15, [rsp+32*0] + WRITE_16X2 3, 4, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 + WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 + jmp m(idct2_16x16_internal_8).end3 + +%macro IDTX16B 3 ; src/dst, tmp, vvc_pw_1697x16 + pmulhrsw m%2, m%3, m%1 + psraw m%2, 1 + pavgw m%1, m%2 ; signs are guaranteed to be equal +%endmacro + +INV_TXFM_16X16_FN identity, dct2 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + vpbroadcastd m7, [o(vvc_pw_1697x16)] + mova xm0, [cq+16* 0] + vinserti128 m0, [cq+16*16], 1 + mova xm15, [cq+16* 1] + vinserti128 m15, [cq+16*17], 1 + mova xm1, [cq+16* 2] + vinserti128 m1, [cq+16*18], 1 + mova xm8, [cq+16* 3] + vinserti128 m8, [cq+16*19], 1 + mova xm2, [cq+16* 4] + vinserti128 m2, [cq+16*20], 1 + mova xm9, [cq+16* 5] + vinserti128 m9, [cq+16*21], 1 + mova xm3, [cq+16* 6] + vinserti128 m3, [cq+16*22], 1 + mova xm10, [cq+16* 7] + add cq, 16*16 + vinserti128 m10, [cq+16* 7], 1 + mova xm4, [cq-16* 8] + vinserti128 m4, [cq+16* 8], 1 + mova xm11, [cq-16* 7] + vinserti128 m11, [cq+16* 9], 1 + mova xm5, [cq-16* 6] + vinserti128 m5, [cq+16*10], 1 + mova xm12, [cq-16* 5] + vinserti128 m12, [cq+16*11], 1 + mova xm13, [cq-16* 3] + vinserti128 m13, [cq+16*13], 1 + mova xm14, [cq-16* 1] + vinserti128 m14, [cq+16*15], 1 + REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ + 10, 4, 11, 5, 12, 13, 14 + mova xm6, [cq-16* 4] + vinserti128 m6, [cq+16*12], 1 + mova [rsp], m0 + IDTX16B 6, 0, 7 + mova xm0, [cq-16* 2] + vinserti128 m0, [cq+16*14], 1 + pmulhrsw m7, m0 + psraw m7, 1 + pavgw m7, m0 + jmp m(idct2_16x16_internal_8).pass1_end3 +ALIGN function_align +.pass2: + vpbroadcastd m15, [o(vvc_pw_1697x16)] + mova [rsp+32*1], m0 + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [rsp+32*1] + mova [rsp+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [rsp+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + jmp m(idct2_16x16_internal_8).end + +%define o_base deint_shuf + 128 + +%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 +%if %3 + vpbroadcastd m15, [o(vvc_pw_64x8)] + pmulhrsw m0, m15, [%1+%2*0] + pmulhrsw m1, m15, [%1+%2*1] + pmulhrsw m2, m15, [%1+%2*2] + pmulhrsw m3, m15, [%1+%2*3] + pmulhrsw m4, m15, [%1+%2*4] + pmulhrsw m5, m15, [%1+%2*5] + pmulhrsw m6, m15, [%1+%2*6] + pmulhrsw m7, m15, [%1+%2*7] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] + mova m7, [%1+%2*7] +%endif +%endmacro + +%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 +%if %3 +%if %3 == 1 + vpbroadcastd m15, [o(vvc_pw_64x8)] +%endif + pmulhrsw m8, m15, [%1+%2*0] + pmulhrsw m9, m15, [%1+%2*1] + pmulhrsw m10, m15, [%1+%2*2] + pmulhrsw m11, m15, [%1+%2*3] + pmulhrsw m12, m15, [%1+%2*4] + pmulhrsw m13, m15, [%1+%2*5] + pmulhrsw m14, m15, [%1+%2*6] + pmulhrsw m15, [%1+%2*7] +%else + mova m8, [%1+%2*0] + mova m9, [%1+%2*1] + mova m10, [%1+%2*2] + mova m11, [%1+%2*3] + mova m12, [%1+%2*4] + mova m13, [%1+%2*5] + mova m14, [%1+%2*6] + mova m15, [%1+%2*7] +%endif +%endmacro + +%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] + vpbroadcastd m%3, [r5-vvc_pw_201_4091x8+vvc_pw_%4_%5x8] + punpcklwd m%1, m%2, m%2 + pmulhrsw m%1, m%3 + vpbroadcastd m%3, [r5-vvc_pw_201_4091x8+vvc_pw_%6_%7x8] + punpckhwd m%2, m%2 + pmulhrsw m%2, m%3 +%endmacro + +cglobal vvc_inv_dct2_dct2_8x32_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + cmp eobd, 106 + jle .fast + LOAD_8ROWS cq+32*1, 32*2 + call m(idct2_16x8_internal_8).main + vperm2i128 m11, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m11, m4 + punpckhwd m11, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 + punpckhdq m5, m11, m4 + punpckldq m11, m4 + punpckldq m4, m7, m1 + punpckhdq m7, m1 + punpckhqdq m12, m6, m0 + punpcklqdq m0, m6 ; out4 + punpckhqdq m13, m7, m4 + punpcklqdq m4, m7 ; out5 + punpckhqdq m14, m3, m2 + punpcklqdq m2, m3 ; out6 + punpckhqdq m15, m5, m11 + punpcklqdq m11, m5 ; out7 + mova [rsp+32*0], m0 + mova [rsp+32*1], m4 + mova [rsp+32*2], m2 +.fast: + LOAD_8ROWS cq+32*0, 32*2 + call m(idct2_16x8_internal_8).main + vperm2i128 m8, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vpbroadcastd m9, [o(vvc_pw_8192)] + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m8, m4 + punpcklwd m8, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m8, m5 + punpckhdq m8, m5 + punpckhdq m5, m3, m4 + punpckldq m3, m4 + punpckhdq m4, m7, m1 + punpckldq m7, m1 + punpcklqdq m1, m7, m4 + punpckhqdq m7, m4 ; out9 + punpckhqdq m4, m2, m8 ; out10 + punpcklqdq m2, m8 + punpckhqdq m8, m3, m5 + punpcklqdq m3, m5 + punpckhqdq m5, m0, m6 ; out8 + punpcklqdq m0, m6 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 + cmp eobd, 106 + jg .full + mova [rsp+32*0], m5 + mova [rsp+32*1], m7 + mova [rsp+32*2], m4 + pmulhrsw m11, m9, m8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call .main_fast + jmp .pass2 +.dconly: + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 32 + jmp m(vvc_inv_dct2_dct2_8x8_8).dconly +.full: + REPX {pmulhrsw x, m9}, m12, m13, m14, m15 + pmulhrsw m6, m9, [rsp+32*2] + mova [rsp+32*2], m4 + pmulhrsw m4, m9, [rsp+32*0] + mova [rsp+32*0], m5 + pmulhrsw m5, m9, [rsp+32*1] + mova [rsp+32*1], m7 + pmulhrsw m7, m9, m11 + pmulhrsw m11, m9, m8 + call .main +.pass2: + vpbroadcastd m12, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m13, m14, m15 + pmulhrsw m12, [rsp] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 + mova [rsp+32*0], m4 + mova [rsp+32*1], m6 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*0], 5, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*1], 7, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 8, 9, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 10, 11, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 12, 13, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 14, 15, 4, 6 + RET +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + call m(idct2_8x16_internal_8).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + lea r5, [r6-(o_base)+vvc_pw_201_4091x8] + ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a + jmp .main2 +ALIGN function_align +cglobal_label .main + call m(idct2_8x16_internal_8).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + punpcklwd m1, m15, m8 ; in31 in1 + punpckhwd m8, m15 ; in3 in29 + punpcklwd m15, m14, m9 ; in27 in5 + punpckhwd m9, m14 ; in7 in25 + punpcklwd m14, m13, m0 ; in23 in9 + punpckhwd m0, m13 ; in11 in21 + punpcklwd m13, m12, m11 ; in19 in13 + punpckhwd m11, m12 ; in15 in17 + ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a + ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a + ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a + ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a + ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a + ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a + ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a + ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a +.main2: + psubsw m6, m1, m11 ; t17 t30 + paddsw m1, m11 ; t16 t31 + psubsw m11, m9, m14 ; t18 t29 + paddsw m9, m14 ; t19 t28 + psubsw m14, m15, m0 ; t21 t26 + paddsw m15, m0 ; t20 t27 + psubsw m0, m8, m13 ; t22 t25 + paddsw m8, m13 ; t23 t24 + ITX_MUL2X_PACK 6, 12, 13, 10, 18, 89, 3 ; t17a t30a + ITX_MUL2X_PACK 11, 12, 13, 10, m89, 18, 3 ; t18a t29a + ITX_MUL2X_PACK 14, 12, 13, 10, 75, 50, 3 ; t21a t26a + ITX_MUL2X_PACK 0, 12, 13, 10, m50, 75, 3 ; t22a t25a + psubsw m13, m1, m9 ; t19a t28a + paddsw m1, m9 ; t16a t31a + psubsw m9, m8, m15 ; t20a t27a + paddsw m8, m15 ; t23a t24a + psubsw m15, m6, m11 ; t18 t29 + paddsw m6, m11 ; t17 t30 + psubsw m11, m0, m14 ; t21 t26 + paddsw m0, m14 ; t22 t25 + ITX_MUL2X_PACK 15, 12, 14, 10, 36, 83, 3 ; t18a t29a + ITX_MUL2X_PACK 13, 12, 14, 10, 36, 83, 3 ; t19 t28 + ITX_MUL2X_PACK 9, 12, 14, 10, m83, 36, 3 ; t20 t27 + ITX_MUL2X_PACK 11, 12, 14, 10, m83, 36, 3 ; t21a t26a + vbroadcasti128 m12, [o(deint_shuf)] + psubsw m14, m1, m8 ; t23 t24 + paddsw m1, m8 ; t16 t31 + psubsw m8, m6, m0 ; t22a t25a + paddsw m6, m0 ; t17a t30a + psubsw m0, m15, m11 ; t21 t26 + paddsw m15, m11 ; t18 t29 + psubsw m11, m13, m9 ; t20a t27a + paddsw m13, m9 ; t19a t28a + REPX {pshufb x, m12}, m1, m6, m15, m13 + ITX_MUL2X_PACK 14, 9, 12, 10, 64, 64 ; t24a t23a + vpbroadcastd m9, [o(vvc_pw_m64_64)] + ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 + vpbroadcastd m12, [o(vvc_pw_64_64)] + ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a + vpbroadcastd m12, [o(vvc_pw_64_64)] + ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 + shufps m9, m14, m8, q1032 ; t23a t22 + vpblendd m14, m8, 0xcc ; t24a t25 + shufps m8, m11, m0, q1032 ; t20 t21a + vpblendd m11, m0, 0xcc ; t27 t26a + punpcklqdq m0, m1, m6 ; t16 t17a + punpckhqdq m1, m6 ; t31 t30a + psubsw m10, m5, m8 ; out20 out21 + paddsw m5, m8 ; out11 out10 + psubsw m6, m3, m14 ; out24 out25 + paddsw m3, m14 ; out7 out6 + psubsw m8, m7, m0 ; out16 out17 + paddsw m7, m0 ; out15 out14 + mova m0, [rsp+gprsize+0*32] + punpcklqdq m12, m13, m15 ; t19a t18 + punpckhqdq m13, m15 ; t28a t29 + psubsw m15, m0, m1 ; out31 out30 + paddsw m0, m1 ; out0 out1 + mova m1, [rsp+gprsize+1*32] + mova [rsp+gprsize+0*32], m6 + mova m6, [rsp+gprsize+2*32] + psubsw m14, m1, m13 ; out28 out29 + paddsw m1, m13 ; out3 out2 + psubsw m13, m2, m11 ; out27 out26 + paddsw m2, m11 ; out4 out5 + psubsw m11, m4, m9 ; out23 out22 + paddsw m4, m9 ; out8 out9 + psubsw m9, m6, m12 ; out19 out18 + paddsw m6, m12 ; out12 out13 + ret + +%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] + vbroadcasti128 m%1, [cq+16*%3] + vbroadcasti128 m%2, [cq+16*%4] + shufpd m%1, m%2, 0x0c +%endmacro + +cglobal vvc_inv_dct2_dct2_32x8_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 8 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [vvc_pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova m1, [dstq] + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq], m1 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 + LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 + LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 + LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*16 + LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 + LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 + LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 + LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 + mova [rsp+32*0], m4 + mova [rsp+32*1], m5 + mova [rsp+32*2], m6 + cmp eobd, 106 + jg .full + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(vvc_inv_dct2_dct2_8x32_8).main_fast + jmp .pass2 +.full: + LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 + LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 + LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 + LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*8 + LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 + LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 + LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 + LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + call m(vvc_inv_dct2_dct2_8x32_8).main +.pass2: + vpbroadcastd m12, [o(vvc_pw_8192)] + REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 + mova [rsp+32*1], m9 + mova [rsp+32*2], m10 + punpckhwd m9, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m10, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + punpckhwd m3, m0, m9 + punpcklwd m0, m9 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m10, m4 + punpckhwd m10, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m10, m5 + punpckhdq m10, m5 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 + pmulhrsw m12, [rsp+32*0] + mova [rsp+32*0], m8 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m10, 0x31 + vinserti128 m3, xm10, 1 + call m(idct2_16x8_internal_8).main + vpbroadcastd m8, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + lea r2, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + mova m0, [rsp+32*0] + mova m1, [rsp+32*1] + mova m2, [rsp+32*2] + punpckhwd m7, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m11 + punpcklwd m1, m11 + punpckhwd m4, m12, m14 + punpcklwd m12, m14 + punpckhwd m5, m13, m15 + punpcklwd m13, m15 + punpckhwd m3, m0, m7 + punpcklwd m0, m7 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m12, m4 + punpckhwd m12, m4 + punpcklwd m4, m5, m13 + punpckhwd m5, m13 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m12, m5 + punpckhdq m12, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m12, 0x31 + vinserti128 m3, xm12, 1 + call m(idct2_16x8_internal_8).main2 + vpbroadcastd m8, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + add r0, 16 + add r3, 16 + %define dstq r0 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + RET + +cglobal vvc_inv_identity_identity_8x32_8, 4, 5, 11, dst, stride, c, eob + vpbroadcastd m9, [vvc_pw_5] + lea r4, [strideq*3] + sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) +.loop: + mova xm0,[cq+16* 0] + mova xm1, [cq+16* 4] + vinserti128 m0, [cq+16* 1], 1 + vinserti128 m1, [cq+16* 5], 1 + pxor m8, m8 + mova [cq+32*0], m8 + mova [cq+32*2], m8 + add cq, 16*16 + mova xm2, [cq-16* 8] + mova xm3, [cq-16* 4] + vinserti128 m2, [cq-16* 7], 1 + vinserti128 m3, [cq-16* 3], 1 + mova xm4, [cq+16* 0] + mova xm5, [cq+16* 4] + vinserti128 m4, [cq+16* 1], 1 + vinserti128 m5, [cq+16* 5], 1 + mova xm6, [cq+16* 8] + mova xm7, [cq+16*12] + vinserti128 m6, [cq+16* 9], 1 + vinserti128 m7, [cq+16*13], 1 + REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 + REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose8x8 + REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + sub cq, 16*16-32 + lea dstq, [dstq+r4*4] + add eobd, 0x80000000 + jnc .loop + RET +ALIGN function_align +.transpose8x8: + punpckhwd m8, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m8, m1 + punpckhdq m8, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + ret + +cglobal vvc_inv_identity_identity_32x8_8, 4, 6, 10, dst, stride, c, eob + add cq, 16*8 + vpbroadcastd m9, [vvc_pw_4096] + lea r4, [strideq*3] + lea r5, [dstq+strideq*4] + sub eobd, 107 +.loop: + mova xm0, [cq-16*8] + mova xm1, [cq-16*7] + vinserti128 m0, [cq+16*0], 1 + vinserti128 m1, [cq+16*1], 1 + mova xm2, [cq-16*6] + mova xm3, [cq-16*5] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m3, [cq+16*3], 1 + mova xm4, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm6, [cq-16*2] + mova xm7, [cq-16*1] + vinserti128 m6, [cq+16*6], 1 + vinserti128 m7, [cq+16*7], 1 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + %define dstq r5 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + add cq, 16*16 + add r0, 16 + add r5, 16 + add eobd, 0x80000000 + jnc .loop + RET + +%define o_base vvc_pw_5 + 128 + +%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs +%if %3 + vpbroadcastd m15, [o(vvc_pw_64x8)] + pmulhrsw m0, m15, [%1+%2* 0] + pmulhrsw m1, m15, [%1+%2* 1] + pmulhrsw m2, m15, [%1+%2* 2] + pmulhrsw m3, m15, [%1+%2* 3] + pmulhrsw m4, m15, [%1+%2* 4] + pmulhrsw m5, m15, [%1+%2* 5] + pmulhrsw m6, m15, [%1+%2* 6] + pmulhrsw m7, m15, [%1+%2* 7] + pmulhrsw m8, m15, [%1+%2* 8] + pmulhrsw m9, m15, [%1+%2* 9] + pmulhrsw m10, m15, [%1+%2*10] + pmulhrsw m11, m15, [%1+%2*11] + pmulhrsw m12, m15, [%1+%2*12] + pmulhrsw m13, m15, [%1+%2*13] + pmulhrsw m14, m15, [%1+%2*14] + pmulhrsw m15, [%1+%2*15] +%else + mova m0, [%1+%2* 0] + mova m1, [%1+%2* 1] + mova m2, [%1+%2* 2] + mova m3, [%1+%2* 3] + mova m4, [%1+%2* 4] + mova m5, [%1+%2* 5] + mova m6, [%1+%2* 6] + mova m7, [%1+%2* 7] + mova m8, [%1+%2* 8] + mova m9, [%1+%2* 9] + mova m10, [%1+%2*10] + mova m11, [%1+%2*11] + mova m12, [%1+%2*12] + mova m13, [%1+%2*13] + mova m14, [%1+%2*14] + mova m15, [%1+%2*15] +%endif + mova [rsp], m15 +%if %4 + pxor m15, m15 + REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15 +%endif +%endmacro + +%macro IDCT2_32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 + pmovzxbw m%4, [dstq+%6] + pmulhrsw m%3, m%5 + pmulhrsw m%1, m%5 + paddw m%3, m%4 + pmovzxbw m%4, [r2+%7] + paddw m%1, m%4 + packuswb m%3, m%1 + vpermq m%3, m%3, q3120 + mova [dstq+%6], xm%3 + vextracti128 [r2+%7], m%3, 1 +%endmacro + +cglobal vvc_inv_dct2_dct2_16x32_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3 + %undef cmp + LOAD_16ROWS cq, 64, 1 + call m(idct2_16x16_internal_8).main + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + lea tmp3q, [tmp1q+32*16] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(vvc_pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp3q-32*4+ 0], xm0 + vextracti128 [tmp3q+32*0+ 0], m0, 1 + mova [tmp3q-32*3+ 0], xm2 + vextracti128 [tmp3q+32*1+ 0], m2, 1 + mova [tmp3q-32*2+ 0], xm4 + vextracti128 [tmp3q+32*2+ 0], m4, 1 + mova [tmp3q-32*1+ 0], xm6 + vextracti128 [tmp3q+32*3+ 0], m6, 1 + mova [tmp3q-32*4+16], xm8 + vextracti128 [tmp3q+32*0+16], m8, 1 + mova [tmp3q-32*3+16], xm10 + vextracti128 [tmp3q+32*1+16], m10, 1 + mova [tmp3q-32*2+16], xm12 + vextracti128 [tmp3q+32*2+16], m12, 1 + mova [tmp3q-32*1+16], xm14 + vextracti128 [tmp3q+32*3+16], m14, 1 + cmp eobd, 150 + jg .full + vinserti128 m0, m1, xm9, 1 + vperm2i128 m4, m1, m9, 0x31 + vinserti128 m2, m5, xm13, 1 + vperm2i128 m6, m5, m13, 0x31 + vinserti128 m1, m3, xm11, 1 + vperm2i128 m5, m3, m11, 0x31 + vinserti128 m3, m7, xm15, 1 + vperm2i128 m7, m7, m15, 0x31 + call .main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct2_16 +.dconly: + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 32 + jmp m(vvc_inv_dct2_dct2_16x4_8).dconly +.full: + mova [tmp1q-32*4], m1 + mova [tmp1q-32*3], m3 + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m7 + mova [tmp1q+32*0], m9 + mova [tmp1q+32*1], m11 + mova [tmp1q+32*2], m13 + mova [tmp1q+32*3], m15 + LOAD_16ROWS cq+32, 64, 1 + call m(idct2_16x16_internal_8).main + lea r2, [tmp3q+32*8] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(vvc_pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [r2-32*4+ 0], xm0 + vextracti128 [r2+32*0+ 0], m0, 1 + mova [r2-32*3+ 0], xm2 + vextracti128 [r2+32*1+ 0], m2, 1 + mova [r2-32*2+ 0], xm4 + vextracti128 [r2+32*2+ 0], m4, 1 + mova [r2-32*1+ 0], xm6 + vextracti128 [r2+32*3+ 0], m6, 1 + mova [r2-32*4+16], xm8 + vextracti128 [r2+32*0+16], m8, 1 + mova [r2-32*3+16], xm10 + vextracti128 [r2+32*1+16], m10, 1 + mova [r2-32*2+16], xm12 + vextracti128 [r2+32*2+16], m12, 1 + mova [r2-32*1+16], xm14 + vextracti128 [r2+32*3+16], m14, 1 + vinserti128 m8, m1, xm9, 1 + vperm2i128 m12, m1, m9, 0x31 + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp1q+32*0], 1 + vinserti128 m1, [tmp1q+32*1], 1 + vinserti128 m10, m5, xm13, 1 + vperm2i128 m14, m5, m13, 0x31 + mova xm4, [tmp1q-32*4+16] + mova xm5, [tmp1q-32*3+16] + vinserti128 m4, [tmp1q+32*0+16], 1 + vinserti128 m5, [tmp1q+32*1+16], 1 + vinserti128 m9, m3, xm11, 1 + vperm2i128 m13, m3, m11, 0x31 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp1q+32*2], 1 + vinserti128 m3, [tmp1q+32*3], 1 + vinserti128 m11, m7, xm15, 1 + vperm2i128 m15, m7, m15, 0x31 + mova xm6, [tmp1q-32*2+16] + mova xm7, [tmp1q-32*1+16] + vinserti128 m6, [tmp1q+32*2+16], 1 + vinserti128 m7, [tmp1q+32*3+16], 1 + call .main_oddhalf + LOAD_8ROWS_H r2-32*4, 32 +.idct2_16: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +cglobal_label .main_oddhalf_fast ; lower half is zero + mova [rsp+gprsize+32*1], m7 + pxor m7, m7 + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m7 + vpbroadcastd m11, [o(vvc_pw_3703x8)] + vpbroadcastd m7, [o(vvc_pw_1751x8)] + vpbroadcastd m12, [o(vvc_pw_m1380x8)] + vpbroadcastd m8, [o(vvc_pw_3857x8)] + vpbroadcastd m13, [o(vvc_pw_3973x8)] + vpbroadcastd m15, [o(vvc_pw_995x8)] + pmulhrsw m11, m4 ; t29a + pmulhrsw m4, m7 ; t18a + pmulhrsw m12, m3 ; t19a + pmulhrsw m3, m8 ; t28a + pmulhrsw m13, m2 ; t27a + pmulhrsw m2, m15 ; t20a + vpbroadcastd m10, [o(vvc_pw_m2106x8)] + vpbroadcastd m7, [o(vvc_pw_3513x8)] + vpbroadcastd m9, [o(vvc_pw_3290x8)] + vpbroadcastd m8, [o(vvc_pw_2440x8)] + vpbroadcastd m14, [o(vvc_pw_m601x8)] + vpbroadcastd m15, [o(vvc_pw_4052x8)] + pmulhrsw m10, m5 ; t21a + pmulhrsw m5, m7 ; t26a + pmulhrsw m9, m6 ; t25a + pmulhrsw m6, m8 ; t22a + pmulhrsw m14, m1 ; t23a + pmulhrsw m1, m15 ; t24a + vpbroadcastd m15, [o(vvc_pd_64)] + jmp .main2 +ALIGN function_align +cglobal_label .main_oddhalf + mova [rsp+gprsize+32*0], m15 + mova [rsp+gprsize+32*1], m7 + mova [rsp+gprsize+32*2], m8 + vpbroadcastd m15, [o(vvc_pd_64)] + ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703, 0 ; t18a, t29a + ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380, 0 ; t19a, t28a + ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973, 0 ; t20a, t27a + ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106, 0 ; t21a, t26a + ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290, 0 ; t22a, t25a + ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601, 0 ; t23a, t24a +.main2: + psubsw m7, m12, m4 ; t18 + paddsw m12, m4 ; t19 + psubsw m4, m2, m10 ; t21 + paddsw m2, m10 ; t20 + psubsw m10, m14, m6 ; t22 + paddsw m14, m6 ; t23 + psubsw m6, m1, m9 ; t25 + paddsw m1, m9 ; t24 + psubsw m9, m13, m5 ; t26 + paddsw m13, m5 ; t27 + psubsw m5, m3, m11 ; t29 + paddsw m3, m11 ; t28 + ITX_MULSUB_2W 5, 7, 8, 11, 15, m89, 18, 0 ; t18a, t29a + ITX_MULSUB_2W 9, 4, 8, 11, 15, 75, 50, 0 ; t21a, t26a + ITX_MULSUB_2W 6, 10, 8, 11, 15, m50, 75, 0 ; t22a, t25a + psubsw m8, m14, m2 ; t20a + paddsw m14, m2 ; t23a + psubsw m2, m1, m13 ; t27a + paddsw m1, m13 ; t24a + psubsw m13, m6, m9 ; t21 + paddsw m6, m9 ; t22 + psubsw m9, m10, m4 ; t26 + paddsw m10, m4 ; t25 + ITX_MULSUB_2W 2, 8, 4, 11, 15, m83, 36, 0 ; t20, t27 + ITX_MULSUB_2W 9, 13, 4, 11, 15, m83, 36, 0 ; t21a, t26a + mova m4, [rsp+gprsize+32*0] ; in31 + mova [rsp+gprsize+32*0], m6 ; t22 + mova m6, [rsp+gprsize+32*1] ; in15 + mova [rsp+gprsize+32*1], m14 ; t23a + mova m14, [rsp+gprsize+32*2] ; in17 + mova [rsp+gprsize+32*2], m1 ; t24a + ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091, 0 ; t16a, t31a + ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751, 0 ; t17a, t30a + psubsw m1, m0, m14 ; t17 + paddsw m0, m14 ; t16 + psubsw m14, m4, m6 ; t30 + paddsw m4, m6 ; t31 + ITX_MULSUB_2W 14, 1, 6, 11, 15, 18, 89, 0 ; t17a, t30a + psubsw m6, m0, m12 ; t19a + paddsw m0, m12 ; t16a + psubsw m12, m4, m3 ; t28a + paddsw m4, m3 ; t31a + psubsw m3, m14, m5 ; t18 + paddsw m14, m5 ; t17 + psubsw m5, m1, m7 ; t29 + paddsw m1, m7 ; t30 + ITX_MULSUB_2W 5, 3, 7, 11, 15, 36, 83, 0 ; t18a, t29a + ITX_MULSUB_2W 12, 6, 7, 11, 15, 36, 83, 0 ; t19, t28 + psubsw m7, m1, m10 ; t25a + paddsw m1, m10 ; t30a + psubsw m10, m5, m9 ; t21 + paddsw m5, m9 ; t18 + psubsw m9, m12, m2 ; t20a + paddsw m12, m2 ; t19a + psubsw m2, m3, m13 ; t26 + paddsw m3, m13 ; t29 + psubsw m13, m6, m8 ; t27a + paddsw m6, m8 ; t28a + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m12 + mova [tmp2q+32*0], m6 + mova [tmp2q+32*1], m3 + mova [tmp2q+32*2], m1 + mova m5, [rsp+gprsize+32*0] ; t22 + mova m6, [rsp+gprsize+32*1] ; t23 + mova m3, [rsp+gprsize+32*2] ; t24a + psubsw m1, m14, m5 ; t22a + paddsw m14, m5 ; t17a + psubsw m5, m0, m6 ; t23 + paddsw m0, m6 ; t16 + psubsw m6, m4, m3 ; t24 + paddsw m4, m3 ; t31 + vpbroadcastd m8, [o(vvc_pw_m64_64)] + vpbroadcastd m3, [o(vvc_pw_64_64)] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m14 + mova [tmp2q+32*3], m4 + ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8, 1 ; t20, t27 + ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8, 1 ; t21a, t26a + ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8, 1 ; t22, t25 + ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8, 1 ; t23a, t24a + mova [tmp1q+32*0], m13 + mova [tmp1q+32*1], m2 + mova [tmp1q+32*2], m7 + mova [tmp1q+32*3], m6 + mova [tmp2q-32*4], m5 + mova [tmp2q-32*3], m1 + mova [tmp2q-32*2], m10 + mova [tmp2q-32*1], m9 + ret +ALIGN function_align +.transpose_2x8x8_round: + punpckhwd m6, m12, m13 + punpcklwd m12, m13 + punpckhwd m13, m8, m9 + punpcklwd m8, m9 + punpckhwd m9, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m10, m11 + punpcklwd m10, m11 + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 + punpckhdq m11, m8, m10 + punpckldq m8, m10 + punpckldq m10, m12, m14 + punpckhdq m12, m14 + punpckhdq m14, m13, m15 + punpckldq m13, m15 + punpckldq m15, m6, m9 + punpckhdq m6, m9 + punpckhqdq m9, m8, m10 + punpcklqdq m8, m10 + punpcklqdq m10, m11, m12 + punpckhqdq m11, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m14, m6 + punpcklqdq m14, m6 + pmulhrsw m6, m7, [rsp+gprsize+32*0] + REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 + pmulhrsw m7, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m15 + vpbroadcastd m15, [o(vvc_pw_2048)] + IDCT2_32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 + IDCT2_32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 + IDCT2_32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 + IDCT2_32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT2_32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 + IDCT2_32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 + IDCT2_32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 + IDCT2_32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + IDCT2_32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 + IDCT2_32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 + IDCT2_32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 + IDCT2_32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m7, [rsp+gprsize+32*0] + mova m1, [rsp+gprsize+32*2] + IDCT2_32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 + IDCT2_32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 + IDCT2_32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 + IDCT2_32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 + ret + +; Perform the final sumsub step and YMM lane shuffling +%macro IDCT2_32_PASS1_END 4 ; row[1-2], tmp[1-2] + mova m%3, [tmp2q+32*( 3-%1)] + psubsw m%4, m%1, m%3 + paddsw m%1, m%3 + mova m%3, [tmp1q+32*(11-%2)] + mova [tmp1q+32*(11-%2)+16], xm%4 + vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 + paddsw m%4, m%2, m%3 + psubsw m%2, m%3 + mova [tmp1q+32*(11-%2)], xm%2 + vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 + vperm2i128 m%2, m%1, m%4, 0x31 + vinserti128 m%1, xm%4, 1 +%endmacro + +cglobal vvc_inv_dct2_dct2_32x16_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 16 + jmp m(vvc_inv_dct2_dct2_32x8_8).dconly +.normal: + PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 + vpbroadcastd m15, [o(vvc_pw_64x8)] + pmulhrsw m0, m15, [cq+32* 1] + pmulhrsw m1, m15, [cq+32* 3] + pmulhrsw m2, m15, [cq+32* 5] + pmulhrsw m3, m15, [cq+32* 7] + pmulhrsw m4, m15, [cq+32* 9] + pmulhrsw m5, m15, [cq+32*11] + pmulhrsw m6, m15, [cq+32*13] + pmulhrsw m7, m15, [cq+32*15] + pmulhrsw m8, m15, [cq+32*17] + pmulhrsw m9, m15, [cq+32*19] + pmulhrsw m10, m15, [cq+32*21] + pmulhrsw m11, m15, [cq+32*23] + pmulhrsw m12, m15, [cq+32*25] + pmulhrsw m13, m15, [cq+32*27] + pmulhrsw m14, m15, [cq+32*29] + pmulhrsw m15, [cq+32*31] + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + LOAD_16ROWS cq+32*0, 32*2, 1, 0 + pxor m15, m15 + mov r3d, 8 +.zero_loop: + mova [cq+32*0], m15 + mova [cq+32*1], m15 + mova [cq+32*2], m15 + mova [cq+32*3], m15 + add cq, 32*4 + dec r3d + jg .zero_loop + call m(idct2_16x16_internal_8).main + call .pass1_end + lea r2, [strideq*3] + mov r3, dstq +.pass2: + vpbroadcastd m7, [o(vvc_pw_16384)] + call m(vvc_inv_dct2_dct2_16x32_8).transpose_2x8x8_round + call m(idct2_16x16_internal_8).main + mova [rsp+32*2], m15 + vpbroadcastd m15, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m15}, m2, m3, m0 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m4, m5, m6, m7 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m8, m9, m10, m11 + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m11, m12, m13, m14 + pmulhrsw m15, [rsp+32*2] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + test r3, r3 + jnz .right_half + RET +.right_half: + LOAD_8ROWS tmp1q-32*4, 32 + LOAD_8ROWS_H tmp2q-32*4, 32 + lea dstq, [r3+16] + xor r3d, r3d + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + jmp .pass2 +ALIGN function_align +.pass1_end: + mova [rsp+gprsize+32*0], m9 + IDCT2_32_PASS1_END 0, 8, 1, 9 + IDCT2_32_PASS1_END 2, 10, 1, 9 + IDCT2_32_PASS1_END 3, 11, 1, 9 + IDCT2_32_PASS1_END 4, 12, 1, 9 + IDCT2_32_PASS1_END 5, 13, 1, 9 + IDCT2_32_PASS1_END 6, 14, 1, 9 + IDCT2_32_PASS1_END 7, 15, 1, 9 + mova m1, [rsp+gprsize+32*1] + mova m9, [rsp+gprsize+32*0] + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*1], m7 + IDCT2_32_PASS1_END 1, 9, 6, 7 + ret + +cglobal vvc_inv_identity_identity_16x32_8, 4, 5, 13, dst, stride, c, eob +%undef cmp + lea r6, [o_base] + vpbroadcastd m9, [o(vvc_pw_64x8)] + vpbroadcastd m10, [o(vvc_pw_1697x16)] + vpbroadcastd m12, [o(vvc_pw_8192)] + cmp eobd, 43 ; if (eob > 43) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg al ; iteration_count++ + add eobd, -279 ; if (eob > 278) + adc r4b, al ; iteration_count++ + lea r3, [strideq*3] + mov r6, cq + paddw m11, m12, m12 ; vvc_pw_16384 +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jge .loop + sub cq, 32 + pxor m0, m0 + mov r0d, 8 + cmp cq, r6 + ja .zero_loop +.zero_loop_half: + mova [r6+64*0], m0 + mova [r6+64*1], m0 + add r6, 64*4 + mova [r6-64*2], m0 + mova [r6-64*1], m0 + sub r0d, 2 + jg .zero_loop_half + RET +.zero_loop: + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + mova [r6+32*3], m0 + add r6, 32*4 + dec r0d + jg .zero_loop + RET + +cglobal vvc_inv_identity_identity_32x16_8, 4, 6, 12, dst, stride, c, eob +%undef cmp + lea r6, [o_base] + vpbroadcastd m9, [o(vvc_pw_64x8)] + vpbroadcastd m10, [o(vvc_pw_1697x16)] + vpbroadcastd m11, [o(vvc_pw_2048)] + cmp eobd, 35 ; if (eob > 35) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg r3b ; iteration_count += 2 + lea r4d, [r4+r3*2] + lea r3, [strideq*3] + mov r5, dstq + mov r6, cq +.loop: + mova xm0, [cq+32* 0] + mova xm1, [cq+32* 1] + vinserti128 m0, [cq+32* 8], 1 + vinserti128 m1, [cq+32* 9], 1 + mova xm2, [cq+32* 2] + mova xm3, [cq+32* 3] + vinserti128 m2, [cq+32*10], 1 + vinserti128 m3, [cq+32*11], 1 + mova xm4, [cq+32* 4] + mova xm5, [cq+32* 5] + vinserti128 m4, [cq+32*12], 1 + vinserti128 m5, [cq+32*13], 1 + mova xm6, [cq+32* 6] + mova xm7, [cq+32* 7] + vinserti128 m6, [cq+32*14], 1 + vinserti128 m7, [cq+32*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jl .ret + test r4b, 1 + jz .loop + add cq, 32*15 + lea dstq, [r5+16] + jmp .loop +.ret: + sub cd, eax + pxor m0, m0 + add cd, 384 +.zero_loop: + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + mova [r6+32*3], m0 + add r6, 32*4 + sub cd, 128 + jge .zero_loop + RET + +cglobal vvc_inv_dct2_dct2_32x32_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 32 + jmp m(vvc_inv_dct2_dct2_32x8_8).dconly +.normal: + PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + %undef cmp + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + sub eobd, 136 + mov tmp4d, eobd +.pass1_loop: + LOAD_8ROWS cq+64*1, 64*2 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test tmp4d, tmp4d + jl .fast + LOAD_8ROWS_H cq+64*17, 64*2 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2 + pxor m0, m0 + REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + mova [rsp], m15 + jmp .idct2_16 +.fast: + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct2_16: + LOAD_8ROWS cq+64*0, 64*2 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct2_16x16_internal_8).main + call m(vvc_inv_dct2_dct2_32x16_8).pass1_end + vpbroadcastd m7, [o(vvc_pw_8192)] + call m(vvc_inv_dct2_dct2_16x32_8).transpose_2x8x8_round + lea tmp3q, [tmp1q+32*32] + mova m15, [rsp] + mova [tmp3q-32*4], m0 + mova [tmp3q-32*3], m2 + mova [tmp3q-32*2], m4 + mova [tmp3q-32*1], m6 + mova [tmp3q+32*0], m8 + mova [tmp3q+32*1], m10 + mova [tmp3q+32*2], m12 + mova [tmp3q+32*3], m14 + add tmp3q, 32*8 + mova [tmp3q-32*4], m1 + mova [tmp3q-32*3], m3 + mova [tmp3q-32*2], m5 + mova [tmp3q-32*1], m7 + mova [tmp3q+32*0], m9 + mova [tmp3q+32*1], m11 + mova [tmp3q+32*2], m13 + mova [tmp3q+32*3], m15 + vpbroadcastd m9, [o(vvc_pw_8192)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*16 + add tmp2q, 32*16 + add eobd, 0x80000000 + jnc .pass1_loop + add tmp1q, 32*24 + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + test tmp4d, tmp4d + jge .pass2_loop + add tmp1q, 32*16 + add tmp2q, 32*16 + add tmp3q, 32*16 +.pass2_loop: + LOAD_8ROWS tmp2q-32*4, 32 + test tmp4d, tmp4d + jl .fast2 + LOAD_8ROWS_H tmp3q-32*4, 32 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + sub tmp3q, 32*8 + LOAD_8ROWS_H tmp3q-32*4, 32 + sub tmp3q, 32*16 + jmp .pass2_loop_end +.fast2: + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + sub tmp3q, 32*24 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 +.pass2_loop_end: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + call m(vvc_inv_dct2_dct2_16x32_8).pass2_end + lea tmp3q, [tmp1q-32*32] + cmp tmp2q, tmp3q + jb .ret + sub tmp2q, 32*32 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + jmp .pass2_loop +.ret: + RET + +cglobal vvc_inv_identity_identity_32x32_8, 4, 6, 10, dst, stride, c, eob + %undef cmp + vpbroadcastd m9, [vvc_pw_8192] + sub eobd, 136 ; if (eob < 136) + shr eobd, 30 ; topleft 16x16 only + lea eobd, [eobq*2-8] + lea r4, [strideq*3] + mov r5, dstq + lea r6, [cq+32] +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + add cq, 16 + inc eobd + jz .ret + test eobd, 3 + jnz .loop + add cq, 64*15 + lea dstq, [r5+16] + jmp .loop +.ret: + pxor m0, m0 + mov r0d, 16 + cmp cq, r6 + jne .zero_loop +.zero_loop_topleft: + mova [r6-32*1], m0 + mova [r6+32*1], m0 + mova [r6+32*3], m0 + mova [r6+32*5], m0 + add r6, 64*4 + sub r0d, 4 + jg .zero_loop_topleft + RET +.zero_loop: + mova [r6-32*1], m0 + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + add r6, 32*4 + dec r0d + jg .zero_loop + RET + +%macro IDCT2_64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [tmp2q-32*(51-%1)] ; idct2_16 out 0+n + mova m%4, [tmp1q-32*(14+%1)] ; idct2_32 out31-n +%else + mova m%5, [tmp1q-32*(45-%1)] + mova m%4, [tmp2q-32*(20+%1)] +%endif + psubsw m%6, m%5, m%4 ; idct2_32 out31-n + paddsw m%5, m%4 ; idct2_32 out 0+n + psubsw m%4, m%6, m%3 ; out32+n + paddsw m%6, m%3 ; out31-n + psubsw m%3, m%5, m%2 ; out63-n + paddsw m%5, m%2 ; out 0+n +%if %0 == 6 ; pass 1 +%if %1 & 1 + mova [tmp2q-32*(19-%1)], m%4 + mova [tmp1q-32*(14+%1)], m%6 + mova [tmp1q+32*(18-%1)], m%3 + mova [tmp2q-32*(51-%1)], m%5 +%else + mova [tmp1q-32*(13-%1)], m%4 + mova [tmp2q-32*(20+%1)], m%6 + mova [tmp2q+32*(12-%1)], m%3 + mova [tmp1q-32*(45-%1)], m%5 +%endif +%else ; pass 2 + REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + pmovzxbw m%2, [%%d0+%9 ] + paddw m%2, m%4 + pmovzxbw m%4, [%%d1+%8 ] + paddw m%4, m%6 + pmovzxbw m%6, [%%d1+%10] + paddw m%3, m%6 + pmovzxbw m%6, [%%d0+%7 ] + paddw m%5, m%6 + packuswb m%2, m%4 + packuswb m%3, m%5 + vpermq m%2, m%2, q3120 + vpermq m%3, m%3, q3120 + mova [%%d0+%9 ], xm%2 + vextracti128 [%%d1+%8 ], m%2, 1 + mova [%%d1+%10], xm%3 + vextracti128 [%%d0+%7 ], m%3, 1 +%endif +%endmacro + +cglobal vvc_inv_dct2_dct2_16x64_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 64 + jmp m(vvc_inv_dct2_dct2_16x4_8).dconly +.normal: + PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + %undef cmp + lea tmp1q, [rsp+32*23] + lea tmp2q, [tmp1q+32*24] + sub eobd, 151 + mov r7d, eobd +.pass1_loop: + LOAD_16ROWS cq, 64 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(vvc_pw_8192)] + call m(vvc_inv_dct2_dct2_16x32_8).transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m2 + mova [tmp1q-32*2], m4 + mova [tmp1q-32*1], m6 + mova [tmp1q+32*0], m8 + mova [tmp1q+32*1], m10 + mova [tmp1q+32*2], m12 + mova [tmp1q+32*3], m14 + mova [tmp2q-32*4], m1 + mova [tmp2q-32*3], m3 + mova [tmp2q-32*2], m5 + mova [tmp2q-32*1], m7 + mova [tmp2q+32*0], m9 + mova [tmp2q+32*1], m11 + mova [tmp2q+32*2], m13 + mova [tmp2q+32*3], m15 + add cq, 32 + add tmp1q, 32*8 + add tmp2q, 32*8 + add eobd, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*23] + mova xm0, [r2-32*4+ 0] + mova xm1, [r2-32*2+ 0] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m1, [r2+32*2+ 0], 1 + mova xm2, [r2-32*4+16] + mova xm3, [r2-32*2+16] + vinserti128 m2, [r2+32*0+16], 1 + vinserti128 m3, [r2+32*2+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r7d, r7d + jl .fast + lea r3, [r2+32*8] + mova xm4, [r3-32*4+ 0] + mova xm5, [r3-32*2+ 0] + vinserti128 m4, [r3+32*0+ 0], 1 + vinserti128 m5, [r3+32*2+ 0], 1 + mova xm6, [r3-32*4+16] + mova xm7, [r3-32*2+16] + vinserti128 m6, [r3+32*0+16], 1 + vinserti128 m7, [r3+32*2+16], 1 +.fast: + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova xm0, [r2-32*3+ 0] + mova xm1, [r2-32*1+ 0] + vinserti128 m0, [r2+32*1+ 0], 1 + vinserti128 m1, [r2+32*3+ 0], 1 + mova xm2, [r2-32*3+16] + mova xm3, [r2-32*1+16] + vinserti128 m2, [r2+32*1+16], 1 + vinserti128 m3, [r2+32*3+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r7d, r7d + jl .fast2 + mova xm4, [r3-32*3+ 0] + mova xm5, [r3-32*1+ 0] + vinserti128 m4, [r3+32*1+ 0], 1 + vinserti128 m5, [r3+32*3+ 0], 1 + mova xm6, [r3-32*3+16] + mova xm7, [r3-32*1+16] + vinserti128 m6, [r3+32*1+16], 1 + vinserti128 m7, [r3+32*3+16], 1 +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + add r2, 32*24 + vpbroadcastd m15, [o(vvc_pd_64)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova xm0, [r2-32*4+ 0] + mova xm3, [r2-32*1+16] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m3, [r2+32*3+16], 1 + mova xm4, [r2-32*4+16] + mova xm7, [r2-32*1+ 0] + vinserti128 m4, [r2+32*0+16], 1 + vinserti128 m7, [r2+32*3+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast3 + add r3, 32*24 + mova xm1, [r3-32*1+16] + mova xm2, [r3-32*4+ 0] + vinserti128 m1, [r3+32*3+16], 1 + vinserti128 m2, [r3+32*0+ 0], 1 + mova xm5, [r3-32*1+ 0] + mova xm6, [r3-32*4+16] + vinserti128 m5, [r3+32*3+ 0], 1 + vinserti128 m6, [r3+32*0+16], 1 +.fast3: + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova xm0, [r2-32*2+ 0] + mova xm3, [r2-32*3+16] + vinserti128 m0, [r2+32*2+ 0], 1 + vinserti128 m3, [r2+32*1+16], 1 + mova xm4, [r2-32*2+16] + mova xm7, [r2-32*3+ 0] + vinserti128 m4, [r2+32*2+16], 1 + vinserti128 m7, [r2+32*1+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast4 + mova xm1, [r3-32*3+16] + mova xm2, [r3-32*2+ 0] + vinserti128 m1, [r3+32*1+16], 1 + vinserti128 m2, [r3+32*2+ 0], 1 + mova xm5, [r3-32*3+ 0] + mova xm6, [r3-32*2+16] + vinserti128 m5, [r3+32*1+ 0], 1 + vinserti128 m6, [r3+32*2+16], 1 +.fast4: + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass2 + RET +ALIGN function_align +%define o_base idct2_64_mul - 8 +cglobal_label .main_part1 + ; idct2_64 steps 1-5: + ; in1/31/17/15/ 9/23/25/ 7 -> + ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a + ; in5/27/21/11/13/19/29/ 3 -> + ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a + vpbroadcastd m11, [o(idct2_64_mul+4* 0)] + vpbroadcastd m13, [o(idct2_64_mul+4* 1)] + vpbroadcastd m10, [o(idct2_64_mul+4* 4)] + vpbroadcastd m12, [o(idct2_64_mul+4* 5)] + pmulhrsw m11, m0 ; t63a + pmulhrsw m0, m13 ; t32a + pmulhrsw m10, m1 ; t62a + pmulhrsw m1, m12 ; t33a + vpbroadcastd m9, [o(idct2_64_mul+4* 8)] + vpbroadcastd m13, [o(idct2_64_mul+4* 9)] + vpbroadcastd m8, [o(idct2_64_mul+4*12)] + vpbroadcastd m12, [o(idct2_64_mul+4*13)] + pmulhrsw m9, m2 ; t61a + pmulhrsw m2, m13 ; t34a + pmulhrsw m8, m3 ; t60a + pmulhrsw m3, m12 ; t35a + psubsw m12, m0, m1 ; t33 + paddsw m0, m1 ; t32 + psubsw m1, m3, m2 ; t34 + paddsw m3, m2 ; t35 + psubsw m2, m8, m9 ; t61 + paddsw m8, m9 ; t60 + psubsw m9, m11, m10 ; t62 + paddsw m11, m10 ; t63 + ITX_MULSUB_2W 2, 1, 10, 13, 15, m90, 9, 0 ; t34a, t61a + vpbroadcastd m14, [o(vvc_pw_9_90)] + ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13, 1 ; t33a, t62a + psubsw m10, m0, m3 ; t35a + paddsw m0, m3 ; t32a + psubsw m3, m11, m8 ; t60a + paddsw m11, m8 ; t63a + psubsw m8, m9, m2 ; t34 + paddsw m9, m2 ; t33 + psubsw m2, m12, m1 ; t61 + paddsw m12, m1 ; t62 + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m9 + mova [tmp2q+32*2], m12 + mova [tmp2q+32*3], m11 + vpbroadcastd m13, [o(vvc_pw_m89_18)] + vpbroadcastd m14, [o(vvc_pw_18_89)] + ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13, 1 ; t34a, t61a + ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13, 1 ; t35, t60 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp2q+32*0], m10 + mova [tmp2q+32*1], m8 + vpbroadcastd m3, [o(idct2_64_mul+4*16)] + vpbroadcastd m11, [o(idct2_64_mul+4*17)] + vpbroadcastd m2, [o(idct2_64_mul+4*20)] + vpbroadcastd m10, [o(idct2_64_mul+4*21)] + vpbroadcastd m1, [o(idct2_64_mul+4*24)] + vpbroadcastd m9, [o(idct2_64_mul+4*25)] + vpbroadcastd m0, [o(idct2_64_mul+4*28)] + vpbroadcastd m8, [o(idct2_64_mul+4*29)] + pmulhrsw m3, m4 ; t59a + pmulhrsw m4, m11 ; t36a + pmulhrsw m2, m5 ; t58a + pmulhrsw m5, m10 ; t37a + pmulhrsw m1, m6 ; t57a + pmulhrsw m6, m9 ; t38a + pmulhrsw m0, m7 ; t56a + pmulhrsw m7, m8 ; t39a + psubsw m8, m4, m5 ; t37 + paddsw m4, m5 ; t36 + psubsw m5, m7, m6 ; t38 + paddsw m7, m6 ; t39 + psubsw m6, m0, m1 ; t57 + paddsw m0, m1 ; t56 + psubsw m1, m3, m2 ; t58 + paddsw m3, m2 ; t59 + ITX_MULSUB_2W 6, 5, 2, 9, 15, m57, 70, 0 ; t38a, t57a + vpbroadcastd m10, [o(vvc_pw_70_57)] + ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9, 1 ; t37a, t58a + psubsw m2, m7, m4 ; t36a + paddsw m7, m4 ; t39a + psubsw m4, m0, m3 ; t59a + paddsw m0, m3 ; t56a + psubsw m3, m6, m1 ; t37 + paddsw m6, m1 ; t38 + psubsw m1, m5, m8 ; t58 + paddsw m5, m8 ; t57 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + mova [tmp2q-32*4], m0 + mova [tmp2q-32*3], m5 + vpbroadcastd m6, [o(vvc_pw_m18_m89)] + vpbroadcastd m7, [o(vvc_pw_m89_18)] + ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6, 1 ; t36, t59 + ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6, 1 ; t37a, t58a + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m1 + mova [tmp2q-32*2], m3 + mova [tmp2q-32*1], m2 + ret +%define o_base vvc_pw_5 + 128 +.main_part2_pass1: ; idct2_64 steps 6-9 + idct2_16/32/64 sumsub + sub r6, o_idct2_64_offset + 8 + vpbroadcastd m11, [o(vvc_pw_36_83)] + vpbroadcastd m12, [o(vvc_pw_m83_36)] + vpbroadcastd m13, [o(vvc_pw_64_64)] + vpbroadcastd m14, [o(vvc_pw_m64_64)] +.main_part2_pass1_loop: + call .main_part2_internal + IDCT2_64_PART2_END 0, 7, 0, 6, 9, 10 + IDCT2_64_PART2_END 7, 8, 5, 0, 6, 7 + IDCT2_64_PART2_END 8, 2, 1, 0, 6, 7 + IDCT2_64_PART2_END 15, 3, 4, 0, 6, 7 + cmp tmp1q, tmp2q + jne .main_part2_pass1_loop + ret +cglobal_label .main_part2_internal + mova m0, [tmp1q-32*12] ; t32a + mova m6, [tmp2q-32*13] ; t39a + mova m1, [tmp1q-32* 4] ; t40a + mova m5, [tmp2q+32* 3] ; t55a + add tmp1q, 32 + sub tmp2q, 32 + mova m2, [tmp1q+32* 3] ; t48a + mova m4, [tmp2q-32* 4] ; t47a + mova m3, [tmp1q+32*11] ; t56a + mova m7, [tmp2q+32*12] ; t63a + psubsw m8, m0, m6 ; t39 + paddsw m0, m6 ; t32 + psubsw m6, m4, m1 ; t40 + paddsw m4, m1 ; t47 + psubsw m1, m2, m5 ; t55 + paddsw m2, m5 ; t48 + psubsw m5, m7, m3 ; t56 + paddsw m7, m3 ; t63 + ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12, 1 ; t39a, t56a + vpbroadcastd m9, [o(vvc_pw_m36_m83)] + ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9, 1 ; t40a, t55a + psubsw m3, m0, m4 ; t47a + paddsw m0, m4 ; t32a + psubsw m4, m7, m2 ; t48a + paddsw m7, m2 ; t63a + psubsw m2, m5, m1 ; t40 + paddsw m5, m1 ; t39 + psubsw m1, m8, m6 ; t55 + paddsw m8, m6 ; t56 + ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14, 1 ; t47, t48 + ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14, 1 ; t40a, t55a + ret +.main_part2_pass2: + sub r6, o_idct2_64_offset + 8 + vpbroadcastd m11, [o(vvc_pw_36_83)] + vpbroadcastd m12, [o(vvc_pw_m83_36)] + vpbroadcastd m13, [o(vvc_pw_64_64)] + lea r9, [strideq*5] ; stride*5 + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + lea r8, [r3+strideq*2] ; stride*8 + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [o(vvc_pw_m64_64)] + call .main_part2_internal + vpbroadcastd m14, [o(vvc_pw_2048)] + IDCT2_64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 + IDCT2_64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 + IDCT2_64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + IDCT2_64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp tmp1q, tmp2q + jne .main_part2_pass2_loop + ret + +cglobal vvc_inv_dct2_dct2_64x16_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 16 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [o(vvc_pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m1, m1 +.dconly_loop: + mova m2, [dstq+32*0] + mova m3, [dstq+32*1] + punpckhbw m4, m2, m1 + punpcklbw m2, m1 + punpckhbw m5, m3, m1 + punpcklbw m3, m1 + paddw m4, m0 + paddw m2, m0 + paddw m5, m0 + paddw m3, m0 + packuswb m2, m4 + packuswb m3, m5 + mova [dstq+32*0], m2 + mova [dstq+32*1], m3 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + LOAD_8ROWS cq+32*0, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+32*2, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + vpbroadcastd m15, [o(vvc_pd_64)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+32* 1] + mova m1, [cq+32*31] + mova m2, [cq+32*17] + mova m3, [cq+32*15] + mova m4, [cq+32* 9] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32* 7] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+32* 5] + mova m1, [cq+32*27] + mova m2, [cq+32*21] + mova m3, [cq+32*11] + mova m4, [cq+32*13] + mova m5, [cq+32*19] + mova m6, [cq+32*29] + mova m7, [cq+32* 3] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass1 + sub tmp1q, 32*36 + lea r2, [strideq*3] + mov tmp2d, 4 +.pass2_loop: + lea r3, [tmp1q-32*8] + mova xm0, [r3 -32*4] + mova xm1, [r3 -32*3] + vinserti128 m0, [tmp1q-32*4], 1 + vinserti128 m1, [tmp1q-32*3], 1 + mova xm2, [r3 -32*2] + mova xm3, [r3 -32*1] + vinserti128 m2, [tmp1q-32*2], 1 + vinserti128 m3, [tmp1q-32*1], 1 + mova xm4, [r3 +32*0] + mova xm5, [r3 +32*1] + vinserti128 m4, [tmp1q+32*0], 1 + vinserti128 m5, [tmp1q+32*1], 1 + mova xm6, [r3 +32*2] + mova xm7, [r3 +32*3] + vinserti128 m6, [tmp1q+32*2], 1 + vinserti128 m7, [tmp1q+32*3], 1 + mova xm8, [r3 -32*4+16] + mova xm9, [r3 -32*3+16] + vinserti128 m8, [tmp1q-32*4+16], 1 + vinserti128 m9, [tmp1q-32*3+16], 1 + mova xm10, [r3 -32*2+16] + mova xm11, [r3 -32*1+16] + vinserti128 m10, [tmp1q-32*2+16], 1 + vinserti128 m11, [tmp1q-32*1+16], 1 + mova xm12, [r3 +32*0+16] + mova xm13, [r3 +32*1+16] + vinserti128 m12, [tmp1q+32*0+16], 1 + vinserti128 m13, [tmp1q+32*1+16], 1 + mova xm14, [r3 +32*2+16] + mova xm15, [r3 +32*3+16] + vinserti128 m14, [tmp1q+32*2+16], 1 + vinserti128 m15, [tmp1q+32*3+16], 1 + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(vvc_pw_8192)] + call m(vvc_inv_dct2_dct2_16x32_8).transpose_2x8x8_round + call m(idct2_16x16_internal_8).main + mova [rsp+32*0], m15 + vpbroadcastd m15, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 + lea r3, [r3+strideq*4] + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + pmulhrsw m15, [rsp+32*0] + lea r3, [r3+strideq*4] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + add tmp1q, 32*16 + add r0, 16 + dec tmp2d + jg .pass2_loop + RET + +cglobal vvc_inv_dct2_dct2_32x64_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 64 + jmp m(vvc_inv_dct2_dct2_32x8_8).dconly +.normal: + PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*7] + lea r10d, [eobq-136] + sar r10d, 31 +.pass1_loop: + lea tmp2q, [tmp1q+32*16] + LOAD_8ROWS cq+64*1, 64*2, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test r10b, r10b + jnz .fast + LOAD_8ROWS_H cq+64*17, 64*2, 2 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2, 1 + mova [rsp], m15 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + jmp .idct2_16 +.fast: + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct2_16: + LOAD_8ROWS cq+64*0, 64*2, 1 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct2_16x16_internal_8).main + call m(vvc_inv_dct2_dct2_32x16_8).pass1_end + vpbroadcastd m7, [o(vvc_pw_16384)] + call m(vvc_inv_dct2_dct2_16x32_8).transpose_2x8x8_round + lea r3, [tmp1q+32*48] + mova m15, [rsp] + mova [r3-32*4], m0 + mova [r3-32*3], m2 + mova [r3-32*2], m4 + mova [r3-32*1], m6 + mova [r3+32*0], m8 + mova [r3+32*1], m10 + mova [r3+32*2], m12 + mova [r3+32*3], m14 + add r3, 32*24 + mova [r3-32*4], m1 + mova [r3-32*3], m3 + mova [r3-32*2], m5 + mova [r3-32*1], m7 + mova [r3+32*0], m9 + mova [r3+32*1], m11 + mova [r3+32*2], m13 + mova [r3+32*3], m15 + vpbroadcastd m9, [o(vvc_pw_16384)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*8 + add r10d, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*55] + lea r7, [r2+32*24] +.pass2_loop: + lea r3, [r2+32*8] + lea r8, [r7+32*8] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r10b, r10b + jnz .fast2 + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast2: + mova [rsp], m8 + lea tmp1q, [rsp+32*39] + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10b, r10b + jnz .fast3 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast3: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + vpbroadcastd m15, [o(vvc_pd_64)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r7-32*4] + mova m3, [r7+32*3] + mova m4, [r7+32*0] + mova m7, [r7-32*1] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast4 + mova m1, [r8+32*3] + mova m2, [r8-32*4] + mova m5, [r8-32*1] + mova m6, [r8+32*0] +.fast4: + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r7-32*2] + mova m3, [r7+32*1] + mova m4, [r7+32*2] + mova m7, [r7-32*3] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast5 + mova m1, [r8+32*1] + mova m2, [r8-32*2] + mova m5, [r8-32*3] + mova m6, [r8+32*2] +.fast5: + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass2 + add r10d, 0x80000000 + jc .ret + lea r2, [rsp+32*7] + lea r7, [r2+32*16] + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + jmp .pass2_loop +.ret: + RET + +cglobal vvc_inv_dct2_dct2_64x32_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 32 + jmp m(vvc_inv_dct2_dct2_64x16_8).dconly +.normal: + PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + lea tmp1q, [rsp+32*7] + lea tmp4d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + vpbroadcastd m15, [o(vvc_pd_64)] + add tmp1q, 32*16 + add tmp2q, 32*32 + vpbroadcastd m7, [o(vvc_pw_64x8)] + pmulhrsw m0, m7, [cq+64* 1] + pmulhrsw m1, m7, [cq+64*31] + pmulhrsw m2, m7, [cq+64*17] + pmulhrsw m3, m7, [cq+64*15] + pmulhrsw m4, m7, [cq+64* 9] + pmulhrsw m5, m7, [cq+64*23] + pmulhrsw m6, m7, [cq+64*25] + pmulhrsw m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + vpbroadcastd m7, [o(vvc_pw_64x8-(o_idct2_64_offset))] + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + pmulhrsw m0, m7, [cq+64* 5] + pmulhrsw m1, m7, [cq+64*27] + pmulhrsw m2, m7, [cq+64*21] + pmulhrsw m3, m7, [cq+64*11] + pmulhrsw m4, m7, [cq+64*13] + pmulhrsw m5, m7, [cq+64*19] + pmulhrsw m6, m7, [cq+64*29] + pmulhrsw m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(vvc_pw_16384)] + call m(vvc_inv_dct2_dct2_64x32_8).transpose_round_interleave + add cq, 32 + add tmp4d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*15] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + mov tmp4b, 4 +.pass2_loop: + lea tmp2q, [tmp1q+32*64] + LOAD_8ROWS tmp1q-32*4, 32 + test tmp4d, 0x40000000 + jnz .fast + LOAD_8ROWS_H tmp2q-32*4, 32 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + lea tmp3q, [tmp2q-32*8] + LOAD_8ROWS_H tmp3q-32*4, 32 + mova [rsp], m15 + jmp .idct2_16 +.fast: + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct2_16: + lea tmp3q, [tmp1q-32*8] + LOAD_8ROWS tmp3q-32*4, 32 + call m(idct2_16x16_internal_8).main + call m(vvc_inv_dct2_dct2_16x32_8).pass2_end + add tmp1q, 32*16 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + dec tmp4b + jg .pass2_loop + RET +ALIGN function_align +.transpose_round_interleave: + mov tmp3d, 4 +.loop: + lea tmp2q, [tmp1q+32*8] + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp2q-32*4], 1 + vinserti128 m1, [tmp2q-32*3], 1 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp2q-32*2], 1 + vinserti128 m3, [tmp2q-32*1], 1 + mova xm4, [tmp1q+32*0] + mova xm5, [tmp1q+32*1] + vinserti128 m4, [tmp2q+32*0], 1 + vinserti128 m5, [tmp2q+32*1], 1 + mova xm6, [tmp1q+32*2] + mova xm7, [tmp1q+32*3] + vinserti128 m6, [tmp2q+32*2], 1 + vinserti128 m7, [tmp2q+32*3], 1 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova xm8, [tmp1q-32*4+16] + mova xm9, [tmp1q-32*3+16] + vinserti128 m8, [tmp2q-32*4+16], 1 + vinserti128 m9, [tmp2q-32*3+16], 1 + mova [tmp1q-32*4], m0 + mova [tmp2q-32*4], m1 + mova [tmp1q-32*3], m2 + mova [tmp2q-32*3], m3 + mova xm2, [tmp1q-32*2+16] + mova xm3, [tmp1q-32*1+16] + vinserti128 m2, [tmp2q-32*2+16], 1 + vinserti128 m3, [tmp2q-32*1+16], 1 + mova [tmp1q-32*2], m4 + mova [tmp2q-32*2], m5 + mova [tmp1q-32*1], m6 + mova [tmp2q-32*1], m7 + mova xm4, [tmp1q+32*0+16] + mova xm5, [tmp1q+32*1+16] + vinserti128 m4, [tmp2q+32*0+16], 1 + vinserti128 m5, [tmp2q+32*1+16], 1 + mova xm6, [tmp1q+32*2+16] + mova xm7, [tmp1q+32*3+16] + vinserti128 m6, [tmp2q+32*2+16], 1 + vinserti128 m7, [tmp2q+32*3+16], 1 + pmulhrsw m0, m8, m10 + pmulhrsw m1, m9, m10 + REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add tmp1q, 32*16 + dec tmp3d + jg .loop + ret -const vvc_pd_64, dd 64 -const vvc_pd_512, dd 512 +cglobal vvc_inv_dct2_dct2_64x64_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 64 + jmp m(vvc_inv_dct2_dct2_64x16_8).dconly +.normal: + PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*71] + lea r10d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + vpbroadcastd m15, [o(vvc_pd_64)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + mova m4, [cq+64* 9] + mova m5, [cq+64*23] + mova m6, [cq+64*25] + mova m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + mova m4, [cq+64*13] + mova m5, [cq+64*19] + mova m6, [cq+64*29] + mova m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(vvc_pw_8192)] + call m(vvc_inv_dct2_dct2_64x32_8).transpose_round_interleave + add cq, 32 + add r10d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*7] + mov r10b, 4 +.pass2_loop: + lea r2, [tmp1q+32*64] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + mova [rsp], m4 + test r10d, 0x40000000 + jnz .fast + lea r3, [r2+32*64] + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast: + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10d, 0x40000000 + jnz .fast2 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + vpbroadcastd m15, [o(vvc_pd_64)] + add r2, 32*8 + add r3, 32*8 + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r2-32*4] ; 1 + mova m3, [r2+32*3] ; 15 + mova m4, [r2+32*0] ; 9 + mova m7, [r2-32*1] ; 7 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast3 + mova m1, [r3+32*3] ; 31 + mova m2, [r3-32*4] ; 17 + mova m5, [r3-32*1] ; 23 + mova m6, [r3+32*0] ; 25 +.fast3: + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r2-32*2] ; 5 + mova m3, [r2+32*1] ; 11 + mova m4, [r2+32*2] ; 13 + mova m7, [r2-32*3] ; 3 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast4 + mova m1, [r3+32*1] ; 27 + mova m2, [r3-32*2] ; 21 + mova m5, [r3-32*3] ; 19 + mova m6, [r3+32*2] ; 29 +.fast4: + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass2 + sub tmp1q, 32*28 + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + dec r10b + jg .pass2_loop + RET %endif ; ARCH_X86_64 From e5dc61941195d8553d44402d9ca7ba63042a37c5 Mon Sep 17 00:00:00 2001 From: Frank Plowman Date: Fri, 18 Aug 2023 12:43:15 +0100 Subject: [PATCH 13/13] lavc/x86/vvc_itx: Change matrix coefficients to match VVC --- libavcodec/x86/vvc_itx_16bit.asm | 134 ++++++++++----------- libavcodec/x86/vvc_itx_8bit.asm | 198 +++++++++++++++---------------- 2 files changed, 166 insertions(+), 166 deletions(-) diff --git a/libavcodec/x86/vvc_itx_16bit.asm b/libavcodec/x86/vvc_itx_16bit.asm index 845c371c7f4..b60abcb5ffb 100644 --- a/libavcodec/x86/vvc_itx_16bit.asm +++ b/libavcodec/x86/vvc_itx_16bit.asm @@ -51,30 +51,30 @@ dd -%2, -%2 %endif %endmacro -COEF_PAIR 201, 995 +COEF_PAIR 4, 22 COEF_PAIR 9, 43 COEF_PAIR 18, 75 -COEF_PAIR 1380, 601 -COEF_PAIR 1751, 2440 +COEF_PAIR 31, 13 +COEF_PAIR 38, 54 COEF_PAIR 57, 25 -COEF_PAIR 2751, 2106 +COEF_PAIR 61, 46 COEF_PAIR 64, 36, 1 COEF_PAIR 64, 83, 1 -COEF_PAIR 3035, 3513 +COEF_PAIR 67, 78 COEF_PAIR 70, 87 -COEF_PAIR 3703, 3290 -COEF_PAIR 3857, 4052 +COEF_PAIR 82, 73 +COEF_PAIR 85, 90 COEF_PAIR 89, 50 COEF_PAIR 90, 80 -COEF_PAIR 4091, 3973 +COEF_PAIR 90, 88 vvc_pd_8: dd 8 -vvc_pd_m601: dd -601 +vvc_pd_m13: dd -13 vvc_pd_m25: dd -25 -vvc_pd_m1380: dd -1380 -vvc_pd_m2106: dd -2106 +vvc_pd_m31: dd -31 +vvc_pd_m46: dd -46 vvc_pd_m57: dd -57 -vvc_pd_m2751: dd -2751 +vvc_pd_m61: dd -61 vvc_pd_m3344: dd -3344 vvc_pd_1024: dd 1024 vvc_pd_1321: dd 1321 @@ -83,10 +83,10 @@ vvc_pd_1697: dd 1697 vvc_pd_2482: dd 2482 vvc_pd_3072: dd 3072 ; 1024 + 2048 vvc_pd_3803: dd 3803 -vvc_pd_5119: dd 5119 ; 1024 + 4096 - 1 -vvc_pd_5120: dd 5120 ; 1024 + 4096 +vvc_pd_5119: dd 5119 ; 1024 + 64 - 1 +vvc_pd_5120: dd 5120 ; 1024 + 64 vvc_pd_5793: dd 5793 -vvc_pd_6144: dd 6144 ; 2048 + 4096 +vvc_pd_6144: dd 6144 ; 2048 + 64 vvc_pd_17408: dd 17408 ; 1024 + 16384 coeff_min_15: times 2 dw -0x8000 @@ -94,17 +94,17 @@ coeff_max_15: times 2 dw 0x7fff pixel_10_max: times 2 dw 0x03ff pixel_12_max: times 2 dw 0x0fff dconly_10: times 2 dw 0x7c00 -dconly_12: times 2 dw 0x7000 +dconly_12: times 2 dw 0x150 clip_18b_min: dd -0x20000 clip_18b_max: dd 0x1ffff clip_20b_min: dd -0x80000 clip_20b_max: dd 0x7ffff const idct2_64_mul_16 -dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 9, 90, 18, 89 -dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -57, -70, -89, -18 -dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 43, 80, 75, 50 -dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -25, -87, -50, -75 +dd 91, 2, 65, -62, 83, 37, 84, -33, 9, 90, 18, 89 +dd -15, 90, 52, 73, -48, 77, 20, 88, -57, -70, -89, -18 +dd 90, 11, 71, -56, 79, 44, 87, -24, 43, 80, 75, 50 +dd -7, 90, 59, 69, -41, 81, 28, 86, -25, -87, -50, -75 cextern deint_shuf cextern idct2_64_mul @@ -117,7 +117,7 @@ cextern vvc_pw_64_64 cextern vvc_pw_m64_64 cextern vvc_pw_5 cextern vvc_pw_2048 -cextern vvc_pw_4096 +cextern vvc_pw_64 cextern vvc_pw_8192 cextern vvc_pw_16384 cextern vvc_pw_64x8 @@ -1007,7 +1007,7 @@ cglobal iidentity_4x8_internal_10, 0, 7, 8, dst, stride, c, eob, tx2 RET ALIGN function_align .pass2_end: - vpbroadcastd m4, [vvc_pw_4096] + vpbroadcastd m4, [vvc_pw_64] packssdw m0, m2 packssdw m1, m3 punpckhwd m2, m0, m1 @@ -1454,10 +1454,10 @@ ALIGN function_align .main2: ; expects: m12 = clip_min m13 = clip_max vpbroadcastd m11, [vvc_pd_2048] - ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 - ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 - ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1 - ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1 + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 4_22, 90_88, 1 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 38_54, 82_73, 1 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 67_78, 61_46, 1 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 85_90, 31_13, 1 psubd m8, m0, m4 ; t8a t10a paddd m0, m4 ; t0a t2a psubd m4, m1, m5 ; t9a t11a @@ -2476,7 +2476,7 @@ cglobal iadst_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8).main_pass2 vpbroadcastd m5, [vvc_pw_2048] - vpbroadcastd xm12, [vvc_pw_4096] + vpbroadcastd xm12, [vvc_pw_64] psubw m12, m5 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 pmulhrsw m0, m12 @@ -2537,7 +2537,7 @@ cglobal iflipadst_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8).main_pass2 vpbroadcastd m12, [vvc_pw_2048] - vpbroadcastd xm5, [vvc_pw_4096] + vpbroadcastd xm5, [vvc_pw_64] psubw m12, m5 vpermq m8, m3, q2031 vpermq m9, m2, q2031 @@ -2594,7 +2594,7 @@ cglobal iidentity_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 - vpbroadcastd m12, [vvc_pw_4096] + vpbroadcastd m12, [vvc_pw_64] punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 @@ -3050,7 +3050,7 @@ cglobal iadst_8x16_internal_10, 0, 7, 16, dst, stride, c, eob, tx2 call m(iadst_8x16_internal_8).main call m(iadst_8x16_internal_8).main_pass2_end vpbroadcastd m8, [vvc_pw_2048] - vpbroadcastd xm12, [vvc_pw_4096] + vpbroadcastd xm12, [vvc_pw_64] REPX {vpermq x, x, q2031}, m0, m1, m2, m3 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 psubw m12, m8 @@ -3116,7 +3116,7 @@ cglobal iflipadst_8x16_internal_10, 0, 7, 16, dst, stride, c, eob, tx2 call m(iadst_8x16_internal_8).main call m(iadst_8x16_internal_8).main_pass2_end vpbroadcastd m12, [vvc_pw_2048] - vpbroadcastd xm13, [vvc_pw_4096] + vpbroadcastd xm13, [vvc_pw_64] mova m11, m0 vpermq m0, m7, q2031 mova m10, m1 @@ -4207,10 +4207,10 @@ ALIGN function_align REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_part2: - ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091 - ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703 - ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751 - ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380 + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 4, 90 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 38, 82 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 67, 61 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 85, 31 psubd m8, m0, m4 ; t8a paddd m0, m4 ; t0a psubd m4, m1, m5 ; t9a @@ -4297,10 +4297,10 @@ ALIGN function_align psubd m12, m8 ; -out11 (unshifted) ret .main_part1: - ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973 - ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290 - ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106 - ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601 + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 22, 88 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 54, 73 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 78, 46 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 90, 13 psubd m8, m0, m4 ; t10a paddd m0, m4 ; t2a psubd m4, m1, m5 ; t11a @@ -4444,7 +4444,7 @@ cglobal iidentity_16x8_internal_10, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp tx2q .pass2: call m(idct2_16x8_internal_10).transpose - vpbroadcastd m10, [vvc_pw_4096] + vpbroadcastd m10, [vvc_pw_64] jmp m(idct2_16x8_internal_10).end INV_TXFM_16X8_FN dct2, dct2, 12 @@ -4592,7 +4592,7 @@ cglobal iidentity_16x8_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iidentity_16x8_internal_10).pass1 .pass2: call m(idct2_16x8_internal_10).transpose2 - vpbroadcastd m10, [vvc_pw_4096] + vpbroadcastd m10, [vvc_pw_64] pmulhrsw m0, m10 pmulhrsw m1, m10 pmulhrsw m2, m10 @@ -5817,14 +5817,14 @@ ALIGN function_align REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_oddhalf_part1_fast: ; lower half zero - vpbroadcastd m7, [vvc_pd_4091] - vpbroadcastd m8, [vvc_pd_201] - vpbroadcastd m6, [vvc_pd_m1380] - vpbroadcastd m9, [vvc_pd_3857] - vpbroadcastd m5, [vvc_pd_3703] - vpbroadcastd m10, [vvc_pd_1751] - vpbroadcastd m4, [vvc_pd_m2751] - vpbroadcastd m15, [vvc_pd_3035] + vpbroadcastd m7, [vvc_pd_90] + vpbroadcastd m8, [vvc_pd_4] + vpbroadcastd m6, [vvc_pd_m31] + vpbroadcastd m9, [vvc_pd_85] + vpbroadcastd m5, [vvc_pd_82] + vpbroadcastd m10, [vvc_pd_38] + vpbroadcastd m4, [vvc_pd_m61] + vpbroadcastd m15, [vvc_pd_67] pmulld m7, m0 pmulld m0, m8 pmulld m6, m1 @@ -5838,10 +5838,10 @@ ALIGN function_align REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 - ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a - ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a - ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a - ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 4, 90 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 85, 31 ; t19a, t28a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 38, 82 ; t18a, t29a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 67, 61 ; t17a, t30a .main_oddhalf_part1_fast2: REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 @@ -5886,14 +5886,14 @@ ALIGN function_align REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_oddhalf_part2_fast: ; lower half zero - vpbroadcastd m7, [vvc_pd_m601] - vpbroadcastd m8, [vvc_pd_4052] - vpbroadcastd m6, [vvc_pd_3973] - vpbroadcastd m9, [vvc_pd_995] - vpbroadcastd m5, [vvc_pd_m2106] - vpbroadcastd m10, [vvc_pd_3513] - vpbroadcastd m4, [vvc_pd_3290] - vpbroadcastd m15, [vvc_pd_2440] + vpbroadcastd m7, [vvc_pd_m13] + vpbroadcastd m8, [vvc_pd_90] + vpbroadcastd m6, [vvc_pd_88] + vpbroadcastd m9, [vvc_pd_22] + vpbroadcastd m5, [vvc_pd_m46] + vpbroadcastd m10, [vvc_pd_78] + vpbroadcastd m4, [vvc_pd_73] + vpbroadcastd m15, [vvc_pd_54] pmulld m7, m0 pmulld m0, m8 pmulld m6, m1 @@ -5907,10 +5907,10 @@ ALIGN function_align REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 - ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a - ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a - ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a - ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a + ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 90, 13 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 22, 88 ; t20a, t27a + ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 78, 46 ; t21a, t26a + ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 54, 73 ; t22a, t25a .main_oddhalf_part2_fast2: REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 @@ -6482,7 +6482,7 @@ ALIGN function_align cglobal vvc_inv_identity_identity_32x8_10, 4, 7, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_10_max] .pass1: - vpbroadcastd m5, [vvc_pw_4096] + vpbroadcastd m5, [vvc_pw_64] pxor m6, m6 mov r6d, eobd add eobb, 21 @@ -7149,7 +7149,7 @@ cglobal vvc_inv_identity_identity_32x16_10, 4, 7, 11, dst, stride, c, eob .pass1: vpbroadcastd m8, [vvc_pw_64x8] vpbroadcastd m9, [vvc_pw_1697x16] - vpbroadcastd m10, [vvc_pw_4096] + vpbroadcastd m10, [vvc_pw_64] lea r6, [strideq*5] pxor m6, m6 mov r5, dstq diff --git a/libavcodec/x86/vvc_itx_8bit.asm b/libavcodec/x86/vvc_itx_8bit.asm index 60e727dc14e..2c9fdb54546 100644 --- a/libavcodec/x86/vvc_itx_8bit.asm +++ b/libavcodec/x86/vvc_itx_8bit.asm @@ -1,6 +1,6 @@ ; Copyright © 2023, Frank Plowman -; Copyright © 2018-2021, VideoLAN and dav1d authors -; Copyright © 2018, Two Orioles, LLC +; Copyright © 48-2021, VideoLAN and dav1d authors +; Copyright © 48, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without @@ -50,7 +50,7 @@ vvc_pw_64_m64: dw 64, -64 const vvc_pw_5, times 2 dw 5 const vvc_pw_2048, times 2 dw 2048 -const vvc_pw_4096, times 2 dw 4096 +const vvc_pw_64, times 2 dw 64 const vvc_pw_8192, times 2 dw 8192 const vvc_pw_16384, times 2 dw 16384 const vvc_pw_1697x16, times 2 dw 1697*16 @@ -65,14 +65,14 @@ const vvc_pw_m64_64, dw -64, 64 const vvc_pw_36_83, dw 36, 83 const vvc_pw_m83_36, dw -83, 36 COEF_PAIR 83, 36 -COEF_PAIR 201, 4091 -COEF_PAIR 995, 3973 -COEF_PAIR 1751, 3703 -COEF_PAIR 2440, 3290 -COEF_PAIR 3035, 2751 -COEF_PAIR 3513, 2106 -COEF_PAIR 3857, 1380 -COEF_PAIR 4052, 601 +COEF_PAIR 4, 90 +COEF_PAIR 22, 88 +COEF_PAIR 38, 82 +COEF_PAIR 54, 73 +COEF_PAIR 67, 61 +COEF_PAIR 78, 46 +COEF_PAIR 85, 31 +COEF_PAIR 90, 13 COEF_PAIR 9, 90 COEF_PAIR 43, 80 COEF_PAIR 70, 57 @@ -96,33 +96,33 @@ COEF_PAIR 89, 18 %endrep %endmacro -vvc_pw_3703x8: COEF_X8 3703 -vvc_pw_1751x8: COEF_X8 1751 -vvc_pw_m1380x8: COEF_X8 -1380 -vvc_pw_3857x8: COEF_X8 3857 -vvc_pw_3973x8: COEF_X8 3973 -vvc_pw_995x8: COEF_X8 995 -vvc_pw_m2106x8: COEF_X8 -2106 -vvc_pw_3513x8: COEF_X8 3513 -vvc_pw_3290x8: COEF_X8 3290 -vvc_pw_2440x8: COEF_X8 2440 -vvc_pw_m601x8: COEF_X8 -601 -vvc_pw_4052x8: COEF_X8 4052 +vvc_pw_82x8: COEF_X8 82 +vvc_pw_38x8: COEF_X8 38 +vvc_pw_n31x8: COEF_X8 -31 +vvc_pw_85x8: COEF_X8 85 +vvc_pw_88x8: COEF_X8 88 +vvc_pw_22x8: COEF_X8 22 +vvc_pw_m46x8: COEF_X8 -2106 +vvc_pw_78x8: COEF_X8 78 +vvc_pw_73x8: COEF_X8 73 +vvc_pw_54x8: COEF_X8 54 +vvc_pw_n13x8: COEF_X8 -13 +vvc_pw_90x8: COEF_X8 90 const idct2_64_mul -COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 -COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 -COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 -COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 - -vvc_pw_201_4091x8: dw 201*8, 4091*8 -vvc_pw_m601_4052x8: dw -601*8, 4052*8 -vvc_pw_995_3973x8: dw 995*8, 3973*8 -vvc_pw_m1380_3857x8: dw -1380*8, 3857*8 -vvc_pw_1751_3703x8: dw 1751*8, 3703*8 -vvc_pw_m2106_3513x8: dw -2106*8, 3513*8 -vvc_pw_2440_3290x8: dw 2440*8, 3290*8 -vvc_pw_m2751_3035x8: dw -2751*8, 3035*8 +COEF_X8 91, 2, 90, 11, 65, -62, 71, -56 +COEF_X8 83, 37, 79, 44, 84, -33, 87, -24 +COEF_X8 88, 20, 86, 28, 77, -48, 81, -41 +COEF_X8 73, 52, 69, 59, 90, -15, 90, -7 + +vvc_pw_4_90x8: dw 4*8, 90*8 +vvc_pw_n13_90x8: dw -13*8, 90*8 +vvc_pw_22_88x8: dw 22*8, 88*8 +vvc_pw_n31_85x8: dw -31*8, 85*8 +vvc_pw_38_82x8: dw 38*8, 82*8 +vvc_pw_m46_78x8: dw -2106*8, 78*8 +vvc_pw_54_73x8: dw 54*8, 73*8 +vvc_pw_m61_67x8: dw -61*8, 67*8 %define o_idct2_64_offset idct2_64_mul - (o_base) - 8 @@ -813,7 +813,7 @@ cglobal iidentity_4x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 paddsw m1, m4 jmp tx2q .pass2: - vpbroadcastd m4, [o(vvc_pw_4096)] + vpbroadcastd m4, [o(vvc_pw_64)] jmp m(iadst_4x8_internal_8).end2 %macro INV_TXFM_4X16_FN 2 ; type1, type2 @@ -1022,10 +1022,10 @@ cglobal_label .main2 punpcklwd m0, m3 ; in0 in15 in2 in13 punpckhwd m3, m2, m1 ; in8 in7 in10 in5 punpcklwd m1, m2 ; in4 in11 in6 in9 - ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 - ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 - ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 - ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 + ITX_MUL4X_PACK 0, 2, 5, 6, 8, 4, 90, 22, 88, 3 + ITX_MUL4X_PACK 1, 2, 5, 6, 8, 38, 82, 54, 73, 3 + ITX_MUL4X_PACK 3, 2, 5, 6, 8, 67, 61, 78, 46, 3 + ITX_MUL4X_PACK 4, 2, 5, 6, 8, 85, 31, 90, 13, 3 psubsw m2, m0, m3 ; t9a t8a t11a t10a paddsw m0, m3 ; t1a t0a t3a t2a psubsw m3, m1, m4 ; t13a t12a t15a t14a @@ -1430,7 +1430,7 @@ cglobal iadst_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 pshufd m5, m1, q1032 call .main_pass2 vpbroadcastd m5, [o(vvc_pw_2048)] - vpbroadcastd xm4, [o(vvc_pw_4096)] + vpbroadcastd xm4, [o(vvc_pw_64)] psubw m4, m5 ; lower half = 2048, upper half = -2048 .end: REPX {vpermq x, x, q3120}, m0, m1, m2, m3 @@ -1497,7 +1497,7 @@ cglobal iflipadst_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8).main_pass2 vpbroadcastd m4, [o(vvc_pw_2048)] - vpbroadcastd xm5, [o(vvc_pw_4096)] + vpbroadcastd xm5, [o(vvc_pw_64)] psubw m4, m5 ; lower half = -2048, upper half = 2048 vpermq m5, m3, q2031 vpermq m3, m0, q2031 @@ -1531,7 +1531,7 @@ cglobal iidentity_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 punpckhdq m3, m4 jmp tx2q .pass2: - vpbroadcastd m4, [o(vvc_pw_4096)] + vpbroadcastd m4, [o(vvc_pw_64)] jmp m(iadst_8x8_internal_8).end %macro INV_TXFM_8X16_FN 2 ; type1, type2 @@ -1640,7 +1640,7 @@ ALIGN function_align call .main call .main_pass2_end vpbroadcastd m9, [o(vvc_pw_2048)] - vpbroadcastd xm8, [o(vvc_pw_4096)] + vpbroadcastd xm8, [o(vvc_pw_64)] psubw m8, m9 REPX {vpermq x, x, q2031}, m0, m1, m2, m3 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 @@ -1658,14 +1658,14 @@ cglobal_label .main punpcklwd m2, m5 ; in4 in11 punpcklwd m5, m4, m3 ; in8 in7 punpckhwd m3, m4 ; in6 in9 - ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 - ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 - ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 - ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 - ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 - ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 - ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 - ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 + ITX_MUL2X_PACK 0, 4, 9, 10, 4, 90, 3 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 9, 10, 22, 88, 3 ; t2 t3 + ITX_MUL2X_PACK 2, 4, 9, 10, 38, 82, 3 ; t4 t5 + ITX_MUL2X_PACK 3, 4, 9, 10, 54, 73, 3 ; t6 t7 + ITX_MUL2X_PACK 5, 4, 9, 10, 67, 61, 3 ; t8 t9 + ITX_MUL2X_PACK 6, 4, 9, 10, 78, 46, 3 ; t10 t11 + ITX_MUL2X_PACK 7, 4, 9, 10, 85, 31, 3 ; t12 t13 + ITX_MUL2X_PACK 8, 4, 9, 10, 90, 13, 3 ; t14 t15 psubsw m4, m0, m5 ; t9a t8a paddsw m0, m5 ; t1a t0a psubsw m5, m1, m6 ; t11a t10a @@ -1792,7 +1792,7 @@ cglobal iflipadst_8x16_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 call m(iadst_8x16_internal_8).main call m(iadst_8x16_internal_8).main_pass2_end vpbroadcastd m8, [o(vvc_pw_2048)] - vpbroadcastd xm9, [o(vvc_pw_4096)] + vpbroadcastd xm9, [o(vvc_pw_64)] psubw m8, m9 vpermq m9, m0, q3120 vpermq m0, m7, q2031 @@ -2455,7 +2455,7 @@ cglobal iidentity_16x8_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 punpckhqdq m7, m8 jmp tx2q .pass2: - vpbroadcastd m8, [o(vvc_pw_4096)] + vpbroadcastd m8, [o(vvc_pw_64)] jmp m(idct2_16x8_internal_8).end %define o_base vvc_pw_5 + 128 @@ -2671,10 +2671,10 @@ cglobal_label .main vpbroadcastd m15, [o(vvc_pd_64)] mova [rsp+gprsize+32*1], m0 mova [rsp+gprsize+32*2], m4 - ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973, 0 ; t3, t2 - ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290, 0 ; t7, t6 - ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106, 0 ; t11, t10 - ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601, 0 ; t15, t14 + ITX_MULSUB_2W 13, 2, 0, 4, 15, 22, 88, 0 ; t3, t2 + ITX_MULSUB_2W 9, 6, 0, 4, 15, 54, 73, 0 ; t7, t6 + ITX_MULSUB_2W 5, 10, 0, 4, 15, 78, 46, 0 ; t11, t10 + ITX_MULSUB_2W 1, 14, 0, 4, 15, 90, 13, 0 ; t15, t14 psubsw m0, m2, m10 ; t10a paddsw m2, m10 ; t2a psubsw m10, m13, m5 ; t11a @@ -2701,10 +2701,10 @@ cglobal_label .main mova [rsp+gprsize+32*1], m6 ; t6a mova m6, [rsp+gprsize+32*2] ; in4 mova [rsp+gprsize+32*2], m2 ; t2 - ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091, 0 ; t1, t0 - ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703, 0 ; t5, t4 - ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751, 0 ; t9, t8 - ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380, 0 ; t13, t12 + ITX_MULSUB_2W 9, 4, 2, 10, 15, 4, 90, 0 ; t1, t0 + ITX_MULSUB_2W 11, 6, 2, 10, 15, 38, 82, 0 ; t5, t4 + ITX_MULSUB_2W 7, 8, 2, 10, 15, 67, 61, 0 ; t9, t8 + ITX_MULSUB_2W 3, 12, 2, 10, 15, 85, 31, 0 ; t13, t12 psubsw m10, m4, m8 ; t8a paddsw m8, m4 ; t0a psubsw m4, m9, m7 ; t9a @@ -3005,10 +3005,10 @@ ALIGN function_align %endmacro %macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] - vpbroadcastd m%3, [r5-vvc_pw_201_4091x8+vvc_pw_%4_%5x8] + vpbroadcastd m%3, [r5-vvc_pw_4_90x8+vvc_pw_%4_%5x8] punpcklwd m%1, m%2, m%2 pmulhrsw m%1, m%3 - vpbroadcastd m%3, [r5-vvc_pw_201_4091x8+vvc_pw_%6_%7x8] + vpbroadcastd m%3, [r5-vvc_pw_4_90x8+vvc_pw_%6_%7x8] punpckhwd m%2, m%2 pmulhrsw m%2, m%3 %endmacro @@ -3162,11 +3162,11 @@ cglobal_label .main_fast ; bottom half is zero mova [rsp+gprsize+1*32], m1 mova m0, [rsp+gprsize+2*32] mova [rsp+gprsize+2*32], m6 - lea r5, [r6-(o_base)+vvc_pw_201_4091x8] - ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a - ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a - ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a - ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a + lea r5, [r6-(o_base)+vvc_pw_4_90x8] + ITX_UNPACK_MULHRSW 1, 8, 6, 4, 90, n13, 90 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 15, 9, 6, 22, 88, n31, 85 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 14, 0, 6, 38, 82, m46, 78 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 13, 11, 6, 54, 73, m61, 67 ; t22a, t25a, t17a, t30a jmp .main2 ALIGN function_align cglobal_label .main @@ -3185,14 +3185,14 @@ cglobal_label .main punpckhwd m0, m13 ; in11 in21 punpcklwd m13, m12, m11 ; in19 in13 punpckhwd m11, m12 ; in15 in17 - ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a - ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a - ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a - ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a - ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a - ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a - ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a - ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a + ITX_MUL2X_PACK 1, 6, 12, 10, 4, 90, 3 ; t16a, t31a + ITX_MUL2X_PACK 8, 6, 12, 10, 90, 13, 3 ; t23a, t24a + ITX_MUL2X_PACK 15, 6, 12, 10, 22, 88, 3 ; t20a, t27a + ITX_MUL2X_PACK 9, 6, 12, 10, 85, 31, 3 ; t19a, t28a + ITX_MUL2X_PACK 14, 6, 12, 10, 38, 82, 3 ; t18a, t29a + ITX_MUL2X_PACK 0, 6, 12, 10, 78, 46, 3 ; t21a, t26a + ITX_MUL2X_PACK 13, 6, 12, 10, 54, 73, 3 ; t22a, t25a + ITX_MUL2X_PACK 11, 6, 12, 10, 67, 61, 3 ; t17a, t30a .main2: psubsw m6, m1, m11 ; t17 t30 paddsw m1, m11 ; t16 t31 @@ -3507,7 +3507,7 @@ ALIGN function_align cglobal vvc_inv_identity_identity_32x8_8, 4, 6, 10, dst, stride, c, eob add cq, 16*8 - vpbroadcastd m9, [vvc_pw_4096] + vpbroadcastd m9, [vvc_pw_64] lea r4, [strideq*3] lea r5, [dstq+strideq*4] sub eobd, 107 @@ -3738,24 +3738,24 @@ cglobal_label .main_oddhalf_fast ; lower half is zero pxor m7, m7 mova [rsp+gprsize+32*0], m7 mova [rsp+gprsize+32*2], m7 - vpbroadcastd m11, [o(vvc_pw_3703x8)] - vpbroadcastd m7, [o(vvc_pw_1751x8)] - vpbroadcastd m12, [o(vvc_pw_m1380x8)] - vpbroadcastd m8, [o(vvc_pw_3857x8)] - vpbroadcastd m13, [o(vvc_pw_3973x8)] - vpbroadcastd m15, [o(vvc_pw_995x8)] + vpbroadcastd m11, [o(vvc_pw_82x8)] + vpbroadcastd m7, [o(vvc_pw_38x8)] + vpbroadcastd m12, [o(vvc_pw_n31x8)] + vpbroadcastd m8, [o(vvc_pw_85x8)] + vpbroadcastd m13, [o(vvc_pw_88x8)] + vpbroadcastd m15, [o(vvc_pw_22x8)] pmulhrsw m11, m4 ; t29a pmulhrsw m4, m7 ; t18a pmulhrsw m12, m3 ; t19a pmulhrsw m3, m8 ; t28a pmulhrsw m13, m2 ; t27a pmulhrsw m2, m15 ; t20a - vpbroadcastd m10, [o(vvc_pw_m2106x8)] - vpbroadcastd m7, [o(vvc_pw_3513x8)] - vpbroadcastd m9, [o(vvc_pw_3290x8)] - vpbroadcastd m8, [o(vvc_pw_2440x8)] - vpbroadcastd m14, [o(vvc_pw_m601x8)] - vpbroadcastd m15, [o(vvc_pw_4052x8)] + vpbroadcastd m10, [o(vvc_pw_m46x8)] + vpbroadcastd m7, [o(vvc_pw_78x8)] + vpbroadcastd m9, [o(vvc_pw_73x8)] + vpbroadcastd m8, [o(vvc_pw_54x8)] + vpbroadcastd m14, [o(vvc_pw_n13x8)] + vpbroadcastd m15, [o(vvc_pw_90x8)] pmulhrsw m10, m5 ; t21a pmulhrsw m5, m7 ; t26a pmulhrsw m9, m6 ; t25a @@ -3770,12 +3770,12 @@ cglobal_label .main_oddhalf mova [rsp+gprsize+32*1], m7 mova [rsp+gprsize+32*2], m8 vpbroadcastd m15, [o(vvc_pd_64)] - ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703, 0 ; t18a, t29a - ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380, 0 ; t19a, t28a - ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973, 0 ; t20a, t27a - ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106, 0 ; t21a, t26a - ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290, 0 ; t22a, t25a - ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601, 0 ; t23a, t24a + ITX_MULSUB_2W 4, 11, 7, 8, 15, 38, 82, 0 ; t18a, t29a + ITX_MULSUB_2W 12, 3, 7, 8, 15, 85, 31, 0 ; t19a, t28a + ITX_MULSUB_2W 2, 13, 7, 8, 15, 22, 88, 0 ; t20a, t27a + ITX_MULSUB_2W 10, 5, 7, 8, 15, 78, 46, 0 ; t21a, t26a + ITX_MULSUB_2W 6, 9, 7, 8, 15, 54, 73, 0 ; t22a, t25a + ITX_MULSUB_2W 14, 1, 7, 8, 15, 90, 13, 0 ; t23a, t24a .main2: psubsw m7, m12, m4 ; t18 paddsw m12, m4 ; t19 @@ -3808,8 +3808,8 @@ cglobal_label .main_oddhalf mova [rsp+gprsize+32*1], m14 ; t23a mova m14, [rsp+gprsize+32*2] ; in17 mova [rsp+gprsize+32*2], m1 ; t24a - ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091, 0 ; t16a, t31a - ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751, 0 ; t17a, t30a + ITX_MULSUB_2W 0, 4, 1, 11, 15, 4, 90, 0 ; t16a, t31a + ITX_MULSUB_2W 14, 6, 1, 11, 15, 67, 61, 0 ; t17a, t30a psubsw m1, m0, m14 ; t17 paddsw m0, m14 ; t16 psubsw m14, m4, m6 ; t30