diff --git a/libavcodec/vvc/vvc_ctu.c b/libavcodec/vvc/vvc_ctu.c index a212d3a44a1..5aaa4f2fdae 100644 --- a/libavcodec/vvc/vvc_ctu.c +++ b/libavcodec/vvc/vvc_ctu.c @@ -268,6 +268,8 @@ static TransformBlock* add_tb(TransformUnit *tu, VVCLocalContext *lc, tb->ts = 0; tb->coeffs = lc->coeffs; lc->coeffs += tb_width * tb_height; + tb->pixels = lc->pixels; + lc->pixels += tb_width * tb_height; return tb; } @@ -2382,6 +2384,7 @@ int ff_vvc_coding_tree_unit(VVCLocalContext *lc, } lc->coeffs = fc->tab.coeffs + rs * ctb_size * VVC_MAX_SAMPLE_ARRAYS; + lc->pixels = fc->tab.pixels + rs * ctb_size * VVC_MAX_SAMPLE_ARRAYS; lc->cu = NULL; ff_vvc_cabac_init(lc, ctu_idx, rx, ry); @@ -2474,4 +2477,4 @@ void ff_vvc_ep_init_stat_coeff(EntryPoint *ep, ep->stat_coeff[i] = persistent_rice_adaptation_enabled_flag ? 2 * (av_log2(bit_depth - 10)) : 0; } -} \ No newline at end of file +} diff --git a/libavcodec/vvc/vvc_ctu.h b/libavcodec/vvc/vvc_ctu.h index 50f081a4c47..f8d2fa454e5 100644 --- a/libavcodec/vvc/vvc_ctu.h +++ b/libavcodec/vvc/vvc_ctu.h @@ -96,6 +96,7 @@ typedef struct TransformBlock { int bd_offset; int *coeffs; + int16_t *pixels; } TransformBlock; typedef enum VVCTreeType { @@ -369,6 +370,7 @@ struct VVCLocalContext { VVCFrameContext *fc; EntryPoint *ep; int *coeffs; + int16_t *pixels; } ; typedef struct VVCAllowedSplit { diff --git a/libavcodec/vvc/vvc_intra.c b/libavcodec/vvc/vvc_intra.c index 7af2af439b3..52302adb12a 100644 --- a/libavcodec/vvc/vvc_intra.c +++ b/libavcodec/vvc/vvc_intra.c @@ -176,11 +176,11 @@ static void add_residual_for_joint_coding_chroma(VVCLocalContext *lc, uint8_t *dst = &fc->frame->data[c_idx][(tb->y0 >> vs) * stride + ((tb->x0 >> hs) << fc->ps.sps->pixel_shift)]; if (chroma_scale) { - fc->vvcdsp.itx.pred_residual_joint(tb->coeffs, tb->tb_width, tb->tb_height, c_sign, shift); - fc->vvcdsp.intra.lmcs_scale_chroma(lc, tb->coeffs, tb->coeffs, tb->tb_width, tb->tb_height, cu->x0, cu->y0); - fc->vvcdsp.itx.add_residual(dst, tb->coeffs, tb->tb_width, tb->tb_height, stride); + fc->vvcdsp.itx.pred_residual_joint(tb->pixels, tb->tb_width, tb->tb_height, c_sign, shift); + fc->vvcdsp.intra.lmcs_scale_chroma(lc, tb->pixels, tb->pixels, tb->tb_width, tb->tb_height, cu->x0, cu->y0); + fc->vvcdsp.itx.add_residual(dst, tb->pixels, tb->tb_width, tb->tb_height, stride); } else { - fc->vvcdsp.itx.add_residual_joint(dst, tb->coeffs, tb->tb_width, tb->tb_height, stride, c_sign, shift); + fc->vvcdsp.itx.add_residual_joint(dst, tb->pixels, tb->tb_width, tb->tb_height, stride, c_sign, shift); } } @@ -272,32 +272,6 @@ static void predict_intra(VVCLocalContext *lc, const TransformUnit *tu, const in } } -static void scale_clip(int *coeff, const int nzw, const int w, const int h, - const int shift, const int log2_transform_range) -{ - const int add = 1 << (shift - 1); - for (int y = 0; y < h; y++) { - int *p = coeff + y * w; - for (int x = 0; x < nzw; x++) { - *p = av_clip_intp2((*p + add) >> shift, log2_transform_range); - p++; - } - memset(p, 0, sizeof(*p) * (w - nzw)); - } -} - -static void scale(int *out, const int *in, const int w, const int h, const int shift) -{ - const int add = 1 << (shift - 1); - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - int *o = out + y * w + x; - const int *i = in + y * w + x; - *o = (*i + add) >> shift; - } - } -} - // part of 8.7.3 Scaling process for transform coefficients static void derive_qp(const VVCLocalContext *lc, const TransformUnit *tu, TransformBlock *tb) { @@ -441,35 +415,6 @@ static void dequant(const VVCLocalContext *lc, const TransformUnit *tu, Transfor } } -static void itx_2d(const VVCFrameContext *fc, TransformBlock *tb, const enum TxType trh, const enum TxType trv, int *temp) -{ - const VVCSPS *sps = fc->ps.sps; - const int w = tb->tb_width; - const int h = tb->tb_height; - const int nzw = tb->max_scan_x + 1; - - for (int x = 0; x < nzw; x++) - fc->vvcdsp.itx.itx[trv][tb->log2_tb_height - 1](temp + x, w, tb->coeffs + x, w); - scale_clip(temp, nzw, w, h, 7, sps->log2_transform_range); - - for (int y = 0; y < h; y++) - fc->vvcdsp.itx.itx[trh][tb->log2_tb_width - 1](tb->coeffs + y * w, 1, temp + y * w, 1); - scale(tb->coeffs, tb->coeffs, w, h, 5 + sps->log2_transform_range - sps->bit_depth); -} - -static void itx_1d(const VVCFrameContext *fc, TransformBlock *tb, const enum TxType trh, const enum TxType trv, int *temp) -{ - const VVCSPS *sps = fc->ps.sps; - const int w = tb->tb_width; - const int h = tb->tb_height; - - if (w > 1) - fc->vvcdsp.itx.itx[trh][tb->log2_tb_width - 1](temp, 1, tb->coeffs, 1); - else - fc->vvcdsp.itx.itx[trv][tb->log2_tb_height - 1](temp, 1, tb->coeffs, 1); - scale(tb->coeffs, temp, w, h, 6 + sps->log2_transform_range - sps->bit_depth); -} - static void transform_bdpcm(TransformBlock *tb, const VVCLocalContext *lc, const CodingUnit *cu) { const VVCSPS *sps = lc->fc->ps.sps; @@ -490,7 +435,7 @@ static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int tu_idx, const VVCSH *sh = &lc->sc->sh; const CodingUnit *cu = lc->cu; const int ps = fc->ps.sps->pixel_shift; - DECLARE_ALIGNED(32, int, temp)[MAX_TB_SIZE * MAX_TB_SIZE]; + DECLARE_ALIGNED(32, int16_t, temp)[MAX_TB_SIZE * MAX_TB_SIZE]; for (int i = 0; i < tu->nb_tbs; i++) { TransformBlock *tb = &tu->tbs[i]; @@ -511,19 +456,26 @@ static void itransform(VVCLocalContext *lc, TransformUnit *tu, const int tu_idx, dequant(lc, tu, tb); if (!tb->ts) { enum TxType trh, trv; + int nzw; if (cu->apply_lfnst_flag[c_idx]) ilfnst_transform(lc, tb); derive_transform_type(fc, lc, tb, &trh, &trv); - if (w > 1 && h > 1) - itx_2d(fc, tb, trh, trv, temp); - else - itx_1d(fc, tb, trh, trv, temp); + + nzw = tb->max_scan_x + 1; + fc->vvcdsp.itx.itx[trh][trv][tb->log2_tb_width][tb->log2_tb_height]( + tb->pixels, tb->coeffs, nzw, sps->log2_transform_range); + } else { + for (int x = 0; x < w; ++x) { + for (int y = 0; y < h; ++y) { + tb->pixels[x * h + y] = tb->coeffs[y * w + x]; + } + } } if (chroma_scale) - fc->vvcdsp.intra.lmcs_scale_chroma(lc, temp, tb->coeffs, w, h, cu->x0, cu->y0); - fc->vvcdsp.itx.add_residual(dst, chroma_scale ? temp : tb->coeffs, w, h, stride); + fc->vvcdsp.intra.lmcs_scale_chroma(lc, temp, tb->pixels, w, h, cu->x0, cu->y0); + fc->vvcdsp.itx.add_residual(dst, chroma_scale ? temp : tb->pixels, w, h, stride); if (tu->joint_cbcr_residual_flag && tb->c_idx) add_residual_for_joint_coding_chroma(lc, tu, tb, chroma_scale); diff --git a/libavcodec/vvc/vvc_intra_template.c b/libavcodec/vvc/vvc_intra_template.c index 81987a579ec..29847247295 100644 --- a/libavcodec/vvc/vvc_intra_template.c +++ b/libavcodec/vvc/vvc_intra_template.c @@ -429,7 +429,7 @@ static int FUNC(lmcs_derive_chroma_scale)(VVCLocalContext *lc, const int x0, con } // 8.7.5.3 Picture reconstruction with luma dependent chroma residual scaling process for chroma samples -static void FUNC(lmcs_scale_chroma)(VVCLocalContext *lc, int *dst, const int *coeff, +static void FUNC(lmcs_scale_chroma)(VVCLocalContext *lc, int16_t *dst, const int16_t *coeff, const int width, const int height, const int x0_cu, const int y0_cu) { const int chroma_scale = FUNC(lmcs_derive_chroma_scale)(lc, x0_cu, y0_cu); diff --git a/libavcodec/vvc/vvcdec.c b/libavcodec/vvc/vvcdec.c index 1eb2b7724ad..545ad9722e5 100644 --- a/libavcodec/vvc/vvcdec.c +++ b/libavcodec/vvc/vvcdec.c @@ -98,6 +98,9 @@ static int ctb_arrays_init(VVCFrameContext *fc, const int ctu_count, const int c fc->tab.coeffs = av_malloc(ctu_count * sizeof(*fc->tab.coeffs) * ctu_size * VVC_MAX_SAMPLE_ARRAYS); if (!fc->tab.coeffs) return AVERROR(ENOMEM); + fc->tab.pixels = av_malloc(ctu_count * sizeof(*fc->tab.pixels) * ctu_size * VVC_MAX_SAMPLE_ARRAYS); + if (!fc->tab.pixels) + return AVERROR(ENOMEM); fc->rpl_tab_pool = av_buffer_pool_init(ctu_count * sizeof(RefPicListTab), av_buffer_allocz); if (!fc->rpl_tab_pool) return AVERROR(ENOMEM); diff --git a/libavcodec/vvc/vvcdec.h b/libavcodec/vvc/vvcdec.h index 255f374d905..ea8253b8fbb 100644 --- a/libavcodec/vvc/vvcdec.h +++ b/libavcodec/vvc/vvcdec.h @@ -252,6 +252,7 @@ struct VVCFrameContext { uint8_t *alf_pixel_buffer_v[VVC_MAX_SAMPLE_ARRAYS][2]; int *coeffs; + int16_t *pixels; CTU *ctus; //used in arrays_init only diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c index 1056cb8ff9f..aea052bd3e8 100644 --- a/libavcodec/vvc/vvcdsp.c +++ b/libavcodec/vvc/vvcdsp.c @@ -254,23 +254,31 @@ static int vvc_sad(const int16_t *src0, const int16_t *src1, int dx, int dy, return sad; } -#define itx_fn(type, s) \ -static void itx_##type##_##s(int *out, ptrdiff_t out_step, const int *in, ptrdiff_t in_step) \ -{ \ - ff_vvc_inv_##type##_##s(out, out_step, in, in_step); \ +static void scale_clip(int *coeff, const int nzw, const int w, const int h, + const int shift, const int log2_transform_range) +{ + const int add = 1 << (shift - 1); + for (int y = 0; y < h; y++) { + int *p = coeff + y * w; + for (int x = 0; x < nzw; x++) { + *p = av_clip_intp2((*p + add) >> shift, log2_transform_range); + p++; + } + memset(p, 0, sizeof(*p) * (w - nzw)); + } } -#define itx_fn_common(type) \ - itx_fn(type, 4); \ - itx_fn(type, 8); \ - itx_fn(type, 16); \ - itx_fn(type, 32); \ - -itx_fn_common(dct2); -itx_fn_common(dst7); -itx_fn_common(dct8); -itx_fn(dct2, 2); -itx_fn(dct2, 64); +static void scale(int16_t *out, const int *in, const int w, const int h, const int shift) +{ + const int add = 1 << (shift - 1); + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + int16_t *o = out + y * w + x; + const int *i = in + y * w + x; + *o = (*i + add) >> shift; + } + } +} typedef struct IntraEdgeParams { uint8_t* top; diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h index af5133eca8d..a5e45fc7638 100644 --- a/libavcodec/vvc/vvcdsp.h +++ b/libavcodec/vvc/vvcdsp.h @@ -35,6 +35,7 @@ enum TxType { }; enum TxSize { + TX_SIZE_1 = 0, TX_SIZE_2, TX_SIZE_4, TX_SIZE_8, @@ -101,7 +102,7 @@ struct VVCLocalContext; typedef struct VVCIntraDSPContext { void (*intra_cclm_pred)(const struct VVCLocalContext *lc, int x0, int y0, int w, int h); - void (*lmcs_scale_chroma)(struct VVCLocalContext *lc, int *dst, const int *coeff, int w, int h, int x0_cu, int y0_cu); + void (*lmcs_scale_chroma)(struct VVCLocalContext *lc, int16_t *dst, const int16_t *coeff, int w, int h, int x0_cu, int y0_cu); void (*intra_pred)(const struct VVCLocalContext *lc, int x0, int y0, int w, int h, int c_idx); void (*pred_planar)(uint8_t *src, const uint8_t *top, const uint8_t *left, int w, int h, ptrdiff_t stride); void (*pred_mip)(uint8_t *src, const uint8_t *top, const uint8_t *left, int w, int h, ptrdiff_t stride, @@ -116,11 +117,11 @@ typedef struct VVCIntraDSPContext { } VVCIntraDSPContext; typedef struct VVCItxDSPContext { - void (*add_residual)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride); - void (*add_residual_joint)(uint8_t *dst, const int *res, int width, int height, ptrdiff_t stride, int c_sign, int shift); - void (*pred_residual_joint)(int *buf, int width, int height, int c_sign, int shift); + void (*add_residual)(uint8_t *dst, const int16_t *res, int width, int height, ptrdiff_t stride); + void (*add_residual_joint)(uint8_t *dst, const int16_t *res, int width, int height, ptrdiff_t stride, int c_sign, int shift); + void (*pred_residual_joint)(int16_t *buf, int width, int height, int c_sign, int shift); - void (*itx[N_TX_TYPE][N_TX_SIZE])(int *out, ptrdiff_t out_step, const int *in, ptrdiff_t in_step); + void (*itx[N_TX_TYPE][N_TX_TYPE][N_TX_SIZE][N_TX_SIZE])(int16_t *dst, const int *coeff, int nzw, int log2_transform_range); void (*transform_bdpcm)(int *coeffs, int width, int height, int vertical, int log2_transform_range); } VVCItxDSPContext; diff --git a/libavcodec/vvc/vvcdsp_template.c b/libavcodec/vvc/vvcdsp_template.c index d3998e633f6..e8d72dfb396 100644 --- a/libavcodec/vvc/vvcdsp_template.c +++ b/libavcodec/vvc/vvcdsp_template.c @@ -23,12 +23,13 @@ #include "libavcodec/bit_depth_template.c" #include "vvcdec.h" +#include "vvc_itx_1d.h" #include "vvc_inter_template.c" #include "vvc_intra_template.c" #include "vvc_filter_template.c" -static void FUNC(add_residual)(uint8_t *_dst, const int *res, +static void FUNC(add_residual)(uint8_t *_dst, const int16_t *res, const int w, const int h, const ptrdiff_t _stride) { pixel *dst = (pixel *)_dst; @@ -37,14 +38,13 @@ static void FUNC(add_residual)(uint8_t *_dst, const int *res, for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - dst[x] = av_clip_pixel(dst[x] + *res); - res++; + dst[x] = av_clip_pixel(dst[x] + res[x * h + y]); } dst += stride; } } -static void FUNC(add_residual_joint)(uint8_t *_dst, const int *res, +static void FUNC(add_residual_joint)(uint8_t *_dst, const int16_t *res, const int w, const int h, const ptrdiff_t _stride, const int c_sign, const int shift) { pixel *dst = (pixel *)_dst; @@ -53,15 +53,14 @@ static void FUNC(add_residual_joint)(uint8_t *_dst, const int *res, for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - const int r = ((*res) * c_sign) >> shift; + const int r = (res[x * h + y] * c_sign) >> shift; dst[x] = av_clip_pixel(dst[x] + r); - res++; } dst += stride; } } -static void FUNC(pred_residual_joint)(int *buf, const int w, const int h, +static void FUNC(pred_residual_joint)(int16_t *buf, const int w, const int h, const int c_sign, const int shift) { for (int y = 0; y < h; y++) { @@ -93,26 +92,164 @@ static void FUNC(transform_bdpcm)(int *coeffs, const int width, const int height } } -static void FUNC(ff_vvc_itx_dsp_init)(VVCItxDSPContext *const itx) -{ -#define VVC_ITX(TYPE, type, s) \ - itx->itx[TYPE][TX_SIZE_##s] = itx_##type##_##s; \ +#define ITX_COMMON_SIZES(TYPE_H, type_h, TYPE_V, type_v) \ + ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, 1, 4); \ + ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, 1, 8); \ + ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, 1, 16); \ + ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, 1, 32); \ + ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, 4, 1); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 4, 4); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 4, 8); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 4, 16); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 4, 32); \ + ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, 8, 1); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 8, 4); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 8, 8); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 8, 16); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 8, 32); \ + ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, 16, 1); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 16, 4); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 16, 8); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 16, 16); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 16, 32); \ + ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, 32, 1); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 32, 4); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 32, 8); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 32, 16); \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, 32, 32); -#define VVC_ITX_COMMON(TYPE, type) \ - VVC_ITX(TYPE, type, 4); \ - VVC_ITX(TYPE, type, 8); \ - VVC_ITX(TYPE, type, 16); \ - VVC_ITX(TYPE, type, 32); +#define ITX \ + ITX_COMMON_SIZES(DCT2, dct2, DCT2, dct2); \ + ITX_COMMON_SIZES(DCT2, dct2, DST7, dst7); \ + ITX_COMMON_SIZES(DCT2, dct2, DCT8, dct8); \ + ITX_COMMON_SIZES(DST7, dst7, DCT2, dct2); \ + ITX_COMMON_SIZES(DST7, dst7, DST7, dst7); \ + ITX_COMMON_SIZES(DST7, dst7, DCT8, dct8); \ + ITX_COMMON_SIZES(DCT8, dct8, DCT2, dct2); \ + ITX_COMMON_SIZES(DCT8, dct8, DST7, dst7); \ + ITX_COMMON_SIZES(DCT8, dct8, DCT8, dct8); \ + ITX_1D_V(DCT2, dct2, DCT2, dct2, 1, 2); \ + ITX_1D_V(DCT2, dct2, DCT2, dct2, 1, 64); \ + ITX_1D_H(DCT2, dct2, DCT2, dct2, 2, 1); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 4); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 8); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 16); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 32); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 2, 64); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 4, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 4, 64); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 8, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 8, 64); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 16, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 16, 64); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 32, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 32, 64); \ + ITX_1D_H(DCT2, dct2, DCT2, dct2, 64, 1); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 2); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 4); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 8); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 16); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 32); \ + ITX_2D(DCT2, dct2, DCT2, dct2, 64, 64); \ + ITX_1D_H(DCT2, dct2, DST7, dst7, 2, 1); \ + ITX_2D(DCT2, dct2, DST7, dst7, 2, 4); \ + ITX_2D(DCT2, dct2, DST7, dst7, 2, 8); \ + ITX_2D(DCT2, dct2, DST7, dst7, 2, 16); \ + ITX_2D(DCT2, dct2, DST7, dst7, 2, 32); \ + ITX_1D_H(DCT2, dct2, DST7, dst7, 64, 1); \ + ITX_2D(DCT2, dct2, DST7, dst7, 64, 4); \ + ITX_2D(DCT2, dct2, DST7, dst7, 64, 8); \ + ITX_2D(DCT2, dct2, DST7, dst7, 64, 16); \ + ITX_2D(DCT2, dct2, DST7, dst7, 64, 32); \ + ITX_1D_H(DCT2, dct2, DCT8, dct8, 2, 1); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 2, 4); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 2, 8); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 2, 16); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 2, 32); \ + ITX_1D_H(DCT2, dct2, DCT8, dct8, 64, 1); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 64, 4); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 64, 8); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 64, 16); \ + ITX_2D(DCT2, dct2, DCT8, dct8, 64, 32); \ + ITX_1D_V(DST7, dst7, DCT2, dct2, 1, 2); \ + ITX_2D(DST7, dst7, DCT2, dct2, 4, 2); \ + ITX_2D(DST7, dst7, DCT2, dct2, 8, 2); \ + ITX_2D(DST7, dst7, DCT2, dct2, 16, 2); \ + ITX_2D(DST7, dst7, DCT2, dct2, 32, 2); \ + ITX_1D_V(DST7, dst7, DCT2, dct2, 1, 64); \ + ITX_2D(DST7, dst7, DCT2, dct2, 4, 64); \ + ITX_2D(DST7, dst7, DCT2, dct2, 8, 64); \ + ITX_2D(DST7, dst7, DCT2, dct2, 16, 64); \ + ITX_2D(DST7, dst7, DCT2, dct2, 32, 64); \ + ITX_1D_V(DCT8, dct8, DCT2, dct2, 1, 2); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 4, 2); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 8, 2); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 16, 2); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 32, 2); \ + ITX_1D_V(DCT8, dct8, DCT2, dct2, 1, 64); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 4, 64); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 8, 64); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 16, 64); \ + ITX_2D(DCT8, dct8, DCT2, dct2, 32, 64); + +// ITX function prototypes +#undef ITX_2D +#define ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) \ +static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int16_t *dst, \ + const int *coeff, int nzw, int log2_transform_range) \ +{ \ + DECLARE_ALIGNED(32, int, temp)[width * height]; \ + DECLARE_ALIGNED(32, int, temp2)[width * height]; \ + \ + for (int x = 0; x < nzw; x++) \ + ff_vvc_inv_##type_v##_##height(temp + x, width, coeff + x, width); \ + \ + scale_clip(temp, nzw, width, height, 7, log2_transform_range); \ + \ + for (int y = 0; y < height; y++) \ + ff_vvc_inv_##type_h##_##width(temp2 + y, height, temp + y * width, 1); \ + \ + scale(dst, temp2, width, height, 5 + log2_transform_range - BIT_DEPTH); \ +} +#undef ITX_1D_H +#define ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, width, height) \ +static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int16_t *dst, \ + const int *coeff, int nzw, int log2_transform_range) \ +{ \ + DECLARE_ALIGNED(32, int, temp)[width * height]; \ + \ + ff_vvc_inv_##type_h##_##width(temp, 1, coeff, 1); \ + scale(dst, temp, width, height, 6 + log2_transform_range - BIT_DEPTH); \ +} +#undef ITX_1D_V +#define ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, width, height) \ +static void FUNC(inv_##type_h##_##type_v##_##width##x##height)(int16_t *dst, \ + const int *coeff, int nzw, int log2_transform_range) \ +{ \ + DECLARE_ALIGNED(32, int, temp)[width * height]; \ + \ + ff_vvc_inv_##type_v##_##height(temp, 1, coeff, 1); \ + scale(dst, temp, width, height, 6 + log2_transform_range - BIT_DEPTH); \ +} +ITX +static void FUNC(ff_vvc_itx_dsp_init)(VVCItxDSPContext *const itx) +{ itx->add_residual = FUNC(add_residual); itx->add_residual_joint = FUNC(add_residual_joint); itx->pred_residual_joint = FUNC(pred_residual_joint); itx->transform_bdpcm = FUNC(transform_bdpcm); - VVC_ITX(DCT2, dct2, 2) - VVC_ITX(DCT2, dct2, 64) - VVC_ITX_COMMON(DCT2, dct2) - VVC_ITX_COMMON(DCT8, dct8) - VVC_ITX_COMMON(DST7, dst7) +#undef ITX_2D +#define ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) \ + itx->itx[TYPE_H][TYPE_V][TX_SIZE_##width][TX_SIZE_##height] = FUNC(inv_##type_h##_##type_v##_##width##x##height); +#undef ITX_1D_H +#define ITX_1D_H(TYPE_H, type_h, TYPE_V, type_v, width, height) \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) +#undef ITX_1D_V +#define ITX_1D_V(TYPE_H, type_h, TYPE_V, type_v, width, height) \ + ITX_2D(TYPE_H, type_h, TYPE_V, type_v, width, height) + ITX #undef VVC_ITX #undef VVC_ITX_COMMON diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 71a1cdf63e7..00a17f662c0 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -206,5 +206,7 @@ X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc_alf.o \ x86/vvc_sao.o \ x86/vvc_sao_10bit.o \ - x86/vvc_mc.o + x86/vvc_mc.o \ + x86/vvc_itx_8bit.o \ + x86/vvc_itx_16bit.o X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o diff --git a/libavcodec/x86/vvc_itx_16bit.asm b/libavcodec/x86/vvc_itx_16bit.asm new file mode 100644 index 00000000000..b60abcb5ffb --- /dev/null +++ b/libavcodec/x86/vvc_itx_16bit.asm @@ -0,0 +1,8599 @@ +; Copyright © 2023, Frank Plowman +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright © 2021, Matthias Dressel +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +itx4_shuf: dd 0x509600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 + dd 0x508901, 0xd0c09385, 0x70603523, 0xf0e0b1a7 +idct2_4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 +idct2_4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5 +iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 +idct2_16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 +iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 +vvc_pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 +idct2_4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 +idct2_32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 + +%macro COEF_PAIR 2-3 0 +vvc_pd_%1_%2: dd %1, %1, %2, %2 +%define vvc_pd_%1 (vvc_pd_%1_%2 + 4*0) +%define vvc_pd_%2 (vvc_pd_%1_%2 + 4*2) +%if %3 +dd -%2, -%2 +%define vvc_pd_%2_m%2 vvc_pd_%2 +%endif +%endmacro + +COEF_PAIR 4, 22 +COEF_PAIR 9, 43 +COEF_PAIR 18, 75 +COEF_PAIR 31, 13 +COEF_PAIR 38, 54 +COEF_PAIR 57, 25 +COEF_PAIR 61, 46 +COEF_PAIR 64, 36, 1 +COEF_PAIR 64, 83, 1 +COEF_PAIR 67, 78 +COEF_PAIR 70, 87 +COEF_PAIR 82, 73 +COEF_PAIR 85, 90 +COEF_PAIR 89, 50 +COEF_PAIR 90, 80 +COEF_PAIR 90, 88 + +vvc_pd_8: dd 8 +vvc_pd_m13: dd -13 +vvc_pd_m25: dd -25 +vvc_pd_m31: dd -31 +vvc_pd_m46: dd -46 +vvc_pd_m57: dd -57 +vvc_pd_m61: dd -61 +vvc_pd_m3344: dd -3344 +vvc_pd_1024: dd 1024 +vvc_pd_1321: dd 1321 +vvc_pd_1448: dd 1448 +vvc_pd_1697: dd 1697 +vvc_pd_2482: dd 2482 +vvc_pd_3072: dd 3072 ; 1024 + 2048 +vvc_pd_3803: dd 3803 +vvc_pd_5119: dd 5119 ; 1024 + 64 - 1 +vvc_pd_5120: dd 5120 ; 1024 + 64 +vvc_pd_5793: dd 5793 +vvc_pd_6144: dd 6144 ; 2048 + 64 +vvc_pd_17408: dd 17408 ; 1024 + 16384 + +coeff_min_15: times 2 dw -0x8000 +coeff_max_15: times 2 dw 0x7fff +pixel_10_max: times 2 dw 0x03ff +pixel_12_max: times 2 dw 0x0fff +dconly_10: times 2 dw 0x7c00 +dconly_12: times 2 dw 0x150 +clip_18b_min: dd -0x20000 +clip_18b_max: dd 0x1ffff +clip_20b_min: dd -0x80000 +clip_20b_max: dd 0x7ffff + +const idct2_64_mul_16 +dd 91, 2, 65, -62, 83, 37, 84, -33, 9, 90, 18, 89 +dd -15, 90, 52, 73, -48, 77, 20, 88, -57, -70, -89, -18 +dd 90, 11, 71, -56, 79, 44, 87, -24, 43, 80, 75, 50 +dd -7, 90, 59, 69, -41, 81, 28, 86, -25, -87, -50, -75 + +cextern deint_shuf +cextern idct2_64_mul +cextern vvc_pw_1697x8 +cextern vvc_pw_1697x16 +cextern vvc_pw_36_83 +cextern vvc_pw_m36_m83 +cextern vvc_pw_m83_36 +cextern vvc_pw_64_64 +cextern vvc_pw_m64_64 +cextern vvc_pw_5 +cextern vvc_pw_2048 +cextern vvc_pw_64 +cextern vvc_pw_8192 +cextern vvc_pw_16384 +cextern vvc_pw_64x8 +cextern vvc_pd_512 +cextern vvc_pd_2048 + +cextern idct2_4x8_internal_8_avx2.main +cextern idct2_4x16_internal_8_avx2.main +cextern idct2_8x8_internal_8_avx2.main +cextern idct2_8x16_internal_8_avx2.main +cextern idct2_16x4_internal_8_avx2.main +cextern idct2_16x8_internal_8_avx2.main +cextern idct2_16x16_internal_8_avx2.main +cextern vvc_inv_dct2_dct2_8x32_8_avx2.main +cextern vvc_inv_dct2_dct2_8x32_8_avx2.main_fast +cextern vvc_inv_dct2_dct2_16x32_8_avx2.main_oddhalf +cextern vvc_inv_dct2_dct2_16x32_8_avx2.main_oddhalf_fast +cextern vvc_inv_dct2_dct2_16x64_8_avx2.main_part1 +cextern vvc_inv_dct2_dct2_16x64_8_avx2.main_part2_internal + +cextern iadst_4x4_internal_8_avx2.main +cextern iadst_4x8_internal_8_avx2.main_pass2 +cextern iadst_4x16_internal_8_avx2.main2 +cextern iadst_8x4_internal_8_avx2.main +cextern iadst_8x8_internal_8_avx2.main_pass2 +cextern iadst_8x16_internal_8_avx2.main +cextern iadst_8x16_internal_8_avx2.main_pass2_end +cextern iadst_16x4_internal_8_avx2.main +cextern iadst_16x8_internal_8_avx2.main +cextern iadst_16x8_internal_8_avx2.main_pass2_end +cextern iadst_16x16_internal_8_avx2.main +cextern iadst_16x16_internal_8_avx2.main_pass2_end + +SECTION .text + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro IWHT4_1D_PACKED 0 + ; m0 = in0 in2, m1 = in1 in3 + psubd m2, m0, m1 ; t2 + paddd xm0, xm1 ; t0 + vpermq m2, m2, q3322 + vpermq m0, m0, q1100 + vpermq m1, m1, q3120 + psubd m3, m0, m2 + psrad m3, 1 + psubd m3, m1 ; t1 t3 + psubd m0, m3 ; ____ out0 + paddd m2, m3 ; out3 ____ +%endmacro + +INIT_YMM avx2 +cglobal vvc_inv_wht_wht_4x4_16, 3, 7, 6, dst, stride, c, eob, bdmax + mova xm0, [cq+16*0] + vinserti128 m0, [cq+16*2], 1 + mova xm1, [cq+16*1] + vinserti128 m1, [cq+16*3], 1 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + lea r6, [dstq+strideq*2] + psrad m0, 2 + psrad m1, 2 + IWHT4_1D_PACKED + punpckhdq m0, m3 + punpckldq m3, m2 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x33 + packssdw m0, m3 + vextracti128 xm2, m0, 1 + punpckhdq xm1, xm0, xm2 ; out2 out1 + punpckldq xm0, xm2 ; out3 out0 + movq xm2, [r6 +strideq*1] + movhps xm2, [dstq+strideq*0] + movq xm3, [r6 +strideq*0] + movhps xm3, [dstq+strideq*1] +%ifidn bdmaxd, bdmaxm + movd xm5, bdmaxd + vpbroadcastw xm5, xm5 +%else ; win64: load from stack + vpbroadcastw xm5, bdmaxm +%endif + paddsw xm0, xm2 + paddsw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movq [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm0 + RET + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 7 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 7 +; flags: 1 = packed, 2 = inv_dst2, 4 = coef1 is reg, 8 = coef2 is reg +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +%if %9 & 8 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else +%if %9 & 1 + vbroadcasti128 m%3, [vvc_pd_%8] +%else + vpbroadcastd m%3, [vvc_pd_%8] +%endif + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%if %9 & 4 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else +%if %9 & 1 + vbroadcasti128 m%5, [vvc_pd_%7] +%else + vpbroadcastd m%5, [vvc_pd_%7] +%endif + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 2 + psubd m%4, m%6, m%4 + psubd m%2, m%4, m%2 +%else +%ifnum %6 + paddd m%4, m%6 +%endif + paddd m%2, m%4 +%endif +%ifnum %6 + paddd m%1, m%6 +%endif + psubd m%1, m%3 +%ifnum %6 + psrad m%2, 7 + psrad m%1, 7 +%endif +%endmacro + +%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth +cglobal vvc_inv_%1_%2_%4_%5, 4, 6, 0, dst, c, eob, l2tr, stride, tx2 + %define %%p1 m(i%1_%4_internal_%5) + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%4_internal_%5).pass2] +%ifidn %1_%2, dct2_dct2 + test eobd, eobd + jnz %%p1 +%else +%if %3 + add eobd, %3 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x4, %3 +%ifidn %1_%2, dct2_dct2 + vpbroadcastd xm2, [dconly_%3] +%if %3 = 10 +.dconly: + imul r7d, [cq], 181 + mov [cq], eobd ; 0 + or r2d, 4 +.dconly2: + add r7d, 128 + sar r7d, 8 +.dconly3: + imul r7d, 181 + add r7d, 2176 + sar r7d, 12 + movd xm0, r7d + paddsw xm0, xm2 + vpbroadcastw xm0, xm0 +.dconly_loop: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + paddsw xm1, xm0 + psubusw xm1, xm2 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + lea dstq, [dstq+strideq*2] + sub r2d, 2 + jg .dconly_loop + WRAP_XMM RET +%else + jmp m(vvc_inv_dct2_dct2_4x4_10).dconly +%endif +%endif +%endmacro + +%macro IDCT2_4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd + ITX_MULSUB_2D %1, %2, %3, %4, %5, nornd, 64_36, 64_83, 1 + punpckhqdq m%3, m%2, m%1 ; t3 t2 + punpcklqdq m%2, m%1 ; t0 t1 + paddd m%1, m%2, m%3 ; out0 out1 + psubd m%2, m%3 ; out3 out2 + paddd m%1, m%6 + paddd m%2, m%6 + psrad m%1, 7 + psrad m%2, 7 + vpbroadcastd m%3, [coeff_min_15] + vpbroadcastd m%4, [coeff_max_15] + pmaxsd m%1, m%3 + pmaxsd m%2, m%3 + pminsd m%1, m%4 + pminsd m%2, m%4 +%endmacro + +%macro IDCT2_4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd + vpbroadcastd m%5, [vvc_pw_m83_36] + punpckhwd m%3, m%2, m%1 + vpbroadcastd m%4, [vvc_pw_36_83] + punpcklwd m%2, m%1 + vpbroadcastd m%1, [vvc_pw_m64_64] + pmaddwd m%5, m%3 + pmaddwd m%3, m%4 + vpbroadcastd m%4, [vvc_pw_64_64] + pmaddwd m%1, m%2 + pmaddwd m%2, m%4 + paddd m%4, m%1, m%5 + psubd m%5, m%1, m%5 + paddd m%1, m%2, m%3 + psubd m%2, m%3 + REPX {paddd x, m%6}, m%4, m%1, m%5, m%2 + REPX {psrad x, 10 }, m%4, m%1, m%5, m%2 + packssdw m%1, m%4 + packssdw m%2, m%5 +%endmacro + +INV_TXFM_4X4_FN dct2, dct2 +INV_TXFM_4X4_FN dct2, identity +INV_TXFM_4X4_FN dct2, adst +INV_TXFM_4X4_FN dct2, flipadst + +cglobal idct2_4x4_internal_10, 0, 8, 6, dst, c, eob, l2tr, stride, tx2 + mov strideq, 8 + call .main + vbroadcasti128 m2, [idct2_4_shuf] + packssdw m0, m1 + pshufb m0, m2 + jmp tx2q +.pass2: + vextracti128 xm1, m0, 1 + vpbroadcastd xm5, [vvc_pd_512] + WRAP_XMM IDCT2_4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 + lea r7, [dstq+strideq*2] + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [r7 +strideq*0], xm1 + movq [r7 +strideq*1], xm1 + RET +ALIGN function_align +.main: + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m5, [vvc_pd_64] +.main2: + IDCT2_4_1D_PACKED 0, 1, 2, 3, 4, 5 + ret + +INV_TXFM_4X4_FN adst, dct2 +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +%macro IADST4_1D 0 + vpbroadcastd m5, [vvc_pd_1321] + vpbroadcastd m7, [vvc_pd_2482] + pmulld m4, m0, m5 ; 1321*in0 + pmulld m6, m3, m7 ; 2482*in3 + paddd m4, m6 ; 1321*in0 + 2482*in3 + pmulld m6, m0, m7 ; 2482*in0 + paddd m0, m3 ; in0 + in3 + paddd m7, m5 ; vvc_pd_3803 + pmulld m5, m2 ; 1321*in2 + pmulld m3, m7 ; 3803*in3 + pmulld m7, m2 ; 3803*in2 + psubd m2, m0 ; in2 - in0 - in3 + vpbroadcastd m0, [vvc_pd_m3344] + pmulld m1, m0 ; -t3 + pmulld m2, m0 ; out2 (unrounded) + psubd m6, m5 ; 2482*in0 - 1321*in2 + paddd m4, m7 ; t0 + psubd m6, m3 ; t1 + paddd m3, m4, m6 + psubd m4, m1 ; out0 (unrounded) + psubd m6, m1 ; out1 (unrounded) + paddd m3, m1 ; out3 (unrounded) +%endmacro + +cglobal iadst_4x4_internal_10, 0, 7, 6, dst, stride, c, eob, tx2 + call .main + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass1_end: + vpbroadcastd m5, [vvc_pd_2048] + mova m2, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + vpermd m0, m2, m0 + psrld m2, 4 + pshufb m0, m2 +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + jmp tx2q +.pass2: + lea r6, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal_8).main +.end: + vpbroadcastd xm4, [vvc_pw_2048] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm5, [pixel_10_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET +ALIGN function_align +.main: + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] +%if WIN64 + movaps [rsp+16], xmm6 + movaps [rsp+32], xmm7 +%endif +.main2: + WRAP_XMM IADST4_1D + ret + +INV_TXFM_4X4_FN flipadst, dct2 +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_10, 0, 7, 6, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10).main + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_10).pass1_end +.pass2: + lea r6, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal_8).main + vpbroadcastd xm4, [vvc_pw_2048] + movq xm3, [dstq+strideq*1] + movhps xm3, [dstq+strideq*0] + lea r6, [dstq+strideq*2] + movq xm2, [r6 +strideq*1] + movhps xm2, [r6 +strideq*0] + vpbroadcastd xm5, [pixel_10_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm1 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movq [r6 +strideq*1], xm0 + RET + +INV_TXFM_4X4_FN identity, dct2 +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_10, 0, 7, 6, dst, stride, c, eob, tx2 + vpbroadcastd m1, [vvc_pd_5793] + pmulld m0, m1, [cq+32*0] + pmulld m1, [cq+32*1] + vpbroadcastd m5, [vvc_pd_2048] + mova m3, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + vpermd m0, m3, m0 + psrld m3, 4 + pshufb m0, m3 + jmp tx2q +.pass2: + vpbroadcastd m1, [vvc_pw_1697x8] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + pmulhrsw m1, m0 + paddsw m0, m1 + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm4, [pixel_10_max] + packssdw m5, m5 ; vvc_pw_2048 + pmulhrsw m0, m5 + pxor m5, m5 + mova [cq+32*0], m5 + mova [cq+32*1], m5 + vextracti128 xm1, m0, 1 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm5 + pmaxsw xm1, xm5 + pminsw xm0, xm4 + pminsw xm1, xm4 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET + +INV_TXFM_4X4_FN dct2, dct2, 12 +INV_TXFM_4X4_FN dct2, identity, 12 +INV_TXFM_4X4_FN dct2, adst, 12 +INV_TXFM_4X4_FN dct2, flipadst, 12 + +cglobal idct2_4x4_internal_12, 0, 7, 8, dst, stride, c, eob, tx2 + call m(idct2_4x4_internal_10).main + mova m3, [idct2_4_12_shuf] + mova m4, [idct2_4_12_shuf2] + vpermd m2, m4, m1 + vpermd m1, m3, m0 + jmp m(iadst_4x4_internal_12).pass1_end2 +.pass2: + vpbroadcastd m5, [vvc_pd_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct2_4x4_internal_10).main2 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_4x4_internal_12).end + +INV_TXFM_4X4_FN adst, dct2, 12 +INV_TXFM_4X4_FN adst, adst, 12 +INV_TXFM_4X4_FN adst, flipadst, 12 +INV_TXFM_4X4_FN adst, identity, 12 + +cglobal iadst_4x4_internal_12, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10).main + vinserti128 m1, m4, xm6, 1 + vinserti128 m2, xm3, 1 +.pass1_end: + mova m3, [itx4_shuf] + vpbroadcastd m5, [vvc_pd_1024] + psrad m1, 1 + psrad m2, 1 + vpermd m1, m3, m1 + vpermd m2, m3, m2 + paddd m1, m5 + paddd m2, m5 + psrad m1, 11 + psrad m2, 11 +.pass1_end2: + vpbroadcastd m3, [clip_18b_min] + vpbroadcastd m4, [clip_18b_max] + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pmaxsd m0, m3 + pmaxsd m1, m3 + pminsd m0, m4 + pminsd m1, m4 + jmp tx2q +.pass2: + call .main_pass2 + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass2_end: + vpbroadcastd m5, [vvc_pd_2048] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 +.end: +%if WIN64 + WIN64_RESTORE_XMM_INTERNAL + %assign xmm_regs_used 6 +%endif +.end2: + vpbroadcastd m4, [vvc_pw_16384] + movq xm2, [dstq+strideq*0] + movq xm3, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movhps xm2, [r6 +strideq*0] ; dst0 dst2 + movhps xm3, [r6 +strideq*1] ; dst1 dst3 + vpbroadcastd m5, [pixel_12_max] + vinserti128 m2, xm3, 1 + psrad m0, 3 + psrad m1, 3 + packssdw m0, m1 ; t0 t2 t1 t3 + pmulhrsw m0, m4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw m0, m2 ; out0 out2 out1 out3 + pmaxsw m0, m4 + pminsw m0, m5 + vextracti128 xm1, m0, 1 ; out1 out3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movhps [r6 +strideq*1], xm1 + RET +.main_pass2: + vextracti128 xm3, m1, 1 + mova xm2, xm1 + vextracti128 xm1, m0, 1 + jmp m(iadst_4x4_internal_10).main2 + +INV_TXFM_4X4_FN flipadst, dct2, 12 +INV_TXFM_4X4_FN flipadst, adst, 12 +INV_TXFM_4X4_FN flipadst, flipadst, 12 +INV_TXFM_4X4_FN flipadst, identity, 12 + +cglobal iflipadst_4x4_internal_12, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10).main + vinserti128 m1, m3, xm2, 1 + vinserti128 m2, m6, xm4, 1 + jmp m(iadst_4x4_internal_12).pass1_end +.pass2: + call m(iadst_4x4_internal_12).main_pass2 + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_12).pass2_end + +INV_TXFM_4X4_FN identity, dct2, 12 +INV_TXFM_4X4_FN identity, adst, 12 +INV_TXFM_4X4_FN identity, flipadst, 12 +INV_TXFM_4X4_FN identity, identity, 12 + +cglobal iidentity_4x4_internal_12, 0, 7, 8, dst, stride, c, eob, tx2 + mova m2, [itx4_shuf] + vpbroadcastd m3, [vvc_pd_1697] + vpermd m0, m2, [cq+32*0] + vpermd m2, m2, [cq+32*1] + vpbroadcastd m5, [vvc_pd_2048] + pmulld m1, m3, m0 + pmulld m3, m2 + paddd m1, m5 + paddd m3, m5 + psrad m1, 12 + psrad m3, 12 + paddd m1, m0 + paddd m2, m3 + jmp m(iadst_4x4_internal_12).pass1_end2 +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + vpbroadcastd m3, [vvc_pd_5793] + vpbroadcastd m5, [vvc_pd_2048] + pmulld m0, m3 + pmulld m1, m3 + paddd m0, m5 ; 2048 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + jmp m(iadst_4x4_internal_12).end + +%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x8, %3 +%ifidn %1_%2, dct2_dct2 + vpbroadcastd xm2, [dconly_%3] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(vvc_inv_dct2_dct2_4x4_10).dconly2 +%else + jmp m(vvc_inv_dct2_dct2_4x8_10).dconly +%endif +%endif +%endmacro + +%macro IDCT2_4_1D 8 ; src[1-4], tmp[1-3], rnd + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 36, 83 ; t2, t3 + vpbroadcastd m%5, [vvc_pd_64] + pmulld m%1, m%5 + pmulld m%3, m%5 + paddd m%1, m%8 + paddd m%5, m%1, m%3 + psubd m%1, m%3 + psrad m%5, 12 ; t0 + psrad m%1, 12 ; t1 + psubd m%3, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%5, m%4 + psubd m%4, m%5, m%4 +%endmacro + +INV_TXFM_4X8_FN dct2, dct2 +INV_TXFM_4X8_FN dct2, identity +INV_TXFM_4X8_FN dct2, adst +INV_TXFM_4X8_FN dct2, flipadst + +cglobal idct2_4x8_internal_10, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m3, [vvc_pd_64] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, m3, [cq+32*3] + vpbroadcastd m7, [vvc_pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + IDCT2_4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp tx2q +.pass2: + packssdw m0, m2 + packssdw m1, m3 + lea r6, [deint_shuf+128] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m2 ; 2 3 + punpckldq m0, m2 ; 0 1 + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + call m(idct2_4x8_internal_8).main + vpbroadcastd xm4, [vvc_pw_2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 3 2 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_10_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movhps [r6 +strideq*2], xm3 + movq [r6 +r3 ], xm3 + RET + +INV_TXFM_4X8_FN adst, dct2 +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_10, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10).main + vpbroadcastd m5, [vvc_pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass1_end: + REPX {psrad x, 12}, m0, m1, m2, m3 + jmp tx2q +.pass2: + call .pass2_main + mova xm4, [vvc_pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 +.end: + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 2 3 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 6 7 + vpbroadcastd xm5, [pixel_10_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET +ALIGN function_align +.pass2_main: + packssdw m0, m2 + packssdw m1, m3 + lea r6, [deint_shuf+128] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpckhdq m5, m4, m0 + punpckldq m4, m0 + vextracti128 xm2, m4, 1 ; 4 5 + vextracti128 xm3, m5, 1 ; 6 7 + pshufd xm4, xm4, q1032 ; 1 0 + pshufd xm5, xm5, q1032 ; 3 2 + jmp m(iadst_4x8_internal_8).main_pass2 +ALIGN function_align +.main: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.main2: + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m2, [cq+16*2] + vbroadcasti128 m3, [cq+16*5] + vbroadcasti128 m1, [cq+16*7] + vpbroadcastd m6, [vvc_pd_64] + shufpd m0, m2, 0x0c ; 0 2 + shufpd m1, m3, 0x0c ; 7 5 + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m4, [cq+16*6] + vbroadcasti128 m5, [cq+16*1] + vbroadcasti128 m3, [cq+16*3] + vpbroadcastd m7, [vvc_pd_2048] + shufpd m2, m4, 0x0c ; 4 6 + shufpd m3, m5, 0x0c ; 3 1 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 +.main3: + ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 9_43, 90_80, 1 + ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 70_87, 57_25, 1 + psubd m4, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + REPX {pmaxsd x, m8}, m4, m2, m0, m1 + REPX {pminsd x, m9}, m4, m2, m0, m1 + pxor m5, m5 + psubd m5, m4 + vpblendd m4, m2, 0xcc ; t4 t7 + vpblendd m2, m5, 0xcc ; t5 -t6 + ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 36, 83 + vpbroadcastd m5, [vvc_pd_64] + vbroadcasti128 m6, [vvc_pw_2048_m2048] ; + + - - + punpckhqdq m3, m0, m1 + punpcklqdq m0, m1 + psubd m1, m0, m3 ; t2 t3 + paddd m0, m3 ; out0 -out7 + punpckhqdq m3, m4, m2 ; t7a t6a + punpcklqdq m4, m2 ; t5a t4a + psubd m2, m4, m3 ; t7 t6 + paddd m4, m3 ; out6 -out1 + REPX {pmaxsd x, m8}, m1, m2 + REPX {pminsd x, m9}, m1, m2 + vpblendd m3, m1, m2, 0xcc + shufpd m1, m2, 0x05 + pmulld m3, m5 + pmulld m5, m1 + psignd m0, m6 ; out0 out7 + psignd m4, m6 ; out6 out1 + paddd m3, m7 + psubd m2, m3, m5 + paddd m5, m3 + psrad m2, 12 ; out4 -out5 + psrad m5, 12 ; -out3 out2 + ret + +INV_TXFM_4X8_FN flipadst, dct2 +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_10, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10).main + vpbroadcastd m5, [vvc_pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m2, m5, m6 + paddd m3, m5, m4 + jmp m(iadst_4x8_internal_10).pass1_end +.pass2: + call m(iadst_4x8_internal_10).pass2_main + mova xm4, [vvc_pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*1] + movhps xm4, [dstq+strideq*0] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*1] + movhps xm6, [r6 +strideq*0] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm3, xm4 ; 1 0 + paddw xm2, xm5 ; 3 2 + paddw xm1, xm6 ; 5 4 + paddw xm0, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_10_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 + REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0 + movhps [dstq+strideq*0], xm3 + movq [dstq+strideq*1], xm3 + movhps [dstq+strideq*2], xm2 + movq [dstq+r3 ], xm2 + movhps [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm1 + movhps [r6 +strideq*2], xm0 + movq [r6 +r3 ], xm0 + RET + +INV_TXFM_4X8_FN identity, dct2 +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_10, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m3, [vvc_pd_64] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, [cq+32*3] + vpbroadcastd m5, [vvc_pd_2048] + vpbroadcastd m4, [vvc_pd_5793] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m6, [pixel_10_max] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + vpbroadcastd m4, [vvc_pw_64] + packssdw m0, m2 + packssdw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m4 + pmulhrsw m0, m4 + punpckhdq m1, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + vpbroadcastq m4, [r6 +strideq*0] + vpbroadcastq m5, [r6 +strideq*1] + movq xm3, [dstq+strideq*2] + movhps xm3, [dstq+r3 ] + vpblendd m2, m4, 0x30 + vpblendd m2, m5, 0xc0 + vpbroadcastq m4, [r6 +strideq*2] + vpbroadcastq m5, [r6 +r3 ] + vpblendd m3, m4, 0x30 + vpblendd m3, m5, 0xc0 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 ; out0 out1 out4 out5 + paddw m1, m3 ; out2 out3 out6 out7 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m6 + pminsw m1, m6 + vextracti128 xm2, m0, 1 ; out4 out5 + vextracti128 xm3, m1, 1 ; out6 out7 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + ret + +INV_TXFM_4X8_FN dct2, dct2, 12 +INV_TXFM_4X8_FN dct2, identity, 12 +INV_TXFM_4X8_FN dct2, adst, 12 +INV_TXFM_4X8_FN dct2, flipadst, 12 + +cglobal idct2_4x8_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(idct2_4x8_internal_10).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + ; transpose & interleave + pshufd m0, m0, q1320 + pshufd m1, m1, q1320 + pshufd m2, m2, q1320 + pshufd m3, m3, q1320 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + vpermq m0, m0, q3102 + vpermq m2, m2, q3102 + vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved) + vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved) + vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) + vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) + vpbroadcastd m7, [vvc_pd_2048] + call m(idct2_8x4_internal_10).main + psubd m3, m0, m4 ; out7 out6 + paddd m0, m4 ; out0 out1 + paddd m1, m2, m5 ; out3 out2 + psubd m2, m5 ; out4 out5 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x8_internal_12).end + +INV_TXFM_4X8_FN adst, dct2, 12 +INV_TXFM_4X8_FN adst, adst, 12 +INV_TXFM_4X8_FN adst, flipadst, 12 +INV_TXFM_4X8_FN adst, identity, 12 + +cglobal iadst_4x8_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10).main + psrad m0, m4, 1 + psrad m1, m6, 1 + psrad m2, 1 + psrad m3, 1 +.pass1_end: + vpbroadcastd m5, [vvc_pd_1024] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 11}, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call .pass2_main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 +.end: + vpbroadcastd m4, [vvc_pw_16384] + REPX {psrad x, 3}, m0, m1, m2, m3 + packssdw m0, m2 ; 0 1 4 5 (interleaved) + packssdw m1, m3 ; 2 3 6 7 (interleaved) + mova m2, [iadst8_12_shuf] + vpermd m0, m2, m0 ; 0 1 4 5 + vpermd m1, m2, m1 ; 2 3 6 7 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + vinserti128 m4, xm6, 1 + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + vinserti128 m5, xm7, 1 + paddw m0, m4 ; 0 1 4 5 + paddw m1, m5 ; 2 3 6 7 + vpbroadcastd m5, [pixel_12_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, m4}, m0, m1 + REPX {pminsw x, m5}, m0, m1 + vextracti128 xm2, m0, 1 ; out4 out5 + vextracti128 xm3, m1, 1 ; out6 out7 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET +ALIGN function_align +.pass2_main: + ; transpose & interleave + pshufd m0, m0, q1320 + pshufd m1, m1, q1320 + pshufd m2, m2, q1320 + pshufd m3, m3, q1320 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved) + vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved) + vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) + vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) + vpbroadcastd m7, [vvc_pd_2048] + jmp m(iadst_4x8_internal_10).main3 + +INV_TXFM_4X8_FN flipadst, dct2, 12 +INV_TXFM_4X8_FN flipadst, adst, 12 +INV_TXFM_4X8_FN flipadst, flipadst, 12 +INV_TXFM_4X8_FN flipadst, identity, 12 + +cglobal iflipadst_4x8_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10).main + psrad m0, m3, 1 + psrad m1, m2, 1 + psrad m2, m6, 1 + psrad m3, m4, 1 + jmp m(iadst_4x8_internal_12).pass1_end +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_4x8_internal_12).pass2_main + shufpd m3, m4, m0, 0x05 ; out1 out0 + shufpd m0, m4, 0x05 ; out7 out6 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 ; out5 out4 + psignd m2, m5, m6 ; out3 out2 + jmp m(iadst_4x8_internal_12).end + +INV_TXFM_4X8_FN identity, dct2, 12 +INV_TXFM_4X8_FN identity, adst, 12 +INV_TXFM_4X8_FN identity, flipadst, 12 +INV_TXFM_4X8_FN identity, identity, 12 + +cglobal iidentity_4x8_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iidentity_4x8_internal_10).pass1 +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m2 = in4 in5 + ; m3 = in6 in7 + vpbroadcastd m6, [pixel_12_max] + call m(iidentity_4x8_internal_10).pass2_end + RET + +%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x16, %3 +%ifidn %1_%2, dct2_dct2 + imul r6d, [cq], 181 + vpbroadcastd xm2, [dconly_%3] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 384 + sar r6d, 9 + jmp m(vvc_inv_dct2_dct2_4x4_10).dconly3 +%endif +%endmacro + +INV_TXFM_4X16_FN dct2, dct2 +INV_TXFM_4X16_FN dct2, identity +INV_TXFM_4X16_FN dct2, adst +INV_TXFM_4X16_FN dct2, flipadst + +cglobal idct2_4x16_internal_10, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m10, [vvc_pd_3072] + mova m1, [cq+32*2] + mova m3, [cq+32*6] + mova m5, [cq+32*3] + mova m7, [cq+32*7] + call .pass1_main + pmulld m0, m6, [cq+32*0] + pmulld m2, m6, [cq+32*4] + pmulld m4, m6, [cq+32*1] + pmulld m6, [cq+32*5] + call .pass1_main2 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 ; 2 3 + punpckldq m0, m4 ; 0 1 + punpckldq m4, m5, m2 ; 8 9 + punpckhdq m5, m2 ; a b + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + vextracti128 xm6, m4, 1 ; c d + vextracti128 xm7, m5, 1 ; e f + call m(idct2_4x16_internal_8).main + vpbroadcastd m9, [vvc_pw_2048] + vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 + vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 + vinserti128 m2, m4, xm5, 1 ; 8 9 b a + vinserti128 m3, m6, xm7, 1 ; c d f e + vpbroadcastd m8, [pixel_10_max] + call .pass2_end + RET +ALIGN function_align +.pass1_main: + vpbroadcastd m4, [vvc_pd_83] + vpbroadcastd m8, [vvc_pd_36] + vpbroadcastd m9, [vvc_pd_2048] + vpbroadcastd m6, [vvc_pd_1448] + ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4, 0xc ; t2l, t3l + ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4, 0xc ; t2h, t3h + ret +ALIGN function_align +.pass1_main2: + paddd m0, m10 + paddd m4, m10 + paddd m8, m0, m2 + psubd m0, m2 + paddd m9, m4, m6 + psubd m4, m6 + REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h + psubd m2, m0, m1 + paddd m1, m0 + psubd m6, m4, m5 + paddd m5, m4 + paddd m0, m8, m3 + psubd m3, m8, m3 + paddd m4, m9, m7 + psubd m7, m9, m7 + ret +ALIGN function_align +.pass2_end: + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + ret +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + vpbroadcastq m5, [dstq+strideq*2] + vpbroadcastq m6, [dstq+r6 ] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + movhps [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN adst, dct2 +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_10, 0, 7, 11, dst, stride, c, eob, tx2 + call m(iadst_16x4_internal_10).main + vpbroadcastd m6, [vvc_pd_6144] + call m(iadst_16x4_internal_10).main_end + psrad m0, m4, 13 + psrad m1, m5, 13 + psrad m2, 13 + psrad m3, 13 + psrad m4, m8, 13 + psrad m5, m9, 13 + psrad m6, 13 + psrad m7, 13 + jmp tx2q +.pass2: + call .pass2_main + vpbroadcastd m5, [vvc_pw_2048] + vpbroadcastd m8, [pixel_10_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+r6 ] + movhps xm4, [dstq+strideq*0] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movhps [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm5 + movq [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.pass2_main: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + punpckldq m4, m5, m2 + punpckhdq m5, m2 + vpblendd m3, m0, m1, 0x33 + vpblendd m0, m1, 0xcc + shufpd m2, m5, m4, 0x05 + shufpd m4, m5, 0x05 + vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5 + vinserti128 m0, xm3, 1 ; 0 3 2 1 + vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? + vinserti128 m2, xm4, 1 ; b 8 9 a + call m(iadst_4x16_internal_8).main2 + vpbroadcastd m5, [vvc_pw_64x8] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + ret +ALIGN function_align +.main: + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 2] + vbroadcasti128 m1, [cq+16*15] + vbroadcasti128 m5, [cq+16*13] + vbroadcasti128 m2, [cq+16* 4] + vbroadcasti128 m6, [cq+16* 6] + vbroadcasti128 m3, [cq+16*11] + vbroadcasti128 m7, [cq+16* 9] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m1, m5, 0x0c ; 15 13 + shufpd m2, m6, 0x0c ; 4 6 + shufpd m3, m7, 0x0c ; 11 9 + vbroadcasti128 m4, [cq+16* 8] + vbroadcasti128 m6, [cq+16*10] + vbroadcasti128 m5, [cq+16* 7] + vbroadcasti128 m7, [cq+16* 5] + shufpd m4, m6, 0x0c ; 8 10 + shufpd m5, m7, 0x0c ; 7 5 + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m7, [cq+16*14] + shufpd m6, m7, 0x0c ; 12 14 + vbroadcasti128 m7, [cq+16* 3] + vbroadcasti128 m8, [cq+16* 1] + shufpd m7, m8, 0x0c ; 3 1 +.main2: + ; expects: m12 = clip_min m13 = clip_max + vpbroadcastd m11, [vvc_pd_2048] + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 4_22, 90_88, 1 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 38_54, 82_73, 1 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 67_78, 61_46, 1 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 85_90, 31_13, 1 + psubd m8, m0, m4 ; t8a t10a + paddd m0, m4 ; t0a t2a + psubd m4, m1, m5 ; t9a t11a + paddd m1, m5 ; t1a t3a + psubd m5, m2, m6 ; t12a t14a + paddd m2, m6 ; t4a t6a + psubd m6, m3, m7 ; t13a t15a + paddd m3, m7 ; t5a t7a + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8 + ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 18_75, 89_50, 1 + ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 89_50, 10, 0x9 + psubd m7, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + psubd m3, m4, m6 ; t12a t14a + paddd m4, m6 ; t8a t10a + psubd m6, m8, m5 ; t13a t15a + paddd m8, m5 ; t9a t11a + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8 + punpcklqdq m5, m3, m7 ; t12a t4 + punpckhqdq m3, m7 ; t14a t6 + punpckhqdq m7, m6, m2 ; t15a t7 + punpcklqdq m6, m2 ; t13a t5 + ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 83, 36, 0x0 + ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 36, 10, 0x8 + vpbroadcastd m10, [vvc_pd_64] + vbroadcasti128 m9, [vvc_pw_2048_m2048] ; + + - - + punpckhqdq m2, m4, m0 ; t10a t2 + punpcklqdq m4, m0 ; t8a t0 + punpckhqdq m0, m8, m1 ; t11a t3 + punpcklqdq m8, m1 ; t9a t1 + paddd m1, m6, m7 ; out2 -out3 + psubd m6, m7 ; t14a t6 + paddd m7, m5, m3 ; -out13 out12 + psubd m5, m3 ; t15a t7 + psubd m3, m8, m0 ; t11 t3a + paddd m8, m0 ; out14 -out15 + paddd m0, m4, m2 ; -out1 out0 + psubd m4, m2 ; t10 t2a + REPX {pmaxsd x, m12}, m6, m5, m3, m4 + REPX {pminsd x, m13}, m6, m5, m3, m4 + REPX {pmulld x, m10}, m6, m5, m3, m4 + paddd m6, m11 + paddd m4, m11 + paddd m2, m6, m5 ; -out5 out4 + psubd m6, m5 ; out10 -out11 + psubd m5, m4, m3 ; -out9 out8 + paddd m3, m4 ; out6 -out7 + REPX {psrad x, 12}, m2, m3, m5, m6 + REPX {psignd x, m9}, m1, m8, m3, m6 + pshufd m9, m9, q1032 + REPX {psignd x, m9}, m0, m7, m2, m5 + ret + +INV_TXFM_4X16_FN flipadst, dct2 +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_10, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + call m(iadst_16x4_internal_10).main + vpbroadcastd m6, [vvc_pd_6144] + call m(iadst_16x4_internal_10).main_end + psrad m0, m3, 13 + psrad m1, m2, 13 + psrad m2, m5, 13 + psrad m3, m4, 13 + psrad m4, m7, 13 + psrad m5, m6, 13 + psrad m6, m9, 13 + psrad m7, m8, 13 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_10).pass2_main + vpbroadcastd m5, [vvc_pw_2048] + vpbroadcastd m8, [pixel_10_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+r6 ] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0x30 + vpblendd m4, m6, 0xc0 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 + movhps [dstq+strideq*2], xm5 + movhps [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN identity, dct2 +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_10, 0, 7, 11, dst, stride, c, eob, tx2 + vpbroadcastd m7, [vvc_pd_5793] + pmulld m0, m7, [cq+32*0] + pmulld m4, m7, [cq+32*1] + pmulld m1, m7, [cq+32*2] + pmulld m5, m7, [cq+32*3] + pmulld m2, m7, [cq+32*4] + pmulld m6, m7, [cq+32*5] + pmulld m3, m7, [cq+32*6] + pmulld m7, [cq+32*7] + vpbroadcastd m8, [vvc_pd_6144] + REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 + REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m7, [vvc_pw_1697x16] + vpbroadcastd m8, [vvc_pw_2048] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + vpbroadcastd m4, [pixel_10_max] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + lea r6, [strideq*5] + pxor m3, m3 + punpckhdq m5, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + punpckldq m6, m7, m1 ; 8 9 c d + punpckhdq m7, m1 ; a b e f + pmulhrsw m0, m8 + call .write_2x4x2 + pmulhrsw m0, m5, m8 + call .write_2x4x2 + pmulhrsw m0, m6, m8 + lea dstq, [dstq+strideq*4] + call .write_2x4x2 + pmulhrsw m0, m7, m8 + call .write_2x4x2 + ret +ALIGN function_align +.write_2x4x2: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + vpbroadcastq m2, [dstq+strideq*4] + vpblendd m1, m2, 0x30 + vpbroadcastq m2, [dstq+r6 ] + vpblendd m1, m2, 0xc0 + mova [cq+32*0], m3 + mova [cq+32*1], m3 + add cq, 32*2 + paddw m1, m0 + pmaxsw m1, m3 + pminsw m1, m4 + vextracti128 xm2, m1, 1 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + movq [dstq+strideq*4], xm2 + movhps [dstq+r6 ], xm2 + lea dstq, [dstq+strideq*2] + ret + +INV_TXFM_4X16_FN dct2, dct2, 12 +INV_TXFM_4X16_FN dct2, identity, 12 +INV_TXFM_4X16_FN dct2, adst, 12 +INV_TXFM_4X16_FN dct2, flipadst, 12 + +cglobal idct2_4x16_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(idct2_4x16_internal_10).pass1 +.pass2: + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m9, m2, m3 + punpckhdq m2, m3 + punpckldq m1, m4, m5 + punpckhdq m4, m5 + punpckldq m3, m6, m7 + punpckhdq m6, m7 + punpcklqdq m5, m0, m2 ; 2 6 + punpckhqdq m12, m0, m2 ; 3 7 + punpcklqdq m0, m8, m9 ; 0 4 + punpckhqdq m10, m8, m9 ; 1 5 + punpcklqdq m2, m1, m3 ; 8 12 + punpckhqdq m13, m1, m3 ; 9 13 + punpcklqdq m9, m4, m6 ; 10 14 + punpckhqdq m4, m6 ; 11 15 + vperm2i128 m1, m5, m9, 0x20 ; 2 10 + vperm2i128 m3, m9, m5, 0x31 ; 14 6 + vpermq m11, m4, q1302 ; 15 11 + ; interleave + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13 + REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13 + call m(idct2_16x4_internal_10).pass1_main + vpermq m6, m12, q1302 ; 7 3 + vpermq m5, m13, q3120 ; 9 13 + call m(idct2_16x4_internal_10).pass1_main2 + call m(idct2_16x4_internal_10).pass1_main3 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + mova m4, [idct2_16_12_shuf] + REPX {vpermd x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m9, [vvc_pw_16384] + vpbroadcastd m8, [pixel_12_max] + call m(idct2_4x16_internal_10).pass2_end + RET + +INV_TXFM_4X16_FN adst, dct2, 12 +INV_TXFM_4X16_FN adst, adst, 12 +INV_TXFM_4X16_FN adst, flipadst, 12 +INV_TXFM_4X16_FN adst, identity, 12 + +cglobal iadst_4x16_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + call .main_pass1 + psrad m0, m4, 12 + psrad m1, m5, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, m8, 12 + psrad m5, m9, 12 + psrad m6, 12 + psrad m7, 12 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose_16x4 + call m(iadst_4x16_internal_10).main2 + pshufd m4, m5, q1032 + psrad m5, m6, 3 + pshufd m6, m7, q1032 + psrad m7, m8, 3 + REPX {pshufd x, x, q1032}, m0, m2 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6 +.pass2_end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + mova m4, [iadst16_12_shuf] + REPX {vpermd x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m9, [vvc_pw_16384] + vpbroadcastd m8, [pixel_12_max] + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call m(iadst_4x16_internal_10).write_4x4 + pmulhrsw m0, m9, m1 + call m(iadst_4x16_internal_10).write_4x4 + pmulhrsw m0, m9, m2 + call m(iadst_4x16_internal_10).write_4x4 + pmulhrsw m0, m9, m3 + call m(iadst_4x16_internal_10).write_4x4 + RET +ALIGN function_align +.transpose_16x4: + ; transpose & interleave + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m9, m2, m3 + punpckhdq m2, m3 + punpckldq m1, m4, m5 + punpckhdq m4, m5 + punpckldq m3, m6, m7 + punpckhdq m6, m7 + punpcklqdq m10, m8, m0 + punpckhqdq m0, m8 + punpcklqdq m11, m9, m2 + punpckhqdq m2, m9 + punpcklqdq m8, m1, m4 + punpckhqdq m4, m1 + punpcklqdq m9, m3, m6 + punpckhqdq m6, m3 + vperm2i128 m5, m0, m2, 0x31 ; 7 5 + vperm2i128 m7, m0, m2, 0x20 ; 3 1 + vperm2i128 m0, m10, m11, 0x20 ; 0 2 + vperm2i128 m2, m10, m11, 0x31 ; 4 6 + vperm2i128 m1, m4, m6, 0x31 ; 15 13 + vperm2i128 m3, m4, m6, 0x20 ; 11 9 + vperm2i128 m4, m8, m9, 0x20 ; 8 10 + vperm2i128 m6, m8, m9, 0x31 ; 12 14 + ret +ALIGN function_align +.main_pass1: + call m(iadst_16x4_internal_10).main + vpbroadcastd m6, [vvc_pd_3072] + paddd m10, m4, m5 + psubd m4, m3 + psubd m5, m3 + paddd m3, m10 + psubd m8, m7, m1 + paddd m7, m9 + psubd m9, m1 + paddd m7, m1 + REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 + REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 + paddd m6, m0 + ret + +INV_TXFM_4X16_FN flipadst, dct2, 12 +INV_TXFM_4X16_FN flipadst, adst, 12 +INV_TXFM_4X16_FN flipadst, flipadst, 12 +INV_TXFM_4X16_FN flipadst, identity, 12 + +cglobal iflipadst_4x16_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + call m(iadst_4x16_internal_12).main_pass1 + psrad m0, m3, 12 + psrad m1, m2, 12 + psrad m2, m5, 12 + psrad m3, m4, 12 + psrad m4, m7, 12 + psrad m5, m6, 12 + psrad m6, m9, 12 + psrad m7, m8, 12 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_4x16_internal_12).transpose_16x4 + call m(iadst_4x16_internal_10).main2 + pshufd m4, m3, q1032 + psrad m3, m5, 3 + psrad m5, m2, 3 + pshufd m2, m6, q1032 + pshufd m6, m1, q1032 + psrad m1, m7, 3 + psrad m7, m0, 3 + pshufd m0, m8, q1032 + REPX {psrad x, 3}, m0, m2, m4, m6 + jmp m(iadst_4x16_internal_12).pass2_end + +INV_TXFM_4X16_FN identity, dct2, 12 +INV_TXFM_4X16_FN identity, adst, 12 +INV_TXFM_4X16_FN identity, flipadst, 12 +INV_TXFM_4X16_FN identity, identity, 12 + +cglobal iidentity_4x16_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [vvc_pd_1697] + mova m0, [cq+32*0] + mova m4, [cq+32*1] + mova m1, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m9, [vvc_pd_6144] + pmulld m2, m8, m0 + pmulld m6, m8, m4 + pmulld m3, m8, m1 + pmulld m7, m8, m5 + mova m10, [cq+32*4] + mova m11, [cq+32*5] + mova m12, [cq+32*6] + mova m13, [cq+32*7] + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m0, m2 + pmulld m2, m8, m10 + paddd m4, m6 + pmulld m6, m8, m11 + paddd m1, m3 + pmulld m3, m8, m12 + paddd m5, m7 + pmulld m7, m8, m13 + REPX {psrad x, 1 }, m0, m4, m1, m5 + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m2, m10 + paddd m6, m11 + paddd m3, m12 + paddd m7, m13 + REPX {psrad x, 1 }, m2, m6, m3, m7 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m8, [vvc_pd_5793] + vpbroadcastd m9, [vvc_pd_1024] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m8, [vvc_pw_16384] + vpbroadcastd m4, [pixel_12_max] + call m(iidentity_4x16_internal_10).pass2_end + RET + +%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 8x4, %3 +%ifidn %1_%2, dct2_dct2 + vpbroadcastd m2, [dconly_%3] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + jmp m(vvc_inv_dct2_dct2_8x8_10).dconly3 +%else + jmp m(vvc_inv_dct2_dct2_8x4_10).dconly +%endif +%endif +%endmacro + +INV_TXFM_8X4_FN dct2, dct2 +INV_TXFM_8X4_FN dct2, identity +INV_TXFM_8X4_FN dct2, adst +INV_TXFM_8X4_FN dct2, flipadst + +cglobal idct2_8x4_internal_10, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.pass1: + vbroadcasti128 m1, [cq+16*1] + vbroadcasti128 m0, [cq+16*5] + vbroadcasti128 m2, [cq+16*3] + vbroadcasti128 m3, [cq+16*7] + vpbroadcastd m6, [vvc_pd_64] + shufpd m1, m0, 0x0c ; 1 5 + shufpd m3, m2, 0x0c ; 7 3 + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m4, [cq+16*2] + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m5, [cq+16*6] + vpbroadcastd m7, [vvc_pd_2048] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m2, m5, 0x0c ; 4 6 + REPX {pmulld x, m6}, m1, m3, m0, m2 + REPX {paddd x, m7}, m1, m3, m0, m2 + REPX {psrad x, 12}, m1, m3, m0, m2 + call .main + psubd m3, m0, m4 ; out7 out6 (interleaved) + paddd m0, m4 ; out0 out1 (interleaved) + paddd m1, m2, m5 ; out3 out2 (interleaved) + psubd m2, m5 ; out4 out5 (interleaved) + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp tx2q +.pass2: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + IDCT2_4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q2031 ; out2 out3 + jmp m(iadst_8x4_internal_10).end +ALIGN function_align +.main: + ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 18_75, 89_50, 1 + IDCT2_4_1D_PACKED 0, 2, 4, 5, 6, 7 + vpbroadcastd m6, [vvc_pd_64] + punpcklqdq m4, m1, m3 ; t4a t7a + punpckhqdq m1, m3 ; t5a t6a + psubd m3, m4, m1 ; t5a t6a + paddd m4, m1 ; t4 t7 + REPX {pmaxsd x, m8}, m3, m4, m0, m2 + REPX {pminsd x, m9}, m3, m4, m0, m2 + pmulld m3, m6 + pshufd m1, m3, q1032 + paddd m3, m7 + psubd m5, m3, m1 + paddd m1, m3 + psrad m5, 12 + psrad m1, 12 + vpblendd m5, m4, 0x33 ; t4 t5 + punpckhqdq m4, m1 ; t7 t6 + ret + +INV_TXFM_8X4_FN adst, dct2 +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_10, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_10).main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + call .pass2_main + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q3120 ; out2 out3 +.end: + vpbroadcastd m1, [vvc_pw_2048] + pmulhrsw m0, m1 + pmulhrsw m1, m2 + vpbroadcastd m5, [pixel_10_max] +.end2: + mova xm2, [dstq+strideq*0] + vinserti128 m2, [dstq+strideq*1], 1 + lea r6, [dstq+strideq*2] + mova xm3, [r6 +strideq*0] + vinserti128 m3, [r6 +strideq*1], 1 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [r6 +strideq*0], xm1 + vextracti128 [r6 +strideq*1], m1, 1 + RET +ALIGN function_align +.pass2_main: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + lea r6, [deint_shuf+128] + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + jmp m(iadst_8x4_internal_8).main +ALIGN function_align +.main: + vpbroadcastd m1, [vvc_pd_64] + pmulld m0, m1, [cq+32*0] + pmulld m3, m1, [cq+32*3] + pmulld m2, m1, [cq+32*2] + pmulld m1, [cq+32*1] + vpbroadcastd m4, [vvc_pd_2048] + REPX {paddd x, m4}, m0, m3, m2, m1 + REPX {psrad x, 12}, m0, m3, m2, m1 +.main2: + IADST4_1D + ret + +INV_TXFM_8X4_FN flipadst, dct2 +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_10, 0, 5, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_10).main + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_10).pass2_main + vpermq m2, m0, q2031 + vpermq m0, m1, q2031 + jmp m(iadst_8x4_internal_10).end + +INV_TXFM_8X4_FN identity, dct2 +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_10, 0, 7, 10, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m4, [vvc_pd_64] + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpermq m2, [cq+32*2], q3120 + vpermq m3, [cq+32*3], q3120 + vpbroadcastd m7, [vvc_pd_2048] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {paddd x, x }, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m5, [pixel_10_max] + vpbroadcastd m4, [vvc_pw_1697x8] + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m1, m4, m0 + pmulhrsw m4, m2 + paddsw m0, m1 + paddsw m2, m4 + packssdw m7, m7 ; vvc_pw_2048 +.pass2_end: + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + lea r6, [dstq+strideq*2] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m7 + pmulhrsw m0, m7 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova xm2, [dstq+strideq*0] + vinserti128 m2, [r6 +strideq*0], 1 + mova xm3, [dstq+strideq*1] + vinserti128 m3, [r6 +strideq*1], 1 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [r6 +strideq*0], m0, 1 + vextracti128 [r6 +strideq*1], m1, 1 + RET + +INV_TXFM_8X4_FN dct2, dct2, 12 +INV_TXFM_8X4_FN dct2, identity, 12 +INV_TXFM_8X4_FN dct2, adst, 12 +INV_TXFM_8X4_FN dct2, flipadst, 12 + +cglobal idct2_8x4_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + jmp m(idct2_8x4_internal_10).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12).transpose_4x8 + IDCT2_4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(iadst_8x4_internal_12).end + +INV_TXFM_8X4_FN adst, dct2, 12 +INV_TXFM_8X4_FN adst, adst, 12 +INV_TXFM_8X4_FN adst, flipadst, 12 +INV_TXFM_8X4_FN adst, identity, 12 + +cglobal iadst_8x4_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + call m(iadst_4x8_internal_10).main2 + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call .pass2_main + vpbroadcastd m5, [vvc_pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass2_end: + REPX {psrad x, 12}, m0, m1, m2, m3 +.end: + vpbroadcastd m4, [vvc_pw_16384] + REPX {psrad x, 3}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m2, m4 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m1, m1, q3120 ; out2 out3 + vpbroadcastd m5, [pixel_12_max] + jmp m(iadst_8x4_internal_10).end2 +ALIGN function_align +.pass2_main: + call .transpose_4x8 + jmp m(iadst_8x4_internal_10).main2 +ALIGN function_align +.transpose_4x8: + ; deinterleave + pshufd m0, m0, q3120 + pshufd m1, m1, q3120 + pshufd m2, m2, q3120 + pshufd m3, m3, q3120 + ; transpose + punpcklqdq m4, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m5, m2, m3 + punpckhqdq m2, m3 + vperm2i128 m1, m0, m2, 0x20 ; out1 + vperm2i128 m3, m0, m2, 0x31 ; out3 + vperm2i128 m2, m4, m5, 0x31 ; out2 + vperm2i128 m0, m4, m5, 0x20 ; out0 + ret + +INV_TXFM_8X4_FN flipadst, dct2, 12 +INV_TXFM_8X4_FN flipadst, adst, 12 +INV_TXFM_8X4_FN flipadst, flipadst, 12 +INV_TXFM_8X4_FN flipadst, identity, 12 + +cglobal iflipadst_8x4_internal_12, 0, 5, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + call m(iadst_4x8_internal_10).main2 + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12).pass2_main + vpbroadcastd m5, [vvc_pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m3, m5, m4 + paddd m2, m5, m6 + jmp m(iadst_8x4_internal_12).pass2_end + +INV_TXFM_8X4_FN identity, dct2, 12 +INV_TXFM_8X4_FN identity, adst, 12 +INV_TXFM_8X4_FN identity, flipadst, 12 +INV_TXFM_8X4_FN identity, identity, 12 + +cglobal iidentity_8x4_internal_12, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iidentity_8x4_internal_10).pass1 +.pass2: + ; m0 = in0 in1 (interleaved) + ; m1 = in2 in3 (interleaved) + ; m2 = in4 in5 (interleaved) + ; m3 = in6 in7 (interleaved) + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + vpbroadcastd m4, [vvc_pd_5793] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 15}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_12_max] + vpbroadcastd m7, [vvc_pw_16384] + packssdw m0, m1 + packssdw m2, m3 + jmp m(iidentity_8x4_internal_10).pass2_end + +%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 8x8, %3 +%ifidn %1_%2, dct2_dct2 + vpbroadcastd m2, [dconly_%3] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 +.dconly2: + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm2 + vpbroadcastw m0, xm0 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + paddsw m1, m0 + psubusw m1, m2 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%else + jmp m(vvc_inv_dct2_dct2_8x8_10).dconly +%endif +%endif +%endmacro + +%macro IADST8_1D 14 ; src[1-8], tmp[1-3], vvc_pd_2048, clip[1-2] + ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 9, 90 ; t1a, t0a + ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 87, 25 ; t7a, t6a + ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 43, 80 ; t3a, t2a + ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 70, 57 ; t5a, t4a + psubd m%9, m%3, m%7 ; t6 + paddd m%3, m%7 ; t2 + psubd m%7, m%1, m%5 ; t4 + paddd m%1, m%5 ; t0 + psubd m%5, m%6, m%2 ; t7 + paddd m%6, m%2 ; t3 + psubd m%2, m%8, m%4 ; t5 + paddd m%8, m%4 ; t1 + REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 36, 83 ; t5a, t4a + ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 83, %11, 0x08 ; t6a, t7a + psubd m%10, m%7, m%9 ; t7 + paddd m%7, m%9 ; out6 + vpbroadcastd m%9, [vvc_pd_1448] + psubd m%4, m%8, m%6 ; t3 + paddd m%8, m%6 ; -out7 + psubd m%6, m%1, m%3 ; t2 + paddd m%1, m%3 ; out0 + psubd m%3, m%2, m%5 ; t6 + paddd m%2, m%5 ; -out1 + REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 + REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 + REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 + psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 + paddd m%4, m%6 ; (t2 + t3) * 1448 + psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 + paddd m%3, m%10 ; (t6 + t7) * 1448 +%endmacro + +INV_TXFM_8X8_FN dct2, dct2 +INV_TXFM_8X8_FN dct2, identity +INV_TXFM_8X8_FN dct2, adst +INV_TXFM_8X8_FN dct2, flipadst + +cglobal idct2_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + vpbroadcastd m11, [vvc_pd_2048] + call .main + call .round_shift1 + jmp tx2q +.pass2: + call .transpose_8x8_packed + call m(idct2_8x8_internal_8).main + vpbroadcastd m12, [vvc_pw_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_8x4 + RET +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_10_max] + lea r6, [strideq*3] + pxor m10, m10 +.write_8x4: + mova xm8, [dstq+strideq*0] + vinserti128 m8, [dstq+strideq*1], 1 + mova xm9, [dstq+strideq*2] + vinserti128 m9, [dstq+r6 ], 1 + mova [cq+32*0], m10 + mova [cq+32*1], m10 + mova [cq+32*2], m10 + mova [cq+32*3], m10 + add cq, 32*4 + paddw m0, m8 + paddw m1, m9 + pmaxsw m0, m10 + pmaxsw m1, m10 + pminsw m0, m11 + pminsw m1, m11 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.transpose_8x8_packed: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m4, m1 + punpckldq m4, m1 + vinserti128 m1, m3, xm2, 1 + vperm2i128 m3, m2, 0x31 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret +ALIGN function_align +.main_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main: + ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 75, 50 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 18, 89 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 36, 83 ; t2 t3 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + vpbroadcastd m3, [vvc_pd_64] + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m3 }, m0, m4, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + ret +ALIGN function_align +.round_shift1: + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct2 +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call .main + call .main_end + jmp tx2q +.pass2: + call m(idct2_8x8_internal_10).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8).main_pass2 + vpbroadcastd m5, [vvc_pw_2048] + vpbroadcastd xm12, [vvc_pw_64] + psubw m12, m5 + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct2_8x8_internal_10).write_8x4 + RET +ALIGN function_align +.main: + mova m0, [cq+32*0] + mova m7, [cq+32*7] + mova m1, [cq+32*1] + mova m6, [cq+32*6] + mova m2, [cq+32*2] + mova m5, [cq+32*5] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + vpbroadcastd m11, [vvc_pd_2048] +.main2: + IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 + psrld m8, 10 ; vvc_pd_1 + vpbroadcastd m9, [vvc_pd_3072] + ret +ALIGN function_align +.main_end: + paddd m0, m8 + psubd m1, m8, m1 + paddd m6, m8 + psubd m7, m8, m7 + REPX {psrad x, 1 }, m0, m1, m6, m7 + ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 + ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 + psubd m8, m9, m8 ; vvc_pd_3071 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 12}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct2 +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_8x8_internal_10).main + call .main_end + jmp tx2q +.pass2: + call m(idct2_8x8_internal_10).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8).main_pass2 + vpbroadcastd m12, [vvc_pw_2048] + vpbroadcastd xm5, [vvc_pw_64] + psubw m12, m5 + vpermq m8, m3, q2031 + vpermq m9, m2, q2031 + vpermq m2, m1, q2031 + vpermq m3, m0, q2031 + pmulhrsw m0, m8, m12 + pmulhrsw m1, m9, m12 + call m(idct2_8x8_internal_10).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct2_8x8_internal_10).write_8x4 + RET +ALIGN function_align +.main_end: + paddd m10, m8, m0 + psubd m0, m8, m7 + psubd m7, m8, m1 + paddd m1, m8, m6 + psrad m0, 1 + psrad m1, 1 + psrad m6, m7, 1 + psrad m7, m10, 1 + psubd m8, m9, m8 ; vvc_pd_6143 + psubd m10, m8, m5 + paddd m5, m9, m2 + psubd m2, m8, m3 + paddd m3, m9, m4 + psrad m4, m2, 12 + psrad m2, m10, 12 + psrad m3, 12 + psrad m5, 12 + ret + +INV_TXFM_8X8_FN identity, dct2 +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 +.pass1: + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + jmp tx2q +.pass2: + packssdw m3, m7 + vpbroadcastd m7, [pixel_10_max] +.pass2_main: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + vpbroadcastd m12, [vvc_pw_64] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m1 + punpckhdq m4, m1 + punpckhqdq m1, m0, m2 ; 1 5 + punpcklqdq m0, m2 ; 0 4 + punpcklqdq m2, m3, m4 ; 2 6 + punpckhqdq m3, m4 ; 3 7 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_2x8x2_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_2x8x2_zero + RET +.write_2x8x2_start: + lea r6, [strideq*5] + pxor m6, m6 +.write_2x8x2_zero: + mova [cq+32*0], m6 + mova [cq+32*1], m6 + mova [cq+32*2], m6 + mova [cq+32*3], m6 + add cq, 32*4 +.write_2x8x2: + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + mova xm5, [dstq+strideq*1] + vinserti128 m5, [dstq+r6 ], 1 + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m6 + pmaxsw m1, m6 + pminsw m0, m7 + pminsw m1, m7 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*4], m0, 1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*2] + ret + +%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4] + punpckldq m%9, m%1, m%2 ; aibj emfn + punpckhdq m%1, m%2 ; ckdl gohp + punpckldq m%10, m%3, m%4 ; qyrz uCvD + punpckhdq m%3, m%4 ; sAtB wExF + punpckldq m%11, m%5, m%6 ; GOHP KSLT + punpckhdq m%5, m%6 ; IQJR MUNV + punpckldq m%12, m%7, m%8 ; WeXf aibj + punpckhdq m%7, m%8 ; YgZh ckdl + punpcklqdq m%2, m%9, m%10 ; aiqy emuC + punpckhqdq m%9, m%10 ; bjrz fnvD + punpcklqdq m%4, m%1, m%3 ; cksA gowE + punpckhqdq m%10, m%1, m%3 ; dltB hpxF + punpcklqdq m%6, m%11, m%12 ; GOWe KSai + punpckhqdq m%11, m%12 ; HPXf LTbj + punpcklqdq m%8, m%5, m%7 ; IQYg MUck + punpckhqdq m%12, m%5, m%7 ; JRZh NVdl + vperm2i128 m%1, m%2, m%6, 0x20 ; out0 + vperm2i128 m%5, m%2, m%6, 0x31 ; out4 + vperm2i128 m%2, m%9, m%11, 0x20 ; out1 + vperm2i128 m%6, m%9, m%11, 0x31 ; out5 + vperm2i128 m%3, m%4, m%8, 0x20 ; out2 + vperm2i128 m%7, m%4, m%8, 0x31 ; out6 + vperm2i128 m%4, m%10, m%12, 0x20 ; out3 + vperm2i128 m%8, m%10, m%12, 0x31 ; out7 +%endmacro + +INV_TXFM_8X8_FN dct2, dct2, 12 +INV_TXFM_8X8_FN dct2, identity, 12 +INV_TXFM_8X8_FN dct2, adst, 12 +INV_TXFM_8X8_FN dct2, flipadst, 12 + +cglobal idct2_8x8_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct2_8x8_internal_10).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose_8x8 + vpbroadcastd m11, [vvc_pd_2048] + call m(idct2_8x8_internal_10).main + call .round_shift4 + jmp m(iadst_8x8_internal_12).pass2_end +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_12_max] + lea r6, [strideq*3] + pxor m10, m10 + ret +ALIGN function_align +.transpose_8x8: + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + ret +ALIGN function_align +.round_shift4: + vpbroadcastd m1, [vvc_pd_8] + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct2, 12 +INV_TXFM_8X8_FN adst, adst, 12 +INV_TXFM_8X8_FN adst, flipadst, 12 +INV_TXFM_8X8_FN adst, identity, 12 + +cglobal iadst_8x8_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x8_internal_10).pass1 +.pass2: + call .pass2_main +.pass2_end: + packssdw m0, m1 + packssdw m1, m2, m3 + REPX {vpermq x, x, q3120}, m0, m1 + call m(idct2_8x8_internal_12).write_8x4_start + call m(idct2_8x8_internal_10).write_8x4 + packssdw m0, m4, m5 + packssdw m1, m6, m7 + REPX {vpermq x, x, q3120}, m0, m1 + call m(idct2_8x8_internal_10).write_8x4 + RET +ALIGN function_align +.pass2_main: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x8_internal_12).transpose_8x8 + vpbroadcastd m11, [vvc_pd_2048] +.pass2_main2: + call m(iadst_8x8_internal_10).main2 + pslld m9, m8, 3 ; vvc_pd_8 + paddd m0, m9 + psubd m1, m9, m1 ; 8+x + paddd m6, m9 + psubd m7, m9, m7 + REPX {psrad x, 4}, m0, m1, m6, m7 + vpbroadcastd m9, [vvc_pd_17408] + psubd m8, m9, m8 ; 17407 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 15}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct2, 12 +INV_TXFM_8X8_FN flipadst, adst, 12 +INV_TXFM_8X8_FN flipadst, flipadst, 12 +INV_TXFM_8X8_FN flipadst, identity, 12 + +cglobal iflipadst_8x8_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x8_internal_10).pass1 +.pass2: + call m(iadst_8x8_internal_12).pass2_main + packssdw m7, m7, m6 + packssdw m6, m1, m0 + packssdw m1, m5, m4 + vpermq m0, m7, q3120 + vpermq m1, m1, q3120 + call m(idct2_8x8_internal_12).write_8x4_start + call m(idct2_8x8_internal_10).write_8x4 + packssdw m0, m3, m2 + vpermq m0, m0, q3120 + vpermq m1, m6, q3120 + call m(idct2_8x8_internal_10).write_8x4 + RET + +INV_TXFM_8X8_FN identity, dct2, 12 +INV_TXFM_8X8_FN identity, adst, 12 +INV_TXFM_8X8_FN identity, flipadst, 12 +INV_TXFM_8X8_FN identity, identity, 12 + +cglobal iidentity_8x8_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(iidentity_8x8_internal_10).pass1 +.pass2: + packssdw m3, m7 + vpbroadcastd m7, [pixel_12_max] + jmp m(iidentity_8x8_internal_10).pass2_main + +%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 8x16, %4 +%ifidn %1_%2, dct2_dct2 + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_%4] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(vvc_inv_dct2_dct2_8x8_10).dconly2 +%endif +%endmacro + +INV_TXFM_8X16_FN dct2, dct2 +INV_TXFM_8X16_FN dct2, identity, 35 +INV_TXFM_8X16_FN dct2, adst +INV_TXFM_8X16_FN dct2, flipadst + +cglobal idct2_8x16_internal_10, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [vvc_pd_64] + vpbroadcastd m11, [vvc_pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct2_8x16_internal_8).main + vpbroadcastd m12, [vvc_pw_2048] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct2_8x8_internal_10).write_8x4 + pmulhrsw m0, m4, m12 + pmulhrsw m1, m5, m12 + call m(idct2_8x8_internal_10).write_8x4 + pmulhrsw m0, m6, m12 + pmulhrsw m1, m7, m12 + call m(idct2_8x8_internal_10).write_8x4 + RET +ALIGN function_align +.transpose: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 + lea r6, [deint_shuf+128] + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m3, m6 + punpckldq m3, m6 + punpckhdq m6, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + vperm2i128 m2, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vperm2i128 m3, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m7, m5, m6, 0x31 + vinserti128 m5, xm6, 1 + vperm2i128 m6, m8, m4, 0x31 + vinserti128 m4, m8, xm4, 1 + ret +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct2_8x8_internal_10).main_rect2 + jmp m(idct2_8x8_internal_10).round_shift1 +ALIGN function_align +.main_evenhalf: + paddd m1, m6, m7 ; idct2_8 out1 + psubd m6, m7 ; idct2_8 out6 + psubd m7, m0, m9 ; idct2_8 out7 + paddd m0, m9 ; idct2_8 out0 + paddd m2, m5, m4 ; idct2_8 out2 + psubd m5, m4 ; idct2_8 out5 + psubd m4, m3, m8 ; idct2_8 out4 + paddd m3, m8 ; idct2_8 out3 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +.main_oddhalf_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_fast: ; lower half zero + vpbroadcastd m7, [vvc_pd_90] + vpbroadcastd m8, [vvc_pd_9] + vpbroadcastd m6, [vvc_pd_m25] + vpbroadcastd m9, [vvc_pd_87] + vpbroadcastd m5, [vvc_pd_80] + vpbroadcastd m10, [vvc_pd_43] + vpbroadcastd m4, [vvc_pd_m57] + vpbroadcastd m15, [vvc_pd_70] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_fast2 +.main_oddhalf_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf: + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 9, 90 ; t8a, t15a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 87, 25 ; t11a, t12a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 43, 80 ; t10a, t13a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 70, 57 ; t9a, t14a +.main_oddhalf_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t9 + paddd m0, m4 ; t8 + psubd m4, m6, m2 ; t10 + paddd m2, m6 ; t11 + psubd m6, m1, m5 ; t13 + paddd m5, m1 ; t12 + psubd m1, m7, m3 ; t14 + paddd m7, m3 ; t15 + REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 + REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 + vpbroadcastd m15, [vvc_pd_83] + vpbroadcastd m10, [vvc_pd_36] + ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15, 0xc + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 0xe + psubd m3, m1, m4 ; t10 + paddd m1, m4 ; t9 + psubd m4, m0, m2 ; t11a + paddd m0, m2 ; t8a + psubd m2, m8, m6 ; t13 + paddd m6, m8 ; t14 + psubd m8, m7, m5 ; t12a + paddd m7, m5 ; t15a + REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pmulld x, m14}, m2, m8, m3, m4 + paddd m2, m11 + paddd m8, m11 + paddd m5, m2, m3 ; t13a + psubd m2, m3 ; t10a + psubd m3, m8, m4 ; t11 + paddd m4, m8 ; t12 + REPX {psrad x, 12}, m5, m2, m3, m4 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 + ret + +INV_TXFM_8X16_FN adst, dct2 +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, 35 + +cglobal iadst_8x16_internal_10, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [vvc_pd_64] + vpbroadcastd m11, [vvc_pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + call m(iadst_8x8_internal_10).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + call m(iadst_8x8_internal_10).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + call m(iadst_8x8_internal_10).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct2_8x16_internal_10).transpose + call m(iadst_8x16_internal_8).main + call m(iadst_8x16_internal_8).main_pass2_end + vpbroadcastd m8, [vvc_pw_2048] + vpbroadcastd xm12, [vvc_pw_64] + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + psubw m12, m8 + jmp m(idct2_8x16_internal_10).end +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m7, m14, [cq+32*14] + pmulld m1, m14, [cq+32* 2] + pmulld m6, m14, [cq+32*12] + pmulld m2, m14, [cq+32* 4] + pmulld m5, m14, [cq+32*10] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp m(iadst_8x8_internal_10).main2 + +INV_TXFM_8X16_FN flipadst, dct2 +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, 35 + +cglobal iflipadst_8x16_internal_10, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [vvc_pd_64] + vpbroadcastd m11, [vvc_pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call m(iadst_8x16_internal_10).pass1_main + call m(iflipadst_8x8_internal_10).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call m(iadst_8x16_internal_10).pass1_main + call m(iflipadst_8x8_internal_10).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call m(iadst_8x16_internal_10).pass1_main + call m(iflipadst_8x8_internal_10).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct2_8x16_internal_10).transpose + call m(iadst_8x16_internal_8).main + call m(iadst_8x16_internal_8).main_pass2_end + vpbroadcastd m12, [vvc_pw_2048] + vpbroadcastd xm13, [vvc_pw_64] + mova m11, m0 + vpermq m0, m7, q2031 + mova m10, m1 + vpermq m1, m6, q2031 + mova m9, m2 + vpermq m2, m5, q2031 + mova m8, m3 + vpermq m3, m4, q2031 + vpermq m4, m8, q3120 + vpermq m5, m9, q3120 + vpermq m6, m10, q3120 + vpermq m7, m11, q3120 + psubw m12, m13 + jmp m(idct2_8x16_internal_10).end + +INV_TXFM_8X16_FN identity, dct2 +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, vvc_pw_1697x16, [vvc_pw_16384] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 +%ifnum %4 + pmulhrsw m%2, m%4 +%else ; without rounding + psraw m%2, 1 +%endif +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_10, 0, 7, 16, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m15, [vvc_pd_64] + pmulld m0, m15, [cq+32* 0] + pmulld m8, m15, [cq+32* 1] + pmulld m1, m15, [cq+32* 2] + pmulld m9, m15, [cq+32* 3] + pmulld m2, m15, [cq+32* 4] + pmulld m10, m15, [cq+32* 5] + pmulld m3, m15, [cq+32* 6] + pmulld m11, m15, [cq+32* 7] + pmulld m4, m15, [cq+32* 8] + pmulld m12, m15, [cq+32* 9] + pmulld m5, m15, [cq+32*10] + pmulld m13, m15, [cq+32*11] + pmulld m6, m15, [cq+32*12] + pmulld m14, m15, [cq+32*13] + pmulld m7, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [cq], m7 + vpbroadcastd m7, [vvc_pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m13, m7, m15 + vpbroadcastd m8, [vvc_pw_1697x16] + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13 + vpbroadcastd m7, [pixel_10_max] + vpbroadcastd m12, [vvc_pw_2048] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + punpckhwd m9, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m13 + punpcklwd m6, m13 + punpckhwd m13, m4, m5 + punpcklwd m4, m5 + punpcklwd m5, m2, m3 + punpckhwd m2, m3 + punpckhdq m3, m0, m5 + punpckldq m0, m5 + punpckhdq m11, m9, m2 + punpckldq m9, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m13, m1 + punpckhdq m13, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m8, m9, m6 + punpckhqdq m9, m6 + punpcklqdq m10, m11, m13 + punpckhqdq m11, m13 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(iidentity_8x8_internal_10).write_2x8x2_start + pmulhrsw m0, m12, m2 + pmulhrsw m1, m12, m3 + call m(iidentity_8x8_internal_10).write_2x8x2_zero + pmulhrsw m0, m12, m8 + pmulhrsw m1, m12, m9 + lea dstq, [dstq+strideq*4] + call m(iidentity_8x8_internal_10).write_2x8x2_zero + pmulhrsw m0, m12, m10 + pmulhrsw m1, m12, m11 + call m(iidentity_8x8_internal_10).write_2x8x2_zero + ret + +INV_TXFM_8X16_FN dct2, dct2, 0, 12 +INV_TXFM_8X16_FN dct2, identity, 35, 12 +INV_TXFM_8X16_FN dct2, adst, 0, 12 +INV_TXFM_8X16_FN dct2, flipadst, 0, 12 + +cglobal idct2_8x16_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct2_8x16_internal_10).pass1 +.pass2: + lea r6, [rsp+32*4] + call .transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*10], m2 + mova [cq+32*12], m4 + mova [cq+32*14], m6 + pmaxsd m0, m12, [cq+32* 1] + pmaxsd m4, m12, m1 + pmaxsd m1, m12, [cq+32* 3] + pmaxsd m2, m12, [cq+32* 5] + pmaxsd m6, m12, m5 + pmaxsd m5, m12, m3 + pmaxsd m3, m12, [cq+32* 7] + pmaxsd m7, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m14, [vvc_pd_64] + call m(idct2_8x16_internal_10).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 2] + pmaxsd m2, m12, [cq+32* 4] + pmaxsd m3, m12, [cq+32* 6] + pmaxsd m4, m12, [cq+32* 8] + pmaxsd m5, m12, [cq+32*10] + pmaxsd m6, m12, [cq+32*12] + pmaxsd m7, m12, [cq+32*14] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + vpbroadcastd m11, [vvc_pd_8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_16x8_internal_10).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 +.end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct2_8x8_internal_12).write_8x4_start + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m2, q3120 + vpermq m1, m3, q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q3120 + call m(idct2_8x8_internal_10).write_8x4 + RET +ALIGN function_align +.transpose: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + call m(idct2_8x8_internal_12).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + mova [cq+32* 3], m3 + mova [cq+32* 4], m4 + mova [cq+32* 5], m5 + mova [cq+32* 6], m6 + mova [cq+32* 7], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, m12 + mova m5, m13 + mova m6, m14 + mova m7, m15 + jmp m(idct2_8x8_internal_12).transpose_8x8 + +INV_TXFM_8X16_FN adst, dct2, 0, 12 +INV_TXFM_8X16_FN adst, adst, 0, 12 +INV_TXFM_8X16_FN adst, flipadst, 0, 12 +INV_TXFM_8X16_FN adst, identity, 35, 12 + +cglobal iadst_8x16_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x16_internal_10).pass1 +.pass2: + lea r6, [rsp+32*4] + call .pass2_main + call m(iadst_16x8_internal_10).pass1_rotations +.pass2_end: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp m(idct2_8x16_internal_12).end +ALIGN function_align +.pass2_main: + call m(idct2_8x16_internal_12).transpose + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*15], m7 + pmaxsd m0, m13, [cq+32* 2] ; 2 + pmaxsd m3, m13, m1 ; 9 + pmaxsd m1, m13, m5 ; 13 + pmaxsd m4, m13, m2 ; 10 + pmaxsd m2, m13, [cq+32* 6] ; 6 + pmaxsd m5, m13, [cq+32* 5] ; 5 + pmaxsd m6, m13, m6 ; 14 + pmaxsd m7, m13, [cq+32* 1] ; 1 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m12, [vvc_pd_2048] + vpbroadcastd m15, [vvc_pd_64] + call m(iadst_16x8_internal_10).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m1, m13, [cq+32*15] ; 15 + pmaxsd m2, m13, [cq+32* 4] ; 4 + pmaxsd m3, m13, [cq+32*11] ; 11 + pmaxsd m4, m13, [cq+32* 8] ; 8 + pmaxsd m5, m13, [cq+32* 7] ; 7 + pmaxsd m6, m13, [cq+32*12] ; 12 + pmaxsd m7, m13, [cq+32* 3] ; 3 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_16x8_internal_10).main_part2 + vpbroadcastd m14, [vvc_pd_17408] + psrld m15, 11 ; vvc_pd_1 + psubd m13, m14, m15 ; vvc_pd_17407 + pslld m15, 3 ; vvc_pd_8 + ret + +INV_TXFM_8X16_FN flipadst, dct2, 0, 12 +INV_TXFM_8X16_FN flipadst, adst, 0, 12 +INV_TXFM_8X16_FN flipadst, flipadst, 0, 12 +INV_TXFM_8X16_FN flipadst, identity, 35, 12 + +cglobal iflipadst_8x16_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x16_internal_10).pass1 +.pass2: + lea r6, [rsp+32*4] + call m(iadst_8x16_internal_12).pass2_main + call m(iflipadst_16x8_internal_10).pass1_rotations + jmp m(iadst_8x16_internal_12).pass2_end + +INV_TXFM_8X16_FN identity, dct2, 0, 12 +INV_TXFM_8X16_FN identity, adst, 0, 12 +INV_TXFM_8X16_FN identity, flipadst, 0, 12 +INV_TXFM_8X16_FN identity, identity, 0, 12 + +cglobal iidentity_8x16_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_8x16_internal_10).pass1 +.pass2: + call .pass2_main + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m13, m7, m15 + vpbroadcastd m7, [pixel_12_max] + vpbroadcastd m12, [vvc_pw_16384] + call m(iidentity_8x16_internal_10).pass2_end + RET +ALIGN function_align +.pass2_main: + mova [cq], m7 + vpbroadcastd m7, [clip_18b_min] + REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmaxsd m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [clip_18b_max] + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pminsd m15, [cq] + mova [cq], m7 + vpbroadcastd m7, [vvc_pd_5793] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmulld m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [vvc_pd_1024] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [cq] + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ret + +%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 16x4, %3 +%ifidn %1_%2, dct2_dct2 + vpbroadcastd m3, [dconly_%3] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 +.dconly2: + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm3 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+strideq*0] + paddsw m2, m0, [dstq+strideq*1] + psubusw m1, m3 + psubusw m2, m3 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%else + jmp m(vvc_inv_dct2_dct2_16x4_10).dconly +%endif +%endif +%endmacro + +INV_TXFM_16X4_FN dct2, dct2 +INV_TXFM_16X4_FN dct2, identity +INV_TXFM_16X4_FN dct2, adst +INV_TXFM_16X4_FN dct2, flipadst + +cglobal idct2_16x4_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.pass1: + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 4] + vbroadcasti128 m1, [cq+16* 2] + vbroadcasti128 m7, [cq+16* 6] + vbroadcasti128 m5, [cq+16*10] + vbroadcasti128 m2, [cq+16* 8] + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m3, [cq+16*14] + shufpd m0, m4, 0x0c ; 0 4 + shufpd m1, m5, 0x0c ; 2 10 + shufpd m2, m6, 0x0c ; 8 12 + shufpd m3, m7, 0x0c ; 14 6 + call .pass1_main + vbroadcasti128 m10, [cq+16* 1] + vbroadcasti128 m4, [cq+16* 5] + vbroadcasti128 m11, [cq+16*15] + vbroadcasti128 m5, [cq+16*11] + shufpd m10, m4, 0x0c ; 1 5 + shufpd m11, m5, 0x0c ; 15 11 + vbroadcasti128 m5, [cq+16* 9] + vbroadcasti128 m4, [cq+16*13] + shufpd m5, m4, 0x0c ; 9 13 + vbroadcasti128 m6, [cq+16* 7] + vbroadcasti128 m4, [cq+16* 3] + shufpd m6, m4, 0x0c ; 7 3 + call .pass1_main2 + pcmpeqd m4, m4 + REPX {psubd x, m4}, m0, m1, m2, m3 + call .pass1_main3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call .transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(idct2_16x4_internal_8).main +.end: + vpbroadcastd m4, [vvc_pw_2048] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_10_max] +.end2: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] +.end3: + lea r6, [dstq+strideq*2] + paddw m2, [r6 +strideq*0] + paddw m3, [r6 +strideq*1] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + REPX {pminsw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [r6 +strideq*0], m2 + mova [r6 +strideq*1], m3 + RET +ALIGN function_align +.pass1_main: + vpbroadcastd m7, [vvc_pd_2048] + call m(idct2_8x4_internal_10).main + psubd m3, m0, m4 ; idct2_8 out7 out6 + paddd m0, m4 ; idct2_8 out0 out1 + paddd m1, m2, m5 ; idct2_8 out3 out2 + psubd m2, m5 ; idct2_8 out4 out5 + ret +ALIGN function_align +.pass1_main2: + ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 9_43, 90_80, 1 + ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 70_87, 57_25, 1 + vbroadcasti128 m12, [vvc_pd_83_m83] + psubd m4, m10, m5 + paddd m10, m5 ; t8 t11 + psignd m4, m12 ; t9 t10 + psubd m5, m11, m6 + paddd m11, m6 ; t15 t12 + psignd m5, m12 ; t14 t13 + vpbroadcastd m6, [vvc_pd_36] + vpbroadcastd m13, [vvc_pd_83] + REPX {pmaxsd x, m8}, m5, m4 + REPX {pminsd x, m9}, m5, m4 + pmulld m12, m5 + pmulld m5, m6 + vbroadcasti128 m6, [vvc_pd_36_m36] + pmulld m13, m4 + pmulld m4, m6 + REPX {pmaxsd x, m8}, m10, m11, m0, m1 + REPX {pminsd x, m9}, m10, m11, m0, m1 + paddd m12, m7 + paddd m5, m7 + paddd m4, m12 + psubd m5, m13 + psrad m4, 12 ; t14a t10a + psrad m5, 12 ; t9a t13a + vpbroadcastd m12, [vvc_pd_64] + punpckhqdq m6, m11, m5 + punpcklqdq m11, m4 + punpckhqdq m4, m10, m4 + punpcklqdq m10, m5 + psubd m5, m11, m6 ; t12a t13 + paddd m11, m6 ; t15a t14 + psubd m6, m10, m4 ; t11a t10 + paddd m10, m4 ; t8a t9 + REPX {pmaxsd x, m8}, m5, m6 + REPX {pminsd x, m9}, m5, m6 + pmulld m5, m12 + pmulld m6, m12 + REPX {pmaxsd x, m8}, m2, m3, m11, m10 + REPX {pminsd x, m9}, m2, m3, m11, m10 + ret +ALIGN function_align +.pass1_main3: + paddd m5, m7 + psubd m4, m5, m6 + paddd m5, m6 + psrad m4, 12 ; t11 t10a + psrad m5, 12 ; t12 t13a + psubd m7, m0, m11 ; out15 out14 + paddd m0, m11 ; out0 out1 + psubd m6, m1, m5 ; out12 out13 + paddd m1, m5 ; out3 out2 + psubd m5, m2, m4 ; out11 out10 + paddd m2, m4 ; out4 out5 + psubd m4, m3, m10 ; out8 out9 + paddd m3, m10 ; out7 out6 + REPX {pshufd x, x, q1032}, m1, m3, m5, m7 + ret +ALIGN function_align +.transpose_4x16_packed: + vbroadcasti128 m8, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + REPX {pshufb x, m8}, m0, m2, m4, m6 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m2, m4, m6 + punpcklqdq m4, m6 + vperm2i128 m3, m1, m2, 0x31 + vinserti128 m1, xm2, 1 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret + +INV_TXFM_16X4_FN adst, dct2 +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_4x16_internal_10).main + psrad m11, 11 ; vvc_pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + paddd m4, m5, m11 + paddd m5, m6, m11 + paddd m6, m7, m11 + paddd m7, m8, m11 +.pass1_end: + REPX {pshufd x, x, q1032}, m0, m2, m4, m6 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct2_16x4_internal_10).transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(iadst_16x4_internal_8).main + jmp m(idct2_16x4_internal_10).end +ALIGN function_align +.main: + vpbroadcastd m6, [vvc_pd_1321] + mova m0, [cq+32*0] + mova m1, [cq+32*1] + vpbroadcastd m7, [vvc_pd_2482] + mova m2, [cq+32*6] + mova m3, [cq+32*7] + pmulld m4, m0, m6 + pmulld m5, m1, m6 ; 1321*in0 + pmulld m9, m2, m7 + pmulld m8, m3, m7 ; 2482*in3 + paddd m4, m9 + paddd m8, m5 ; 1321*in0 + 2482*in3 + pmulld m5, m0, m7 + pmulld m9, m1, m7 ; 2482*in0 + paddd m0, m2 + paddd m1, m3 ; in0 + in3 + paddd m7, m6 ; vvc_pd_3803 + pmulld m2, m7 + pmulld m3, m7 ; 3803*in3 + psubd m5, m2 + psubd m9, m3 ; 2482*in0 - 3803*in3 + mova m2, [cq+32*4] + pmulld m10, m7, m2 + pmulld m3, m6, m2 + psubd m2, m0 + mova m0, [cq+32*5] + pmulld m7, m0 ; 3803*in2 + pmulld m6, m0 ; 1321*in2 + psubd m0, m1 ; in2 - in0 - in3 + vpbroadcastd m1, [vvc_pd_m3344] + paddd m4, m10 + paddd m7, m8 ; t0 + psubd m5, m3 + psubd m9, m6 ; t1 + pmulld m2, m1 + pmulld m0, m1 ; t2 + pmulld m3, m1, [cq+32*2] + pmulld m1, [cq+32*3] ; -t3 + ret +ALIGN function_align +.main_end: + ; expects: m6 = rnd + paddd m5, m6 + paddd m9, m6 + paddd m10, m4, m5 + paddd m4, m6 + paddd m8, m7, m6 + paddd m7, m9 + psubd m4, m3 ; out0 (unshifted) + psubd m5, m3 ; out1 (unshifted) + paddd m2, m6 ; out2 (unshifted) + paddd m3, m10 ; out3 (unshifted) + psubd m8, m1 ; out4 (unshifted) + psubd m9, m1 ; out5 (unshifted) + paddd m6, m0 ; out6 (unshifted) + paddd m7, m1 ; out7 (unshifted) + ret + +INV_TXFM_16X4_FN flipadst, dct2 +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_4x16_internal_10).main + psrad m11, 11 ; vvc_pd_1 + paddd m4, m3, m11 + paddd m3, m5, m11 + paddd m5, m2, m11 + paddd m2, m6, m11 + paddd m6, m1, m11 + paddd m1, m7, m11 + paddd m7, m0, m11 + paddd m0, m8, m11 + jmp m(iadst_16x4_internal_10).pass1_end +.pass2: + call m(idct2_16x4_internal_10).transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(iadst_16x4_internal_8).main + vpbroadcastd m4, [vvc_pw_2048] + pmulhrsw m5, m3, m4 + pmulhrsw m6, m2, m4 + pmulhrsw m2, m1, m4 + pmulhrsw m3, m0, m4 + paddw m0, m5, [dstq+strideq*0] + paddw m1, m6, [dstq+strideq*1] + vpbroadcastd m5, [pixel_10_max] + jmp m(idct2_16x4_internal_10).end3 + +INV_TXFM_16X4_FN identity, dct2 +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_10, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [vvc_pd_5793] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m4, [cq+32*4], q3120 ; 8 9 + vpermq m5, [cq+32*5], q3120 ; a b + vpermq m6, [cq+32*6], q3120 ; c d + vpermq m7, [cq+32*7], q3120 ; e f + vpbroadcastd m9, [vvc_pd_3072] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct2_16x4_internal_10).transpose_4x16_packed + vpbroadcastd m7, [vvc_pw_1697x8] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct2_16x4_internal_10).end + +INV_TXFM_16X4_FN dct2, dct2, 12 +INV_TXFM_16X4_FN dct2, identity, 12 +INV_TXFM_16X4_FN dct2, adst, 12 +INV_TXFM_16X4_FN dct2, flipadst, 12 + +cglobal idct2_16x4_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + jmp m(idct2_16x4_internal_10).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ; deinterleave + REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + ; transpose + punpcklqdq m8, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m9, m2, m3 + punpckhqdq m2, m3 + punpcklqdq m10, m4, m5 + punpckhqdq m4, m5 + punpcklqdq m11, m6, m7 + punpckhqdq m6, m7 + vperm2i128 m3, m0, m2, 0x31 ; out6 + vperm2i128 m1, m0, m2, 0x20 ; out2 + vperm2i128 m7, m4, m6, 0x31 ; out7 + vperm2i128 m5, m4, m6, 0x20 ; out3 + vperm2i128 m13, m10, m11, 0x31 ; out5 + vperm2i128 m12, m10, m11, 0x20 ; out1 + vperm2i128 m11, m8, m9, 0x31 ; out4 + vperm2i128 m10, m8, m9, 0x20 ; out0 + call m(idct2_4x16_internal_10).pass1_main + pmulld m0, m6, m10 + pmulld m2, m6, m11 + pmulld m4, m6, m12 + pmulld m6, m13 + vpbroadcastd m10, [vvc_pd_17408] + call m(idct2_4x16_internal_10).pass1_main2 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m5, [pixel_12_max] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + jmp m(idct2_16x4_internal_10).end2 + +INV_TXFM_16X4_FN adst, dct2, 12 +INV_TXFM_16X4_FN adst, adst, 12 +INV_TXFM_16X4_FN adst, flipadst, 12 +INV_TXFM_16X4_FN adst, identity, 12 + +cglobal iadst_16x4_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_16x4_internal_10).pass1 +.pass2: + call .pass2_main + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + jmp m(idct2_16x4_internal_10).end2 +ALIGN function_align +.pass2_main: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7 + pmaxsd m8, m4, m12 + pmaxsd m9, m5, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12).transpose_4x8 + mova [cq+32*0], m0 + mova [cq+32*2], m1 + mova [cq+32*4], m2 + mova [cq+32*6], m3 + pminsd m0, m8, m13 + pminsd m1, m9, m13 + pminsd m2, m6, m13 + pminsd m3, m7, m13 + call m(iadst_8x4_internal_12).transpose_4x8 + mova [cq+32*1], m0 + mova [cq+32*3], m1 + mova [cq+32*5], m2 + mova [cq+32*7], m3 + call m(iadst_16x4_internal_10).main + vpbroadcastd m6, [vvc_pd_2048] + call m(iadst_16x4_internal_10).main_end + psrad m0, m4, 15 + psrad m1, m5, 15 + psrad m2, 15 + psrad m3, 15 + psrad m4, m8, 15 + psrad m5, m9, 15 + psrad m6, 15 + psrad m7, 15 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m4, [vvc_pw_16384] + vpbroadcastd m5, [pixel_12_max] + ret + +INV_TXFM_16X4_FN flipadst, dct2, 12 +INV_TXFM_16X4_FN flipadst, adst, 12 +INV_TXFM_16X4_FN flipadst, flipadst, 12 +INV_TXFM_16X4_FN flipadst, identity, 12 + +cglobal iflipadst_16x4_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_16x4_internal_10).pass1 +.pass2: + call m(iadst_16x4_internal_12).pass2_main + vpermq m7, m0, q3120 + vpermq m6, m1, q3120 + vpermq m1, m2, q3120 + vpermq m0, m3, q3120 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m6, m4 + pmulhrsw m3, m7, m4 + jmp m(idct2_16x4_internal_10).end2 + +INV_TXFM_16X4_FN identity, dct2, 12 +INV_TXFM_16X4_FN identity, adst, 12 +INV_TXFM_16X4_FN identity, flipadst, 12 +INV_TXFM_16X4_FN identity, identity, 12 + +cglobal iidentity_16x4_internal_12, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [vvc_pd_1697] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpbroadcastd m9, [vvc_pd_3072] + pmulld m4, m8, m0 + pmulld m5, m8, m1 + pmulld m6, m8, m2 + pmulld m7, m8, m3 + vpermq m10, [cq+32*4], q3120 ; 8 9 + vpermq m11, [cq+32*5], q3120 ; a b + vpermq m12, [cq+32*6], q3120 ; c d + vpermq m13, [cq+32*7], q3120 ; e f + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m0, m4 + pmulld m4, m8, m10 + paddd m1, m5 + pmulld m5, m8, m11 + paddd m2, m6 + pmulld m6, m8, m12 + paddd m3, m7 + pmulld m7, m8, m13 + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m4, m10 + paddd m5, m11 + paddd m6, m12 + paddd m7, m13 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m8, [vvc_pd_5793] + vpbroadcastd m9, [vvc_pd_2048] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_16x4_internal_10).transpose_4x16_packed + vpbroadcastd m4, [vvc_pw_16384] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_12_max] + jmp m(idct2_16x4_internal_10).end2 + +%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 16x8, %3 +%ifidn %1_%2, dct2_dct2 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%3] + mov [cq], eobd ; 0 + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(vvc_inv_dct2_dct2_16x4_10).dconly2 +%endif +%endmacro + +INV_TXFM_16X8_FN dct2, dct2 +INV_TXFM_16X8_FN dct2, identity +INV_TXFM_16X8_FN dct2, adst +INV_TXFM_16X8_FN dct2, flipadst + +cglobal idct2_16x8_internal_10, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [vvc_pd_64] + pmulld m0, m14, [cq+32* 1] + pmulld m1, m14, [cq+32* 3] + pmulld m2, m14, [cq+32* 5] + pmulld m3, m14, [cq+32* 7] + pmulld m4, m14, [cq+32* 9] + pmulld m5, m14, [cq+32*11] + pmulld m6, m14, [cq+32*13] + pmulld m7, m14, [cq+32*15] + vpbroadcastd m11, [vvc_pd_2048] + lea r6, [rsp+32*4] + call m(idct2_8x16_internal_10).main_oddhalf_rect2 + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct2_8x8_internal_10).main_rect2 + call m(idct2_8x16_internal_10).main_evenhalf + psrld m11, 11 ; vvc_pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call .pass1_rotations + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct2_16x8_internal_8).main + vpbroadcastd m10, [vvc_pw_2048] +.end: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call .write_16x4_start +.end2: + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m10 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m10 + call .write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + ret +ALIGN function_align +.transpose: + lea r6, [deint_shuf+128] +.transpose2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 +.transpose3: + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m4, m6 + punpckldq m4, m6 + punpckldq m6, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m3, m5 + punpckldq m3, m5 + punpcklqdq m5, m6, m3 + punpckhqdq m6, m3 + punpckhqdq m3, m2, m7 + punpcklqdq m2, m7 + punpcklqdq m7, m8, m1 + punpckhqdq m8, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m5, 0x31 + vinserti128 m0, xm5, 1 + vperm2i128 m5, m1, m6, 0x31 + vinserti128 m1, xm6, 1 + vperm2i128 m6, m2, m7, 0x31 + vinserti128 m2, xm7, 1 + vperm2i128 m7, m3, m8, 0x31 + vinserti128 m3, xm8, 1 + ret +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_10_max] + lea r3, [strideq*3] + pxor m8, m8 +.write_16x4_zero: + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7 + add cq, 32*8 +.write_16x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3 ] + REPX {pmaxsw x, m8}, m0, m1, m2, m3 + REPX {pminsw x, m9}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_16X8_FN adst, dct2 +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_10, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + lea r6, [rsp+32*4] + call .main + vpbroadcastd m14, [vvc_pd_3072] + psrld m15, 11 ; vvc_pd_1 + psubd m13, m14, m15 ; vvc_pd_3071 + call .pass1_rotations +.pass1_end: + REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp tx2q +.pass2: + call m(idct2_16x8_internal_10).transpose + call m(iadst_16x8_internal_8).main + call m(iadst_16x8_internal_8).main_pass2_end + vpbroadcastd m10, [vvc_pw_2048] + pxor m11, m11 + psubw m11, m10 + pmulhrsw m0, m10 + pmulhrsw m1, m11 + pmulhrsw m2, m10 + pmulhrsw m3, m11 + call m(idct2_16x8_internal_10).write_16x4_start + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m11 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m11 + call m(idct2_16x8_internal_10).write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] + ret +ALIGN function_align +.main: + ; expects: m13 = clip_min m14 = clip_max + vpbroadcastd m15, [vvc_pd_64] + pmulld m0, m15, [cq+32* 2] + pmulld m1, m15, [cq+32*13] + pmulld m2, m15, [cq+32* 6] + pmulld m3, m15, [cq+32* 9] + pmulld m4, m15, [cq+32*10] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32*14] + pmulld m7, m15, [cq+32* 1] + vpbroadcastd m12, [vvc_pd_2048] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + call .main_part1 + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32*15] + pmulld m2, m15, [cq+32* 4] + pmulld m3, m15, [cq+32*11] + pmulld m4, m15, [cq+32* 8] + pmulld m5, m15, [cq+32* 7] + pmulld m6, m15, [cq+32*12] + pmulld m7, m15, [cq+32* 3] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_part2: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 4, 90 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 38, 82 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 67, 61 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 85, 31 + psubd m8, m0, m4 ; t8a + paddd m0, m4 ; t0a + psubd m4, m1, m5 ; t9a + paddd m1, m5 ; t1a + psubd m5, m2, m6 ; t12a + paddd m2, m6 ; t4a + psubd m6, m3, m7 ; t13a + paddd m7, m3 ; t5a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [vvc_pd_89] + vpbroadcastd m10, [vvc_pd_18] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11, 0xc + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10, 0xc + psubd m3, m0, m2 ; t4 + paddd m0, m2 ; t0 + psubd m2, m1, m7 ; t5 + paddd m1, m7 ; t1 + psubd m7, m4, m6 ; t12a + paddd m4, m6 ; t8a + psubd m6, m8, m5 ; t13a + paddd m5, m8 ; t9a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5 + vpbroadcastd m11, [vvc_pd_83] + vpbroadcastd m10, [vvc_pd_36] + ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11, 0xc + ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11, 0xc + pminsd m10, m14, [r6-32*4] ; t2 + pminsd m8, m14, [r6-32*3] ; t3 + psubd m9, m0, m10 ; t2a + paddd m0, m10 ; out0 + psubd m10, m1, m8 ; t3a + paddd m1, m8 ; -out15 + pmaxsd m9, m13 + pmaxsd m10, m13 + pminsd m9, m14 + pminsd m10, m14 + mova [r6-32*4], m1 + mova m11, [r6-32*1] ; t7a + mova m1, [r6-32*2] ; t6a + psubd m8, m3, m11 ; t7 + paddd m11, m3 ; out12 + paddd m3, m2, m1 ; -out3 + psubd m2, m1 ; t6 + pmaxsd m8, m13 + pmaxsd m2, m13 + pminsd m8, m14 + pminsd m2, m14 + mova [r6-32*1], m11 + mova [r6-32*3], m2 + mova m1, [r6+32*3] ; t15 + mova m2, [r6+32*2] ; t14 + paddd m12, m7, m1 ; -out13 + psubd m7, m1 ; t15a + psubd m11, m6, m2 ; t14a + paddd m2, m6 ; out2 + pmaxsd m7, m13 + pmaxsd m11, m13 + pminsd m7, m14 + pminsd m11, m14 + mova [r6-32*2], m12 + pminsd m1, m14, [r6+32*0] ; t10a + pminsd m12, m14, [r6+32*1] ; t11a + psubd m6, m4, m1 ; t10 + paddd m1, m4 ; -out1 + psubd m4, m5, m12 ; t11 + paddd m5, m12 ; out14 + vpbroadcastd m12, [vvc_pd_1448] + pmaxsd m6, m13 + pmaxsd m4, m13 + pminsd m6, m14 + pminsd m4, m14 + REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 + pmulld m12, [r6-32*3] ; t6 + mova [r6-32*3], m5 + paddd m5, m11, m7 ; -out5 (unshifted) + psubd m11, m7 ; out10 (unshifted) + paddd m7, m9, m10 ; -out7 (unshifted) + psubd m9, m10 ; out8 (unshifted) + psubd m10, m6, m4 ; -out9 (unshifted) + paddd m6, m4 ; out6 (unshifted) + paddd m4, m12, m8 ; out4 (unshifted) + psubd m12, m8 ; -out11 (unshifted) + ret +.main_part1: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 22, 88 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 54, 73 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 78, 46 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 90, 13 + psubd m8, m0, m4 ; t10a + paddd m0, m4 ; t2a + psubd m4, m1, m5 ; t11a + paddd m1, m5 ; t3a + psubd m5, m2, m6 ; t14a + paddd m2, m6 ; t6a + psubd m6, m3, m7 ; t15a + paddd m7, m3 ; t7a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [vvc_pd_50] + vpbroadcastd m10, [vvc_pd_75] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11, 0xc + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10, 0xc + psubd m3, m0, m2 ; t6 + paddd m0, m2 ; t2 + psubd m2, m1, m7 ; t7 + paddd m1, m7 ; t3 + psubd m7, m4, m6 ; t14a + paddd m4, m6 ; t10a + psubd m6, m8, m5 ; t15a + paddd m5, m8 ; t11a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later + vpbroadcastd m11, [vvc_pd_36] + vpbroadcastd m10, [vvc_pd_83] + ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11, 0xc + ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11, 0xc + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + ret + +INV_TXFM_16X8_FN flipadst, dct2 +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_10, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + lea r6, [rsp+32*4] + call m(iadst_16x8_internal_10).main + vpbroadcastd m14, [vvc_pd_3072] + psrld m15, 11 + psubd m13, m14, m15 + call .pass1_rotations + jmp m(iadst_16x8_internal_10).pass1_end +.pass2: + call m(idct2_16x8_internal_10).transpose + call m(iadst_16x8_internal_8).main + call m(iadst_16x8_internal_8).main_pass2_end + vpbroadcastd m10, [vvc_pw_2048] + pxor m11, m11 + psubw m11, m10 + mova m12, m0 + pmulhrsw m0, m7, m11 + mova m7, m1 + pmulhrsw m1, m6, m10 + mova m6, m2 + pmulhrsw m2, m5, m11 + mova m5, m3 + pmulhrsw m3, m4, m10 + call m(idct2_16x8_internal_10).write_16x4_start + pmulhrsw m0, m5, m11 + pmulhrsw m1, m6, m10 + pmulhrsw m2, m7, m11 + pmulhrsw m3, m12, m10 + call m(idct2_16x8_internal_10).write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + ret + +INV_TXFM_16X8_FN identity, dct2 +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_10, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m15, [vvc_pd_64] + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32* 1] + pmulld m2, m15, [cq+32* 2] + pmulld m3, m15, [cq+32* 3] + pmulld m4, m15, [cq+32* 4] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32* 6] + pmulld m7, m15, [cq+32* 7] + pmulld m8, m15, [cq+32* 8] + pmulld m9, m15, [cq+32* 9] + pmulld m10, m15, [cq+32*10] + pmulld m11, m15, [cq+32*11] + pmulld m12, m15, [cq+32*12] + pmulld m13, m15, [cq+32*13] + pmulld m14, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [rsp], m7 + vpbroadcastd m7, [vvc_pd_2048] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + mova [rsp], m15 + vpbroadcastd m15, [vvc_pd_5793] + REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pmulld m15, [rsp] + mova [rsp], m7 + vpbroadcastd m7, [vvc_pd_3072] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct2_16x8_internal_10).transpose + vpbroadcastd m10, [vvc_pw_64] + jmp m(idct2_16x8_internal_10).end + +INV_TXFM_16X8_FN dct2, dct2, 12 +INV_TXFM_16X8_FN dct2, identity, 12 +INV_TXFM_16X8_FN dct2, adst, 12 +INV_TXFM_16X8_FN dct2, flipadst, 12 + +cglobal idct2_16x8_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct2_16x8_internal_10).pass1 +.pass2: + call .pass2_main + RET +ALIGN function_align +.pass2_main: + call m(idct2_8x16_internal_12).transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m11, [vvc_pd_2048] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x8_internal_10).main + call m(idct2_8x8_internal_12).round_shift4 + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + pmaxsd m0, m12, [cq+32*0] + pmaxsd m1, m12, [cq+32*1] + pmaxsd m2, m12, [cq+32*2] + pmaxsd m3, m12, [cq+32*3] + pmaxsd m4, m12, [cq+32*4] + pmaxsd m5, m12, [cq+32*5] + pmaxsd m6, m12, [cq+32*6] + pmaxsd m7, m12, [cq+32*7] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x8_internal_10).main + call m(idct2_8x8_internal_12).round_shift4 +.end: + packssdw m0, [cq+32* 8] + packssdw m1, [cq+32* 9] + packssdw m2, [cq+32*10] + packssdw m3, [cq+32*11] + packssdw m4, [cq+32*12] + packssdw m5, [cq+32*13] + packssdw m6, [cq+32*14] + packssdw m7, [cq+32*15] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + call .write_16x4_start + call m(idct2_16x8_internal_10).write_16x4_zero + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + vpermq m2, m6, q3120 + vpermq m3, m7, q3120 + jmp m(idct2_16x8_internal_10).write_16x4_zero +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_12_max] + lea r3, [strideq*3] + pxor m8, m8 + ret + +INV_TXFM_16X8_FN adst, dct2, 12 +INV_TXFM_16X8_FN adst, adst, 12 +INV_TXFM_16X8_FN adst, flipadst, 12 +INV_TXFM_16X8_FN adst, identity, 12 + +cglobal iadst_16x8_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iadst_16x8_internal_10).pass1 +.pass2: + call .pass2_main + call m(idct2_16x8_internal_12).end + RET +ALIGN function_align +.pass2_main: + call m(idct2_8x16_internal_12).transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m11, [vvc_pd_2048] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_8x8_internal_12).pass2_main2 + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + pmaxsd m0, m12, [cq+32*0] + pmaxsd m1, m12, [cq+32*1] + pmaxsd m2, m12, [cq+32*2] + pmaxsd m3, m12, [cq+32*3] + pmaxsd m4, m12, [cq+32*4] + pmaxsd m5, m12, [cq+32*5] + pmaxsd m6, m12, [cq+32*6] + pmaxsd m7, m12, [cq+32*7] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_8x8_internal_12).pass2_main2 + ret + +INV_TXFM_16X8_FN flipadst, dct2, 12 +INV_TXFM_16X8_FN flipadst, adst, 12 +INV_TXFM_16X8_FN flipadst, flipadst, 12 +INV_TXFM_16X8_FN flipadst, identity, 12 + +cglobal iflipadst_16x8_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iflipadst_16x8_internal_10).pass1 +.pass2: + call m(iadst_16x8_internal_12).pass2_main + packssdw m13, m0, [cq+32* 8] + packssdw m12, m1, [cq+32* 9] + packssdw m11, m2, [cq+32*10] + packssdw m10, m3, [cq+32*11] + packssdw m3, m4, [cq+32*12] + packssdw m2, m5, [cq+32*13] + packssdw m1, m6, [cq+32*14] + packssdw m0, m7, [cq+32*15] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + call m(idct2_16x8_internal_12).write_16x4_start + call m(idct2_16x8_internal_10).write_16x4_zero + vpermq m0, m10, q3120 + vpermq m1, m11, q3120 + vpermq m2, m12, q3120 + vpermq m3, m13, q3120 + call m(idct2_16x8_internal_10).write_16x4_zero + RET + +INV_TXFM_16X8_FN identity, dct2, 12 +INV_TXFM_16X8_FN identity, adst, 12 +INV_TXFM_16X8_FN identity, flipadst, 12 +INV_TXFM_16X8_FN identity, identity, 12 + +cglobal iidentity_16x8_internal_12, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_16x8_internal_10).pass1 +.pass2: + call m(idct2_16x8_internal_10).transpose2 + vpbroadcastd m10, [vvc_pw_64] + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call m(idct2_16x8_internal_12).write_16x4_start + call m(idct2_16x8_internal_10).write_16x4_zero + jmp m(idct2_16x8_internal_10).end2 + +%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 16x16, %4 +%ifidn %1_%2, dct2_dct2 + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%4] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 640 + sar r6d, 10 + jmp m(vvc_inv_dct2_dct2_16x4_10).dconly3 +%endif +%endmacro + +INV_TXFM_16X16_FN dct2, dct2 +INV_TXFM_16X16_FN dct2, identity, 28 +INV_TXFM_16X16_FN dct2, adst +INV_TXFM_16X16_FN dct2, flipadst + +cglobal idct2_16x16_internal_10, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + mova m10, [r6-32*4] + mova m9, [r6-32*3] + mova m8, [r6-32*2] + psubd m15, m0, m10 ; out15 + paddd m0, m10 ; out0 + psubd m10, m1, m9 ; out14 + paddd m1, m9 ; out1 + psubd m9, m2, m8 ; out13 + paddd m2, m8 ; out2 + REPX {psrad x, 2}, m0, m1, m2 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova m2, [r6-32*1] + mova m1, [r6+32*0] + mova m0, [r6+32*1] + REPX {psrad x, 2}, m9, m10, m15 + psubd m8, m3, m2 ; out12 + paddd m3, m2 ; out3 + psubd m2, m4, m1 ; out11 + paddd m4, m1 ; out4 + psubd m1, m5, m0 ; out10 + paddd m5, m0 ; out5 + REPX {psrad x, 2}, m3, m4, m5 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova m4, [r6+32*2] + mova m3, [r6+32*3] + REPX {psrad x, 2}, m1, m2, m8 + psubd m5, m6, m4 ; out9 + paddd m6, m4 ; out6 + psubd m4, m7, m3 ; out8 + paddd m7, m3 ; out7 + REPX {psrad x, 2}, m6, m7, m4, m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + mova [r6-32*4], m4 + mova [r6-32*3], m5 + mova [r6-32*2], m1 + mova [r6-32*1], m2 + mova [r6+32*0], m8 + mova [r6+32*1], m9 + mova [r6+32*2], m10 + mova [r6+32*3], m15 +.fast: + add r6, 32*8 + call .main + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + sub r6, 32*8 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + lea r6, [vvc_pw_5+128] + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] +.end: + call .write_16x16 + RET +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [vvc_pw_2048] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct2_16x8_internal_10).write_16x4_start +.write_16x16_2: + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct2_16x8_internal_10).write_16x4_zero +ALIGN function_align +.transpose: + test eobd, eobd + jl .transpose_fast + packssdw m8, [r6-32*4] + packssdw m9, [r6-32*3] + packssdw m10, [r6-32*2] + packssdw m11, [r6-32*1] + packssdw m12, [r6+32*0] + packssdw m13, [r6+32*1] + packssdw m14, [r6+32*2] + packssdw m15, [r6+32*3] + sub r6, 32*8 + packssdw m0, [r6-32*4] + packssdw m1, [r6-32*3] + packssdw m2, [r6-32*2] + packssdw m3, [r6-32*1] + packssdw m4, [r6+32*0] + packssdw m5, [r6+32*1] + packssdw m6, [r6+32*2] + packssdw m7, [r6+32*3] + mova [r6], m8 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m6, m7 + punpcklwd m6, m7 + punpcklwd m7, m4, m5 + punpckhwd m4, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m7, m6 + punpckldq m7, m6 + punpckhdq m6, m4, m3 + punpckldq m4, m3 + punpckhqdq m3, m2, m1 + punpcklqdq m2, m1 + punpckhqdq m1, m0, m7 + punpcklqdq m0, m7 + punpcklqdq m7, m8, m6 + punpckhqdq m8, m6 + punpckhqdq m6, m5, m4 + punpcklqdq m5, m4 + mova m4, [r6] + mova [r6], m8 + punpcklwd m8, m4, m9 + punpckhwd m4, m9 + punpcklwd m9, m10, m11 + punpckhwd m10, m11 + punpckhwd m11, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m12, m13 + punpcklwd m12, m13 + punpckldq m13, m4, m10 + punpckhdq m4, m10 + punpckhdq m10, m8, m9 + punpckldq m8, m9 + punpckhdq m9, m12, m14 + punpckldq m12, m14 + punpckhdq m14, m15, m11 + punpckldq m15, m11 + punpckhqdq m11, m10, m9 + punpcklqdq m10, m9 + punpckhqdq m9, m8, m12 + punpcklqdq m8, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m4, m14 + punpcklqdq m14, m4, m14 + vperm2i128 m4, m0, m8, 0x31 + vinserti128 m0, xm8, 1 + vinserti128 m8, m5, xm12, 1 + vperm2i128 m12, m5, 0x13 + vperm2i128 m5, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vinserti128 m9, m6, xm13, 1 + vperm2i128 m13, m6, 0x13 + vperm2i128 m6, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vinserti128 m10, m7, xm14, 1 + vperm2i128 m14, m7, 0x13 + vperm2i128 m7, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + mova xm11, [r6] + vinserti128 m11, xm15, 1 + vinserti128 m15, [r6+16], 0 + ret +.transpose_fast: + call m(idct2_16x8_internal_10).transpose2 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + ret +ALIGN function_align +.main: + mova m0, [cq+64* 1] + mova m1, [cq+64* 3] + mova m2, [cq+64* 5] + mova m3, [cq+64* 7] + mova m4, [cq+64* 9] + mova m5, [cq+64*11] + mova m6, [cq+64*13] + mova m7, [cq+64*15] + call m(idct2_8x16_internal_10).main_oddhalf + mova m0, [cq+64* 0] + mova m1, [cq+64* 2] + mova m2, [cq+64* 4] + mova m3, [cq+64* 6] + mova m4, [cq+64* 8] + mova m5, [cq+64*10] + mova m6, [cq+64*12] + mova m7, [cq+64*14] + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + psrld m10, m11, 10 ; vvc_pd_2 + REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_16X16_FN adst, dct2 +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_10, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + vpbroadcastd m15, [vvc_pd_64] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + vpbroadcastd m8, [vvc_pd_5120] + paddd m4, m8 + paddd m6, m8 + paddd m9, m8 + paddd m11, m8 + vpbroadcastd m8, [vvc_pd_5119] + psubd m5, m8, m5 + psubd m7, m8, m7 + psubd m10, m8, m10 + psubd m12, m8, m12 + REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + psrld m4, m15, 10 ; vvc_pd_2 + paddd m0, m4 + psubd m1, m4, m1 + paddd m2, m4 + psubd m3, m4, m3 + psubd m7, m4, [r6-32*4] + paddd m6, m4, [r6-32*3] + psubd m5, m4, [r6-32*2] + paddd m4, [r6-32*1] + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + add r6, 32*8 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova [r6-32*2], m11 + mova [r6-32*1], m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 +.fast: + add r6, 32*8 + call .main + vpbroadcastd m14, [vvc_pd_5120] + vpbroadcastd m13, [vvc_pd_5119] + psrld m15, 10 ; vvc_pd_2 + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] +.pass1_end: + REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 + sub r6, 32*8 + jmp tx2q +.pass2: + call m(idct2_16x16_internal_10).transpose + lea r6, [vvc_pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal_8).main + call m(iadst_16x16_internal_8).main_pass2_end + mova [rsp+32*0], m8 + mova [rsp+32*2], m12 + mova [rsp+32*3], m13 + vpbroadcastd m12, [vvc_pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m12 + pmulhrsw m1, m13, [rsp+32*1] + mova [rsp+32*1], m9 + pmulhrsw m2, m12 + pmulhrsw m3, m13 + call m(idct2_16x8_internal_10).write_16x4_start + pmulhrsw m0, m12, m4 + pmulhrsw m1, m13, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m13, m7 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*0] + pmulhrsw m1, m13, [rsp+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m13, m11 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*2] + pmulhrsw m1, m13, [rsp+32*3] + pmulhrsw m2, m12, m14 + pmulhrsw m3, m13, m15 + call m(idct2_16x8_internal_10).write_16x4_zero + RET +ALIGN function_align +.main: + mova m0, [cq+64* 2] + mova m1, [cq+64*13] + mova m2, [cq+64* 6] + mova m3, [cq+64* 9] + mova m4, [cq+64*10] + mova m5, [cq+64* 5] + mova m6, [cq+64*14] + mova m7, [cq+64* 1] + vpbroadcastd m12, [vvc_pd_2048] + call m(iadst_16x8_internal_10).main_part1 + mova m0, [cq+64* 0] + mova m1, [cq+64*15] + mova m2, [cq+64* 4] + mova m3, [cq+64*11] + mova m4, [cq+64* 8] + mova m5, [cq+64* 7] + mova m6, [cq+64*12] + mova m7, [cq+64* 3] + jmp m(iadst_16x8_internal_10).main_part2 + +INV_TXFM_16X16_FN flipadst, dct2 +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_10, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + vpbroadcastd m15, [vvc_pd_64] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call m(iadst_16x16_internal_10).main + sub cq, 32 + vpbroadcastd m8, [vvc_pd_5120] + paddd m11, m8 + paddd m9, m8 + paddd m6, m8 + paddd m4, m8 + vpbroadcastd m8, [vvc_pd_5119] + psubd m12, m8, m12 + psubd m10, m8, m10 + psubd m7, m8, m7 + psubd m5, m8, m5 + REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 + mova [r6+32*0], m12 + mova [r6+32*1], m11 + mova [r6+32*2], m10 + mova [r6+32*3], m9 + psrld m9, m15, 10 ; vvc_pd_2 + psubd m3, m9, m3 + paddd m2, m9 + psubd m1, m9, m1 + paddd m0, m9 + psubd m12, m9, [r6-32*4] + paddd m11, m9, [r6-32*3] + psubd m10, m9, [r6-32*2] + paddd m9, [r6-32*1] + REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0 + mova [r6-32*4], m12 + mova [r6-32*3], m11 + mova [r6-32*2], m10 + mova [r6-32*1], m9 + add r6, 32*8 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 +.fast: + add r6, 32*8 + call m(iadst_16x16_internal_10).main + vpbroadcastd m14, [vvc_pd_5120] + vpbroadcastd m13, [vvc_pd_5119] + psrld m15, 10 ; vvc_pd_2 + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + jmp m(iadst_16x16_internal_10).pass1_end +.pass2: + call m(idct2_16x16_internal_10).transpose + lea r6, [vvc_pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal_8).main + call m(iadst_16x16_internal_8).main_pass2_end + mova [rsp+32*3], m3 + mova [rsp+32*2], m2 + mova [rsp+32*0], m0 + mova m2, m13 + mova m3, m12 + vpbroadcastd m12, [vvc_pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m13, m15 + pmulhrsw m1, m12, m14 + pmulhrsw m2, m13 + pmulhrsw m3, m12 + mova m14, m8 + mova m15, m9 + call m(idct2_16x8_internal_10).write_16x4_start + pmulhrsw m0, m13, m11 + pmulhrsw m1, m12, m10 + pmulhrsw m2, m13, m15 + pmulhrsw m3, m12, m14 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m13, m7 + pmulhrsw m1, m12, m6 + pmulhrsw m2, m13, m5 + pmulhrsw m3, m12, m4 + call m(idct2_16x8_internal_10).write_16x4_zero + pmulhrsw m0, m13, [rsp+32*3] + pmulhrsw m1, m12, [rsp+32*2] + pmulhrsw m2, m13, [rsp+32*1] + pmulhrsw m3, m12, [rsp+32*0] + call m(idct2_16x8_internal_10).write_16x4_zero + RET + +INV_TXFM_16X16_FN identity, dct2, -92 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_10, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m15, [vvc_pd_5793] + vpbroadcastd m7, [vvc_pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + pmulld m0, m15, [cq+r3+32*33] + pmulld m1, m15, [cq+r3+32*35] + pmulld m2, m15, [cq+r3+32*37] + pmulld m3, m15, [cq+r3+32*39] + add r6, 32*4 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + pmulld m0, m15, [cq+64* 0] + pmulld m1, m15, [cq+64* 1] + pmulld m2, m15, [cq+64* 2] + pmulld m3, m15, [cq+64* 3] + pmulld m4, m15, [cq+64* 4] + pmulld m5, m15, [cq+64* 5] + pmulld m6, m15, [cq+64* 6] + pmulld m8, m15, [cq+64* 7] + mova [cq], m8 + pmulld m8, m15, [cq+64* 8] + pmulld m9, m15, [cq+64* 9] + pmulld m10, m15, [cq+64*10] + pmulld m11, m15, [cq+64*11] + pmulld m12, m15, [cq+64*12] + pmulld m13, m15, [cq+64*13] + pmulld m14, m15, [cq+64*14] + pmulld m15, [cq+64*15] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct2_16x16_internal_10).transpose + + mova [cq+32*0], m15 + mova [cq+32*1], m0 + vpbroadcastd m15, [vvc_pw_1697x16] + + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [cq+32*1] + mova [cq+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [cq+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + mova m1, [cq+32*1] + jmp m(idct2_16x16_internal_10).end + +INV_TXFM_16X16_FN dct2, dct2, 0, 12 +INV_TXFM_16X16_FN dct2, identity, 28, 12 +INV_TXFM_16X16_FN dct2, adst, 0, 12 +INV_TXFM_16X16_FN dct2, flipadst, 0, 12 + +cglobal idct2_16x16_internal_12, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct2_16x16_internal_10).pass1 +.pass2: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + mova [cq+32*12], m12 + mova [cq+32*13], m13 + mova [cq+32*14], m14 + mova [cq+32*15], m15 + call .pass2_main + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] + mov r5, r6 + add r6, 32*16 + call .pass2_main + jmp m(iadst_16x16_internal_12).end +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [vvc_pw_16384] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct2_16x8_internal_12).write_16x4_start + call m(idct2_16x8_internal_10).write_16x4_zero + jmp m(idct2_16x16_internal_10).write_16x16_2 +ALIGN function_align +.pass2_main: + call m(idct2_8x8_internal_12).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m2 + mova [cq+32* 2], m4 + mova [cq+32* 3], m6 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, m1 + pmaxsd m1, m12, m3 + pmaxsd m2, m12, m5 + pmaxsd m3, m12, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3 + test eobd, eobd + jge .pass2_slow + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + jmp .pass2_fast +.pass2_slow: + sub r6, 32*8 + mova m8, [r6-32*4] + mova m4, [r6-32*3] + mova m10, [r6-32*2] + mova m5, [r6-32*1] + mova m12, [r6+32*0] + mova m6, [r6+32*1] + mova m14, [r6+32*2] + mova m7, [r6+32*3] + TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15 + mova [cq+32* 4], m8 + mova [cq+32* 5], m10 + mova [cq+32* 6], m12 + mova [cq+32* 7], m14 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m4, m5, m6, m7 + REPX {pminsd x, m13}, m4, m5, m6, m7 +.pass2_fast: + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m14, [vvc_pd_64] + call m(idct2_8x16_internal_10).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 1] + pmaxsd m2, m12, [cq+32* 2] + pmaxsd m3, m12, [cq+32* 3] + REPX {pminsd x, m13}, m0, m1, m2, m3 + test eobd, eobd + jge .pass2_slow2 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + jmp .pass2_fast2 +.pass2_slow2: + pmaxsd m4, m12, [cq+32* 4] + pmaxsd m5, m12, [cq+32* 5] + pmaxsd m6, m12, [cq+32* 6] + pmaxsd m7, m12, [cq+32* 7] + REPX {pminsd x, m13}, m4, m5, m6, m7 +.pass2_fast2: + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + psrad m11, 8 ; vvc_pd_8 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_16x8_internal_10).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ret + +INV_TXFM_16X16_FN adst, dct2, 0, 12 +INV_TXFM_16X16_FN adst, adst, 0, 12 +INV_TXFM_16X16_FN adst, flipadst, 0, 12 + +cglobal iadst_16x16_internal_12, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iadst_16x16_internal_10).pass1 +.pass2: + call .pass2_part1 + call m(iadst_16x8_internal_10).pass1_rotations + call .pass2_part2 + call m(iadst_16x8_internal_10).pass1_rotations +.pass2_part3: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 +.end: + packssdw m15, m14 + packssdw m14, m13, m12 + packssdw m13, m11, m10 + packssdw m12, m9, m8 + packssdw m11, m7, m6 + packssdw m10, m5, m4 + packssdw m7, m3, m2 + packssdw m6, m1, m0 + vpblendd m0, m6, [r5-32*4], 0x33 + vpblendd m1, m6, [r5-32*4], 0xcc + vpblendd m2, m7, [r5-32*3], 0x33 + vpblendd m3, m7, [r5-32*3], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct2_16x8_internal_12).write_16x4_start + call m(idct2_16x8_internal_10).write_16x4_zero + vpblendd m0, m10, [r5-32*2], 0x33 + vpblendd m1, m10, [r5-32*2], 0xcc + vpblendd m2, m11, [r5-32*1], 0x33 + vpblendd m3, m11, [r5-32*1], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct2_16x8_internal_10).write_16x4_zero + vpblendd m0, m12, [r5+32*0], 0x33 + vpblendd m1, m12, [r5+32*0], 0xcc + vpblendd m2, m13, [r5+32*1], 0x33 + vpblendd m3, m13, [r5+32*1], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct2_16x8_internal_10).write_16x4_zero + vpblendd m0, m14, [r5+32*2], 0x33 + vpblendd m1, m14, [r5+32*2], 0xcc + vpblendd m2, m15, [r5+32*3], 0x33 + vpblendd m3, m15, [r5+32*3], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct2_16x8_internal_10).write_16x4_zero + RET +ALIGN function_align +.pass2_part1: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + mova [cq+32*12], m12 + mova [cq+32*13], m13 + mova [cq+32*14], m14 + mova [cq+32*15], m15 +.pass2_main: + call m(idct2_8x8_internal_12).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m3 + mova [cq+32* 2], m4 + mova [cq+32* 3], m7 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + pmaxsd m0, m13, m2 + pmaxsd m2, m13, m6 + pmaxsd m5, m13, m5 + pmaxsd m7, m13, m1 + REPX {pminsd x, m14}, m0, m2, m5, m7 + test eobd, eobd + jge .pass2_slow + pxor m1, m1 + REPX {mova x, m1}, m3, m4, m6 + jmp .pass2_fast +.pass2_slow: + sub r6, 32*8 + mova m8, [r6-32*4] + mova m3, [r6-32*3] + mova m4, [r6-32*2] + mova m11, [r6-32*1] + mova m12, [r6+32*0] + mova m1, [r6+32*1] + mova m6, [r6+32*2] + mova m15, [r6+32*3] + TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14 + mova [cq+32* 4], m8 + mova [cq+32* 5], m11 + mova [cq+32* 6], m12 + mova [cq+32* 7], m15 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + REPX {pmaxsd x, m13}, m1, m3, m4, m6 + REPX {pminsd x, m14}, m1, m3, m4, m6 +.pass2_fast: + vpbroadcastd m12, [vvc_pd_2048] + vpbroadcastd m15, [vvc_pd_64] + call m(iadst_16x8_internal_10).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m7, m13, [cq+32* 1] ; 3 + pmaxsd m2, m13, [cq+32* 2] ; 4 + pmaxsd m5, m13, [cq+32* 3] ; 7 + REPX {pminsd x, m14}, m0, m2, m5, m7 + test eobd, eobd + jge .pass2_slow2 + pxor m1, m1 + REPX {mova x, m1}, m3, m4, m6 + jmp .pass2_fast2 +.pass2_slow2: + pmaxsd m4, m13, [cq+32* 4] ; 8 + pmaxsd m3, m13, [cq+32* 5] ; 11 + pmaxsd m6, m13, [cq+32* 6] ; 12 + pmaxsd m1, m13, [cq+32* 7] ; 15 + REPX {pminsd x, m14}, m1, m3, m4, m6 +.pass2_fast2: + call m(iadst_16x8_internal_10).main_part2 + vpbroadcastd m14, [vvc_pd_17408] + psrld m15, 11 ; vvc_pd_1 + psubd m13, m14, m15 ; vvc_pd_17407 + pslld m15, 3 ; vvc_pd_8 + ret +ALIGN function_align +.pass2_part2: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] + mov r5, r6 + add r6, 32*16 + jmp .pass2_main + +INV_TXFM_16X16_FN flipadst, dct2, 0, 12 +INV_TXFM_16X16_FN flipadst, adst, 0, 12 +INV_TXFM_16X16_FN flipadst, flipadst, 0, 12 + +cglobal iflipadst_16x16_internal_12, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iflipadst_16x16_internal_10).pass1 +.pass2: + call m(iadst_16x16_internal_12).pass2_part1 + call m(iflipadst_16x8_internal_10).pass1_rotations + call m(iadst_16x16_internal_12).pass2_part2 + call m(iflipadst_16x8_internal_10).pass1_rotations + jmp m(iadst_16x16_internal_12).pass2_part3 + +INV_TXFM_16X16_FN identity, dct2, -92, 12 +INV_TXFM_16X16_FN identity, identity, 0, 12 + +%macro IDTX16_12 1 ; src + pmulld m6, m7, m%1 + paddd m6, m15 + psrad m6, 12 + paddd m6, m%1 + psrad m%1, m6, 1 +%endmacro + +cglobal iidentity_16x16_internal_12, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m7, [vvc_pd_1697] + vpbroadcastd m15, [vvc_pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + mova m10, [cq+r3+32*33] + mova m11, [cq+r3+32*35] + mova m12, [cq+r3+32*37] + mova m13, [cq+r3+32*39] + add r6, 32*4 + pmulld m0, m7, m10 + pmulld m1, m7, m11 + pmulld m2, m7, m12 + pmulld m3, m7, m13 + REPX {paddd x, m15}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + paddd m0, m10 + paddd m1, m11 + paddd m2, m12 + paddd m3, m13 + REPX {psrad x, 1 }, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + mova m0, [cq+64* 0] + mova m1, [cq+64* 1] + mova m2, [cq+64* 2] + mova m3, [cq+64* 3] + mova m4, [cq+64* 4] + mova m5, [cq+64* 5] + mova m8, [cq+64* 6] + mova m9, [cq+64* 7] + REPX {IDTX16_12 x}, 0, 1, 2, 3, 4, 5, 8, 9 + mova [cq+64*0], m8 + mova [cq+64*1], m9 + mova m8, [cq+64* 8] + mova m9, [cq+64* 9] + mova m10, [cq+64*10] + mova m11, [cq+64*11] + mova m12, [cq+64*12] + mova m13, [cq+64*13] + mova m14, [cq+64*14] + REPX {IDTX16_12 x}, 8, 9, 10, 11, 12, 13, 14 + mova m6, [cq+64*15] + pmulld m7, m6 + paddd m7, m15 + psrad m7, 12 + paddd m7, m6 + mova m6, [cq+64*0] + psrad m15, m7, 1 + mova m7, [cq+64*1] + jmp tx2q +.pass2: + call m(iidentity_8x16_internal_12).pass2_main + call m(idct2_16x16_internal_10).transpose_fast + test eobd, eobd + jl .pass2_fast + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + mova m10, [r6-32*2] + mova m11, [r6-32*1] + mova m12, [r6+32*0] + mova m13, [r6+32*1] + mova m14, [r6+32*2] + mova m15, [r6+32*3] + sub r6, 32*8 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + mova m2, [r6-32*2] + mova m3, [r6-32*1] + mova m4, [r6+32*0] + mova m5, [r6+32*1] + mova m6, [r6+32*2] + mova m7, [r6+32*3] + call m(iidentity_8x16_internal_12).pass2_main + call m(idct2_16x8_internal_10).transpose2 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 + mova m13, m5 + mova m14, m6 + mova m15, m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] +.pass2_fast: + call m(idct2_16x16_internal_12).write_16x16 + RET + +%macro IDCT2_32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack + mova m%4, [r6+32*(%1-4)] + mova m%2, [r5+32*(3-%1)] + mova m%5, [r4+32*(%1-4)] + psubd m%3, m%1, m%4 ; idct2_16 out15 - n + paddd m%1, m%4 ; idct2_16 out0 + n + pmaxsd m%1, m12 + pmaxsd m%3, m12 + pminsd m%1, m13 + pminsd m%3, m13 + paddd m%1, m11 + paddd m%3, m11 + psubd m%4, m%1, m%2 ; out31 - n + paddd m%1, m%2 ; out0 + n + paddd m%2, m%3, m%5 ; out15 - n + psubd m%3, m%5 ; out16 + n + REPX {psrad x, %6}, m%1, m%3, m%2, m%4 +%if %7 & 1 + packssdw m%1, m%3 ; out0 + n, out16 + n + packssdw m%2, m%4 ; out15 - n, out31 - n +%endif +%endmacro + +cglobal vvc_inv_dct2_dct2_8x32_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vbroadcasti128 m14, [idct2_32_shuf] + mov r4, cq + call .pass1_main + mova [rsp+32*0], m2 + mova [rsp+32*1], m3 + cmp eobd, 43 + jge .eob43 + pxor m4, m4 + REPX {mova x, m4}, [rsp+32*2], m2, m3, m11 + jmp .pass1_end_fast +.eob43: + lea r6, [rsp+32*8] + mova [r6-32*4], m0 + mova [r6-32*3], m1 + call .pass1_main + mova [rsp+32*2], m2 + cmp eobd, 107 + jge .eob107 + mova m11, m3 + mova m2, m0 + mova m3, m1 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + pxor m4, m4 +.pass1_end_fast: + vpbroadcastd m10, [vvc_pw_2048] + lea r6, [deint_shuf+128] + REPX {mova x, m4}, m5, m6, m7 + call m(vvc_inv_dct2_dct2_8x32_8).main_fast + jmp .end +.eob107: + mova [rsp+32*3], m3 + mova [r6-32*2], m0 + mova [r6-32*1], m1 + call .pass1_main + cmp eobd, 171 + jge .eob171 + pshufd m12, m2, q1032 + pshufd m13, m3, q1032 + mova m4, m0 + mova m5, m1 + pxor m6, m6 + REPX {mova x, m6}, m7, m14, m15 + jmp .pass1_end +.eob171: + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + call .pass1_main + pshufd m12, [r6+32*2], q1032 ; out19 out17 + pshufd m13, [r6+32*3], q1032 ; out23 out21 + mova m4, [r6+32*0] ; out16 out18 + mova m5, [r6+32*1] ; out20 out22 + pshufd m14, m2, q1032 ; out27 out25 + pshufd m15, m3, q1032 ; out31 out29 + mova m6, m0 ; out24 out26 + mova m7, m1 ; out28 out30 +.pass1_end: + mova m0, [r6-32*4] ; out0 out2 + mova m1, [r6-32*3] ; out4 out6 + mova m2, [r6-32*2] ; out8 out10 + mova m3, [r6-32*1] ; out12 out14 + lea r6, [deint_shuf+128] + mova m11, [rsp+32*3] ; out13 out15 + vpbroadcastd m10, [vvc_pw_2048] + call m(vvc_inv_dct2_dct2_8x32_8).main +.end: ; [rsp+0*32] = m12 + vpbroadcastd m12, [vvc_pw_2048] + mov cq, r4 + mova [rsp+32*1], m8 + mova [rsp+32*2], m9 + mova [rsp+32*3], m10 + mova [rsp+32*4], m11 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4_start + vpermq m0, m2, q3120 + vpermq m1, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [rsp+32*1], q3120 + vpermq m1, [rsp+32*2], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [rsp+32*3], q3120 + vpermq m1, [rsp+32*4], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [rsp+32*0], q3120 + vpermq m1, m13, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m14, q3120 + vpermq m1, m15, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct2_8x8_internal_10).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(vvc_inv_dct2_dct2_8x8_10).dconly3 +ALIGN function_align +.pass1_main_part1: + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + call m(idct2_8x8_internal_10).main + psrld m1, m11, 10 ; vvc_pd_2 + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret +ALIGN function_align +.pass1_main: + call .pass1_main_part1 + add cq, 32 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + pshufb m0, m14 + pshufb m2, m14 + pshufb m4, m14 + pshufb m6, m14 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + vperm2i128 m1, m0, m2, 0x31 ; 4 6 + vinserti128 m0, xm2, 1 ; 0 2 + vinserti128 m2, m3, xm4, 1 ; 1 3 + vperm2i128 m3, m4, 0x31 ; 5 7 + ret +.main_oddhalf_part1_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part1_fast: ; lower half zero + vpbroadcastd m7, [vvc_pd_90] + vpbroadcastd m8, [vvc_pd_4] + vpbroadcastd m6, [vvc_pd_m31] + vpbroadcastd m9, [vvc_pd_85] + vpbroadcastd m5, [vvc_pd_82] + vpbroadcastd m10, [vvc_pd_38] + vpbroadcastd m4, [vvc_pd_m61] + vpbroadcastd m15, [vvc_pd_67] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part1_fast2 +.main_oddhalf_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 4, 90 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 85, 31 ; t19a, t28a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 38, 82 ; t18a, t29a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 67, 61 ; t17a, t30a +.main_oddhalf_part1_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t17 + paddd m0, m4 ; t16 + psubd m4, m6, m2 ; t18 + paddd m6, m2 ; t19 + psubd m2, m1, m5 ; t29 + paddd m1, m5 ; t28 + psubd m5, m7, m3 ; t30 + paddd m7, m3 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [vvc_pd_89] + vpbroadcastd m10, [vvc_pd_18] + ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15, 0xc ; t17a, t30a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 0xe ; t29a, t18a + psubd m3, m0, m6 ; t19a + paddd m0, m6 ; t16a + psubd m6, m7, m1 ; t28a + paddd m7, m1 ; t31a + psubd m1, m5, m4 ; t18 + paddd m5, m4 ; t17 + psubd m4, m8, m2 ; t29 + paddd m8, m2 ; t30 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [vvc_pd_83] + vpbroadcastd m10, [vvc_pd_36] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 0xc ; t18a, t29a + ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15, 0xc ; t19, t28 + mova [r6-32*4], m0 + mova [r6-32*3], m5 + mova [r6-32*2], m4 + mova [r6-32*1], m6 + mova [r6+32*0], m3 + mova [r6+32*1], m1 + mova [r6+32*2], m8 + mova [r6+32*3], m7 + ret +.main_oddhalf_part2_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part2_fast: ; lower half zero + vpbroadcastd m7, [vvc_pd_m13] + vpbroadcastd m8, [vvc_pd_90] + vpbroadcastd m6, [vvc_pd_88] + vpbroadcastd m9, [vvc_pd_22] + vpbroadcastd m5, [vvc_pd_m46] + vpbroadcastd m10, [vvc_pd_78] + vpbroadcastd m4, [vvc_pd_73] + vpbroadcastd m15, [vvc_pd_54] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part2_fast2 +.main_oddhalf_part2_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 + ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 90, 13 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 22, 88 ; t20a, t27a + ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 78, 46 ; t21a, t26a + ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 54, 73 ; t22a, t25a +.main_oddhalf_part2_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t25 + paddd m0, m4 ; t24 + psubd m4, m6, m2 ; t26 + paddd m6, m2 ; t27 + psubd m2, m1, m5 ; t21 + paddd m1, m5 ; t20 + psubd m5, m7, m3 ; t22 + paddd m7, m3 ; t23 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [vvc_pd_50] + vpbroadcastd m10, [vvc_pd_75] + ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15, 0xc ; t21a, t26a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 0xe ; t25a, t22a + psubd m3, m0, m6 ; t27a + paddd m0, m6 ; t24a + psubd m6, m7, m1 ; t20a + paddd m7, m1 ; t23a + psubd m1, m5, m4 ; t21 + paddd m5, m4 ; t22 + psubd m4, m8, m2 ; t26 + paddd m8, m2 ; t25 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [vvc_pd_83] + vpbroadcastd m10, [vvc_pd_36] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 0xe ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 0xe ; t27, t20 + mova m9, [r6-32*4] ; t16a + mova m10, [r6-32*3] ; t17 + psubd m2, m9, m7 ; t23 + paddd m9, m7 ; t16 + psubd m7, m10, m5 ; t22a + paddd m10, m5 ; t17a + REPX {pmaxsd x, m12}, m9, m10, m2, m7 + REPX {pminsd x, m13}, m9, m10, m2, m7 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova m9, [r6-32*2] ; t18a + mova m10, [r6-32*1] ; t19 + psubd m5, m9, m1 ; t21 + paddd m9, m1 ; t18 + psubd m1, m10, m6 ; t20a + paddd m10, m6 ; t19a + REPX {pmaxsd x, m12}, m9, m10, m5, m1 + REPX {pminsd x, m13}, m9, m10, m5, m1 + mova [r6-32*2], m9 + mova [r6-32*1], m10 + mova m9, [r6+32*0] ; t28 + mova m10, [r6+32*1] ; t29a + psubd m6, m9, m3 ; t27a + paddd m9, m3 ; t28a + psubd m3, m10, m4 ; t26 + paddd m10, m4 ; t29 + REPX {pmaxsd x, m12}, m9, m10, m6, m3 + REPX {pminsd x, m13}, m9, m10, m6, m3 + REPX {pmulld x, m14}, m6, m3, m1, m5 + paddd m6, m11 + paddd m3, m11 + psubd m4, m6, m1 ; t20 + paddd m6, m1 ; t27 + psubd m1, m3, m5 ; t21a + paddd m3, m5 ; t26a + REPX {psrad x, 12 }, m4, m1, m3, m6 + mova [r6+32*0], m4 + mova [r6+32*1], m1 + mova m4, [r6+32*2] ; t30 + mova m1, [r6+32*3] ; t31a + psubd m5, m4, m8 ; t25a + paddd m4, m8 ; t30a + psubd m8, m1, m0 ; t24 + paddd m1, m0 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m1 + REPX {pminsd x, m13}, m8, m5, m4, m1 + REPX {pmulld x, m14}, m5, m8, m7, m2 + paddd m5, m11 + paddd m8, m11 + psubd m0, m5, m7 ; t22 + paddd m5, m7 ; t25 + psubd m7, m8, m2 ; t23a + paddd m2, m8 ; t24a + REPX {psrad x, 12 }, m0, m7, m2, m5 + mova [r6+32*2], m0 + mova [r6+32*3], m7 + mov r4, r6 + add r6, 32*8 + mova [r6-32*4], m2 + mova [r6-32*3], m5 + mova [r6-32*2], m3 + mova [r6-32*1], m6 + mova [r6+32*0], m9 + mova [r6+32*1], m10 + mova [r6+32*2], m4 + mova [r6+32*3], m1 + mov r5, r6 + add r6, 32*8 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; vvc_pd_2 + IDCT2_32_END 0, 15, 8, 9, 10, 2 + IDCT2_32_END 1, 14, 8, 9, 10, 2 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT2_32_END 2, 15, 8, 9, 10, 2 + IDCT2_32_END 3, 14, 8, 9, 10, 2 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT2_32_END 4, 15, 8, 9, 10, 2 + IDCT2_32_END 5, 14, 8, 9, 10, 2 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT2_32_END 6, 15, 8, 9, 10, 2 + IDCT2_32_END 7, 14, 8, 9, 10, 2 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 +.transpose: + punpckhdq m15, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m4, m6 + punpckldq m4, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpcklqdq m5, m2, m15 + punpckhqdq m2, m15 + punpckhqdq m15, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m6, m1 + punpcklqdq m6, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m7, 0x31 + vinserti128 m0, xm7, 1 + vperm2i128 m7, m3, m2, 0x31 + vinserti128 m3, xm2, 1 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m15, 0x31 + vinserti128 m1, xm15, 1 + ret + +cglobal vvc_inv_identity_identity_8x32_10, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_10_max] +.pass1: + vpbroadcastd m5, [vvc_pw_5] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {paddsw x, m5}, m0, m1, m2, m3 + REPX {psraw x, 3 }, m0, m1, m2, m3 + call .main_zero + add cq, 32 + lea dstq, [dstq+strideq*8] + sub eobd, 64 + jge .loop + RET +ALIGN function_align +.main_zero: + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m1 + punpcklwd m2, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + paddw m0, m4 + mova xm4, [dstq+strideq*1] + vinserti128 m4, [dstq+r5 ], 1 + paddw m1, m4 + mova xm4, [dstq+strideq*2] + vinserti128 m4, [dstq+r6*2 ], 1 + paddw m2, m4 + mova xm4, [dstq+r6 ] + vinserti128 m4, [dstq+r4 ], 1 + paddw m3, m4 + REPX {pmaxsw x, m6}, m0, m1, m2, m3 + REPX {pminsw x, m7}, m0, m1, m2, m3 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*4], m0, 1 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+r5 ], m1, 1 + mova [dstq+strideq*2], xm2 + vextracti128 [dstq+r6*2 ], m2, 1 + mova [dstq+r6 ], xm3 + vextracti128 [dstq+r4 ], m3, 1 + ret + +cglobal vvc_inv_dct2_dct2_8x32_12, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + mov r4, cq + lea r6, [rsp+32*4] + call .pass1_main + cmp eobd, 43 + jge .eob43 + jmp .pass2_fast +.eob43: + call .pass1_main + cmp eobd, 107 + jge .eob107 +.pass2_fast: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + vpbroadcastd m14, [vvc_pd_64] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_fast + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_fast + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(idct2_8x16_internal_10).main_oddhalf_fast + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + jmp .pass2_end +.eob107: + call .pass1_main + cmp eobd, 171 + jge .eob171 + jmp .pass2 +.eob171: + call .pass1_main +.pass2: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + pmaxsd m4, m12, [cq+128*1+64] + pmaxsd m5, m12, [cq+128*7+64] + pmaxsd m6, m12, [cq+128*1+96] + pmaxsd m7, m12, [cq+128*7+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m14, [vvc_pd_64] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1 + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + pmaxsd m4, m12, [cq+128*3+64] + pmaxsd m5, m12, [cq+128*5+64] + pmaxsd m6, m12, [cq+128*3+96] + pmaxsd m7, m12, [cq+128*5+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2 + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + pmaxsd m4, m12, [cq+128*2+64] + pmaxsd m5, m12, [cq+128*6+64] + pmaxsd m6, m12, [cq+128*2+96] + pmaxsd m7, m12, [cq+128*6+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x16_internal_10).main_oddhalf + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + pmaxsd m4, m12, [cq+128*0+64] + pmaxsd m5, m12, [cq+128*4+64] + pmaxsd m6, m12, [cq+128*0+96] + pmaxsd m7, m12, [cq+128*4+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf +.pass2_end: + psrld m11, 8 ; vvc_pd_8 + IDCT2_32_END 0, 15, 8, 9, 10, 4 + IDCT2_32_END 1, 14, 8, 9, 10, 4 + punpckhqdq m8, m0, m1 ; 16 17 (interleaved) + punpcklqdq m0, m1 ; 0 1 (interleaved) + punpcklqdq m1, m14, m15 ; 14 15 (interleaved) + punpckhqdq m14, m15 ; 30 31 (interleaved) + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT2_32_END 2, 15, 8, 9, 10, 4 + IDCT2_32_END 3, 14, 8, 9, 10, 4 + punpckhqdq m8, m2, m3 ; 18 19 (interleaved) + punpcklqdq m2, m3 ; 2 3 (interleaved) + punpcklqdq m3, m14, m15 ; 12 13 (interleaved) + punpckhqdq m14, m15 ; 28 29 (interleaved) + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT2_32_END 4, 15, 8, 9, 10, 4 + IDCT2_32_END 5, 14, 8, 9, 10, 4 + punpckhqdq m8, m4, m5 ; 20 21 (interleaved) + punpcklqdq m4, m5 ; 4 5 (interleaved) + punpcklqdq m5, m14, m15 ; 10 11 (interleaved) + punpckhqdq m14, m15 ; 26 27 (interleaved) + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT2_32_END 6, 15, 8, 9, 10, 4 + IDCT2_32_END 7, 14, 8, 9, 10, 4 + punpckhqdq m8, m6, m7 ; 22 23 (interleaved) + punpcklqdq m6, m7 ; 6 7 (interleaved) + punpcklqdq m7, m14, m15 ; 8 9 (interleaved) + punpckhqdq m14, m15 ; 24 25 (interleaved) + mova [r5-32*3], m8 + mova [r5-32*4], m14 + mova m15, m1 +.end: + vpermq m0, m0, q3120 + vpermq m1, m2, q3120 + call m(idct2_8x8_internal_12).write_8x4_start + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m6, q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m7, q3120 + vpermq m1, m5, q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, m3, q3120 + vpermq m1, m15, q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [r5+32*3], q3120 + vpermq m1, [r5+32*1], q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [r5-32*1], q3120 + vpermq m1, [r5-32*3], q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [r5-32*4], q3120 + vpermq m1, [r5-32*2], q3120 + call m(idct2_8x8_internal_10).write_8x4 + vpermq m0, [r5+32*0], q3120 + vpermq m1, [r5+32*2], q3120 + call m(idct2_8x8_internal_10).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_12] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(vvc_inv_dct2_dct2_8x8_10).dconly3 +ALIGN function_align +.pass1_main: + call m(vvc_inv_dct2_dct2_8x32_10).pass1_main_part1 + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 32 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; vvc_pd_2 + IDCT2_32_END 0, 15, 8, 9, 10, 2, 0 + mova [cq+32*16], m8 + mova [cq+32*31], m9 + IDCT2_32_END 1, 14, 8, 9, 10, 2, 0 + mova [cq+32*17], m8 + mova [cq+32*30], m9 + mova [cq+32*14], m14 + IDCT2_32_END 2, 14, 8, 9, 10, 2, 0 + mova [cq+32*18], m8 + mova [cq+32*29], m9 + mova [cq+32*13], m14 + IDCT2_32_END 3, 14, 8, 9, 10, 2, 0 + mova [cq+32*19], m8 + mova [cq+32*28], m9 + mova [cq+32*12], m14 + IDCT2_32_END 4, 14, 8, 9, 10, 2, 0 + mova [cq+32*20], m8 + mova [cq+32*27], m9 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + IDCT2_32_END 5, 10, 0, 1, 2, 2, 0 + mova [cq+32*21], m0 + mova [cq+32*26], m1 + IDCT2_32_END 6, 9, 0, 1, 2, 2, 0 + mova [cq+32*22], m0 + mova [cq+32*25], m1 + IDCT2_32_END 7, 8, 0, 1, 2, 2, 0 + mova [cq+32*23], m0 + mova [cq+32*24], m1 + mova m0, [cq+32* 0] + mova m1, [cq+32* 1] + mova m2, [cq+32* 2] + mova m11, m14 + mova m12, [cq+32*12] + mova m13, [cq+32*13] + mova m14, [cq+32*14] + ret + +cglobal vvc_inv_identity_identity_8x32_12, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12_max] + jmp m(vvc_inv_identity_identity_8x32_10).pass1 + +cglobal vvc_inv_dct2_dct2_32x8_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 8 +.dconly: + add r6d, 640 + sar r6d, 10 +.dconly2: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm3 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + psubusw m1, m3 + psubusw m2, m3 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + call .pass1 + call m(vvc_inv_dct2_dct2_8x32_10).main_end + lea r6, [deint_shuf+128] + vpbroadcastd m11, [vvc_pw_2048] + mov r4, dstq + call .pass2 + mova m0, [r5+32*3] ; 16 17 + mova m1, [r5+32*2] ; 30 31 + mova m2, [r5+32*1] ; 18 19 + mova m3, [r5+32*0] ; 28 29 + mova m4, [r5-32*1] ; 20 21 + mova m5, [r5-32*2] ; 26 27 + mova m6, [r5-32*3] ; 22 23 + mova m7, [r5-32*4] ; 24 25 + call m(vvc_inv_dct2_dct2_8x32_10).transpose + lea dstq, [r4+32] + call .pass2 + RET +ALIGN function_align +.pass2: + call m(idct2_16x8_internal_8).main + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + call m(idct2_16x8_internal_10).write_16x4_start + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 + jmp m(idct2_16x8_internal_10).write_16x4_zero +ALIGN function_align +.pass1: + mova m0, [cq+32* 1] + mova m1, [cq+32* 7] + mova m2, [cq+32* 9] + mova m3, [cq+32*15] + mova m4, [cq+32*17] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32*31] + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m14, [vvc_pd_64] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1 + mova m0, [cq+32* 3] + mova m1, [cq+32* 5] + mova m2, [cq+32*11] + mova m3, [cq+32*13] + mova m4, [cq+32*19] + mova m5, [cq+32*21] + mova m6, [cq+32*27] + mova m7, [cq+32*29] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2 + mova m0, [cq+32* 2] + mova m1, [cq+32* 6] + mova m2, [cq+32*10] + mova m3, [cq+32*14] + mova m4, [cq+32*18] + mova m5, [cq+32*22] + mova m6, [cq+32*26] + mova m7, [cq+32*30] + call m(idct2_8x16_internal_10).main_oddhalf + mova m0, [cq+32* 0] + mova m1, [cq+32* 4] + mova m2, [cq+32* 8] + mova m3, [cq+32*12] + mova m4, [cq+32*16] + mova m5, [cq+32*20] + mova m6, [cq+32*24] + mova m7, [cq+32*28] + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + ret + +cglobal vvc_inv_identity_identity_32x8_10, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_10_max] +.pass1: + vpbroadcastd m5, [vvc_pw_64] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+32*0] + packssdw m0, [cq+32*1] + mova m1, [cq+32*2] + packssdw m1, [cq+32*3] + REPX {mova [cq+32*x], m6}, 0, 1, 2, 3 + add cq, 32*8 + mova m2, [cq-32*4] + packssdw m2, [cq-32*3] + mova m3, [cq-32*2] + packssdw m3, [cq-32*1] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 + call m(vvc_inv_identity_identity_8x32_10).main + add dstq, 16 + sub eobd, 64 + jge .loop + RET + +cglobal vvc_inv_dct2_dct2_32x8_12, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_12] + mov [cq], eobd ; 0 + or r3d, 8 + jmp m(vvc_inv_dct2_dct2_32x8_10).dconly +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + call m(vvc_inv_dct2_dct2_32x8_10).pass1 + call m(vvc_inv_dct2_dct2_8x32_12).main_end + mov r4, dstq + call m(idct2_16x8_internal_12).pass2_main + mova m0, [cq+32* 0] ; 16 + mova m1, [cq+32* 1] ; 17 + mova m2, [cq+32* 2] ; 18 + mova m3, [cq+32* 3] ; 19 + mova m4, [cq+32* 4] ; 20 + mova m5, [cq+32* 5] ; 21 + mova m6, [cq+32* 6] ; 22 + mova m7, [cq+32* 7] ; 23 + mova m8, [cq+32* 8] ; 24 + mova m9, [cq+32* 9] ; 25 + mova m10, [cq+32*10] ; 26 + mova m11, [cq+32*11] ; 27 + mova m12, [cq+32*12] ; 28 + mova m13, [cq+32*13] ; 29 + mova m14, [cq+32*14] ; 30 + mova m15, [cq+32*15] ; 31 + lea dstq, [r4+32] + call m(idct2_16x8_internal_12).pass2_main + RET + +cglobal vvc_inv_identity_identity_32x8_12, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12_max] + jmp m(vvc_inv_identity_identity_32x8_10).pass1 + +%macro IDCT2_32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 +%if %1 == 0 + pxor m6, m6 +%endif + pmulhrsw m%3, m15 + pmulhrsw m%1, m15 + paddw m%3, [dstq+%5] + paddw m%1, [r2+%6] + pmaxsw m%3, m6 + pmaxsw m%1, m6 + pminsw m%3, m7 + pminsw m%1, m7 + mova [dstq+%5], m%3 + mova [r2+%6], m%1 +%endmacro + +cglobal vvc_inv_dct2_dct2_16x32_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*16] + lea r4, [r6+32*8] + lea r5, [r6+32*16] + call .main + sub eobd, 44 + jge .eob44 + vperm2i128 m2, m0, m3, 0x31 ; 5 + vinserti128 m0, xm3, 1 ; 1 + vperm2i128 m3, m1, m4, 0x31 ; 7 + vinserti128 m1, xm4, 1 ; 3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 + jmp .fast +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(vvc_inv_dct2_dct2_16x4_10).dconly2 +.eob44: + mova [r4+16*0], xm0 + mova [r4+16*1], xm3 + mova [r4+16*2], xm1 + mova [r4+16*3], xm4 + vextracti128 [r4+16*4], m0, 1 + vextracti128 [r4+16*5], m3, 1 + vextracti128 [r4+16*6], m1, 1 + vextracti128 [r4+16*7], m4, 1 + call .main + sub eobd, 107 + jge .eob151 + vperm2i128 m7, m1, m4, 0x31 ; 15 + vinserti128 m5, m1, xm4, 1 ; 11 + vperm2i128 m6, m0, m3, 0x31 ; 13 + vinserti128 m4, m0, xm3, 1 ; 9 + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] +.fast: + lea r6, [vvc_pw_5+128] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct2_16 +.eob151: + mova [r4-16*8], xm0 + mova [r4-16*7], xm3 + mova [r4-16*6], xm1 + mova [r4-16*5], xm4 + vextracti128 [r4-16*4], m0, 1 + vextracti128 [r4-16*3], m3, 1 + vextracti128 [r4-16*2], m1, 1 + vextracti128 [r4-16*1], m4, 1 + call .main + sub eobd, 128 + jge .eob279 + vperm2i128 m10, m0, m3, 0x31 ; 21 + vinserti128 m8, m0, xm3, 1 ; 17 + vperm2i128 m11, m1, m4, 0x31 ; 23 + vinserti128 m9, m1, xm4, 1 ; 19 + pxor m12, m12 + REPX {mova x, m12}, m13, m14, m15 + REPX {mova [r6+32*x], m12}, 0, 1, 2, 3 + jmp .full +.eob279: + mova [r5+16*0], xm0 + mova [r5+16*1], xm3 + mova [r5+16*2], xm1 + mova [r5+16*3], xm4 + vextracti128 [r5+16*4], m0, 1 + vextracti128 [r5+16*5], m3, 1 + vextracti128 [r5+16*6], m1, 1 + vextracti128 [r5+16*7], m4, 1 + call .main + vperm2i128 m14, m0, m3, 0x31 ; 29 + vinserti128 m12, m0, xm3, 1 ; 25 + vperm2i128 m15, m1, m4, 0x31 ; 31 + vinserti128 m13, m1, xm4, 1 ; 27 + mova m8, [r5+32*0] + mova m9, [r5+32*1] + mova m10, [r5+32*2] + mova m11, [r5+32*3] +.full: + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] + mova m4, [r4-32*4] + mova m5, [r4-32*3] + mova m6, [r4-32*2] + mova m7, [r4-32*1] + lea r6, [vvc_pw_5 + 128] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + lea r3, [rsp+32*8] + mova m8, [r3+32*0] + mova m9, [r3+32*1] + mova m10, [r3+32*2] + mova m11, [r3+32*3] + mova m12, [r3-32*4] + mova m13, [r3-32*3] + mova m14, [r3-32*2] + mova m15, [r3-32*1] +.idct2_16: + lea r3, [rsp+32*16] + mova m0, [r3+32*0] + mova m1, [r3+32*1] + mova m2, [r3+32*2] + mova m3, [r3+32*3] + mova m4, [r3-32*4] + mova m5, [r3-32*3] + mova m6, [r3-32*2] + mova m7, [r3-32*1] + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +.main: + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 3] + pmulld m2, m14, [cq+128* 5] + pmulld m3, m14, [cq+128* 7] + pmulld m4, m14, [cq+128* 9] + pmulld m5, m14, [cq+128*11] + pmulld m6, m14, [cq+128*13] + pmulld m7, m14, [cq+128*15] + call m(idct2_8x16_internal_10).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 2] + pmulld m2, m14, [cq+128* 4] + pmulld m3, m14, [cq+128* 6] + pmulld m4, m14, [cq+128* 8] + pmulld m5, m14, [cq+128*10] + pmulld m6, m14, [cq+128*12] + pmulld m7, m14, [cq+128*14] + call m(idct2_8x8_internal_10).main_rect2 + call m(idct2_8x16_internal_10).main_evenhalf + psrld m15, m11, 11 ; vvc_pd_1 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*2] + paddd m15, m1, m9 ; out1 + psubd m1, m9 ; out14 + mova m9, [r6-32*1] + REPX {psrad x, 1}, m0, m15, m10, m1 + packssdw m0, m15 + packssdw m1, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6+32*0] + paddd m15, m3, m9 ; out3 + psubd m3, m9 ; out12 + mova m9, [r6+32*1] + REPX {psrad x, 1}, m2, m15, m10, m3 + packssdw m2, m15 + packssdw m3, m10 + psubd m10, m4, m8 ; out11 + paddd m4, m8 ; out4 + mova m8, [r6+32*2] + paddd m15, m5, m9 ; out5 + psubd m5, m9 ; out10 + mova m9, [r6+32*3] + REPX {psrad x, 1}, m4, m10, m15, m5 + packssdw m4, m15 + packssdw m5, m10 + psubd m10, m6, m8 ; out9 + paddd m6, m8 ; out6 + paddd m15, m7, m9 ; out7 + psubd m7, m9 ; out8 + REPX {psrad x, 1}, m6, m10, m15, m7 + packssdw m6, m15 + packssdw m7, m10 + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m4, m6 + punpcklwd m4, m6 + punpcklwd m6, m7, m5 + punpckhwd m7, m5 + pxor m5, m5 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m5 + mova [cq+r7+128*0], m5 + mova [cq+r7+128*1], m5 + mova [cq+r7+128*2], m5 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + punpcklwd m5, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m1 + punpckhwd m4, m1 + punpckhwd m1, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m6, m7 + punpcklwd m6, m7 + punpcklqdq m7, m1, m4 + punpckhqdq m1, m4 + punpckhqdq m4, m8, m3 + punpcklqdq m8, m3 + punpckhqdq m3, m6, m5 + punpcklqdq m6, m5 + punpcklqdq m5, m0, m2 + punpckhqdq m0, m2 + mova [r6+16*0], xm5 + mova [r6+16*1], xm6 + mova [r6+16*2], xm7 + mova [r6+16*3], xm8 + vextracti128 [r6+16*4], m5, 1 + vextracti128 [r6+16*5], m6, 1 + vextracti128 [r6+16*6], m7, 1 + vextracti128 [r6+16*7], m8, 1 + sub r6, 32*4 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*2], m7 + mova [rsp+gprsize+32*3], m15 + vpbroadcastd m15, [vvc_pw_2048] + vpbroadcastd m7, [pixel_10_max] + IDCT2_32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 + IDCT2_32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 + IDCT2_32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 + IDCT2_32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT2_32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4 + IDCT2_32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8 + IDCT2_32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4 + IDCT2_32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*0] + IDCT2_32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4 + IDCT2_32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8 + IDCT2_32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4 + IDCT2_32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*2] + mova m2, [rsp+gprsize+32*3] + IDCT2_32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4 + IDCT2_32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8 + IDCT2_32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4 + IDCT2_32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 + ret + +cglobal vvc_inv_identity_identity_16x32_10, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_10_max] +.pass1: + vpbroadcastd m8, [vvc_pw_64x8] + vpbroadcastd m9, [vvc_pw_1697x16] + vpbroadcastd m11, [vvc_pw_8192] + lea r6, [strideq*5] + pxor m6, m6 + paddw m10, m11, m11 ; vvc_pw_16384 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main2: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpcklwd m4, m2, m1 + punpckhwd m2, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + call m(iidentity_8x8_internal_10).write_2x8x2 + punpcklqdq m0, m3, m2 + punpckhqdq m1, m3, m2 + jmp m(iidentity_8x8_internal_10).write_2x8x2 + +cglobal vvc_inv_identity_identity_16x32_12, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_12_max] + jmp m(vvc_inv_identity_identity_16x32_10).pass1 + +cglobal vvc_inv_dct2_dct2_32x16_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*4] + call .main + cmp eobd, 36 + jge .full + call m(vvc_inv_dct2_dct2_8x32_10).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + lea r6, [vvc_pw_5+128] + mov r7, dstq + call m(idct2_16x16_internal_8).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(vvc_inv_dct2_dct2_8x32_10).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + jmp .end +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(vvc_inv_dct2_dct2_32x8_10).dconly2 +.full: + add cq, 32 + mova [r4+32*3], m0 + mova [r4+32*2], m1 + mova [r4+32*1], m2 + mova [r4+32*0], m3 + mova [r4-32*1], m4 + mova [r4-32*2], m5 + mova [r4-32*3], m6 + mova [r4-32*4], m7 + call .main + sub r4, 32*16 ; topleft 16x8 + call .transpose_16x16 + lea r6, [vvc_pw_5+128] + mov r7, dstq + call m(idct2_16x16_internal_8).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + add r4, 32*8 ; bottomleft 16x8 + call .transpose_16x16 +.end: + lea dstq, [r7+32] + call m(idct2_16x16_internal_8).main + call .write_16x16 + RET +ALIGN function_align +.transpose_16x16: + punpckhdq m8, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpckhdq m5, m4, m6 + punpckldq m4, m6 + punpckhqdq m6, m0, m4 + punpcklqdq m0, m4 + punpckhqdq m4, m1, m5 + punpcklqdq m1, m5 + punpckhqdq m5, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m2, m8 + punpcklqdq m2, m8 + vinserti128 m8, m0, xm7, 1 + vperm2i128 m12, m0, m7, 0x31 + vinserti128 m9, m6, xm5, 1 + vperm2i128 m13, m6, m5, 0x31 + vinserti128 m10, m1, xm2, 1 + vperm2i128 m14, m1, m2, 0x31 + vinserti128 m11, m4, xm3, 1 + vperm2i128 m15, m4, m3, 0x31 + mova m0, [r4+32*3] + mova m1, [r4+32*2] + mova m2, [r4+32*1] + mova m3, [r4+32*0] + mova m4, [r4-32*1] + mova m5, [r4-32*2] + mova m6, [r4-32*3] + mova m7, [r4-32*4] + mova [rsp+gprsize], m15 + jmp m(vvc_inv_dct2_dct2_8x32_10).transpose +ALIGN function_align +.main: + vpbroadcastd m14, [vvc_pd_64] + vpbroadcastd m11, [vvc_pd_2048] + pmulld m0, m14, [cq+64* 1] + pmulld m1, m14, [cq+64* 7] + pmulld m2, m14, [cq+64* 9] + pmulld m3, m14, [cq+64*15] + pmulld m4, m14, [cq+64*17] + pmulld m5, m14, [cq+64*23] + pmulld m6, m14, [cq+64*25] + pmulld m7, m14, [cq+64*31] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+64* 3] + pmulld m1, m14, [cq+64* 5] + pmulld m2, m14, [cq+64*11] + pmulld m3, m14, [cq+64*13] + pmulld m4, m14, [cq+64*19] + pmulld m5, m14, [cq+64*21] + pmulld m6, m14, [cq+64*27] + pmulld m7, m14, [cq+64*29] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+64* 2] + pmulld m1, m14, [cq+64* 6] + pmulld m2, m14, [cq+64*10] + pmulld m3, m14, [cq+64*14] + pmulld m4, m14, [cq+64*18] + pmulld m5, m14, [cq+64*22] + pmulld m6, m14, [cq+64*26] + pmulld m7, m14, [cq+64*30] + call m(idct2_8x16_internal_10).main_oddhalf_rect2 + pmulld m0, m14, [cq+64* 0] + pmulld m1, m14, [cq+64* 4] + pmulld m2, m14, [cq+64* 8] + pmulld m3, m14, [cq+64*12] + pmulld m4, m14, [cq+64*16] + pmulld m5, m14, [cq+64*20] + pmulld m6, m14, [cq+64*24] + pmulld m7, m14, [cq+64*28] + call m(idct2_8x8_internal_10).main_rect2 + call m(idct2_8x16_internal_10).main_evenhalf + pxor m8, m8 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m8 + mova [cq+r7-64*1], m8 + mova [cq+r7+64*0], m8 + mova [cq+r7+64*1], m8 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m11, 11 ; vvc_pd_1 + IDCT2_32_END 0, 15, 8, 9, 10, 1 + IDCT2_32_END 1, 14, 8, 9, 10, 1 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT2_32_END 2, 15, 8, 9, 10, 1 + IDCT2_32_END 3, 14, 8, 9, 10, 1 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT2_32_END 4, 15, 8, 9, 10, 1 + IDCT2_32_END 5, 14, 8, 9, 10, 1 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT2_32_END 6, 15, 8, 9, 10, 1 + IDCT2_32_END 7, 14, 8, 9, 10, 1 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 + ret +ALIGN function_align +.write_16x16: + mova m1, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [vvc_pw_2048] + vpbroadcastd m9, [pixel_10_max] + lea r3, [strideq*3] + pxor m8, m8 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct2_16x8_internal_10).write_16x4 + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct2_16x8_internal_10).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct2_16x8_internal_10).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct2_16x8_internal_10).write_16x4 + +cglobal vvc_inv_identity_identity_32x16_10, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_10_max] +.pass1: + vpbroadcastd m8, [vvc_pw_64x8] + vpbroadcastd m9, [vvc_pw_1697x16] + vpbroadcastd m10, [vvc_pw_64] + lea r6, [strideq*5] + pxor m6, m6 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*1] + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*2] + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*3] + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {paddsw x, x }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3 + REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(vvc_inv_identity_identity_16x32_10).main2 + +cglobal vvc_inv_identity_identity_32x16_12, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_12_max] + jmp m(vvc_inv_identity_identity_32x16_10).pass1 + +cglobal vvc_inv_dct2_dct2_32x32_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*38, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 32 + jmp m(vvc_inv_dct2_dct2_32x8_10).dconly +.fast: + lea r4, [rsp+32*71] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r3, [rsp+32*3] + mov r4, r6 + lea r5, [r6+32*8] + lea r6, [vvc_pw_5+128] + call .pass2_oddhalf + call .pass2_evenhalf + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call m(vvc_inv_dct2_dct2_16x32_10).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + lea r3, [rsp+32*11] + call .pass2_oddhalf + call .pass2_evenhalf + lea r3, [strideq*3] + call m(vvc_inv_dct2_dct2_16x32_10).pass2_end + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 7] + mova m2, [cq+128* 9] + mova m3, [cq+128*15] + mova m4, [cq+128*17] + mova m5, [cq+128*23] + mova m6, [cq+128*25] + mova m7, [cq+128*31] + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m14, [vvc_pd_64] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128* 5] + mova m2, [cq+128*11] + mova m3, [cq+128*13] + mova m4, [cq+128*19] + mova m5, [cq+128*21] + mova m6, [cq+128*27] + mova m7, [cq+128*29] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128* 6] + mova m2, [cq+128*10] + mova m3, [cq+128*14] + mova m4, [cq+128*18] + mova m5, [cq+128*22] + mova m6, [cq+128*26] + mova m7, [cq+128*30] + call m(idct2_8x16_internal_10).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 4] + mova m2, [cq+128* 8] + mova m3, [cq+128*12] + mova m4, [cq+128*16] + mova m5, [cq+128*20] + mova m6, [cq+128*24] + mova m7, [cq+128*28] + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + call m(vvc_inv_dct2_dct2_8x32_10).main_end + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(vvc_inv_dct2_dct2_8x32_10).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret +ALIGN function_align +.pass2_oddhalf: + mova m0, [r3+32* 1] ; 1 + mova m1, [r3+32* 3] ; 3 + mova m2, [r3+32* 5] ; 5 + mova m3, [r3+32* 7] ; 7 + mova m4, [r3+32*17] ; 9 + mova m5, [r3+32*19] ; 11 + mova m6, [r3+32*21] ; 13 + mova m7, [r3+32*23] ; 15 + mova m8, [r3+32*33] ; 17 + mova m9, [r3+32*35] ; 19 + mova m10, [r3+32*37] ; 21 + mova m11, [r3+32*39] ; 23 + mova m12, [r3+32*49] ; 25 + mova m13, [r3+32*51] ; 27 + mova m14, [r3+32*53] ; 29 + mova m15, [r3+32*55] ; 31 + jmp m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf +ALIGN function_align +.pass2_evenhalf: + mova m0, [r3+32* 0] ; 0 + mova m1, [r3+32* 2] ; 2 + mova m2, [r3+32* 4] ; 4 + mova m3, [r3+32* 6] ; 6 + mova m4, [r3+32*16] ; 8 + mova m5, [r3+32*18] ; 10 + mova m6, [r3+32*20] ; 12 + mova m7, [r3+32*22] ; 14 + mova m8, [r3+32*32] ; 16 + mova m9, [r3+32*34] ; 18 + mova m10, [r3+32*36] ; 20 + mova m11, [r3+32*38] ; 22 + mova m12, [r3+32*48] ; 24 + mova m13, [r3+32*50] ; 26 + mova m14, [r3+32*52] ; 28 + mova m15, [r3+32*54] ; 30 + mova [rsp+gprsize], m15 + jmp m(idct2_16x16_internal_8).main + +cglobal vvc_inv_identity_identity_32x32_10, 4, 8, 8, dst, stride, c, eob +%undef cmp + vpbroadcastd m7, [pixel_10_max] +.pass1: + vpbroadcastd m5, [vvc_pw_8192] + pxor m6, m6 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 + call .main ; 0 + cmp eobd, 36 + jl .ret + add cq, 128*8 ; 0 1 + mov r7, dstq ; 1 + add dstq, 16 + call .main + call .main2 + cmp eobd, 136 + jl .ret + add cq, 128*16-32 ; 0 1 2 + lea dstq, [r7+16*2] ; 1 2 + call .main ; 2 + call .main2 + call .main2 + cmp eobd, 300 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + add r7, 16*3 ; 1 2 3 + mov dstq, r7 ; 2 3 + call .main ; 3 + call .main2 + call .main2 + call .main2 + cmp eobd, 535 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + mov r7, dstq ; 2 3 4 + call .main ; 3 4 + call .main2 + call .main2 + cmp eobd, 755 + jl .ret + add cq, 128*16-32 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + call .main ; 2 3 4 5 + call .main2 ; 3 4 5 + cmp eobd, 911 + jl .ret + add cq, 128*8 ; 0 1 2 3 + add dstq, 16 ; 1 2 3 4 + call .main ; 2 3 4 5 +.ret: ; 3 4 5 6 + RET +ALIGN function_align +.main2: + sub cq, 128*8-32 + lea dstq, [dstq+strideq*8-16] +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + jmp m(vvc_inv_identity_identity_8x32_10).main_zero + +cglobal vvc_inv_identity_identity_32x32_12, 4, 8, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12_max] + jmp m(vvc_inv_identity_identity_32x32_10).pass1 + +%macro IDCT2_64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [r5-32*(51-%1)] ; idct2_16 out 0+n + mova m%4, [r4-32*(14+%1)] ; idct2_32 out31-n +%else + mova m%5, [r4-32*(45-%1)] + mova m%4, [r5-32*(20+%1)] +%endif + paddsw m%6, m%5, m%4 ; idct2_32 out 0+n + psubsw m%5, m%4 ; idct2_32 out31-n + paddsw m%4, m%5, m%3 ; out31-n + psubsw m%5, m%3 ; out32+n + paddsw m%3, m%6, m%2 ; out 0+n + psubsw m%6, m%2 ; out63-n + REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + paddw m%3, [%%d0+%7 ] + paddw m%4, [%%d1+%8 ] + paddw m%5, [%%d0+%9 ] + paddw m%6, [%%d1+%10] + pxor m%2, m%2 + REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 + vpbroadcastd m%2, [pixel_10_max] + REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 + mova [%%d0+%7 ], m%3 + mova [%%d1+%8 ], m%4 + mova [%%d0+%9 ], m%5 + mova [%%d1+%10], m%6 +%endmacro + +cglobal vvc_inv_dct2_dct2_16x64_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*6] + call .main + sub eobd, 44 + jl .fast + call .main + sub eobd, 107 + jl .fast + call .main + sub eobd, 128 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 64 + add r6d, 640 + sar r6d, 10 + jmp m(vvc_inv_dct2_dct2_16x4_10).dconly3 +.fast: + lea r4, [rsp+32*38] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r6, [vvc_pw_5+128] + mova m0, [rsp+32* 2] ; in0 + mova m1, [rsp+32* 6] ; in4 + mova m2, [rsp+32*10] ; in8 + mova m3, [rsp+32*14] ; in12 + mova m4, [rsp+32*18] ; in16 + mova m5, [rsp+32*22] ; in20 + mova m6, [rsp+32*26] ; in24 + mova m7, [rsp+32*30] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*38] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [rsp+32* 4] ; in2 + mova m1, [rsp+32* 8] ; in6 + mova m2, [rsp+32*12] ; in10 + mova m3, [rsp+32*16] ; in14 + mova m4, [rsp+32*20] ; in18 + mova m5, [rsp+32*24] ; in22 + mova m6, [rsp+32*28] ; in26 + mova m7, [rsp+32*32] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + mova m0, [rsp+32* 3] ; in1 + mova m1, [rsp+32*33] ; in31 + mova m2, [rsp+32*19] ; in17 + mova m3, [rsp+32*17] ; in15 + mova m4, [rsp+32*11] ; in9 + mova m5, [rsp+32*25] ; in23 + mova m6, [rsp+32*27] ; in25 + mova m7, [rsp+32* 9] ; in7 + lea r6, [idct2_64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + mova m0, [rsp+32* 7] ; in5 + mova m1, [rsp+32*29] ; in27 + mova m2, [rsp+32*23] ; in21 + mova m3, [rsp+32*13] ; in11 + mova m4, [rsp+32*15] ; in13 + mova m5, [rsp+32*21] ; in19 + mova m6, [rsp+32*31] ; in29 + mova m7, [rsp+32* 5] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + call .main_part2_pass2 + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + call m(idct2_8x16_internal_10).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(idct2_8x8_internal_10).main + call m(idct2_8x16_internal_10).main_evenhalf + pxor m15, m15 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + psrld m15, m11, 10 ; vvc_pd_2 + mova m8, [r6-32*4] + mova m9, [r6+32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*3] + psubd m15, m7, m9 ; out8 + paddd m7, m9 ; out7 + mova m9, [r6+32*2] + REPX {psrad x, 2}, m0, m15, m10, m7 + packssdw m0, m15 + packssdw m7, m10 + psubd m10, m1, m8 ; out14 + paddd m1, m8 ; out1 + mova m8, [r6-32*2] + psubd m15, m6, m9 ; out9 + paddd m6, m9 ; out6 + mova m9, [r6+32*1] + REPX {psrad x, 2}, m1, m15, m10, m6 + packssdw m1, m15 + packssdw m6, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6-32*1] + psubd m15, m5, m9 ; out10 + paddd m5, m9 ; out5 + mova m9, [r6+32*0] + REPX {psrad x, 2}, m2, m15, m10, m5 + packssdw m2, m15 + packssdw m5, m10 + psubd m10, m3, m8 ; out12 + paddd m3, m8 ; out3 + psubd m15, m4, m9 ; out11 + paddd m4, m9 ; out4 + REPX {psrad x, 2}, m3, m15, m10, m4 + packssdw m3, m15 + packssdw m4, m10 + call m(idct2_16x8_internal_10).transpose3 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + ret +.main_part2_pass2: + vpbroadcastd m11, [vvc_pw_36_83] + vpbroadcastd m12, [vvc_pw_m83_36] + vpbroadcastd m13, [vvc_pw_64_64] + lea r6, [vvc_pw_5+128] + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [vvc_pw_m64_64] + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_internal + vpbroadcastd m14, [vvc_pw_2048] + IDCT2_64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 + IDCT2_64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 + IDCT2_64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + IDCT2_64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp r4, r5 + jne .main_part2_pass2_loop + ret +ALIGN function_align +.main_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_part1: ; idct2_64 steps 1-5 + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/83/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vpbroadcastd m7, [r5+4*0] + vpbroadcastd m8, [r5+4*1] + vpbroadcastd m6, [r5+4*2] + vpbroadcastd m9, [r5+4*3] + vpbroadcastd m5, [r5+4*4] + vpbroadcastd m10, [r5+4*5] + vpbroadcastd m4, [r5+4*6] + vpbroadcastd m15, [r5+4*7] + pmulld m7, m0 ; t63a + pmulld m0, m8 ; t32a + pmulld m6, m1 ; t62a + pmulld m1, m9 ; t33a + pmulld m5, m2 ; t61a + pmulld m2, m10 ; t34a + pmulld m4, m3 ; t60a + pmulld m3, m15 ; t35a + vpbroadcastd m10, [r5+4*8] + vpbroadcastd m15, [r5+4*9] + REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 + REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 + psubd m8, m0, m1 ; t33 + paddd m0, m1 ; t32 + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m5 ; t61 + paddd m4, m5 ; t60 + REPX {pmaxsd x, m12}, m8, m1, m6, m2 + REPX {pminsd x, m13}, m8, m1, m6, m2 + ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15, 0xc ; t33a, t62a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 0xe ; t61a, t34a + REPX {pmaxsd x, m12}, m0, m3, m7, m4 + REPX {pminsd x, m13}, m0, m3, m7, m4 + vpbroadcastd m10, [r5+4*10] + vpbroadcastd m15, [r5+4*11] + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m8, m2 ; t61 + paddd m8, m2 ; t62 + REPX {pmaxsd x, m12}, m5, m3, m4, m6 + REPX {pminsd x, m13}, m5, m3, m4, m6 + ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15, 0xc ; t35, t60 + ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15, 0xc ; t34a, t61a + REPX {pmaxsd x, m12}, m0, m7, m1, m8 + REPX {pminsd x, m13}, m0, m7, m1, m8 + add r5, 4*12 + mova [r6-32*4], m0 + mova [r6+32*3], m7 + mova [r6-32*3], m1 + mova [r6+32*2], m8 + mova [r6-32*2], m6 + mova [r6+32*1], m4 + mova [r6-32*1], m3 + mova [r6+32*0], m5 + add r6, 32*8 + ret +.main_part2: ; idct2_64 steps 6-9 + lea r5, [r6+32*3] + sub r6, 32*4 + vpbroadcastd m10, [vvc_pd_36] + vpbroadcastd m15, [vvc_pd_83] +.main_part2_loop: + mova m0, [r6-32*32] ; t32a + mova m1, [r5-32*24] ; t39a + mova m2, [r5-32*32] ; t63a + mova m3, [r6-32*24] ; t56a + mova m4, [r6-32*16] ; t40a + mova m5, [r5-32* 8] ; t47a + mova m6, [r5-32*16] ; t55a + mova m7, [r6-32* 8] ; t48a + psubd m8, m0, m1 ; t39 + paddd m0, m1 ; t32 + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + REPX {pmaxsd x, m12}, m8, m1, m3, m4 + REPX {pminsd x, m13}, m8, m1, m3, m4 + ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15, 0xc ; t39a, t56a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 0xe ; t55a, t40a + REPX {pmaxsd x, m12}, m0, m2, m5, m7 + REPX {pminsd x, m13}, m0, m5, m2, m7 + psubd m6, m2, m7 ; t48a + paddd m2, m7 ; t63a + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m8, m4 ; t55 + paddd m8, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + REPX {pmaxsd x, m12}, m6, m7, m5, m4 + REPX {pminsd x, m13}, m6, m7, m5, m4 + REPX {pmulld x, m14}, m6, m7, m5, m4 + REPX {pmaxsd x, m12}, m2, m0, m8, m1 + REPX {pminsd x, m13}, m2, m0, m8, m1 + paddd m6, m11 + paddd m5, m11 + psubd m3, m6, m7 ; t47 + paddd m6, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m3, m6, m7, m5 + mova [r5-32* 8], m2 + mova [r6-32*32], m0 + mova [r6-32* 8], m8 + mova [r5-32*32], m1 + mova [r5-32*24], m3 + mova [r6-32*16], m6 + mova [r6-32*24], m7 + mova [r5-32*16], m5 + add r6, 32 + sub r5, 32 + cmp r6, r5 + jl .main_part2_loop + ret + +cglobal vvc_inv_dct2_dct2_32x64_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*6] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10] + mov [cq], eobd ; 0 + or r3d, 64 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(vvc_inv_dct2_dct2_32x8_10).dconly2 +.fast: + lea r4, [rsp+32*70] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r6, [vvc_pw_5 + 128] + mov r10, rsp + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10+32* 2] ; in0 + mova m1, [r10+32* 6] ; in4 + mova m2, [r10+32*18] ; in8 + mova m3, [r10+32*22] ; in12 + mova m4, [r10+32*34] ; in16 + mova m5, [r10+32*38] ; in20 + mova m6, [r10+32*50] ; in24 + mova m7, [r10+32*54] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*70] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10+32* 4] ; in2 + mova m1, [r10+32* 8] ; in6 + mova m2, [r10+32*20] ; in10 + mova m3, [r10+32*24] ; in14 + mova m4, [r10+32*36] ; in18 + mova m5, [r10+32*40] ; in22 + mova m6, [r10+32*52] ; in26 + mova m7, [r10+32*56] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + mova m0, [r10+32* 3] ; in1 + mova m1, [r10+32*57] ; in31 + mova m2, [r10+32*35] ; in17 + mova m3, [r10+32*25] ; in15 + mova m4, [r10+32*19] ; in9 + mova m5, [r10+32*41] ; in23 + mova m6, [r10+32*51] ; in25 + mova m7, [r10+32* 9] ; in7 + lea r6, [idct2_64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + mova m0, [r10+32* 7] ; in5 + mova m1, [r10+32*53] ; in27 + mova m2, [r10+32*39] ; in21 + mova m3, [r10+32*21] ; in11 + mova m4, [r10+32*23] ; in13 + mova m5, [r10+32*37] ; in19 + mova m6, [r10+32*55] ; in29 + mova m7, [r10+32* 5] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_10).main_part2_pass2 + add r10, 32*8 + sub r4, 32*98 ; rsp+32*16 + sub dstq, r8 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + vpbroadcastd m14, [vvc_pd_64] + vpbroadcastd m11, [vvc_pd_2048] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 7] + pmulld m2, m14, [cq+128* 9] + pmulld m3, m14, [cq+128*15] + pmulld m4, m14, [cq+128*17] + pmulld m5, m14, [cq+128*23] + pmulld m6, m14, [cq+128*25] + pmulld m7, m14, [cq+128*31] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128* 5] + pmulld m2, m14, [cq+128*11] + pmulld m3, m14, [cq+128*13] + pmulld m4, m14, [cq+128*19] + pmulld m5, m14, [cq+128*21] + pmulld m6, m14, [cq+128*27] + pmulld m7, m14, [cq+128*29] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128* 6] + pmulld m2, m14, [cq+128*10] + pmulld m3, m14, [cq+128*14] + pmulld m4, m14, [cq+128*18] + pmulld m5, m14, [cq+128*22] + pmulld m6, m14, [cq+128*26] + pmulld m7, m14, [cq+128*30] + call m(idct2_8x16_internal_10).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 4] + pmulld m2, m14, [cq+128* 8] + pmulld m3, m14, [cq+128*12] + pmulld m4, m14, [cq+128*16] + pmulld m5, m14, [cq+128*20] + pmulld m6, m14, [cq+128*24] + pmulld m7, m14, [cq+128*28] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + call m(idct2_8x8_internal_10).main_rect2 + call m(idct2_8x16_internal_10).main_evenhalf + call m(vvc_inv_dct2_dct2_32x16_10).main_end + call m(vvc_inv_dct2_dct2_8x32_10).transpose + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(vvc_inv_dct2_dct2_8x32_10).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret + +cglobal vvc_inv_dct2_dct2_64x16_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .normal + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 16 +.dconly: + add r6d, 640 + sar r6d, 10 +.dconly2: + vpbroadcastd m5, [dconly_10] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm5 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + paddsw m3, m0, [dstq+32*2] + paddsw m4, m0, [dstq+32*3] + REPX {psubusw x, m5}, m1, m2, m3, m4 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + mova [dstq+32*2], m3 + mova [dstq+32*3], m4 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*4] + call .main + call .shift_transpose + cmp eobd, 36 + jl .fast + call .main + call .shift_transpose + jmp .pass2 +.fast: + pxor m0, m0 + mov r3d, 4 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + dec r3d + jg .fast_loop +.pass2: + lea r7, [r6-32*64] + lea r4, [r6-32*32] + lea r6, [vvc_pw_5+128] + mov r5, dstq +.pass2_loop: + mova m0, [r7-32*4] + mova m1, [r7-32*3] + mova m2, [r7-32*2] + mova m3, [r7-32*1] + mova m4, [r7+32*0] + mova m5, [r7+32*1] + mova m6, [r7+32*2] + mova m7, [r7+32*3] + add r7, 32*32 + mova m8, [r7-32*4] + mova m9, [r7-32*3] + mova m10, [r7-32*2] + mova m11, [r7-32*1] + mova m12, [r7+32*0] + mova m13, [r7+32*1] + mova m14, [r7+32*2] + mova m15, [r7+32*3] + sub r7, 32*24 + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + call m(vvc_inv_dct2_dct2_32x16_10).write_16x16 + add r5, 32 + mov dstq, r5 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct2_64_mul_16] + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+64* 7] + mova m1, [cq+64*25] + mova m2, [cq+64*23] + mova m3, [cq+64* 9] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+64* 3] + mova m1, [cq+64*29] + mova m2, [cq+64*19] + mova m3, [cq+64*13] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + call m(vvc_inv_dct2_dct2_16x64_10).main_part2 + mova m0, [cq+64* 2] + mova m1, [cq+64*14] + mova m2, [cq+64*18] + mova m3, [cq+64*30] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_fast + mova m0, [cq+64* 6] + mova m1, [cq+64*10] + mova m2, [cq+64*22] + mova m3, [cq+64*26] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_fast + mova m0, [cq+64* 4] + mova m1, [cq+64*12] + mova m2, [cq+64*20] + mova m3, [cq+64*28] + call m(idct2_8x16_internal_10).main_oddhalf_fast + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m2, [cq+64*16] + mova m3, [cq+64*24] + pxor m15, m15 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m15 + mova [cq+r7-64*1], m15 + mova [cq+r7+64*0], m15 + mova [cq+r7+64*1], m15 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m15, m11, 10 ; vvc_pd_2 +.main_end2: + add cq, 32 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct2_8x8_internal_10).main + add r6, 32*8 + call m(idct2_8x16_internal_10).main_evenhalf + mova [r6+32*2], m1 + mova [r6+32*1], m2 + mova [r6+32*0], m3 + mova [r6-32*1], m4 + mova [r6-32*2], m5 + mova [r6-32*3], m6 + mova [r6-32*4], m7 + jmp .main_end_loop_start +.main_end_loop: + mova m0, [r6+32* 3] ; idct2_8 0 + n +.main_end_loop_start: + mova m1, [r5+32* 4] ; idct2_16 15 - n + mova m2, [r5-32*12] ; idct2_32 16 + n + mova m3, [r6-32*13] ; idct2_32 31 - n + mova m4, [r6-32*29] ; idct2_64 63 - n + mova m5, [r5-32*28] ; idct2_64 48 + n + mova m6, [r6-32*45] ; idct2_64 47 - n + mova m7, [r5-32*44] ; idct2_64 32 + n + paddd m8, m0, m1 ; idct2_16 out0 + n + psubd m0, m1 ; idct2_16 out15 - n + REPX {pmaxsd x, m12}, m8, m0 + REPX {pminsd x, m13}, m8, m0 + paddd m1, m8, m3 ; idct2_32 out0 + n + psubd m8, m3 ; idct2_32 out31 - n + paddd m3, m0, m2 ; idct2_32 out15 - n + psubd m0, m2 ; idct2_32 out16 + n + REPX {pmaxsd x, m12}, m1, m8, m3, m0 + REPX {pminsd x, m13}, m1, m3, m8, m0 + REPX {paddd x, m15}, m1, m3, m0, m8 + paddd m2, m1, m4 ; idct2_64 out0 + n (unshifted) + psubd m1, m4 ; idct2_64 out63 - n (unshifted) + paddd m4, m3, m5 ; idct2_64 out15 - n (unshifted) + psubd m3, m5 ; idct2_64 out48 + n (unshifted) + paddd m5, m0, m6 ; idct2_64 out16 + n (unshifted) + psubd m0, m6 ; idct2_64 out47 - n (unshifted) + paddd m6, m8, m7 ; idct2_64 out31 - n (unshifted) + psubd m8, m7 ; idct2_64 out32 + n (unshifted) + mova [r5-32*44], m2 + mova [r6+32* 3], m1 + mova [r6-32*45], m4 + mova [r5+32* 4], m3 + mova [r5-32*28], m5 + mova [r6-32*13], m0 + mova [r6-32*29], m6 + mova [r5-32*12], m8 + add r5, 32 + sub r6, 32 + cmp r5, r6 + jl .main_end_loop + ret +.shift_transpose: +%macro IDCT2_64_SHIFT_TRANSPOSE 1 ; shift + sub r6, 32*48 + mov r5, r6 +%%loop: + mova m0, [r6-32* 4] + mova m4, [r6+32* 4] + mova m1, [r6-32* 3] + mova m5, [r6+32* 5] + mova m2, [r6-32* 2] + mova m6, [r6+32* 6] + mova m3, [r6-32* 1] + mova m7, [r6+32* 7] + REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m4, [r6+32* 0] + mova m6, [r6+32* 8] + mova m5, [r6+32* 1] + mova m7, [r6+32* 9] + REPX {psrad x, %1}, m4, m6, m5, m7 + packssdw m4, m6 + packssdw m5, m7 + mova m6, [r6+32* 2] + mova m8, [r6+32*10] + mova m7, [r6+32* 3] + mova m9, [r6+32*11] + REPX {psrad x, %1}, m6, m8, m7, m9 + packssdw m6, m8 + packssdw m7, m9 + call m(idct2_16x8_internal_10).transpose3 + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + add r6, 32*16 + add r5, 32*8 + cmp r5, r4 + jl %%loop + mov r6, r4 +%endmacro + IDCT2_64_SHIFT_TRANSPOSE 2 + ret + +cglobal vvc_inv_dct2_dct2_64x32_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(vvc_inv_dct2_dct2_64x16_10).dconly2 +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r7, [r6-32*32] + lea r5, [r6+32*8] + lea r6, [vvc_pw_5+128] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq +.pass2_loop: + mova m0, [r7-32*99] + mova m1, [r7-32*97] + mova m2, [r7-32*95] + mova m3, [r7-32*93] + mova m4, [r7-32*67] + mova m5, [r7-32*65] + mova m6, [r7-32*63] + mova m7, [r7-32*61] + mova m8, [r7-32*35] + mova m9, [r7-32*33] + mova m10, [r7-32*31] + mova m11, [r7-32*29] + mova m12, [r7-32* 3] + mova m13, [r7-32* 1] + mova m14, [r7+32* 1] + mova m15, [r7+32* 3] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + mova m0, [r7-32*100] + mova m1, [r7-32*98] + mova m2, [r7-32*96] + mova m3, [r7-32*94] + mova m4, [r7-32*68] + mova m5, [r7-32*66] + mova m6, [r7-32*64] + mova m7, [r7-32*62] + mova m8, [r7-32*36] + mova m9, [r7-32*34] + mova m10, [r7-32*32] + mova m11, [r7-32*30] + mova m12, [r7-32* 4] + mova m13, [r7-32* 2] + mova m14, [r7+32* 0] + mova m15, [r7+32* 2] + add r7, 32*8 + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + call m(vvc_inv_dct2_dct2_16x32_10).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct2_64_mul_16] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128*31] + pmulld m2, m14, [cq+128*17] + pmulld m3, m14, [cq+128*15] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1_rect2 + pmulld m0, m14, [cq+128* 7] + pmulld m1, m14, [cq+128*25] + pmulld m2, m14, [cq+128*23] + pmulld m3, m14, [cq+128* 9] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1_rect2 + pmulld m0, m14, [cq+128* 5] + pmulld m1, m14, [cq+128*27] + pmulld m2, m14, [cq+128*21] + pmulld m3, m14, [cq+128*11] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128*29] + pmulld m2, m14, [cq+128*19] + pmulld m3, m14, [cq+128*13] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1_rect2 + call m(vvc_inv_dct2_dct2_16x64_10).main_part2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128*14] + pmulld m2, m14, [cq+128*18] + pmulld m3, m14, [cq+128*30] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_fast_rect2 + pmulld m0, m14, [cq+128* 6] + pmulld m1, m14, [cq+128*10] + pmulld m2, m14, [cq+128*22] + pmulld m3, m14, [cq+128*26] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_fast_rect2 + pmulld m0, m14, [cq+128* 4] + pmulld m1, m14, [cq+128*12] + pmulld m2, m14, [cq+128*20] + pmulld m3, m14, [cq+128*28] + call m(idct2_8x16_internal_10).main_oddhalf_fast_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 8] + pmulld m2, m14, [cq+128*16] + pmulld m3, m14, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + psrld m15, m11, 11 ; vvc_pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + call m(vvc_inv_dct2_dct2_64x16_10).main_end2 + IDCT2_64_SHIFT_TRANSPOSE 1 + ret + +cglobal vvc_inv_dct2_dct2_64x64_10, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [vvc_pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [vvc_pd_64] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 64 + jmp m(vvc_inv_dct2_dct2_64x16_10).dconly +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r10, [r6-32*32] + lea r6, [vvc_pw_5+128] + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10-32*100] ; in0 + mova m1, [r10-32*96] ; in4 + mova m2, [r10-32*68] ; in8 + mova m3, [r10-32*64] ; in12 + mova m4, [r10-32*36] ; in16 + mova m5, [r10-32*32] ; in20 + mova m6, [r10-32* 4] ; in24 + mova m7, [r10+32* 0] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10-32*98] ; in2 + mova m1, [r10-32*94] ; in6 + mova m2, [r10-32*66] ; in10 + mova m3, [r10-32*62] ; in14 + mova m4, [r10-32*34] ; in18 + mova m5, [r10-32*30] ; in22 + mova m6, [r10-32* 2] ; in26 + mova m7, [r10+32* 2] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + mova m0, [r10-32*99] ; in1 + mova m1, [r10+32* 3] ; in31 + mova m2, [r10-32*35] ; in17 + mova m3, [r10-32*61] ; in15 + mova m4, [r10-32*67] ; in9 + mova m5, [r10-32*29] ; in23 + mova m6, [r10-32* 3] ; in25 + mova m7, [r10-32*93] ; in7 + lea r6, [idct2_64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + mova m0, [r10-32*95] ; in5 + mova m1, [r10-32* 1] ; in27 + mova m2, [r10-32*31] ; in21 + mova m3, [r10-32*65] ; in11 + mova m4, [r10-32*63] ; in13 + mova m5, [r10-32*33] ; in19 + mova m6, [r10+32* 1] ; in29 + mova m7, [r10-32*97] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_10).main_part2_pass2 + add r10, 32*8 + sub dstq, r8 + sub r4, 32*44 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct2_64_mul_16] + mova m0, [cq+128* 1] + mova m1, [cq+128*31] + mova m2, [cq+128*17] + mova m3, [cq+128*15] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+128* 7] + mova m1, [cq+128*25] + mova m2, [cq+128*23] + mova m3, [cq+128* 9] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+128* 5] + mova m1, [cq+128*27] + mova m2, [cq+128*21] + mova m3, [cq+128*11] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128*29] + mova m2, [cq+128*19] + mova m3, [cq+128*13] + call m(vvc_inv_dct2_dct2_16x64_10).main_part1 + call m(vvc_inv_dct2_dct2_16x64_10).main_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128*14] + mova m2, [cq+128*18] + mova m3, [cq+128*30] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part1_fast + mova m0, [cq+128* 6] + mova m1, [cq+128*10] + mova m2, [cq+128*22] + mova m3, [cq+128*26] + call m(vvc_inv_dct2_dct2_8x32_10).main_oddhalf_part2_fast + mova m0, [cq+128* 4] + mova m1, [cq+128*12] + mova m2, [cq+128*20] + mova m3, [cq+128*28] + call m(idct2_8x16_internal_10).main_oddhalf_fast + mova m0, [cq+128* 0] + mova m1, [cq+128* 8] + mova m2, [cq+128*16] + mova m3, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + call m(vvc_inv_dct2_dct2_64x16_10).main_end + jmp m(vvc_inv_dct2_dct2_64x16_10).shift_transpose + +%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/vvc_itx_8bit.asm b/libavcodec/x86/vvc_itx_8bit.asm new file mode 100644 index 00000000000..2c9fdb54546 --- /dev/null +++ b/libavcodec/x86/vvc_itx_8bit.asm @@ -0,0 +1,5545 @@ +; Copyright © 2023, Frank Plowman +; Copyright © 48-2021, VideoLAN and dav1d authors +; Copyright © 48, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 + +; Note: The order of (at least some of) those constants matter! + +const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +%macro COEF_PAIR 2 +vvc_pw_%1_%2: dw %1, %2 +vvc_pw_m%2_%1: dw -%2, %1 +%endmacro + +; ADST-only +vvc_pw_3803_1321: dw 3803, 1321 +vvc_pw_m1321_2482: dw -1321, 2482 +vvc_pw_2482_3344: dw 2482, 3344 +vvc_pw_m3344_3344: dw -3344, 3344 +vvc_pw_m3803_3344: dw -3803, 3344 +vvc_pw_m3803_m6688: dw -3803, -6688 +vvc_pw_64_m64: dw 64, -64 + +const vvc_pw_5, times 2 dw 5 +const vvc_pw_2048, times 2 dw 2048 +const vvc_pw_64, times 2 dw 64 +const vvc_pw_8192, times 2 dw 8192 +const vvc_pw_16384, times 2 dw 16384 +const vvc_pw_1697x16, times 2 dw 1697*16 +const vvc_pw_1697x8, times 2 dw 1697*8 +const vvc_pw_64x8, times 2 dw 64*8 +const vvc_pd_64, dd 64 +const vvc_pd_512, dd 512 +const vvc_pd_2048, dd 2048 + +const vvc_pw_64_64, dw 64, 64 +const vvc_pw_m64_64, dw -64, 64 +const vvc_pw_36_83, dw 36, 83 +const vvc_pw_m83_36, dw -83, 36 +COEF_PAIR 83, 36 +COEF_PAIR 4, 90 +COEF_PAIR 22, 88 +COEF_PAIR 38, 82 +COEF_PAIR 54, 73 +COEF_PAIR 67, 61 +COEF_PAIR 78, 46 +COEF_PAIR 85, 31 +COEF_PAIR 90, 13 +COEF_PAIR 9, 90 +COEF_PAIR 43, 80 +COEF_PAIR 70, 57 +COEF_PAIR 87, 25 +COEF_PAIR 18, 89 +COEF_PAIR 75, 50 +vvc_pw_m18_m89: dw -18, -89 +const vvc_pw_m36_m83, dw -36, -83 +vvc_pw_m75_m50: dw -75, -50 +vvc_pw_m9_m90: dw -9, -90 +vvc_pw_m70_m57: dw -70, -57 +vvc_pw_m43_m80: dw -43, -80 +vvc_pw_m87_m25: dw -87, -25 +COEF_PAIR 50, 75 +COEF_PAIR 89, 18 + +%macro COEF_X8 1-* +%rep %0 + dw %1*8, %1*8 + %rotate 1 +%endrep +%endmacro + +vvc_pw_82x8: COEF_X8 82 +vvc_pw_38x8: COEF_X8 38 +vvc_pw_n31x8: COEF_X8 -31 +vvc_pw_85x8: COEF_X8 85 +vvc_pw_88x8: COEF_X8 88 +vvc_pw_22x8: COEF_X8 22 +vvc_pw_m46x8: COEF_X8 -2106 +vvc_pw_78x8: COEF_X8 78 +vvc_pw_73x8: COEF_X8 73 +vvc_pw_54x8: COEF_X8 54 +vvc_pw_n13x8: COEF_X8 -13 +vvc_pw_90x8: COEF_X8 90 + +const idct2_64_mul +COEF_X8 91, 2, 90, 11, 65, -62, 71, -56 +COEF_X8 83, 37, 79, 44, 84, -33, 87, -24 +COEF_X8 88, 20, 86, 28, 77, -48, 81, -41 +COEF_X8 73, 52, 69, 59, 90, -15, 90, -7 + +vvc_pw_4_90x8: dw 4*8, 90*8 +vvc_pw_n13_90x8: dw -13*8, 90*8 +vvc_pw_22_88x8: dw 22*8, 88*8 +vvc_pw_n31_85x8: dw -31*8, 85*8 +vvc_pw_38_82x8: dw 38*8, 82*8 +vvc_pw_m46_78x8: dw -2106*8, 78*8 +vvc_pw_54_73x8: dw 54*8, 73*8 +vvc_pw_m61_67x8: dw -61*8, 67*8 + +%define o_idct2_64_offset idct2_64_mul - (o_base) - 8 + +SECTION .text + +; Code size reduction trickery: Instead of using rip-relative loads with +; mandatory 4-byte offsets everywhere, we can set up a base pointer with a +; single rip-relative lea and then address things relative from that with +; 1-byte offsets as long as data is within +-128 bytes of the base pointer. +%define o_base deint_shuf + 128 +%define o(x) (r6 - (o_base) + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; flags: 1 = swap, 2 = interleave, 4: coef_regs +%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags +%if %7 & 4 + pmaddwd m%2, m%5, m%1 + pmaddwd m%1, m%6 +%else +%if %7 & 1 + vpbroadcastd m%2, [o(vvc_pw_%5_%6)] + vpbroadcastd m%3, [o(vvc_pw_m%6_%5)] +%else + vpbroadcastd m%2, [o(vvc_pw_m%6_%5)] + vpbroadcastd m%3, [o(vvc_pw_%5_%6)] +%endif + pmaddwd m%2, m%1 + pmaddwd m%1, m%3 +%endif + paddd m%2, m%4 + paddd m%1, m%4 +%if %7 & 2 + pslld m%2, 4 + psrld m%1, 12 + pblendw m%1, m%2, 0xaa +%else + psrad m%2, 7 + psrad m%1, 7 + packssdw m%1, m%2 +%endif +%endmacro + +; flags: 1 = swap, 2 = interleave, 4 = coef_regs +%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags +%if %10 & 1 + vpbroadcastd m%3, [o(vvc_pw_%8_%9)] + vpbroadcastd m%4, [o(vvc_pw_m%9_%8)] + vpbroadcastd xm%2, [o(vvc_pw_%6_%7)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(vvc_pw_m%7_%6)] +%else + vpbroadcastd m%3, [o(vvc_pw_m%9_%8)] + vpbroadcastd m%4, [o(vvc_pw_%8_%9)] + vpbroadcastd xm%2, [o(vvc_pw_m%7_%6)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(vvc_pw_%6_%7)] +%endif + vpblendd m%3, m%4, 0xf0 + ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) +%endmacro + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 1 = coef_regs +%macro ITX_MULSUB_2W 8-9 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], flags, dst2 + punpckhwd m%3, m%2, m%1 + punpcklwd m%2, m%1 +%if %8 & 1 + pmaddwd m%1, m%7, m%2 + pmaddwd m%4, m%7, m%3 +%else + vpbroadcastd m%1, [o(vvc_pw_m%7_%6)] + pmaddwd m%4, m%3, m%1 + pmaddwd m%1, m%2 +%endif + paddd m%4, m%5 + paddd m%1, m%5 + psrad m%4, 7 + psrad m%1, 7 + packssdw m%1, m%4 +%if %8 & 1 + pmaddwd m%3, m%6 + pmaddwd m%2, m%6 +%else + vpbroadcastd m%4, [o(vvc_pw_%6_%7)] + pmaddwd m%3, m%4 + pmaddwd m%2, m%4 +%endif + paddd m%3, m%5 + paddd m%2, m%5 + psrad m%3, 7 + psrad m%2, 7 +%if %0 == 9 + packssdw m%8, m%2, m%3 +%else + packssdw m%2, m%3 +%endif +%endmacro + +%macro IDCT2_4_1D 7 ; src[1-4], tmp[1-2], vvc_pd_64 + ITX_MULSUB_2W %2, %4, %5, %6, %7, 36, 83, 0, %5 ; t2, t3 + ITX_MULSUB_2W %1, %3, %4, %6, %7, 64, 64, 0, %4 ; t1, t0 + psubsw m%3, m%1, m%2 + paddsw m%2, m%1 + paddsw m%1, m%4, m%5 + psubsw m%4, m%5 +%endmacro + +%macro IDCT2_8_1D 11 ; src[1-8], tmp[1-2], vvc_pd_64 + ITX_MULSUB_2W %6, %4, %9, %10, %11, 75, 50, 0 ; t5a, t6a + ITX_MULSUB_2W %2, %8, %9, %10, %11, 18, 89, 0 ; t4a, t7a + ITX_MULSUB_2W %3, %7, %9, %10, %11, 36, 83, 0 ; t2, t3 + paddsw m%9, m%2, m%6 ; t4 + psubsw m%2, m%6 ; t5a + paddsw m%10, m%8, m%4 ; t7 + psubsw m%8, m%4 ; t6a + ITX_MULSUB_2W %1, %5, %4, %6, %11, 64, 64, 0 ; t1, t0 + ITX_MULSUB_2W %8, %2, %4, %6, %11, 64, 64, 0 ; t5, t6 + psubsw m%6, m%1, m%3 ; dct4 out2 + paddsw m%3, m%1 ; dct4 out1 + paddsw m%1, m%5, m%7 ; dct4 out0 + psubsw m%5, m%7 ; dct4 out3 + psubsw m%7, m%3, m%2 ; out6 + paddsw m%2, m%3 ; out1 + paddsw m%3, m%6, m%8 ; out2 + psubsw m%6, m%8 ; out5 + psubsw m%8, m%1, m%10 ; out7 + paddsw m%1, m%10 ; out0 + paddsw m%4, m%5, m%9 ; out3 + psubsw m%5, m%9 ; out4 +%endmacro + +; in1 = %1, in3 = %2, in5 = %3, in7 = %4 +; in9 = %5, in11 = %6, in13 = %7, in15 = %8 +%macro IDCT2_16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], vvc_pd_64 + ITX_MULSUB_2W %1, %8, %9, %10, %11, 9, 90, 0 ; t8a, t15a + ITX_MULSUB_2W %5, %4, %9, %10, %11, 70, 57, 0 ; t9a, t14a + ITX_MULSUB_2W %3, %6, %9, %10, %11, 43, 80, 0 ; t10a, t13a + ITX_MULSUB_2W %7, %2, %9, %10, %11, 87, 25, 0 ; t11a, t12a + psubsw m%9, m%2, m%6 ; t13 + paddsw m%6, m%2 ; t12 + psubsw m%2, m%8, m%4 ; t14 + paddsw m%8, m%4 ; t15 + psubsw m%4, m%7, m%3 ; t10 + paddsw m%3, m%7 ; t11 + psubsw m%7, m%1, m%5 ; t9 + paddsw m%1, m%5 ; t8 + ITX_MULSUB_2W %2, %7, %5, %10, %11, 36, 83, 0 ; t9a, t14a + ITX_MULSUB_2W %9, %4, %5, %10, %11, m83, 36, 0 ; t10a, t13a + psubsw m%5, m%1, m%3 ; t11a + paddsw m%1, m%3 ; t8a + psubsw m%3, m%7, m%4 ; t13 + paddsw m%7, m%4 ; t14 + psubsw m%4, m%8, m%6 ; t12a + paddsw m%8, m%6 ; t15a + psubsw m%6, m%2, m%9 ; t10 + paddsw m%2, m%9 ; t9 + ITX_MULSUB_2W %3, %6, %9, %10, %11, 64, 64, 0 ; t10a, t13a + ITX_MULSUB_2W %4, %5, %9, %10, %11, 64, 64, 0 ; t11, t12 +%endmacro + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + vpbroadcastd m2, [o(vvc_pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + movd m2, [%%row_adr1] + pinsrd m2, [%%row_adr2], 1 + movd m3, [%%row_adr3] + pinsrd m3, [%%row_adr4], 1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + movd [%%row_adr1], m0 + pextrd [%%row_adr2], m0, 1 + pextrd [%%row_adr3], m0, 2 + pextrd [%%row_adr4], m0, 3 + ret +%endmacro + +%macro IWHT4_1D_PACKED 0 + punpckhqdq m3, m0, m1 ; in1 in3 + punpcklqdq m0, m1 ; in0 in2 + psubw m2, m0, m3 + paddw m0, m3 + punpckhqdq m2, m2 ; t2 t2 + punpcklqdq m0, m0 ; t0 t0 + psubw m1, m0, m2 + psraw m1, 1 + psubw m1, m3 ; t1 t3 + psubw m0, m1 ; ____ out0 + paddw m2, m1 ; out3 ____ +%endmacro + +INIT_XMM avx2 +cglobal vvc_inv_wht_wht_4x4_8, 3, 3, 4, dst, stride, c + mova m0, [cq+16*0] + mova m1, [cq+16*1] + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + psraw m0, 2 + psraw m1, 2 + IWHT4_1D_PACKED + punpckhwd m0, m1 + punpcklwd m3, m1, m2 + punpckhdq m1, m0, m3 + punpckldq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x03 + ITX4_END 3, 0, 2, 1, 0 + +%macro INV_TXFM_FN 3 ; type1, type2, size +cglobal vvc_inv_%1_%2_%3_8, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%3_internal_8) + lea r6, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%3_internal_8).pass2] +%ifidn %1_%2, dct2_dct2 + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4 +%ifidn %1_%2, dct2_dct2 + vpbroadcastw m0, [cq] + vpbroadcastd m1, [o(vvc_pw_64x8)] + pmulhrsw m0, m1 + mov [cq], eobd ; 0 + pmulhrsw m0, m1 + mova m1, m0 + jmp m(iadst_4x4_internal_8).end2 +%endif +%endmacro + +%macro IDCT2_4_1D_PACKED 0 + vpbroadcastd m4, [o(vvc_pd_64)] + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + ITX_MUL2X_PACK 2, 0, 3, 4, 36, 83 + ITX_MUL2X_PACK 1, 0, 3, 4, 64, 64 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 +%endmacro + +%macro IADST4_1D_PACKED 0 + punpcklwd m2, m1, m0 + punpckhwd m3, m1, m0 + vpbroadcastd m5, [o(vvc_pw_m3344_3344)] + vpbroadcastd m0, [o(vvc_pw_3803_1321)] + vpbroadcastd m4, [o(vvc_pw_m1321_2482)] + pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 + psrld m5, 16 + pmaddwd m0, m2 + pmaddwd m2, m4 + pmaddwd m5, m3 ; 3344*in0 + paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 + vpbroadcastd m4, [o(vvc_pw_2482_3344)] + vpbroadcastd m5, [o(vvc_pw_m3803_3344)] + pmaddwd m4, m3 + pmaddwd m5, m3 + paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 + vpbroadcastd m0, [o(vvc_pw_m3803_m6688)] + pmaddwd m3, m0 + vpbroadcastd m0, [o(vvc_pd_64)] + paddd m2, m0 + paddd m1, m0 + paddd m0, m4 + paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 + paddd m2, m4 + paddd m2, m3 + REPX {psrad x, 7}, m1, m2, m0, m5 + packssdw m0, m5 ; out0 out1 + packssdw m1, m2 ; out2 out3 +%endmacro + +INV_TXFM_4X4_FN dct2, dct2 +INV_TXFM_4X4_FN dct2, adst +INV_TXFM_4X4_FN dct2, flipadst +INV_TXFM_4X4_FN dct2, identity + +cglobal idct2_4x4_internal_8, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + IDCT2_4_1D_PACKED + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + IDCT2_4_1D_PACKED + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct2 +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call .main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 0, 1, 2, 3 +ALIGN function_align +cglobal_label .main + IADST4_1D_PACKED + ret + +INV_TXFM_4X4_FN flipadst, dct2 +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call m(iadst_4x4_internal_8).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x4_internal_8).main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct2 +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + vpbroadcastd m3, [o(vvc_pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(vvc_pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8).end + +%macro WRITE_4X8 2 ; coefs[1-2] + movd xm4, [dstq+strideq*0] + pinsrd xm4, [dstq+strideq*1], 1 + movd xm5, [dstq+strideq*2] + pinsrd xm5, [dstq+r3 ], 1 + pinsrd xm4, [r2 +strideq*0], 2 + pinsrd xm4, [r2 +strideq*1], 3 + pinsrd xm5, [r2 +strideq*2], 2 + pinsrd xm5, [r2 +r3 ], 3 + pmovzxbw m4, xm4 + pmovzxbw m5, xm5 + paddw m4, m%1 + paddw m5, m%2 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+strideq*2], xm4, 2 + pextrd [dstq+r3 ], xm4, 3 + movd [r2 +strideq*0], xm5 + pextrd [r2 +strideq*1], xm5, 1 + pextrd [r2 +strideq*2], xm5, 2 + pextrd [r2 +r3 ], xm5, 3 +%endmacro + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_4x8_internal_8).end3 +%endif +%endmacro + +%macro IDCT2_8_1D_PACKED 0 + vpbroadcastd m6, [o(vvc_pd_64)] + punpckhwd m5, m3, m0 ; in7 in1 + punpckhwd m4, m1, m2 ; in3 in5 + punpcklwd m3, m1 ; in6 in2 + punpcklwd m2, m0 ; in4 in0 + ITX_MUL2X_PACK 5, 0, 1, 6, 18, 89, 3 ; t4a t7a + ITX_MUL2X_PACK 4, 0, 1, 6, 75, 50, 3 ; t5a t6a + ITX_MUL2X_PACK 3, 0, 1, 6, 36, 83 ; t3 t2 + psubsw m0, m5, m4 ; t5a t6a (interleaved) + paddsw m4, m5 ; t4 t7 (interleaved) + ITX_MUL2X_PACK 2, 1, 5, 6, 64, 64 ; t0 t1 + vpbroadcastd m1, [o(vvc_pw_m64_64)] + ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 +%if mmsize > 16 + vbroadcasti128 m1, [o(deint_shuf)] + pshufb m4, m1 +%else + pshufb m4, [o(deint_shuf)] +%endif + psubsw m1, m2, m3 ; tmp3 tmp2 + paddsw m3, m2 ; tmp0 tmp1 + shufps m2, m4, m0, q1032 ; t7 t6 + vpblendd m4, m0, 0xcc ; t4 t5 + paddsw m0, m3, m2 ; out0 out1 + psubsw m3, m2 ; out7 out6 + psubsw m2, m1, m4 ; out4 out5 + paddsw m1, m4 ; out3 out2 +%endmacro + +%macro IADST8_1D_PACKED 1 ; pass + vpbroadcastd m6, [o(vvc_pd_64)] + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 +%if %1 == 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 9, 90, 3 ; t1a t0a + ITX_MUL2X_PACK 1, 4, 5, 6, 43, 80, 2 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 70, 57, 3 ; t5a t4a + ITX_MUL2X_PACK 3, 4, 5, 6, 87, 25, 2 ; t6a t7a + psubsw m4, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + ITX_MUL2X_PACK 4, 2, 3, 6, 36, 83, 3 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 83, 36, 2 ; t7a t6a +%if mmsize > 16 + vbroadcasti128 m2, [o(deint_shuf)] +%else + mova m2, [o(deint_shuf)] +%endif + pshuflw m1, m1, q2301 + pshufhw m1, m1, q2301 + psubsw m3, m0, m1 ; t3 t2 + paddsw m0, m1 ; -out7 out0 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + pshufb m0, m2 + pshufb m4, m2 + vpbroadcastd m5, [o(vvc_pw_m64_64)] + pmaddwd m2, m5, m3 + pmaddwd m5, m1 + paddd m2, m6 + paddd m5, m6 + psrad m2, 7 + psrad m5, 7 + packssdw m2, m5 ; out4 -out5 + vpbroadcastd m5, [o(vvc_pw_64_64)] + pmaddwd m3, m5 + pmaddwd m1, m5 + paddd m3, m6 + paddd m1, m6 + psrad m3, 7 + psrad m1, 7 + packssdw m1, m3 ; out2 -out3 + punpcklqdq m3, m4, m0 ; out6 -out7 + punpckhqdq m0, m4 ; out0 -out1 +%else + ITX_MUL2X_PACK 0, 4, 5, 6, 9, 90 ; t0a t1a + ITX_MUL2X_PACK 1, 4, 5, 6, 43, 80 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 70, 57 ; t4a t5a + ITX_MUL2X_PACK 3, 4, 5, 6, 87, 25 ; t6a t7a + psubsw m4, m0, m2 ; t4 t5 + paddsw m0, m2 ; t0 t1 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + shufps m2, m5, m4, q1032 + punpckhwd m4, m2 + punpcklwd m5, m2 + ITX_MUL2X_PACK 4, 2, 3, 6, 36, 83, 1 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 83, 36 ; t7a t6a + psubsw m2, m0, m1 ; t2 t3 + paddsw m0, m1 ; out0 -out7 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + vpbroadcastd m5, [o(vvc_pw_64x8)] + vpblendd m3, m0, m4, 0x33 ; out6 -out7 + vpblendd m0, m4, 0xcc ; out0 -out1 + shufps m4, m2, m1, q1032 ; t3 t7 + vpblendd m1, m2, 0x33 ; t2 t6 + psubsw m2, m1, m4 ; t2-t3 t6-t7 + paddsw m1, m4 ; t2+t3 t6+t7 + pmulhrsw m2, m5 ; out4 -out5 + pshufd m1, m1, q1032 + pmulhrsw m1, m5 ; out2 -out3 +%endif +%endmacro + +INIT_YMM avx2 +INV_TXFM_4X8_FN dct2, dct2 +INV_TXFM_4X8_FN dct2, adst +INV_TXFM_4X8_FN dct2, flipadst +INV_TXFM_4X8_FN dct2, identity + +cglobal idct2_4x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(vvc_pw_64x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + IDCT2_4_1D_PACKED + vbroadcasti128 m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + call .main + vpbroadcastd m4, [o(vvc_pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pshufd m1, m1, q1032 + jmp m(iadst_4x8_internal_8).end2 +ALIGN function_align +cglobal_label .main + WRAP_XMM IDCT2_8_1D_PACKED + ret + +INV_TXFM_4X8_FN adst, dct2 +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(vvc_pw_64x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8).main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call .main_pass2 + vpbroadcastd m4, [o(vvc_pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pxor m5, m5 + psubw m5, m4 +.end: + vpblendd m4, m5, 0xcc +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 + WIN64_RESTORE_XMM + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 +.end3: + lea r2, [dstq+strideq*4] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + RET +ALIGN function_align +.main_pass1: + WRAP_XMM IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + WRAP_XMM IADST8_1D_PACKED 2 + ret + +INV_TXFM_4X8_FN flipadst, dct2 +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(vvc_pw_64x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8).main + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call m(iadst_4x8_internal_8).main_pass2 + vpbroadcastd m5, [o(vvc_pw_2048)] + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + pxor m4, m4 + psubw m4, m5 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + jmp m(iadst_4x8_internal_8).end + +INV_TXFM_4X8_FN identity, dct2 +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m2, [cq+32*0], q3120 + vpermq m0, [cq+32*1], q3120 + vpbroadcastd m3, [o(vvc_pw_64x8)] + vpbroadcastd m4, [o(vvc_pw_1697x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + pmulhrsw m2, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m2 + paddsw m1, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(vvc_pw_64)] + jmp m(iadst_4x8_internal_8).end2 + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + movd xm3, [o(vvc_pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm2 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm3 + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp m(iadst_4x16_internal_8).end3 +%endif +%endmacro + +%macro IDCT2_16_1D_PACKED 0 + vpbroadcastd m10, [o(vvc_pd_64)] +.main2: + punpckhwd m8, m7, m0 ; dct16 in15 in1 + punpcklwd m9, m4, m0 ; dct4 in2 in0 + punpckhwd m0, m3, m4 ; dct16 in7 in9 + punpcklwd m7, m1 ; dct8 in7 in1 + punpckhwd m1, m6 ; dct16 in3 in13 + punpcklwd m3, m5 ; dct8 in3 in5 + punpckhwd m5, m2 ; dct16 in11 in5 + punpcklwd m6, m2 ; dct4 in3 in1 + ITX_MUL2X_PACK 8, 2, 4, 10, 9, 90, 3 ; t8a t15a + ITX_MUL2X_PACK 0, 2, 4, 10, 70, 57, 3 ; t9a t14a + ITX_MUL2X_PACK 1, 2, 4, 10, 87, 25, 3 ; t11a t12a + ITX_MUL2X_PACK 5, 2, 4, 10, 43, 80, 3 ; t10a t13a + ITX_MUL2X_PACK 7, 2, 4, 10, 18, 89, 3 ; t4a t7a + ITX_MUL2X_PACK 3, 2, 4, 10, 75, 50, 3 ; t5a t6a + ITX_MUL2X_PACK 6, 2, 4, 10, 36, 83 ; t3 t2 + psubsw m2, m8, m0 ; t9 t14 + paddsw m8, m0 ; t8 t15 + psubsw m0, m1, m5 ; t10 t13 + paddsw m1, m5 ; t11 t12 + vpbroadcastd m5, [o(vvc_pw_m83_36)] ; reuse vvc_pw_36_83 + ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a + vpbroadcastd m4, [o(vvc_pw_m36_m83)] ; reuse vvc_pw_m83_36 + ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a + psubsw m4, m8, m1 ; t11a t12a + paddsw m8, m1 ; t8a t15a + psubsw m1, m7, m3 ; t5a t6a + paddsw m7, m3 ; t4 t7 + paddsw m3, m2, m0 ; t9 t14 + psubsw m2, m0 ; t10 t13 +%if mmsize > 16 + vbroadcasti128 m0, [o(deint_shuf)] +%else + mova m0, [o(deint_shuf)] +%endif + pshufb m8, m0 + pshufb m7, m0 + pshufb m3, m0 + ITX_MUL2X_PACK 9, 0, 5, 10, 64, 64 ; t0 t1 + vpbroadcastd m0, [o(vvc_pw_m64_64)] + ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 + vpbroadcastd m5, [o(vvc_pw_64_64)] + ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 + vpbroadcastd m0, [o(vvc_pw_m64_64)] + ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a + punpckhqdq m0, m8, m3 ; t15a t14 + punpcklqdq m8, m3 ; t8a t9 + shufps m5, m4, m2, q1032 ; t12 t13a + vpblendd m4, m2, 0xcc ; t11 t10a + shufps m2, m7, m1, q1032 ; t7 t6 + vpblendd m7, m1, 0xcc ; t4 t5 + psubsw m1, m9, m6 ; dct4 out3 out2 + paddsw m9, m6 ; dct4 out0 out1 + psubsw m3, m9, m2 ; dct8 out7 out6 + paddsw m9, m2 ; dct8 out0 out1 + psubsw m2, m1, m7 ; dct8 out4 out5 + paddsw m1, m7 ; dct8 out3 out2 + psubsw m7, m9, m0 ; out15 out14 + paddsw m0, m9 ; out0 out1 + psubsw m6, m1, m5 ; out12 out13 + paddsw m1, m5 ; out3 out2 + psubsw m5, m2, m4 ; out11 out10 + paddsw m2, m4 ; out4 out5 + psubsw m4, m3, m8 ; out8 out9 + paddsw m3, m8 ; out7 out6 +%endmacro + +INV_TXFM_4X16_FN dct2, dct2 +INV_TXFM_4X16_FN dct2, adst +INV_TXFM_4X16_FN dct2, flipadst +INV_TXFM_4X16_FN dct2, identity + +cglobal idct2_4x16_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(idct2_16x4_internal_8).main + vpbroadcastd m5, [o(vvc_pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m0, m4, m2, m3 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vextracti128 xm4, m0, 1 + vextracti128 xm5, m1, 1 + vextracti128 xm6, m2, 1 + vextracti128 xm7, m3, 1 + call .main + vinserti128 m0, xm4, 1 + vinserti128 m1, xm5, 1 + vpbroadcastd m5, [o(vvc_pw_2048)] + vinserti128 m2, xm6, 1 + vinserti128 m3, xm7, 1 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x16_internal_8).end2 +ALIGN function_align +cglobal_label .main + WRAP_XMM IDCT2_16_1D_PACKED + ret + +INV_TXFM_4X16_FN adst, dct2 +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal_8).main + vpbroadcastd m5, [o(vvc_pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m4, m2, m3, m0 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main + vpbroadcastd m5, [o(vvc_pw_64x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m5, [o(vvc_pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m1, m0, 0x33 + vpblendd m0, m2, 0x33 + vpblendd m2, m3, 0x33 + vpblendd m3, m1, 0x33 + vpermq m0, m0, q2031 + vpermq m1, m2, q1302 + vpermq m2, m3, q3120 + vpermq m3, m4, q0213 + psubw m6, m7, m5 +.end: + vpblendd m5, m6, 0xcc +.end2: + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + WIN64_RESTORE_XMM + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + lea r2, [dstq+strideq*8] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + lea dstq, [dstq+strideq*4] + lea r2, [r2 +strideq*4] + WRITE_4X8 2, 3 + RET +ALIGN function_align +.main: + vpblendd m4, m1, m0, 0xcc + vpblendd m1, m0, 0x33 + vpblendd m5, m2, m3, 0xcc + vpblendd m2, m3, 0x33 + vperm2i128 m3, m5, m2, 0x31 + vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 + vperm2i128 m4, m1, m4, 0x31 + vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 + pshufd m3, m3, q1032 ; in15 in12 in13 in14 + pshufd m2, m4, q1032 ; in11 in8 in9 in10 +cglobal_label .main2 + vpbroadcastd m8, [o(vvc_pd_64)] + pxor m7, m7 + punpckhwd m4, m3, m0 ; in12 in3 in14 in1 + punpcklwd m0, m3 ; in0 in15 in2 in13 + punpckhwd m3, m2, m1 ; in8 in7 in10 in5 + punpcklwd m1, m2 ; in4 in11 in6 in9 + ITX_MUL4X_PACK 0, 2, 5, 6, 8, 4, 90, 22, 88, 3 + ITX_MUL4X_PACK 1, 2, 5, 6, 8, 38, 82, 54, 73, 3 + ITX_MUL4X_PACK 3, 2, 5, 6, 8, 67, 61, 78, 46, 3 + ITX_MUL4X_PACK 4, 2, 5, 6, 8, 85, 31, 90, 13, 3 + psubsw m2, m0, m3 ; t9a t8a t11a t10a + paddsw m0, m3 ; t1a t0a t3a t2a + psubsw m3, m1, m4 ; t13a t12a t15a t14a + paddsw m1, m4 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 4, 5, 6, 8, 18, 89, 75, 50, 3 + psubw m6, m7, m5 + ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 + vpbroadcastd m6, [o(vvc_pw_m83_36)] + vpbroadcastd m5, [o(vvc_pw_36_83)] + psubsw m4, m0, m1 ; t5 t4 t7 t6 + paddsw m0, m1 ; t1 t0 t3 t2 + psubsw m1, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + psubw m3, m7, m6 ; vvc_pw_83_m36 + vpblendd m6, m3, 0xf0 + ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a + ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 + vbroadcasti128 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a + vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a + vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 + vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 + pshufd m2, m2, q1032 ; t6a t7a t14 t15 + psubsw m1, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m4, m2 ; -out3 out12 out2 -out13 + psubsw m4, m2 ; t6 t7 t14a t15a + shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a + vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m5, [o(vvc_pw_m64_64)] + vpbroadcastd m6, [o(vvc_pw_64_64)] + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + pmaddwd m2, m5, m4 + pmaddwd m4, m6 + pmaddwd m5, m1 + pmaddwd m1, m6 + REPX {paddd x, m8}, m5, m1, m2, m4 + REPX {psrad x, 7}, m5, m2, m1, m4 + packssdw m2, m5 ; -out11 out8 out10 -out9 + packssdw m1, m4 ; -out7 out4 out6 -out5 + ret + +INV_TXFM_4X16_FN flipadst, dct2 +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal_8).main + vpbroadcastd m5, [o(vvc_pw_16384)] + punpcklwd m4, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m3, m2 + punpckhwd m3, m2 + REPX {pmulhrsw x, m5}, m4, m1, m0, m3 + punpckldq m2, m3, m1 + punpckhdq m3, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_8).main + vpbroadcastd m5, [o(vvc_pw_64x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m6, [o(vvc_pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m0, m2, 0x33 + vpblendd m0, m1, 0xcc + vpblendd m1, m3, 0xcc + vpblendd m2, m3, 0x33 + vpermq m0, m0, q3120 + vpermq m1, m1, q0213 + vpermq m2, m2, q2031 + vpermq m3, m4, q1302 + psubw m5, m7, m6 + jmp m(iadst_4x16_internal_8).end + +INV_TXFM_4X16_FN identity, dct2 +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova m3, [cq+32*0] + mova m2, [cq+32*1] + mova m4, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m8, [o(vvc_pw_1697x8)] + pcmpeqw m0, m0 ; -1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 + pmulhrsw m8, m4 + pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is + pxor m1, m9 ; unsigned. as long as both signs are equal + pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the + pxor m2, m9 ; pmulhrsw result will become 0 which causes + pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless + pxor m3, m9 ; we explicitly deal with that case here. + pcmpeqw m0, m4 + pxor m4, m0 + pavgw m1, m5 + pavgw m2, m6 + pavgw m3, m7 + pavgw m4, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(vvc_pw_1697x16)] + vpbroadcastd m5, [o(vvc_pw_2048)] + pmulhrsw m4, m8, m0 + pmulhrsw m6, m8, m1 + pmulhrsw m7, m8, m2 + pmulhrsw m8, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m6 + paddsw m2, m7 + paddsw m3, m8 + jmp m(iadst_4x16_internal_8).end2 + +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] + movq xm%3, [dstq ] + movhps xm%3, [dstq+%5] + movq xm%4, [dstq+%6] + movhps xm%4, [dstq+%7] + pmovzxbw m%3, xm%3 + pmovzxbw m%4, xm%4 +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vextracti128 xm%4, m%3, 1 + movq [dstq ], xm%3 + movhps [dstq+%6], xm%3 + movq [dstq+%5], xm%4 + movhps [dstq+%7], xm%4 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + mov [cq], eobd + pmulhrsw xm0, xm1 + jmp m(vvc_inv_dct2_dct2_8x8_8).dconly2 +%endif +%endmacro + +INV_TXFM_8X4_FN dct2, dct2 +INV_TXFM_8X4_FN dct2, adst +INV_TXFM_8X4_FN dct2, flipadst +INV_TXFM_8X4_FN dct2, identity + +cglobal idct2_8x4_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm3, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm3, [cq+16*0] + pmulhrsw xm1, xm3, [cq+16*1] + pmulhrsw xm2, xm3, [cq+16*2] + pmulhrsw xm3, [cq+16*3] + call m(idct2_4x8_internal_8).main + vbroadcasti128 m4, [o(deint_shuf)] + vinserti128 m3, m1, xm3, 1 + vinserti128 m1, m0, xm2, 1 + shufps m0, m1, m3, q0220 + shufps m1, m3, q1331 + pshufb m0, m4 + pshufb m1, m4 + jmp tx2q +.pass2: + IDCT2_4_1D_PACKED + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_8x4_internal_8).end2 + +INV_TXFM_8X4_FN adst, dct2 +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(vvc_pw_64x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8).main_pass1 + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pxor m3, m3 + psubsw m3, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +.end2: + vpbroadcastd m2, [o(vvc_pw_2048)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + WIN64_RESTORE_XMM +.end3: + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + RET +ALIGN function_align +cglobal_label .main + IADST4_1D_PACKED + ret + +INV_TXFM_8X4_FN flipadst, dct2 +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(vvc_pw_64x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8).main_pass1 + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + punpckhwd m1, m3, m2 + punpcklwd m3, m2 + pxor m0, m0 + psubsw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_8).main + mova m2, m1 + vpermq m1, m0, q2031 + vpermq m0, m2, q2031 + jmp m(iadst_8x4_internal_8).end2 + +INV_TXFM_8X4_FN identity, dct2 +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm0, [cq+16*1] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m0, [cq+16*3], 1 + vpbroadcastd m3, [o(vvc_pw_64x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + paddsw m0, m0 + paddsw m1, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(vvc_pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_8x4_internal_8).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + or r3d, 8 +.dconly: + pmulhrsw xm0, xm2 +.dconly2: + movd xm2, [vvc_pw_2048] + pmulhrsw xm0, xm1 + lea r2, [strideq*3] + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 +.dconly_loop: + WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2 + lea dstq, [dstq+strideq*4] + sub r3d, 4 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct2, dct2 +INV_TXFM_8X8_FN dct2, adst +INV_TXFM_8X8_FN dct2, flipadst +INV_TXFM_8X8_FN dct2, identity + +cglobal idct2_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m1, [cq+32*1], q3120 ; 2 3 + call .main + shufps m4, m0, m1, q0220 + shufps m5, m0, m1, q1331 + shufps m1, m2, m3, q0220 + shufps m3, m2, m3, q1331 + vbroadcasti128 m0, [o(deint_shuf)] + vpbroadcastd m2, [o(vvc_pw_16384)] + REPX {pshufb x, m0}, m4, m5, m1, m3 + REPX {pmulhrsw x, m2}, m4, m5, m1, m3 + vinserti128 m0, m4, xm1, 1 + vperm2i128 m2, m4, m1, 0x31 + vinserti128 m1, m5, xm3, 1 + vperm2i128 m3, m5, m3, 0x31 + jmp tx2q +.pass2: + call .main + vpbroadcastd m4, [o(vvc_pw_2048)] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + jmp m(iadst_8x8_internal_8).end2 +ALIGN function_align +cglobal_label .main + IDCT2_8_1D_PACKED + ret + +INV_TXFM_8X8_FN adst, dct2 +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call .main_pass1 + vpbroadcastd m5, [o(vvc_pw_16384)] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + pxor m3, m3 + psubw m3, m5 ; negate odd elements during rounding + pmulhrsw m4, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m5 + pmulhrsw m2, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + vperm2i128 m2, m3, m0, 0x31 + vinserti128 m0, m3, xm0, 1 + vperm2i128 m3, m4, m1, 0x31 + vinserti128 m1, m4, xm1, 1 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call .main_pass2 + vpbroadcastd m5, [o(vvc_pw_2048)] + vpbroadcastd xm4, [o(vvc_pw_64)] + psubw m4, m5 ; lower half = 2048, upper half = -2048 +.end: + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + pmulhrsw m2, m4 + pmulhrsw m3, m4 + WIN64_RESTORE_XMM +.end4: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 5 + RET +ALIGN function_align +.main_pass1: + IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + IADST8_1D_PACKED 2 + ret + +INV_TXFM_8X8_FN flipadst, dct2 +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call m(iadst_8x8_internal_8).main_pass1 + vpbroadcastd m5, [o(vvc_pw_16384)] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + pxor m0, m0 + psubw m0, m5 + pmulhrsw m4, m0 + pmulhrsw m3, m5 + pmulhrsw m2, m0 + pmulhrsw m1, m5 + punpckhwd m0, m4, m3 + punpcklwd m4, m3 + punpckhwd m3, m2, m1 + punpcklwd m2, m1 + vinserti128 m1, m0, xm3, 1 + vperm2i128 m3, m0, m3, 0x31 + vinserti128 m0, m4, xm2, 1 + vperm2i128 m2, m4, m2, 0x31 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8).main_pass2 + vpbroadcastd m4, [o(vvc_pw_2048)] + vpbroadcastd xm5, [o(vvc_pw_64)] + psubw m4, m5 ; lower half = -2048, upper half = 2048 + vpermq m5, m3, q2031 + vpermq m3, m0, q2031 + vpermq m0, m2, q2031 + vpermq m2, m1, q2031 + pmulhrsw m1, m0, m4 + pmulhrsw m0, m5, m4 + jmp m(iadst_8x8_internal_8).end3 + +INV_TXFM_8X8_FN identity, dct2 +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*1] + vinserti128 m3, [cq+16*4], 1 + vinserti128 m2, [cq+16*5], 1 + mova xm4, [cq+16*2] + mova xm0, [cq+16*3] + vinserti128 m4, [cq+16*6], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(vvc_pw_64)] + jmp m(iadst_8x8_internal_8).end + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 16 + jmp m(vvc_inv_dct2_dct2_8x8_8).dconly +%endif +%endmacro + +%macro ITX_8X16_LOAD_COEFS 0 + vpbroadcastd m4, [o(vvc_pw_64x8)] + pmulhrsw m0, m4, [cq+32*0] + add cq, 32*4 + pmulhrsw m7, m4, [cq+32*3] + pmulhrsw m1, m4, [cq-32*3] + pmulhrsw m6, m4, [cq+32*2] + pmulhrsw m2, m4, [cq-32*2] + pmulhrsw m5, m4, [cq+32*1] + pmulhrsw m3, m4, [cq-32*1] + pmulhrsw m4, [cq+32*0] +%endmacro + +INV_TXFM_8X16_FN dct2, dct2 +INV_TXFM_8X16_FN dct2, adst +INV_TXFM_8X16_FN dct2, flipadst +INV_TXFM_8X16_FN dct2, identity + +cglobal idct2_8x16_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(idct2_16x8_internal_8).main + vpbroadcastd m10, [o(vvc_pw_16384)] +.pass1_end: + vperm2i128 m9, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vperm2i128 m8, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 +.pass1_end2: + punpckhwd m7, m5, m6 + punpcklwd m5, m6 + punpcklwd m6, m8, m9 + punpckhwd m8, m9 + REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + call .main + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + vpbroadcastd m8, [o(vvc_pw_2048)] +.end2: + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +.end3: + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 8, 9 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 4, 5, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 6, 7, 0, 1 + RET +ALIGN function_align +cglobal_label .main + IDCT2_16_1D_PACKED + ret + +INV_TXFM_8X16_FN adst, dct2 +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal_8).main + call m(iadst_16x8_internal_8).main_pass1_end + vpbroadcastd m10, [o(vvc_pw_16384)] + pslld m9, m10, 17 + psubw m10, m9 ; 16384, -16384 + jmp m(idct2_8x16_internal_8).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + vpbroadcastd m9, [o(vvc_pw_2048)] + vpbroadcastd xm8, [o(vvc_pw_64)] + psubw m8, m9 + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + jmp m(idct2_8x16_internal_8).end2 +ALIGN function_align +cglobal_label .main + REPX {pshufd x, x, q1032}, m7, m1, m5, m3 +.main2: + vpbroadcastd m10, [o(vvc_pd_64)] + punpckhwd m8, m7, m0 ; in14 in1 + punpcklwd m0, m7 ; in0 in15 + punpcklwd m7, m6, m1 ; in12 in3 + punpckhwd m1, m6 ; in2 in13 + punpckhwd m6, m5, m2 ; in10 in5 + punpcklwd m2, m5 ; in4 in11 + punpcklwd m5, m4, m3 ; in8 in7 + punpckhwd m3, m4 ; in6 in9 + ITX_MUL2X_PACK 0, 4, 9, 10, 4, 90, 3 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 9, 10, 22, 88, 3 ; t2 t3 + ITX_MUL2X_PACK 2, 4, 9, 10, 38, 82, 3 ; t4 t5 + ITX_MUL2X_PACK 3, 4, 9, 10, 54, 73, 3 ; t6 t7 + ITX_MUL2X_PACK 5, 4, 9, 10, 67, 61, 3 ; t8 t9 + ITX_MUL2X_PACK 6, 4, 9, 10, 78, 46, 3 ; t10 t11 + ITX_MUL2X_PACK 7, 4, 9, 10, 85, 31, 3 ; t12 t13 + ITX_MUL2X_PACK 8, 4, 9, 10, 90, 13, 3 ; t14 t15 + psubsw m4, m0, m5 ; t9a t8a + paddsw m0, m5 ; t1a t0a + psubsw m5, m1, m6 ; t11a t10a + paddsw m1, m6 ; t3a t2a + psubsw m6, m2, m7 ; t13a t12a + paddsw m2, m7 ; t5a t4a + psubsw m7, m3, m8 ; t15a t14a + paddsw m3, m8 ; t7a t6a + vpbroadcastd m11, [o(vvc_pw_m89_18)] + vpbroadcastd m12, [o(vvc_pw_18_89)] + pxor m9, m9 + ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 + psubw m8, m9, m11 ; vvc_pw_89_m18 + ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 + vpbroadcastd m11, [o(vvc_pw_m50_75)] + vpbroadcastd m12, [o(vvc_pw_75_50)] + ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 + psubw m8, m9, m11 ; vvc_pw_50_m75 + ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 + psubsw m8, m1, m3 ; t7 t6 + paddsw m1, m3 ; t3 t2 + psubsw m3, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m2, m5, m7 ; t14a t15a + paddsw m7, m5 ; t10a t11a + psubsw m5, m4, m6 ; t12a t13a + paddsw m4, m6 ; t8a t9a + vpbroadcastd m11, [o(vvc_pw_m83_36)] + vpbroadcastd m12, [o(vvc_pw_36_83)] + ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a + psubw m6, m9, m11 ; vvc_pw_83_m36 + ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a + vpbroadcastd m11, [o(vvc_pw_m36_83)] + vpbroadcastd m12, [o(vvc_pw_83_36)] + ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 + psubw m6, m9, m11 ; vvc_pw_36_m83 + ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 + vbroadcasti128 m12, [o(deint_shuf)] + paddsw m6, m4, m7 ; -out1 out14 + psubsw m4, m7 ; t10 t11 + psubsw m11, m3, m8 ; t7 t6 + paddsw m8, m3 ; out12 -out3 + psubsw m3, m0, m1 ; t3a t2a + paddsw m0, m1 ; -out15 out0 + paddsw m1, m2, m5 ; -out13 out2 + psubsw m5, m2 ; t15a t14a + pshufb m0, m12 + pshufb m6, m12 + pshufb m8, m12 + pshufb m1, m12 + shufps m7, m6, m0, q1032 ; out14 -out15 + vpblendd m0, m6, 0x33 ; -out1 out0 + punpcklqdq m6, m8, m1 ; out12 -out13 + punpckhqdq m1, m8, m1 ; -out3 out2 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m8, [o(vvc_pw_m64_64)] + vpbroadcastd m12, [o(vvc_pw_64_64)] + pmaddwd m9, m8, m11 ; -out11 + pmaddwd m2, m12, m5 ; -out5 + pmaddwd m5, m8 ; out10 + pmaddwd m11, m12 ; out4 + REPX {paddd x, m10}, m9, m5, m2, m11 + REPX {psrad x, 7 }, m9, m5, m2, m11 + packssdw m5, m9 ; out10 -out11 + packssdw m2, m11 ; -out5 out4 + pmaddwd m11, m8, m3 ; out8 + vpbroadcastd m8, [o(vvc_pw_64_m64)] + pmaddwd m3, m12 ; -out7 + pmaddwd m8, m4 ; -out9 + pmaddwd m4, m12 ; out6 + REPX {paddd x, m10}, m11, m3, m8, m4 + REPX {psrad x, 7 }, m11, m3, m8, m4 + packssdw m3, m4 ; -out7 out6 + packssdw m4, m11, m8 ; out8 -out9 + vpbroadcastd m10, [o(vvc_pw_16384)] + pxor m9, m9 + ret +ALIGN function_align +cglobal_label .main_pass2_end + vpbroadcastd m8, [o(vvc_pw_64x8)] + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + punpcklqdq m11, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + vpblendd m3, m4, 0xcc ; t3a t11 + psubsw m4, m2, m3 ; out8 -out9 + paddsw m3, m2 ; -out7 out6 + paddsw m2, m5, m11 ; -out5 out4 + psubsw m5, m11 ; out10 -out11 + REPX {pmulhrsw x, m8}, m2, m3, m4, m5 + ret + +INV_TXFM_8X16_FN flipadst, dct2 +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal_8).main + call m(iadst_16x8_internal_8).main_pass1_end + vpbroadcastd m9, [o(vvc_pw_16384)] + pslld m10, m9, 17 + psubw m10, m9 ; -16384, 16384 + vperm2i128 m9, m4, m0, 0x31 + vinserti128 m0, m4, xm0, 1 + vperm2i128 m8, m5, m1, 0x31 + vinserti128 m4, m5, xm1, 1 + vperm2i128 m5, m7, m3, 0x31 + vinserti128 m3, m7, xm3, 1 + vinserti128 m1, m6, xm2, 1 + vperm2i128 m6, m6, m2, 0x31 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m3, m1 + punpckhwd m3, m1 + jmp m(idct2_8x16_internal_8).pass1_end2 +.pass2: + call m(iadst_8x16_internal_8).main + call m(iadst_8x16_internal_8).main_pass2_end + vpbroadcastd m8, [o(vvc_pw_2048)] + vpbroadcastd xm9, [o(vvc_pw_64)] + psubw m8, m9 + vpermq m9, m0, q3120 + vpermq m0, m7, q2031 + vpermq m7, m1, q3120 + vpermq m1, m6, q2031 + vpermq m6, m2, q3120 + vpermq m2, m5, q2031 + vpermq m5, m3, q3120 + vpermq m3, m4, q2031 + pmulhrsw m0, m8 + pmulhrsw m1, m8 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + pmulhrsw m4, m5, m8 + pmulhrsw m5, m6, m8 + pmulhrsw m6, m7, m8 + pmulhrsw m7, m9, m8 + jmp m(idct2_8x16_internal_8).end3 + +INV_TXFM_8X16_FN identity, dct2 +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, vvc_pw_1697x16, [vvc_pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*2] + add cq, 16*8 + vinserti128 m3, [cq+16*0], 1 + vinserti128 m2, [cq+16*2], 1 + vpbroadcastd m9, [o(vvc_pw_64x8)] + mova xm4, [cq-16*4] + mova xm5, [cq-16*2] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*6], 1 + mova xm7, [cq-16*7] + mova xm6, [cq-16*5] + vinserti128 m7, [cq+16*1], 1 + vinserti128 m6, [cq+16*3], 1 + mova xm8, [cq-16*3] + mova xm0, [cq-16*1] + vinserti128 m8, [cq+16*5], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + punpcklwd m5, m7, m6 + punpckhwd m7, m6 + punpcklwd m6, m8, m0 + punpckhwd m8, m0 + REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(vvc_pw_1697x16)] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(idct2_8x16_internal_8).end + +%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] + pmovzxbw m%3, [dstq+%5] +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif + pmovzxbw m%4, [dstq+%6] +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vpermq m%3, m%3, q3120 + mova [dstq+%5], xm%3 + vextracti128 [dstq+%6], m%3, 1 +%endmacro + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + or r3d, 4 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [vvc_pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X4_FN dct2, dct2 +INV_TXFM_16X4_FN dct2, adst +INV_TXFM_16X4_FN dct2, flipadst +INV_TXFM_16X4_FN dct2, identity + +cglobal idct2_16x4_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] + mova xm4, [cq+16*4] + mova xm5, [cq+16*5] + mova xm6, [cq+16*6] + mova xm7, [cq+16*7] + call m(idct2_4x16_internal_8).main + vinserti128 m6, m2, xm6, 1 + vinserti128 m2, m0, xm4, 1 + vinserti128 m0, m1, xm5, 1 + vinserti128 m1, m3, xm7, 1 + punpcklwd m3, m2, m6 + punpckhwd m2, m6 + vpbroadcastd m6, [o(vvc_pw_16384)] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + mova m1, m6 + jmp m(iadst_16x4_internal_8).pass1_end +.pass2: + call .main + jmp m(iadst_16x4_internal_8).end +ALIGN function_align +cglobal_label .main + vpbroadcastd m6, [o(vvc_pd_64)] + IDCT2_4_1D 0, 1, 2, 3, 4, 5, 6 + ret + +INV_TXFM_16X4_FN adst, dct2 +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal_8).main2 + call m(iadst_4x16_internal_8).main_pass1_end + punpcklwd m4, m3, m1 + punpcklwd m5, m2, m0 + punpckhwd m0, m1 + punpckhwd m2, m3 + vpbroadcastd m1, [o(vvc_pw_16384)] + vinserti128 m3, m0, xm2, 1 + vperm2i128 m2, m0, m2, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m6, m7, m1 +.pass1_end: + pmulhrsw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m4, m1 + pmulhrsw m0, m6 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m4, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + WIN64_RESTORE_XMM +.end2: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m6, [o(vvc_pw_m3344_3344)] + vpbroadcastd m7, [o(vvc_pw_3803_1321)] + vpbroadcastd m8, [o(vvc_pw_m1321_2482)] + vpbroadcastd m9, [o(vvc_pw_2482_3344)] + punpcklwd m4, m2, m0 ; in2 in0 l + punpckhwd m2, m0 ; in2 in0 h + psrld m5, m6, 16 + pmaddwd m10, m6, m4 ; t2:02 l + pmaddwd m6, m2 ; t2:02 h + pmaddwd m0, m7, m4 ; t0:02 l + pmaddwd m7, m2 ; t0:02 h + pmaddwd m4, m8 ; t1:02 l + pmaddwd m8, m2 ; t1:02 h + punpckhwd m2, m3, m1 ; in3 in1 h + punpcklwd m3, m1 ; in3 in1 l + pmaddwd m1, m5, m2 ; t2:3 h + pmaddwd m5, m3 ; t2:3 l + paddd m6, m1 + vpbroadcastd m1, [o(vvc_pd_64)] + paddd m10, m5 + pmaddwd m5, m9, m3 + pmaddwd m9, m2 + paddd m0, m1 + paddd m7, m1 + paddd m0, m5 ; t0 + t3 + 2048 l + paddd m7, m9 ; t0 + t3 + 2048 h + vpbroadcastd m9, [o(vvc_pw_m3803_3344)] + pmaddwd m5, m9, m2 + pmaddwd m9, m3 + paddd m10, m1 ; t2 + 2048 l + paddd m6, m1 ; t2 + 2048 h + paddd m5, m1 ; t1:13 + 2048 h + paddd m1, m9 ; t1:13 + 2048 l + vpbroadcastd m9, [o(vvc_pw_m3803_m6688)] + pmaddwd m2, m9 + pmaddwd m3, m9 + paddd m5, m8 ; t1 + t3 + 2048 h + paddd m1, m4 ; t1 + t3 + 2048 l + paddd m8, m7 + paddd m4, m0 + paddd m2, m8 ; t0 + t1 - t3 + 2048 h + paddd m3, m4 ; t0 + t1 - t3 + 2048 l + REPX {psrad x, 7}, m10, m6, m0, m7, m5, m1, m2, m3 + packssdw m0, m7 + packssdw m1, m5 + packssdw m3, m2 + packssdw m2, m10, m6 + ret + +INV_TXFM_16X4_FN flipadst, dct2 +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal_8).main2 + call m(iadst_4x16_internal_8).main_pass1_end + punpckhwd m4, m3, m2 + punpckhwd m5, m1, m0 + punpcklwd m0, m2 + punpcklwd m1, m3 + vpbroadcastd m6, [o(vvc_pw_16384)] + vinserti128 m3, m0, xm1, 1 + vperm2i128 m2, m0, m1, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m1, m7, m6 + jmp m(iadst_16x4_internal_8).pass1_end +ALIGN function_align +.pass2: + call m(iadst_16x4_internal_8).main + vpbroadcastd m4, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m4}, m3, m2, m1, m0 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 + RET + +INV_TXFM_16X4_FN identity, dct2 +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm4, [cq+16*1] + vinserti128 m2, [cq+16*4], 1 + vinserti128 m4, [cq+16*5], 1 + mova xm0, [cq+16*2] + mova xm1, [cq+16*3] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m7, [o(vvc_pw_1697x16)] + vpbroadcastd m8, [o(vvc_pw_16384)] + punpcklwd m3, m2, m4 + punpckhwd m2, m4 + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + pmulhrsw m0, m7, m1 + pmulhrsw m5, m7, m2 + pmulhrsw m6, m7, m3 + pmulhrsw m7, m4 + REPX {pmulhrsw x, m8}, m0, m5, m6, m7 + paddsw m1, m0 + paddsw m2, m5 + paddsw m3, m6 + paddsw m4, m7 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m7, [o(vvc_pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_16x4_internal_8).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 8 + jmp m(vvc_inv_dct2_dct2_16x4_8).dconly +%endif +%endmacro + +%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd + vpbroadcastd m8, [o(vvc_pw_64x8)] + vpermq m0, [cq+32*0], q3120 + add cq, 32*4 + vpermq m7, [cq+32*3], q%1 + vpermq m1, [cq-32*3], q%1 + vpermq m6, [cq+32*2], q3120 + vpermq m2, [cq-32*2], q3120 + vpermq m5, [cq+32*1], q%1 + vpermq m3, [cq-32*1], q%1 + vpermq m4, [cq+32*0], q3120 + REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 +%endmacro + +INV_TXFM_16X8_FN dct2, dct2 +INV_TXFM_16X8_FN dct2, adst +INV_TXFM_16X8_FN dct2, flipadst +INV_TXFM_16X8_FN dct2, identity + +cglobal idct2_16x8_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 3120 + call m(idct2_8x16_internal_8).main + vpbroadcastd m10, [o(vvc_pw_16384)] + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + REPX {pmulhrsw x, m10}, m8, m1, m4, m6 +.pass1_end: + REPX {pmulhrsw x, m10}, m0, m2, m9, m5 + punpckhwd m3, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m9, m4 + punpckhwd m9, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m8 + punpckhdq m3, m8 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m8, m9, m5 + punpckhdq m9, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m8, 0x31 + vinserti128 m2, xm8, 1 + vperm2i128 m7, m3, m9, 0x31 + vinserti128 m3, xm9, 1 + jmp tx2q +.pass2: + call .main + vpbroadcastd m8, [o(vvc_pw_2048)] +.end: + REPX {pmulhrsw x, m8}, m0, m2, m4, m6 +.end2: + REPX {pmulhrsw x, m8}, m1, m3, m5, m7 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 +.end3: + pxor m0, m0 + REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 +.end4: + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(vvc_pd_64)] +.main2: + IDCT2_8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 + ret + +INV_TXFM_16X8_FN adst, dct2 +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal_8).main2 + call m(iadst_8x16_internal_8).main_pass1_end + psubw m11, m9, m10 + punpcklwd m8, m0, m2 + punpckhwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpckhwd m6, m5, m7 + punpcklwd m5, m7 + REPX {pmulhrsw x, m11}, m8, m1, m4, m6 + jmp m(idct2_16x8_internal_8).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + pxor m8, m8 + psubw m8, m9 + REPX {pmulhrsw x, m9}, m0, m2, m4, m6 + jmp m(idct2_16x8_internal_8).end2 +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(vvc_pd_64)] + ITX_MULSUB_2W 7, 0, 8, 9, 10, 9, 90, 0 ; t1a, t0a + ITX_MULSUB_2W 3, 4, 8, 9, 10, 70, 57, 0 ; t5a, t4a + ITX_MULSUB_2W 1, 6, 8, 9, 10, 87, 25, 0 ; t7a, t6a + ITX_MULSUB_2W 5, 2, 8, 9, 10, 43, 80, 0 ; t3a, t2a + psubsw m8, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m0, m4 ; t4 + paddsw m0, m4 ; t0 + psubsw m4, m5, m1 ; t7 + paddsw m5, m1 ; t3 + psubsw m1, m7, m3 ; t5 + paddsw m7, m3 ; t1 + ITX_MULSUB_2W 6, 1, 3, 9, 10, 36, 83, 0 ; t5a, t4a + ITX_MULSUB_2W 4, 8, 3, 9, 10, 83, 36, 0 ; t6a, t7a + psubsw m9, m6, m8 ; t7 + paddsw m6, m8 ; out6 + psubsw m3, m7, m5 ; t3 + paddsw m7, m5 ; -out7 + psubsw m5, m0, m2 ; t2 + paddsw m0, m2 ; out0 + psubsw m2, m1, m4 ; t6 + paddsw m1, m4 ; -out1 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m11, [o(vvc_pw_m64_64)] + vpbroadcastd m12, [o(vvc_pw_64_64)] + punpckhwd m4, m3, m5 + punpcklwd m3, m5 + pmaddwd m5, m11, m4 + pmaddwd m4, m12 + pmaddwd m8, m11, m3 + pmaddwd m3, m12 + REPX {paddd x, m10}, m5, m4, m8, m3 + REPX {psrad x, 7 }, m5, m8, m4, m3 + packssdw m3, m4 ; -out3 + packssdw m4, m8, m5 ; out4 + punpcklwd m5, m9, m2 + punpckhwd m9, m2 + pmaddwd m2, m12, m5 + pmaddwd m5, m11 + pmaddwd m12, m9 + pmaddwd m11, m9 + REPX {paddd x, m10}, m2, m5, m12, m11 + REPX {psrad x, 7 }, m2, m7, m5, m11 + packssdw m2, m12 ; out2 + packssdw m5, m11 ; -out5 + ret +ALIGN function_align +cglobal_label .main_pass2_end + vpbroadcastd m8, [o(vvc_pw_64x8)] + psubsw m4, m5, m3 + paddsw m3, m5 + psubsw m5, m2, m9 + paddsw m2, m9 + pmulhrsw m2, m8 ; out2 + pmulhrsw m3, m8 ; -out3 + pmulhrsw m4, m8 ; out4 + pmulhrsw m5, m8 ; -out5 + vpbroadcastd m9, [o(vvc_pw_2048)] + ret + +INV_TXFM_16X8_FN flipadst, dct2 +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal_8).main2 + call m(iadst_8x16_internal_8).main_pass1_end + psubw m9, m10 + punpcklwd m8, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m7, m5 + punpckhwd m7, m5 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m2, m0 + punpcklwd m2, m0 + REPX {pmulhrsw x, m10}, m8, m4, m5, m1 + REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 + punpcklwd m0, m7, m4 + punpckhwd m7, m4 + punpckhwd m4, m6, m8 + punpcklwd m6, m8 + punpckhwd m8, m3, m5 + punpcklwd m3, m5 + punpcklwd m5, m2, m1 + punpckhwd m2, m1 + punpckhdq m1, m0, m6 + punpckldq m0, m6 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckhdq m4, m3, m5 + punpckldq m3, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m4, 0x31 + vinserti128 m1, xm4, 1 + vperm2i128 m4, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vinserti128 m3, m7, xm8, 1 + vperm2i128 m7, m8, 0x31 + jmp tx2q +.pass2: + call m(iadst_16x8_internal_8).main + call m(iadst_16x8_internal_8).main_pass2_end + pxor m8, m8 + psubw m8, m9 + pmulhrsw m10, m7, m8 + pmulhrsw m7, m0, m9 + pmulhrsw m0, m6, m9 + pmulhrsw m6, m1, m8 + pmulhrsw m1, m5, m8 + pmulhrsw m5, m2, m9 + pmulhrsw m2, m4, m9 + pmulhrsw m4, m3, m8 + lea r3, [strideq*3] + WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 + WRITE_16X2 1, 2, 0, 1, strideq*2, r3 + jmp m(idct2_16x8_internal_8).end3 + +INV_TXFM_16X8_FN identity, dct2 +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm7, [cq+16*0] + mova xm2, [cq+16*1] + add cq, 16*8 + vpbroadcastd m3, [o(vvc_pw_64x8)] + vinserti128 m7, [cq+16*0], 1 + vinserti128 m2, [cq+16*1], 1 + mova xm6, [cq-16*6] + mova xm4, [cq-16*5] + vinserti128 m6, [cq+16*2], 1 + vinserti128 m4, [cq+16*3], 1 + mova xm8, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m8, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm0, [cq-16*2] + mova xm1, [cq-16*1] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m10, [o(vvc_pw_1697x16)] + vpbroadcastd m11, [o(vvc_pw_16384)] + REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 + punpcklwd m3, m7, m2 + punpckhwd m7, m2 + punpcklwd m2, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m8, m5 + punpckhwd m8, m5 + punpcklwd m5, m0, m1 + punpckhwd m0, m1 + punpckldq m1, m3, m2 + punpckhdq m3, m2 + punpckldq m2, m4, m5 + punpckhdq m4, m5 + punpckldq m5, m7, m6 + punpckhdq m7, m6 + punpckldq m6, m8, m0 + punpckhdq m8, m0 + REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m6 + punpckhqdq m5, m6 + punpcklqdq m6, m7, m8 + punpckhqdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(vvc_pw_64)] + jmp m(idct2_16x8_internal_8).end + +%define o_base vvc_pw_5 + 128 + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16 +%ifidn %1_%2, dct2_dct2 + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 16 + jmp m(vvc_inv_dct2_dct2_16x4_8).dconly +%endif +%endmacro + +%macro ITX_16X16_LOAD_COEFS 0 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + add cq, 32*8 + mova m4, [cq-32*4] + mova m5, [cq-32*3] + mova m6, [cq-32*2] + mova m7, [cq-32*1] + mova m8, [cq+32*0] + mova m9, [cq+32*1] + mova m10, [cq+32*2] + mova m11, [cq+32*3] + mova m12, [cq+32*4] + mova m13, [cq+32*5] + mova m14, [cq+32*6] + mova m15, [cq+32*7] + mova [rsp], m15 +%endmacro + +INV_TXFM_16X16_FN dct2, dct2 +INV_TXFM_16X16_FN dct2, adst +INV_TXFM_16X16_FN dct2, flipadst +INV_TXFM_16X16_FN dct2, identity + +cglobal idct2_16x16_internal_8, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main +.pass1_end: + vpbroadcastd m1, [o(vvc_pw_8192)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 +.pass1_end2: + vextracti128 [rsp+16*4], m0, 1 + mova [rsp+16*0], xm0 + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + vperm2i128 m8, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vperm2i128 m9, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vperm2i128 m10, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + vperm2i128 m11, m4, m12, 0x31 + vinserti128 m4, xm12, 1 + vperm2i128 m12, m5, m13, 0x31 + vinserti128 m5, xm13, 1 + vperm2i128 m13, m6, m14, 0x31 + vinserti128 m6, xm14, 1 + vperm2i128 m14, m7, m15, 0x31 + vinserti128 m7, xm15, 1 + mova m15, [rsp+32*2] +.pass1_end3: + punpcklwd m0, m9, m10 + punpckhwd m9, m10 + punpcklwd m10, m15, m8 + punpckhwd m15, m8 + punpckhwd m8, m11, m12 + punpcklwd m11, m12 + punpckhwd m12, m13, m14 + punpcklwd m13, m14 + punpckhdq m14, m11, m13 + punpckldq m11, m13 + punpckldq m13, m15, m9 + punpckhdq m15, m9 + punpckldq m9, m10, m0 + punpckhdq m10, m0 + punpckhdq m0, m8, m12 + punpckldq m8, m12 + punpcklqdq m12, m13, m8 + punpckhqdq m13, m8 + punpcklqdq m8, m9, m11 + punpckhqdq m9, m11 + punpckhqdq m11, m10, m14 + punpcklqdq m10, m14 + punpcklqdq m14, m15, m0 + punpckhqdq m15, m0 + mova m0, [rsp] + mova [rsp], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m1, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp], m6 +.end2: + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + lea r3, [strideq*3] + WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 +.end3: + pxor m2, m2 + REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 + lea dstq, [dstq+strideq*4] + WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 + WRITE_16X2 10, 11, 0, 1, strideq*2, r3 + REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 + lea dstq, [dstq+strideq*4] + WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 + WRITE_16X2 14, 15, 0, 1, strideq*2, r3 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m15, [o(vvc_pd_64)] + mova [rsp+gprsize+32*1], m1 + mova [rsp+gprsize+32*2], m9 + IDCT2_8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 + mova m1, [rsp+gprsize+32*2] ; in9 + mova [rsp+gprsize+32*2], m14 ; tmp7 + mova m9, [rsp+gprsize+32*1] ; in1 + mova [rsp+gprsize+32*1], m10 ; tmp5 + mova m14, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m6 ; tmp3 + IDCT2_16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 + mova m6, [rsp+gprsize+32*1] ; tmp5 + psubsw m15, m0, m14 ; out15 + paddsw m0, m14 ; out0 + psubsw m14, m2, m13 ; out14 + paddsw m2, m13 ; out1 + mova [rsp+gprsize+32*1], m2 + psubsw m13, m4, m11 ; out13 + paddsw m2, m4, m11 ; out2 + psubsw m11, m8, m7 ; out11 + paddsw m4, m8, m7 ; out4 + mova m7, [rsp+gprsize+32*2] ; tmp7 + psubsw m10, m6, m5 ; out10 + paddsw m5, m6 ; out5 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; out7 + psubsw m9, m12, m3 ; out9 + paddsw m6, m12, m3 ; out6 + mova m3, [rsp+gprsize+32*0] ; tmp3 + psubsw m12, m3, m1 ; out12 + paddsw m3, m1 ; out3 + ret + +INV_TXFM_16X16_FN adst, dct2 +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main + call .main_pass1_end + pmulhrsw m0, m1, [cq+32*0] + pmulhrsw m2, m1, [cq+32*1] + REPX {pmulhrsw x, m1}, m4, m6, m8, m10 + pmulhrsw m12, m1, [cq+32*2] + pmulhrsw m14, m1, [cq+32*3] + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 + pxor m8, m8 + psubw m1, m8, m1 + jmp m(idct2_16x16_internal_8).pass1_end2 +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp+32*0], m6 + pxor m6, m6 + psubw m1, m6, m1 + jmp m(idct2_16x16_internal_8).end2 +ALIGN function_align +cglobal_label .main + vpbroadcastd m15, [o(vvc_pd_64)] + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*2], m4 + ITX_MULSUB_2W 13, 2, 0, 4, 15, 22, 88, 0 ; t3, t2 + ITX_MULSUB_2W 9, 6, 0, 4, 15, 54, 73, 0 ; t7, t6 + ITX_MULSUB_2W 5, 10, 0, 4, 15, 78, 46, 0 ; t11, t10 + ITX_MULSUB_2W 1, 14, 0, 4, 15, 90, 13, 0 ; t15, t14 + psubsw m0, m2, m10 ; t10a + paddsw m2, m10 ; t2a + psubsw m10, m13, m5 ; t11a + paddsw m13, m5 ; t3a + psubsw m5, m6, m14 ; t14a + paddsw m6, m14 ; t6a + psubsw m14, m9, m1 ; t15a + paddsw m9, m1 ; t7a + ITX_MULSUB_2W 0, 10, 1, 4, 15, 75, 50, 0 ; t11, t10 + ITX_MULSUB_2W 14, 5, 1, 4, 15, 50, 75, 0 ; t14, t15 + psubsw m1, m10, m14 ; t14a + paddsw m10, m14 ; t10a + psubsw m14, m0, m5 ; t15a + paddsw m0, m5 ; t11a + psubsw m5, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m13, m9 ; t7 + paddsw m13, m9 ; t3 + ITX_MULSUB_2W 6, 5, 4, 9, 15, 83, 36, 0 ; t6a, t7a + ITX_MULSUB_2W 14, 1, 4, 9, 15, 83, 36, 0 ; t14, t15 + mova m9, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m10 ; t10a + mova m4, [rsp+gprsize+32*1] ; in0 + mova [rsp+gprsize+32*1], m6 ; t6a + mova m6, [rsp+gprsize+32*2] ; in4 + mova [rsp+gprsize+32*2], m2 ; t2 + ITX_MULSUB_2W 9, 4, 2, 10, 15, 4, 90, 0 ; t1, t0 + ITX_MULSUB_2W 11, 6, 2, 10, 15, 38, 82, 0 ; t5, t4 + ITX_MULSUB_2W 7, 8, 2, 10, 15, 67, 61, 0 ; t9, t8 + ITX_MULSUB_2W 3, 12, 2, 10, 15, 85, 31, 0 ; t13, t12 + psubsw m10, m4, m8 ; t8a + paddsw m8, m4 ; t0a + psubsw m4, m9, m7 ; t9a + paddsw m9, m7 ; t1a + psubsw m7, m6, m12 ; t12a + paddsw m6, m12 ; t4a + psubsw m12, m11, m3 ; t13a + paddsw m11, m3 ; t5a + ITX_MULSUB_2W 10, 4, 2, 3, 15, 18, 89, 0 ; t9, t8 + ITX_MULSUB_2W 12, 7, 2, 3, 15, 89, 18, 0 ; t12, t13 + psubsw m3, m9, m11 ; t5 + paddsw m9, m11 ; t1 + psubsw m11, m4, m12 ; t12a + paddsw m4, m12 ; t8a + paddsw m12, m8, m6 ; t0 + psubsw m8, m6 ; t4 + paddsw m6, m10, m7 ; t9a + psubsw m10, m7 ; t13a + ITX_MULSUB_2W 8, 3, 2, 7, 15, 36, 83, 0 ; t5a, t4a + ITX_MULSUB_2W 11, 10, 2, 7, 15, 36, 83, 0 ; t13, t12 + mova m7, [rsp+gprsize+32*0] ; t10a + mova m2, [rsp+gprsize+32*1] ; t6a + paddsw m15, m9, m13 ; -out15 + psubsw m9, m13 ; t3a + paddsw m13, m11, m1 ; -out13 + psubsw m11, m1 ; t15a + psubsw m1, m4, m7 ; t10 + paddsw m7, m4 ; -out1 + psubsw m4, m3, m2 ; t6 + paddsw m3, m2 ; -out3 + paddsw m2, m10, m14 ; out2 + psubsw m10, m14 ; t14a + paddsw m14, m6, m0 ; out14 + psubsw m6, m0 ; t11 + mova m0, [rsp+gprsize+32*2] ; t2 + mova [rsp+gprsize+32*1], m7 + psubsw m7, m12, m0 ; t2a + paddsw m0, m12 ; out0 + paddsw m12, m8, m5 ; out12 + psubsw m8, m5 ; t7 + ret +ALIGN function_align +.main_pass1_end: + mova [cq+32*0], m0 + mova [cq+32*1], m2 + mova [cq+32*2], m12 + mova [cq+32*3], m14 + vpbroadcastd m14, [vvc_pw_m64_64] + vpbroadcastd m12, [vvc_pw_64_64] + vpbroadcastd m2, [vvc_pd_64] + punpcklwd m5, m11, m10 + punpckhwd m11, m10 + pmaddwd m10, m14, m5 + pmaddwd m0, m14, m11 + pmaddwd m5, m12 + pmaddwd m11, m12 + REPX {paddd x, m2}, m10, m0, m5, m11 + REPX {psrad x, 7}, m10, m0, m5, m11 + packssdw m10, m0 ; out10 + packssdw m5, m11 ; -out5 + punpcklwd m11, m8, m4 + punpckhwd m8, m4 + pmaddwd m4, m12, m11 + pmaddwd m0, m12, m8 + pmaddwd m11, m14 + pmaddwd m8, m14 + REPX {paddd x, m2}, m4, m0, m11, m8 + REPX {psrad x, 7}, m4, m0, m11, m8 + packssdw m4, m0 ; out4 + packssdw m11, m8 ; -out11 + punpcklwd m8, m9, m7 + punpckhwd m9, m7 + pmaddwd m7, m12, m8 + pmaddwd m0, m12, m9 + pmaddwd m8, m14 + pmaddwd m9, m14 + REPX {paddd x, m2}, m7, m0, m8, m9 + REPX {psrad x, 7}, m7, m0, m8, m9 + packssdw m7, m0 ; -out7 + packssdw m8, m9 ; out8 + punpckhwd m0, m6, m1 + punpcklwd m6, m1 + pmaddwd m1, m14, m0 + pmaddwd m9, m14, m6 + pmaddwd m0, m12 + pmaddwd m6, m12 + REPX {paddd x, m2}, m1, m9, m0, m6 + REPX {psrad x, 7}, m1, m9, m0, m6 + packssdw m9, m1 ; -out7 + packssdw m6, m0 ; out8 + vpbroadcastd m1, [o(vvc_pw_8192)] + ret +ALIGN function_align +cglobal_label .main_pass2_end + ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to + ; 16-bit here will produce the same result as using 32-bit intermediates. + paddsw m5, m10, m11 ; -out5 + psubsw m10, m11 ; out10 + psubsw m11, m4, m8 ; -out11 + paddsw m4, m8 ; out4 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; -out7 + psubsw m9, m1, m6 ; -out9 + paddsw m6, m1 ; out6 + vpbroadcastd m1, [o(vvc_pw_64x8)] + REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 + vpbroadcastd m1, [o(vvc_pw_2048)] + ret + +INV_TXFM_16X16_FN flipadst, dct2 +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call m(iadst_16x16_internal_8).main + call m(iadst_16x16_internal_8).main_pass1_end + pmulhrsw m6, m1 + pmulhrsw m2, m1, m8 + mova [rsp+32*2], m6 + pmulhrsw m6, m1, m4 + pmulhrsw m4, m1, m10 + pmulhrsw m8, m1, [cq+32*3] + pmulhrsw m10, m1, [cq+32*2] + pmulhrsw m12, m1, [cq+32*1] + pmulhrsw m14, m1, [cq+32*0] + pxor m0, m0 + psubw m0, m1 + REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 + pmulhrsw m1, m0, m9 + pmulhrsw m9, m0, m13 + pmulhrsw m0, [rsp+32*1] + mova [rsp+16*0], xm15 + mova [rsp+16*1], xm7 + vperm2i128 m15, m15, m7, 0x31 + vinserti128 m7, m2, xm14, 1 + vperm2i128 m14, m2, m14, 0x31 + vinserti128 m2, m9, xm5, 1 + vperm2i128 m9, m9, m5, 0x31 + vinserti128 m5, m4, xm12, 1 + vperm2i128 m12, m4, m12, 0x31 + vinserti128 m4, m11, xm3, 1 + vperm2i128 m11, m11, m3, 0x31 + vinserti128 m3, m10, xm6, 1 + vperm2i128 m10, m10, m6, 0x31 + vinserti128 m6, m1, xm0, 1 + vperm2i128 m13, m1, m0, 0x31 + vinserti128 m1, m8, [rsp+32*2], 1 + vperm2i128 m8, m8, [rsp+32*2], 0x31 + jmp m(idct2_16x16_internal_8).pass1_end3 +.pass2: + call m(iadst_16x16_internal_8).main + call m(iadst_16x16_internal_8).main_pass2_end + pmulhrsw m0, m1 + pmulhrsw m8, m1 + mova [rsp+32*0], m0 + mova [rsp+32*2], m8 + pxor m0, m0 + psubw m0, m1 + pmulhrsw m8, m0, m7 + pmulhrsw m7, m0, m9 + pmulhrsw m9, m1, m6 + pmulhrsw m6, m1, m10 + pmulhrsw m10, m0, m5 + pmulhrsw m5, m0, m11 + pmulhrsw m11, m1, m4 + pmulhrsw m4, m1, m12 + pmulhrsw m12, m0, m3 + pmulhrsw m3, m0, m13 + pmulhrsw m13, m1, m2 + pmulhrsw m1, m14 + pmulhrsw m14, m0, [rsp+32*1] + pmulhrsw m0, m15 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 + mova m15, [rsp+32*0] + WRITE_16X2 3, 4, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 + WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 + jmp m(idct2_16x16_internal_8).end3 + +%macro IDTX16B 3 ; src/dst, tmp, vvc_pw_1697x16 + pmulhrsw m%2, m%3, m%1 + psraw m%2, 1 + pavgw m%1, m%2 ; signs are guaranteed to be equal +%endmacro + +INV_TXFM_16X16_FN identity, dct2 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + vpbroadcastd m7, [o(vvc_pw_1697x16)] + mova xm0, [cq+16* 0] + vinserti128 m0, [cq+16*16], 1 + mova xm15, [cq+16* 1] + vinserti128 m15, [cq+16*17], 1 + mova xm1, [cq+16* 2] + vinserti128 m1, [cq+16*18], 1 + mova xm8, [cq+16* 3] + vinserti128 m8, [cq+16*19], 1 + mova xm2, [cq+16* 4] + vinserti128 m2, [cq+16*20], 1 + mova xm9, [cq+16* 5] + vinserti128 m9, [cq+16*21], 1 + mova xm3, [cq+16* 6] + vinserti128 m3, [cq+16*22], 1 + mova xm10, [cq+16* 7] + add cq, 16*16 + vinserti128 m10, [cq+16* 7], 1 + mova xm4, [cq-16* 8] + vinserti128 m4, [cq+16* 8], 1 + mova xm11, [cq-16* 7] + vinserti128 m11, [cq+16* 9], 1 + mova xm5, [cq-16* 6] + vinserti128 m5, [cq+16*10], 1 + mova xm12, [cq-16* 5] + vinserti128 m12, [cq+16*11], 1 + mova xm13, [cq-16* 3] + vinserti128 m13, [cq+16*13], 1 + mova xm14, [cq-16* 1] + vinserti128 m14, [cq+16*15], 1 + REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ + 10, 4, 11, 5, 12, 13, 14 + mova xm6, [cq-16* 4] + vinserti128 m6, [cq+16*12], 1 + mova [rsp], m0 + IDTX16B 6, 0, 7 + mova xm0, [cq-16* 2] + vinserti128 m0, [cq+16*14], 1 + pmulhrsw m7, m0 + psraw m7, 1 + pavgw m7, m0 + jmp m(idct2_16x16_internal_8).pass1_end3 +ALIGN function_align +.pass2: + vpbroadcastd m15, [o(vvc_pw_1697x16)] + mova [rsp+32*1], m0 + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [rsp+32*1] + mova [rsp+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [rsp+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + jmp m(idct2_16x16_internal_8).end + +%define o_base deint_shuf + 128 + +%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 +%if %3 + vpbroadcastd m15, [o(vvc_pw_64x8)] + pmulhrsw m0, m15, [%1+%2*0] + pmulhrsw m1, m15, [%1+%2*1] + pmulhrsw m2, m15, [%1+%2*2] + pmulhrsw m3, m15, [%1+%2*3] + pmulhrsw m4, m15, [%1+%2*4] + pmulhrsw m5, m15, [%1+%2*5] + pmulhrsw m6, m15, [%1+%2*6] + pmulhrsw m7, m15, [%1+%2*7] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] + mova m7, [%1+%2*7] +%endif +%endmacro + +%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 +%if %3 +%if %3 == 1 + vpbroadcastd m15, [o(vvc_pw_64x8)] +%endif + pmulhrsw m8, m15, [%1+%2*0] + pmulhrsw m9, m15, [%1+%2*1] + pmulhrsw m10, m15, [%1+%2*2] + pmulhrsw m11, m15, [%1+%2*3] + pmulhrsw m12, m15, [%1+%2*4] + pmulhrsw m13, m15, [%1+%2*5] + pmulhrsw m14, m15, [%1+%2*6] + pmulhrsw m15, [%1+%2*7] +%else + mova m8, [%1+%2*0] + mova m9, [%1+%2*1] + mova m10, [%1+%2*2] + mova m11, [%1+%2*3] + mova m12, [%1+%2*4] + mova m13, [%1+%2*5] + mova m14, [%1+%2*6] + mova m15, [%1+%2*7] +%endif +%endmacro + +%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] + vpbroadcastd m%3, [r5-vvc_pw_4_90x8+vvc_pw_%4_%5x8] + punpcklwd m%1, m%2, m%2 + pmulhrsw m%1, m%3 + vpbroadcastd m%3, [r5-vvc_pw_4_90x8+vvc_pw_%6_%7x8] + punpckhwd m%2, m%2 + pmulhrsw m%2, m%3 +%endmacro + +cglobal vvc_inv_dct2_dct2_8x32_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + cmp eobd, 106 + jle .fast + LOAD_8ROWS cq+32*1, 32*2 + call m(idct2_16x8_internal_8).main + vperm2i128 m11, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m11, m4 + punpckhwd m11, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 + punpckhdq m5, m11, m4 + punpckldq m11, m4 + punpckldq m4, m7, m1 + punpckhdq m7, m1 + punpckhqdq m12, m6, m0 + punpcklqdq m0, m6 ; out4 + punpckhqdq m13, m7, m4 + punpcklqdq m4, m7 ; out5 + punpckhqdq m14, m3, m2 + punpcklqdq m2, m3 ; out6 + punpckhqdq m15, m5, m11 + punpcklqdq m11, m5 ; out7 + mova [rsp+32*0], m0 + mova [rsp+32*1], m4 + mova [rsp+32*2], m2 +.fast: + LOAD_8ROWS cq+32*0, 32*2 + call m(idct2_16x8_internal_8).main + vperm2i128 m8, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vpbroadcastd m9, [o(vvc_pw_8192)] + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m8, m4 + punpcklwd m8, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m8, m5 + punpckhdq m8, m5 + punpckhdq m5, m3, m4 + punpckldq m3, m4 + punpckhdq m4, m7, m1 + punpckldq m7, m1 + punpcklqdq m1, m7, m4 + punpckhqdq m7, m4 ; out9 + punpckhqdq m4, m2, m8 ; out10 + punpcklqdq m2, m8 + punpckhqdq m8, m3, m5 + punpcklqdq m3, m5 + punpckhqdq m5, m0, m6 ; out8 + punpcklqdq m0, m6 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 + cmp eobd, 106 + jg .full + mova [rsp+32*0], m5 + mova [rsp+32*1], m7 + mova [rsp+32*2], m4 + pmulhrsw m11, m9, m8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call .main_fast + jmp .pass2 +.dconly: + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 32 + jmp m(vvc_inv_dct2_dct2_8x8_8).dconly +.full: + REPX {pmulhrsw x, m9}, m12, m13, m14, m15 + pmulhrsw m6, m9, [rsp+32*2] + mova [rsp+32*2], m4 + pmulhrsw m4, m9, [rsp+32*0] + mova [rsp+32*0], m5 + pmulhrsw m5, m9, [rsp+32*1] + mova [rsp+32*1], m7 + pmulhrsw m7, m9, m11 + pmulhrsw m11, m9, m8 + call .main +.pass2: + vpbroadcastd m12, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m13, m14, m15 + pmulhrsw m12, [rsp] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 + mova [rsp+32*0], m4 + mova [rsp+32*1], m6 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*0], 5, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*1], 7, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 8, 9, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 10, 11, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 12, 13, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 14, 15, 4, 6 + RET +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + call m(idct2_8x16_internal_8).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + lea r5, [r6-(o_base)+vvc_pw_4_90x8] + ITX_UNPACK_MULHRSW 1, 8, 6, 4, 90, n13, 90 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 15, 9, 6, 22, 88, n31, 85 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 14, 0, 6, 38, 82, m46, 78 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 13, 11, 6, 54, 73, m61, 67 ; t22a, t25a, t17a, t30a + jmp .main2 +ALIGN function_align +cglobal_label .main + call m(idct2_8x16_internal_8).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + punpcklwd m1, m15, m8 ; in31 in1 + punpckhwd m8, m15 ; in3 in29 + punpcklwd m15, m14, m9 ; in27 in5 + punpckhwd m9, m14 ; in7 in25 + punpcklwd m14, m13, m0 ; in23 in9 + punpckhwd m0, m13 ; in11 in21 + punpcklwd m13, m12, m11 ; in19 in13 + punpckhwd m11, m12 ; in15 in17 + ITX_MUL2X_PACK 1, 6, 12, 10, 4, 90, 3 ; t16a, t31a + ITX_MUL2X_PACK 8, 6, 12, 10, 90, 13, 3 ; t23a, t24a + ITX_MUL2X_PACK 15, 6, 12, 10, 22, 88, 3 ; t20a, t27a + ITX_MUL2X_PACK 9, 6, 12, 10, 85, 31, 3 ; t19a, t28a + ITX_MUL2X_PACK 14, 6, 12, 10, 38, 82, 3 ; t18a, t29a + ITX_MUL2X_PACK 0, 6, 12, 10, 78, 46, 3 ; t21a, t26a + ITX_MUL2X_PACK 13, 6, 12, 10, 54, 73, 3 ; t22a, t25a + ITX_MUL2X_PACK 11, 6, 12, 10, 67, 61, 3 ; t17a, t30a +.main2: + psubsw m6, m1, m11 ; t17 t30 + paddsw m1, m11 ; t16 t31 + psubsw m11, m9, m14 ; t18 t29 + paddsw m9, m14 ; t19 t28 + psubsw m14, m15, m0 ; t21 t26 + paddsw m15, m0 ; t20 t27 + psubsw m0, m8, m13 ; t22 t25 + paddsw m8, m13 ; t23 t24 + ITX_MUL2X_PACK 6, 12, 13, 10, 18, 89, 3 ; t17a t30a + ITX_MUL2X_PACK 11, 12, 13, 10, m89, 18, 3 ; t18a t29a + ITX_MUL2X_PACK 14, 12, 13, 10, 75, 50, 3 ; t21a t26a + ITX_MUL2X_PACK 0, 12, 13, 10, m50, 75, 3 ; t22a t25a + psubsw m13, m1, m9 ; t19a t28a + paddsw m1, m9 ; t16a t31a + psubsw m9, m8, m15 ; t20a t27a + paddsw m8, m15 ; t23a t24a + psubsw m15, m6, m11 ; t18 t29 + paddsw m6, m11 ; t17 t30 + psubsw m11, m0, m14 ; t21 t26 + paddsw m0, m14 ; t22 t25 + ITX_MUL2X_PACK 15, 12, 14, 10, 36, 83, 3 ; t18a t29a + ITX_MUL2X_PACK 13, 12, 14, 10, 36, 83, 3 ; t19 t28 + ITX_MUL2X_PACK 9, 12, 14, 10, m83, 36, 3 ; t20 t27 + ITX_MUL2X_PACK 11, 12, 14, 10, m83, 36, 3 ; t21a t26a + vbroadcasti128 m12, [o(deint_shuf)] + psubsw m14, m1, m8 ; t23 t24 + paddsw m1, m8 ; t16 t31 + psubsw m8, m6, m0 ; t22a t25a + paddsw m6, m0 ; t17a t30a + psubsw m0, m15, m11 ; t21 t26 + paddsw m15, m11 ; t18 t29 + psubsw m11, m13, m9 ; t20a t27a + paddsw m13, m9 ; t19a t28a + REPX {pshufb x, m12}, m1, m6, m15, m13 + ITX_MUL2X_PACK 14, 9, 12, 10, 64, 64 ; t24a t23a + vpbroadcastd m9, [o(vvc_pw_m64_64)] + ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 + vpbroadcastd m12, [o(vvc_pw_64_64)] + ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a + vpbroadcastd m12, [o(vvc_pw_64_64)] + ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 + shufps m9, m14, m8, q1032 ; t23a t22 + vpblendd m14, m8, 0xcc ; t24a t25 + shufps m8, m11, m0, q1032 ; t20 t21a + vpblendd m11, m0, 0xcc ; t27 t26a + punpcklqdq m0, m1, m6 ; t16 t17a + punpckhqdq m1, m6 ; t31 t30a + psubsw m10, m5, m8 ; out20 out21 + paddsw m5, m8 ; out11 out10 + psubsw m6, m3, m14 ; out24 out25 + paddsw m3, m14 ; out7 out6 + psubsw m8, m7, m0 ; out16 out17 + paddsw m7, m0 ; out15 out14 + mova m0, [rsp+gprsize+0*32] + punpcklqdq m12, m13, m15 ; t19a t18 + punpckhqdq m13, m15 ; t28a t29 + psubsw m15, m0, m1 ; out31 out30 + paddsw m0, m1 ; out0 out1 + mova m1, [rsp+gprsize+1*32] + mova [rsp+gprsize+0*32], m6 + mova m6, [rsp+gprsize+2*32] + psubsw m14, m1, m13 ; out28 out29 + paddsw m1, m13 ; out3 out2 + psubsw m13, m2, m11 ; out27 out26 + paddsw m2, m11 ; out4 out5 + psubsw m11, m4, m9 ; out23 out22 + paddsw m4, m9 ; out8 out9 + psubsw m9, m6, m12 ; out19 out18 + paddsw m6, m12 ; out12 out13 + ret + +%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] + vbroadcasti128 m%1, [cq+16*%3] + vbroadcasti128 m%2, [cq+16*%4] + shufpd m%1, m%2, 0x0c +%endmacro + +cglobal vvc_inv_dct2_dct2_32x8_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 8 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [vvc_pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova m1, [dstq] + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq], m1 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 + LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 + LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 + LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*16 + LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 + LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 + LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 + LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 + mova [rsp+32*0], m4 + mova [rsp+32*1], m5 + mova [rsp+32*2], m6 + cmp eobd, 106 + jg .full + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(vvc_inv_dct2_dct2_8x32_8).main_fast + jmp .pass2 +.full: + LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 + LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 + LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 + LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*8 + LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 + LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 + LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 + LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + call m(vvc_inv_dct2_dct2_8x32_8).main +.pass2: + vpbroadcastd m12, [o(vvc_pw_8192)] + REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 + mova [rsp+32*1], m9 + mova [rsp+32*2], m10 + punpckhwd m9, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m10, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + punpckhwd m3, m0, m9 + punpcklwd m0, m9 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m10, m4 + punpckhwd m10, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m10, m5 + punpckhdq m10, m5 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 + pmulhrsw m12, [rsp+32*0] + mova [rsp+32*0], m8 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m10, 0x31 + vinserti128 m3, xm10, 1 + call m(idct2_16x8_internal_8).main + vpbroadcastd m8, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + lea r2, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + mova m0, [rsp+32*0] + mova m1, [rsp+32*1] + mova m2, [rsp+32*2] + punpckhwd m7, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m11 + punpcklwd m1, m11 + punpckhwd m4, m12, m14 + punpcklwd m12, m14 + punpckhwd m5, m13, m15 + punpcklwd m13, m15 + punpckhwd m3, m0, m7 + punpcklwd m0, m7 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m12, m4 + punpckhwd m12, m4 + punpcklwd m4, m5, m13 + punpckhwd m5, m13 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m12, m5 + punpckhdq m12, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m12, 0x31 + vinserti128 m3, xm12, 1 + call m(idct2_16x8_internal_8).main2 + vpbroadcastd m8, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + add r0, 16 + add r3, 16 + %define dstq r0 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + RET + +cglobal vvc_inv_identity_identity_8x32_8, 4, 5, 11, dst, stride, c, eob + vpbroadcastd m9, [vvc_pw_5] + lea r4, [strideq*3] + sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) +.loop: + mova xm0,[cq+16* 0] + mova xm1, [cq+16* 4] + vinserti128 m0, [cq+16* 1], 1 + vinserti128 m1, [cq+16* 5], 1 + pxor m8, m8 + mova [cq+32*0], m8 + mova [cq+32*2], m8 + add cq, 16*16 + mova xm2, [cq-16* 8] + mova xm3, [cq-16* 4] + vinserti128 m2, [cq-16* 7], 1 + vinserti128 m3, [cq-16* 3], 1 + mova xm4, [cq+16* 0] + mova xm5, [cq+16* 4] + vinserti128 m4, [cq+16* 1], 1 + vinserti128 m5, [cq+16* 5], 1 + mova xm6, [cq+16* 8] + mova xm7, [cq+16*12] + vinserti128 m6, [cq+16* 9], 1 + vinserti128 m7, [cq+16*13], 1 + REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 + REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose8x8 + REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + sub cq, 16*16-32 + lea dstq, [dstq+r4*4] + add eobd, 0x80000000 + jnc .loop + RET +ALIGN function_align +.transpose8x8: + punpckhwd m8, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m8, m1 + punpckhdq m8, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + ret + +cglobal vvc_inv_identity_identity_32x8_8, 4, 6, 10, dst, stride, c, eob + add cq, 16*8 + vpbroadcastd m9, [vvc_pw_64] + lea r4, [strideq*3] + lea r5, [dstq+strideq*4] + sub eobd, 107 +.loop: + mova xm0, [cq-16*8] + mova xm1, [cq-16*7] + vinserti128 m0, [cq+16*0], 1 + vinserti128 m1, [cq+16*1], 1 + mova xm2, [cq-16*6] + mova xm3, [cq-16*5] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m3, [cq+16*3], 1 + mova xm4, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm6, [cq-16*2] + mova xm7, [cq-16*1] + vinserti128 m6, [cq+16*6], 1 + vinserti128 m7, [cq+16*7], 1 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + %define dstq r5 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + add cq, 16*16 + add r0, 16 + add r5, 16 + add eobd, 0x80000000 + jnc .loop + RET + +%define o_base vvc_pw_5 + 128 + +%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs +%if %3 + vpbroadcastd m15, [o(vvc_pw_64x8)] + pmulhrsw m0, m15, [%1+%2* 0] + pmulhrsw m1, m15, [%1+%2* 1] + pmulhrsw m2, m15, [%1+%2* 2] + pmulhrsw m3, m15, [%1+%2* 3] + pmulhrsw m4, m15, [%1+%2* 4] + pmulhrsw m5, m15, [%1+%2* 5] + pmulhrsw m6, m15, [%1+%2* 6] + pmulhrsw m7, m15, [%1+%2* 7] + pmulhrsw m8, m15, [%1+%2* 8] + pmulhrsw m9, m15, [%1+%2* 9] + pmulhrsw m10, m15, [%1+%2*10] + pmulhrsw m11, m15, [%1+%2*11] + pmulhrsw m12, m15, [%1+%2*12] + pmulhrsw m13, m15, [%1+%2*13] + pmulhrsw m14, m15, [%1+%2*14] + pmulhrsw m15, [%1+%2*15] +%else + mova m0, [%1+%2* 0] + mova m1, [%1+%2* 1] + mova m2, [%1+%2* 2] + mova m3, [%1+%2* 3] + mova m4, [%1+%2* 4] + mova m5, [%1+%2* 5] + mova m6, [%1+%2* 6] + mova m7, [%1+%2* 7] + mova m8, [%1+%2* 8] + mova m9, [%1+%2* 9] + mova m10, [%1+%2*10] + mova m11, [%1+%2*11] + mova m12, [%1+%2*12] + mova m13, [%1+%2*13] + mova m14, [%1+%2*14] + mova m15, [%1+%2*15] +%endif + mova [rsp], m15 +%if %4 + pxor m15, m15 + REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15 +%endif +%endmacro + +%macro IDCT2_32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 + pmovzxbw m%4, [dstq+%6] + pmulhrsw m%3, m%5 + pmulhrsw m%1, m%5 + paddw m%3, m%4 + pmovzxbw m%4, [r2+%7] + paddw m%1, m%4 + packuswb m%3, m%1 + vpermq m%3, m%3, q3120 + mova [dstq+%6], xm%3 + vextracti128 [r2+%7], m%3, 1 +%endmacro + +cglobal vvc_inv_dct2_dct2_16x32_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3 + %undef cmp + LOAD_16ROWS cq, 64, 1 + call m(idct2_16x16_internal_8).main + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + lea tmp3q, [tmp1q+32*16] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(vvc_pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp3q-32*4+ 0], xm0 + vextracti128 [tmp3q+32*0+ 0], m0, 1 + mova [tmp3q-32*3+ 0], xm2 + vextracti128 [tmp3q+32*1+ 0], m2, 1 + mova [tmp3q-32*2+ 0], xm4 + vextracti128 [tmp3q+32*2+ 0], m4, 1 + mova [tmp3q-32*1+ 0], xm6 + vextracti128 [tmp3q+32*3+ 0], m6, 1 + mova [tmp3q-32*4+16], xm8 + vextracti128 [tmp3q+32*0+16], m8, 1 + mova [tmp3q-32*3+16], xm10 + vextracti128 [tmp3q+32*1+16], m10, 1 + mova [tmp3q-32*2+16], xm12 + vextracti128 [tmp3q+32*2+16], m12, 1 + mova [tmp3q-32*1+16], xm14 + vextracti128 [tmp3q+32*3+16], m14, 1 + cmp eobd, 150 + jg .full + vinserti128 m0, m1, xm9, 1 + vperm2i128 m4, m1, m9, 0x31 + vinserti128 m2, m5, xm13, 1 + vperm2i128 m6, m5, m13, 0x31 + vinserti128 m1, m3, xm11, 1 + vperm2i128 m5, m3, m11, 0x31 + vinserti128 m3, m7, xm15, 1 + vperm2i128 m7, m7, m15, 0x31 + call .main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct2_16 +.dconly: + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 32 + jmp m(vvc_inv_dct2_dct2_16x4_8).dconly +.full: + mova [tmp1q-32*4], m1 + mova [tmp1q-32*3], m3 + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m7 + mova [tmp1q+32*0], m9 + mova [tmp1q+32*1], m11 + mova [tmp1q+32*2], m13 + mova [tmp1q+32*3], m15 + LOAD_16ROWS cq+32, 64, 1 + call m(idct2_16x16_internal_8).main + lea r2, [tmp3q+32*8] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(vvc_pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [r2-32*4+ 0], xm0 + vextracti128 [r2+32*0+ 0], m0, 1 + mova [r2-32*3+ 0], xm2 + vextracti128 [r2+32*1+ 0], m2, 1 + mova [r2-32*2+ 0], xm4 + vextracti128 [r2+32*2+ 0], m4, 1 + mova [r2-32*1+ 0], xm6 + vextracti128 [r2+32*3+ 0], m6, 1 + mova [r2-32*4+16], xm8 + vextracti128 [r2+32*0+16], m8, 1 + mova [r2-32*3+16], xm10 + vextracti128 [r2+32*1+16], m10, 1 + mova [r2-32*2+16], xm12 + vextracti128 [r2+32*2+16], m12, 1 + mova [r2-32*1+16], xm14 + vextracti128 [r2+32*3+16], m14, 1 + vinserti128 m8, m1, xm9, 1 + vperm2i128 m12, m1, m9, 0x31 + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp1q+32*0], 1 + vinserti128 m1, [tmp1q+32*1], 1 + vinserti128 m10, m5, xm13, 1 + vperm2i128 m14, m5, m13, 0x31 + mova xm4, [tmp1q-32*4+16] + mova xm5, [tmp1q-32*3+16] + vinserti128 m4, [tmp1q+32*0+16], 1 + vinserti128 m5, [tmp1q+32*1+16], 1 + vinserti128 m9, m3, xm11, 1 + vperm2i128 m13, m3, m11, 0x31 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp1q+32*2], 1 + vinserti128 m3, [tmp1q+32*3], 1 + vinserti128 m11, m7, xm15, 1 + vperm2i128 m15, m7, m15, 0x31 + mova xm6, [tmp1q-32*2+16] + mova xm7, [tmp1q-32*1+16] + vinserti128 m6, [tmp1q+32*2+16], 1 + vinserti128 m7, [tmp1q+32*3+16], 1 + call .main_oddhalf + LOAD_8ROWS_H r2-32*4, 32 +.idct2_16: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +cglobal_label .main_oddhalf_fast ; lower half is zero + mova [rsp+gprsize+32*1], m7 + pxor m7, m7 + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m7 + vpbroadcastd m11, [o(vvc_pw_82x8)] + vpbroadcastd m7, [o(vvc_pw_38x8)] + vpbroadcastd m12, [o(vvc_pw_n31x8)] + vpbroadcastd m8, [o(vvc_pw_85x8)] + vpbroadcastd m13, [o(vvc_pw_88x8)] + vpbroadcastd m15, [o(vvc_pw_22x8)] + pmulhrsw m11, m4 ; t29a + pmulhrsw m4, m7 ; t18a + pmulhrsw m12, m3 ; t19a + pmulhrsw m3, m8 ; t28a + pmulhrsw m13, m2 ; t27a + pmulhrsw m2, m15 ; t20a + vpbroadcastd m10, [o(vvc_pw_m46x8)] + vpbroadcastd m7, [o(vvc_pw_78x8)] + vpbroadcastd m9, [o(vvc_pw_73x8)] + vpbroadcastd m8, [o(vvc_pw_54x8)] + vpbroadcastd m14, [o(vvc_pw_n13x8)] + vpbroadcastd m15, [o(vvc_pw_90x8)] + pmulhrsw m10, m5 ; t21a + pmulhrsw m5, m7 ; t26a + pmulhrsw m9, m6 ; t25a + pmulhrsw m6, m8 ; t22a + pmulhrsw m14, m1 ; t23a + pmulhrsw m1, m15 ; t24a + vpbroadcastd m15, [o(vvc_pd_64)] + jmp .main2 +ALIGN function_align +cglobal_label .main_oddhalf + mova [rsp+gprsize+32*0], m15 + mova [rsp+gprsize+32*1], m7 + mova [rsp+gprsize+32*2], m8 + vpbroadcastd m15, [o(vvc_pd_64)] + ITX_MULSUB_2W 4, 11, 7, 8, 15, 38, 82, 0 ; t18a, t29a + ITX_MULSUB_2W 12, 3, 7, 8, 15, 85, 31, 0 ; t19a, t28a + ITX_MULSUB_2W 2, 13, 7, 8, 15, 22, 88, 0 ; t20a, t27a + ITX_MULSUB_2W 10, 5, 7, 8, 15, 78, 46, 0 ; t21a, t26a + ITX_MULSUB_2W 6, 9, 7, 8, 15, 54, 73, 0 ; t22a, t25a + ITX_MULSUB_2W 14, 1, 7, 8, 15, 90, 13, 0 ; t23a, t24a +.main2: + psubsw m7, m12, m4 ; t18 + paddsw m12, m4 ; t19 + psubsw m4, m2, m10 ; t21 + paddsw m2, m10 ; t20 + psubsw m10, m14, m6 ; t22 + paddsw m14, m6 ; t23 + psubsw m6, m1, m9 ; t25 + paddsw m1, m9 ; t24 + psubsw m9, m13, m5 ; t26 + paddsw m13, m5 ; t27 + psubsw m5, m3, m11 ; t29 + paddsw m3, m11 ; t28 + ITX_MULSUB_2W 5, 7, 8, 11, 15, m89, 18, 0 ; t18a, t29a + ITX_MULSUB_2W 9, 4, 8, 11, 15, 75, 50, 0 ; t21a, t26a + ITX_MULSUB_2W 6, 10, 8, 11, 15, m50, 75, 0 ; t22a, t25a + psubsw m8, m14, m2 ; t20a + paddsw m14, m2 ; t23a + psubsw m2, m1, m13 ; t27a + paddsw m1, m13 ; t24a + psubsw m13, m6, m9 ; t21 + paddsw m6, m9 ; t22 + psubsw m9, m10, m4 ; t26 + paddsw m10, m4 ; t25 + ITX_MULSUB_2W 2, 8, 4, 11, 15, m83, 36, 0 ; t20, t27 + ITX_MULSUB_2W 9, 13, 4, 11, 15, m83, 36, 0 ; t21a, t26a + mova m4, [rsp+gprsize+32*0] ; in31 + mova [rsp+gprsize+32*0], m6 ; t22 + mova m6, [rsp+gprsize+32*1] ; in15 + mova [rsp+gprsize+32*1], m14 ; t23a + mova m14, [rsp+gprsize+32*2] ; in17 + mova [rsp+gprsize+32*2], m1 ; t24a + ITX_MULSUB_2W 0, 4, 1, 11, 15, 4, 90, 0 ; t16a, t31a + ITX_MULSUB_2W 14, 6, 1, 11, 15, 67, 61, 0 ; t17a, t30a + psubsw m1, m0, m14 ; t17 + paddsw m0, m14 ; t16 + psubsw m14, m4, m6 ; t30 + paddsw m4, m6 ; t31 + ITX_MULSUB_2W 14, 1, 6, 11, 15, 18, 89, 0 ; t17a, t30a + psubsw m6, m0, m12 ; t19a + paddsw m0, m12 ; t16a + psubsw m12, m4, m3 ; t28a + paddsw m4, m3 ; t31a + psubsw m3, m14, m5 ; t18 + paddsw m14, m5 ; t17 + psubsw m5, m1, m7 ; t29 + paddsw m1, m7 ; t30 + ITX_MULSUB_2W 5, 3, 7, 11, 15, 36, 83, 0 ; t18a, t29a + ITX_MULSUB_2W 12, 6, 7, 11, 15, 36, 83, 0 ; t19, t28 + psubsw m7, m1, m10 ; t25a + paddsw m1, m10 ; t30a + psubsw m10, m5, m9 ; t21 + paddsw m5, m9 ; t18 + psubsw m9, m12, m2 ; t20a + paddsw m12, m2 ; t19a + psubsw m2, m3, m13 ; t26 + paddsw m3, m13 ; t29 + psubsw m13, m6, m8 ; t27a + paddsw m6, m8 ; t28a + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m12 + mova [tmp2q+32*0], m6 + mova [tmp2q+32*1], m3 + mova [tmp2q+32*2], m1 + mova m5, [rsp+gprsize+32*0] ; t22 + mova m6, [rsp+gprsize+32*1] ; t23 + mova m3, [rsp+gprsize+32*2] ; t24a + psubsw m1, m14, m5 ; t22a + paddsw m14, m5 ; t17a + psubsw m5, m0, m6 ; t23 + paddsw m0, m6 ; t16 + psubsw m6, m4, m3 ; t24 + paddsw m4, m3 ; t31 + vpbroadcastd m8, [o(vvc_pw_m64_64)] + vpbroadcastd m3, [o(vvc_pw_64_64)] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m14 + mova [tmp2q+32*3], m4 + ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8, 1 ; t20, t27 + ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8, 1 ; t21a, t26a + ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8, 1 ; t22, t25 + ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8, 1 ; t23a, t24a + mova [tmp1q+32*0], m13 + mova [tmp1q+32*1], m2 + mova [tmp1q+32*2], m7 + mova [tmp1q+32*3], m6 + mova [tmp2q-32*4], m5 + mova [tmp2q-32*3], m1 + mova [tmp2q-32*2], m10 + mova [tmp2q-32*1], m9 + ret +ALIGN function_align +.transpose_2x8x8_round: + punpckhwd m6, m12, m13 + punpcklwd m12, m13 + punpckhwd m13, m8, m9 + punpcklwd m8, m9 + punpckhwd m9, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m10, m11 + punpcklwd m10, m11 + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 + punpckhdq m11, m8, m10 + punpckldq m8, m10 + punpckldq m10, m12, m14 + punpckhdq m12, m14 + punpckhdq m14, m13, m15 + punpckldq m13, m15 + punpckldq m15, m6, m9 + punpckhdq m6, m9 + punpckhqdq m9, m8, m10 + punpcklqdq m8, m10 + punpcklqdq m10, m11, m12 + punpckhqdq m11, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m14, m6 + punpcklqdq m14, m6 + pmulhrsw m6, m7, [rsp+gprsize+32*0] + REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 + pmulhrsw m7, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m15 + vpbroadcastd m15, [o(vvc_pw_2048)] + IDCT2_32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 + IDCT2_32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 + IDCT2_32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 + IDCT2_32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT2_32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 + IDCT2_32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 + IDCT2_32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 + IDCT2_32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + IDCT2_32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 + IDCT2_32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 + IDCT2_32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 + IDCT2_32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m7, [rsp+gprsize+32*0] + mova m1, [rsp+gprsize+32*2] + IDCT2_32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 + IDCT2_32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 + IDCT2_32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 + IDCT2_32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 + ret + +; Perform the final sumsub step and YMM lane shuffling +%macro IDCT2_32_PASS1_END 4 ; row[1-2], tmp[1-2] + mova m%3, [tmp2q+32*( 3-%1)] + psubsw m%4, m%1, m%3 + paddsw m%1, m%3 + mova m%3, [tmp1q+32*(11-%2)] + mova [tmp1q+32*(11-%2)+16], xm%4 + vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 + paddsw m%4, m%2, m%3 + psubsw m%2, m%3 + mova [tmp1q+32*(11-%2)], xm%2 + vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 + vperm2i128 m%2, m%1, m%4, 0x31 + vinserti128 m%1, xm%4, 1 +%endmacro + +cglobal vvc_inv_dct2_dct2_32x16_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 16 + jmp m(vvc_inv_dct2_dct2_32x8_8).dconly +.normal: + PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 + vpbroadcastd m15, [o(vvc_pw_64x8)] + pmulhrsw m0, m15, [cq+32* 1] + pmulhrsw m1, m15, [cq+32* 3] + pmulhrsw m2, m15, [cq+32* 5] + pmulhrsw m3, m15, [cq+32* 7] + pmulhrsw m4, m15, [cq+32* 9] + pmulhrsw m5, m15, [cq+32*11] + pmulhrsw m6, m15, [cq+32*13] + pmulhrsw m7, m15, [cq+32*15] + pmulhrsw m8, m15, [cq+32*17] + pmulhrsw m9, m15, [cq+32*19] + pmulhrsw m10, m15, [cq+32*21] + pmulhrsw m11, m15, [cq+32*23] + pmulhrsw m12, m15, [cq+32*25] + pmulhrsw m13, m15, [cq+32*27] + pmulhrsw m14, m15, [cq+32*29] + pmulhrsw m15, [cq+32*31] + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + LOAD_16ROWS cq+32*0, 32*2, 1, 0 + pxor m15, m15 + mov r3d, 8 +.zero_loop: + mova [cq+32*0], m15 + mova [cq+32*1], m15 + mova [cq+32*2], m15 + mova [cq+32*3], m15 + add cq, 32*4 + dec r3d + jg .zero_loop + call m(idct2_16x16_internal_8).main + call .pass1_end + lea r2, [strideq*3] + mov r3, dstq +.pass2: + vpbroadcastd m7, [o(vvc_pw_16384)] + call m(vvc_inv_dct2_dct2_16x32_8).transpose_2x8x8_round + call m(idct2_16x16_internal_8).main + mova [rsp+32*2], m15 + vpbroadcastd m15, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m15}, m2, m3, m0 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m4, m5, m6, m7 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m8, m9, m10, m11 + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m11, m12, m13, m14 + pmulhrsw m15, [rsp+32*2] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + test r3, r3 + jnz .right_half + RET +.right_half: + LOAD_8ROWS tmp1q-32*4, 32 + LOAD_8ROWS_H tmp2q-32*4, 32 + lea dstq, [r3+16] + xor r3d, r3d + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + jmp .pass2 +ALIGN function_align +.pass1_end: + mova [rsp+gprsize+32*0], m9 + IDCT2_32_PASS1_END 0, 8, 1, 9 + IDCT2_32_PASS1_END 2, 10, 1, 9 + IDCT2_32_PASS1_END 3, 11, 1, 9 + IDCT2_32_PASS1_END 4, 12, 1, 9 + IDCT2_32_PASS1_END 5, 13, 1, 9 + IDCT2_32_PASS1_END 6, 14, 1, 9 + IDCT2_32_PASS1_END 7, 15, 1, 9 + mova m1, [rsp+gprsize+32*1] + mova m9, [rsp+gprsize+32*0] + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*1], m7 + IDCT2_32_PASS1_END 1, 9, 6, 7 + ret + +cglobal vvc_inv_identity_identity_16x32_8, 4, 5, 13, dst, stride, c, eob +%undef cmp + lea r6, [o_base] + vpbroadcastd m9, [o(vvc_pw_64x8)] + vpbroadcastd m10, [o(vvc_pw_1697x16)] + vpbroadcastd m12, [o(vvc_pw_8192)] + cmp eobd, 43 ; if (eob > 43) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg al ; iteration_count++ + add eobd, -279 ; if (eob > 278) + adc r4b, al ; iteration_count++ + lea r3, [strideq*3] + mov r6, cq + paddw m11, m12, m12 ; vvc_pw_16384 +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jge .loop + sub cq, 32 + pxor m0, m0 + mov r0d, 8 + cmp cq, r6 + ja .zero_loop +.zero_loop_half: + mova [r6+64*0], m0 + mova [r6+64*1], m0 + add r6, 64*4 + mova [r6-64*2], m0 + mova [r6-64*1], m0 + sub r0d, 2 + jg .zero_loop_half + RET +.zero_loop: + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + mova [r6+32*3], m0 + add r6, 32*4 + dec r0d + jg .zero_loop + RET + +cglobal vvc_inv_identity_identity_32x16_8, 4, 6, 12, dst, stride, c, eob +%undef cmp + lea r6, [o_base] + vpbroadcastd m9, [o(vvc_pw_64x8)] + vpbroadcastd m10, [o(vvc_pw_1697x16)] + vpbroadcastd m11, [o(vvc_pw_2048)] + cmp eobd, 35 ; if (eob > 35) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg r3b ; iteration_count += 2 + lea r4d, [r4+r3*2] + lea r3, [strideq*3] + mov r5, dstq + mov r6, cq +.loop: + mova xm0, [cq+32* 0] + mova xm1, [cq+32* 1] + vinserti128 m0, [cq+32* 8], 1 + vinserti128 m1, [cq+32* 9], 1 + mova xm2, [cq+32* 2] + mova xm3, [cq+32* 3] + vinserti128 m2, [cq+32*10], 1 + vinserti128 m3, [cq+32*11], 1 + mova xm4, [cq+32* 4] + mova xm5, [cq+32* 5] + vinserti128 m4, [cq+32*12], 1 + vinserti128 m5, [cq+32*13], 1 + mova xm6, [cq+32* 6] + mova xm7, [cq+32* 7] + vinserti128 m6, [cq+32*14], 1 + vinserti128 m7, [cq+32*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jl .ret + test r4b, 1 + jz .loop + add cq, 32*15 + lea dstq, [r5+16] + jmp .loop +.ret: + sub cd, eax + pxor m0, m0 + add cd, 384 +.zero_loop: + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + mova [r6+32*3], m0 + add r6, 32*4 + sub cd, 128 + jge .zero_loop + RET + +cglobal vvc_inv_dct2_dct2_32x32_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 32 + jmp m(vvc_inv_dct2_dct2_32x8_8).dconly +.normal: + PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + %undef cmp + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + sub eobd, 136 + mov tmp4d, eobd +.pass1_loop: + LOAD_8ROWS cq+64*1, 64*2 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test tmp4d, tmp4d + jl .fast + LOAD_8ROWS_H cq+64*17, 64*2 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2 + pxor m0, m0 + REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + mova [rsp], m15 + jmp .idct2_16 +.fast: + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct2_16: + LOAD_8ROWS cq+64*0, 64*2 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct2_16x16_internal_8).main + call m(vvc_inv_dct2_dct2_32x16_8).pass1_end + vpbroadcastd m7, [o(vvc_pw_8192)] + call m(vvc_inv_dct2_dct2_16x32_8).transpose_2x8x8_round + lea tmp3q, [tmp1q+32*32] + mova m15, [rsp] + mova [tmp3q-32*4], m0 + mova [tmp3q-32*3], m2 + mova [tmp3q-32*2], m4 + mova [tmp3q-32*1], m6 + mova [tmp3q+32*0], m8 + mova [tmp3q+32*1], m10 + mova [tmp3q+32*2], m12 + mova [tmp3q+32*3], m14 + add tmp3q, 32*8 + mova [tmp3q-32*4], m1 + mova [tmp3q-32*3], m3 + mova [tmp3q-32*2], m5 + mova [tmp3q-32*1], m7 + mova [tmp3q+32*0], m9 + mova [tmp3q+32*1], m11 + mova [tmp3q+32*2], m13 + mova [tmp3q+32*3], m15 + vpbroadcastd m9, [o(vvc_pw_8192)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*16 + add tmp2q, 32*16 + add eobd, 0x80000000 + jnc .pass1_loop + add tmp1q, 32*24 + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + test tmp4d, tmp4d + jge .pass2_loop + add tmp1q, 32*16 + add tmp2q, 32*16 + add tmp3q, 32*16 +.pass2_loop: + LOAD_8ROWS tmp2q-32*4, 32 + test tmp4d, tmp4d + jl .fast2 + LOAD_8ROWS_H tmp3q-32*4, 32 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + sub tmp3q, 32*8 + LOAD_8ROWS_H tmp3q-32*4, 32 + sub tmp3q, 32*16 + jmp .pass2_loop_end +.fast2: + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + sub tmp3q, 32*24 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 +.pass2_loop_end: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct2_16x16_internal_8).main + call m(vvc_inv_dct2_dct2_16x32_8).pass2_end + lea tmp3q, [tmp1q-32*32] + cmp tmp2q, tmp3q + jb .ret + sub tmp2q, 32*32 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + jmp .pass2_loop +.ret: + RET + +cglobal vvc_inv_identity_identity_32x32_8, 4, 6, 10, dst, stride, c, eob + %undef cmp + vpbroadcastd m9, [vvc_pw_8192] + sub eobd, 136 ; if (eob < 136) + shr eobd, 30 ; topleft 16x16 only + lea eobd, [eobq*2-8] + lea r4, [strideq*3] + mov r5, dstq + lea r6, [cq+32] +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + add cq, 16 + inc eobd + jz .ret + test eobd, 3 + jnz .loop + add cq, 64*15 + lea dstq, [r5+16] + jmp .loop +.ret: + pxor m0, m0 + mov r0d, 16 + cmp cq, r6 + jne .zero_loop +.zero_loop_topleft: + mova [r6-32*1], m0 + mova [r6+32*1], m0 + mova [r6+32*3], m0 + mova [r6+32*5], m0 + add r6, 64*4 + sub r0d, 4 + jg .zero_loop_topleft + RET +.zero_loop: + mova [r6-32*1], m0 + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + add r6, 32*4 + dec r0d + jg .zero_loop + RET + +%macro IDCT2_64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [tmp2q-32*(51-%1)] ; idct2_16 out 0+n + mova m%4, [tmp1q-32*(14+%1)] ; idct2_32 out31-n +%else + mova m%5, [tmp1q-32*(45-%1)] + mova m%4, [tmp2q-32*(20+%1)] +%endif + psubsw m%6, m%5, m%4 ; idct2_32 out31-n + paddsw m%5, m%4 ; idct2_32 out 0+n + psubsw m%4, m%6, m%3 ; out32+n + paddsw m%6, m%3 ; out31-n + psubsw m%3, m%5, m%2 ; out63-n + paddsw m%5, m%2 ; out 0+n +%if %0 == 6 ; pass 1 +%if %1 & 1 + mova [tmp2q-32*(19-%1)], m%4 + mova [tmp1q-32*(14+%1)], m%6 + mova [tmp1q+32*(18-%1)], m%3 + mova [tmp2q-32*(51-%1)], m%5 +%else + mova [tmp1q-32*(13-%1)], m%4 + mova [tmp2q-32*(20+%1)], m%6 + mova [tmp2q+32*(12-%1)], m%3 + mova [tmp1q-32*(45-%1)], m%5 +%endif +%else ; pass 2 + REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + pmovzxbw m%2, [%%d0+%9 ] + paddw m%2, m%4 + pmovzxbw m%4, [%%d1+%8 ] + paddw m%4, m%6 + pmovzxbw m%6, [%%d1+%10] + paddw m%3, m%6 + pmovzxbw m%6, [%%d0+%7 ] + paddw m%5, m%6 + packuswb m%2, m%4 + packuswb m%3, m%5 + vpermq m%2, m%2, q3120 + vpermq m%3, m%3, q3120 + mova [%%d0+%9 ], xm%2 + vextracti128 [%%d1+%8 ], m%2, 1 + mova [%%d1+%10], xm%3 + vextracti128 [%%d0+%7 ], m%3, 1 +%endif +%endmacro + +cglobal vvc_inv_dct2_dct2_16x64_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 64 + jmp m(vvc_inv_dct2_dct2_16x4_8).dconly +.normal: + PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + %undef cmp + lea tmp1q, [rsp+32*23] + lea tmp2q, [tmp1q+32*24] + sub eobd, 151 + mov r7d, eobd +.pass1_loop: + LOAD_16ROWS cq, 64 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(vvc_pw_8192)] + call m(vvc_inv_dct2_dct2_16x32_8).transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m2 + mova [tmp1q-32*2], m4 + mova [tmp1q-32*1], m6 + mova [tmp1q+32*0], m8 + mova [tmp1q+32*1], m10 + mova [tmp1q+32*2], m12 + mova [tmp1q+32*3], m14 + mova [tmp2q-32*4], m1 + mova [tmp2q-32*3], m3 + mova [tmp2q-32*2], m5 + mova [tmp2q-32*1], m7 + mova [tmp2q+32*0], m9 + mova [tmp2q+32*1], m11 + mova [tmp2q+32*2], m13 + mova [tmp2q+32*3], m15 + add cq, 32 + add tmp1q, 32*8 + add tmp2q, 32*8 + add eobd, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*23] + mova xm0, [r2-32*4+ 0] + mova xm1, [r2-32*2+ 0] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m1, [r2+32*2+ 0], 1 + mova xm2, [r2-32*4+16] + mova xm3, [r2-32*2+16] + vinserti128 m2, [r2+32*0+16], 1 + vinserti128 m3, [r2+32*2+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r7d, r7d + jl .fast + lea r3, [r2+32*8] + mova xm4, [r3-32*4+ 0] + mova xm5, [r3-32*2+ 0] + vinserti128 m4, [r3+32*0+ 0], 1 + vinserti128 m5, [r3+32*2+ 0], 1 + mova xm6, [r3-32*4+16] + mova xm7, [r3-32*2+16] + vinserti128 m6, [r3+32*0+16], 1 + vinserti128 m7, [r3+32*2+16], 1 +.fast: + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova xm0, [r2-32*3+ 0] + mova xm1, [r2-32*1+ 0] + vinserti128 m0, [r2+32*1+ 0], 1 + vinserti128 m1, [r2+32*3+ 0], 1 + mova xm2, [r2-32*3+16] + mova xm3, [r2-32*1+16] + vinserti128 m2, [r2+32*1+16], 1 + vinserti128 m3, [r2+32*3+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r7d, r7d + jl .fast2 + mova xm4, [r3-32*3+ 0] + mova xm5, [r3-32*1+ 0] + vinserti128 m4, [r3+32*1+ 0], 1 + vinserti128 m5, [r3+32*3+ 0], 1 + mova xm6, [r3-32*3+16] + mova xm7, [r3-32*1+16] + vinserti128 m6, [r3+32*1+16], 1 + vinserti128 m7, [r3+32*3+16], 1 +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + add r2, 32*24 + vpbroadcastd m15, [o(vvc_pd_64)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova xm0, [r2-32*4+ 0] + mova xm3, [r2-32*1+16] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m3, [r2+32*3+16], 1 + mova xm4, [r2-32*4+16] + mova xm7, [r2-32*1+ 0] + vinserti128 m4, [r2+32*0+16], 1 + vinserti128 m7, [r2+32*3+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast3 + add r3, 32*24 + mova xm1, [r3-32*1+16] + mova xm2, [r3-32*4+ 0] + vinserti128 m1, [r3+32*3+16], 1 + vinserti128 m2, [r3+32*0+ 0], 1 + mova xm5, [r3-32*1+ 0] + mova xm6, [r3-32*4+16] + vinserti128 m5, [r3+32*3+ 0], 1 + vinserti128 m6, [r3+32*0+16], 1 +.fast3: + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova xm0, [r2-32*2+ 0] + mova xm3, [r2-32*3+16] + vinserti128 m0, [r2+32*2+ 0], 1 + vinserti128 m3, [r2+32*1+16], 1 + mova xm4, [r2-32*2+16] + mova xm7, [r2-32*3+ 0] + vinserti128 m4, [r2+32*2+16], 1 + vinserti128 m7, [r2+32*1+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast4 + mova xm1, [r3-32*3+16] + mova xm2, [r3-32*2+ 0] + vinserti128 m1, [r3+32*1+16], 1 + vinserti128 m2, [r3+32*2+ 0], 1 + mova xm5, [r3-32*3+ 0] + mova xm6, [r3-32*2+16] + vinserti128 m5, [r3+32*1+ 0], 1 + vinserti128 m6, [r3+32*2+16], 1 +.fast4: + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass2 + RET +ALIGN function_align +%define o_base idct2_64_mul - 8 +cglobal_label .main_part1 + ; idct2_64 steps 1-5: + ; in1/31/17/15/ 9/23/25/ 7 -> + ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a + ; in5/27/21/11/13/19/29/ 3 -> + ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a + vpbroadcastd m11, [o(idct2_64_mul+4* 0)] + vpbroadcastd m13, [o(idct2_64_mul+4* 1)] + vpbroadcastd m10, [o(idct2_64_mul+4* 4)] + vpbroadcastd m12, [o(idct2_64_mul+4* 5)] + pmulhrsw m11, m0 ; t63a + pmulhrsw m0, m13 ; t32a + pmulhrsw m10, m1 ; t62a + pmulhrsw m1, m12 ; t33a + vpbroadcastd m9, [o(idct2_64_mul+4* 8)] + vpbroadcastd m13, [o(idct2_64_mul+4* 9)] + vpbroadcastd m8, [o(idct2_64_mul+4*12)] + vpbroadcastd m12, [o(idct2_64_mul+4*13)] + pmulhrsw m9, m2 ; t61a + pmulhrsw m2, m13 ; t34a + pmulhrsw m8, m3 ; t60a + pmulhrsw m3, m12 ; t35a + psubsw m12, m0, m1 ; t33 + paddsw m0, m1 ; t32 + psubsw m1, m3, m2 ; t34 + paddsw m3, m2 ; t35 + psubsw m2, m8, m9 ; t61 + paddsw m8, m9 ; t60 + psubsw m9, m11, m10 ; t62 + paddsw m11, m10 ; t63 + ITX_MULSUB_2W 2, 1, 10, 13, 15, m90, 9, 0 ; t34a, t61a + vpbroadcastd m14, [o(vvc_pw_9_90)] + ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13, 1 ; t33a, t62a + psubsw m10, m0, m3 ; t35a + paddsw m0, m3 ; t32a + psubsw m3, m11, m8 ; t60a + paddsw m11, m8 ; t63a + psubsw m8, m9, m2 ; t34 + paddsw m9, m2 ; t33 + psubsw m2, m12, m1 ; t61 + paddsw m12, m1 ; t62 + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m9 + mova [tmp2q+32*2], m12 + mova [tmp2q+32*3], m11 + vpbroadcastd m13, [o(vvc_pw_m89_18)] + vpbroadcastd m14, [o(vvc_pw_18_89)] + ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13, 1 ; t34a, t61a + ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13, 1 ; t35, t60 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp2q+32*0], m10 + mova [tmp2q+32*1], m8 + vpbroadcastd m3, [o(idct2_64_mul+4*16)] + vpbroadcastd m11, [o(idct2_64_mul+4*17)] + vpbroadcastd m2, [o(idct2_64_mul+4*20)] + vpbroadcastd m10, [o(idct2_64_mul+4*21)] + vpbroadcastd m1, [o(idct2_64_mul+4*24)] + vpbroadcastd m9, [o(idct2_64_mul+4*25)] + vpbroadcastd m0, [o(idct2_64_mul+4*28)] + vpbroadcastd m8, [o(idct2_64_mul+4*29)] + pmulhrsw m3, m4 ; t59a + pmulhrsw m4, m11 ; t36a + pmulhrsw m2, m5 ; t58a + pmulhrsw m5, m10 ; t37a + pmulhrsw m1, m6 ; t57a + pmulhrsw m6, m9 ; t38a + pmulhrsw m0, m7 ; t56a + pmulhrsw m7, m8 ; t39a + psubsw m8, m4, m5 ; t37 + paddsw m4, m5 ; t36 + psubsw m5, m7, m6 ; t38 + paddsw m7, m6 ; t39 + psubsw m6, m0, m1 ; t57 + paddsw m0, m1 ; t56 + psubsw m1, m3, m2 ; t58 + paddsw m3, m2 ; t59 + ITX_MULSUB_2W 6, 5, 2, 9, 15, m57, 70, 0 ; t38a, t57a + vpbroadcastd m10, [o(vvc_pw_70_57)] + ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9, 1 ; t37a, t58a + psubsw m2, m7, m4 ; t36a + paddsw m7, m4 ; t39a + psubsw m4, m0, m3 ; t59a + paddsw m0, m3 ; t56a + psubsw m3, m6, m1 ; t37 + paddsw m6, m1 ; t38 + psubsw m1, m5, m8 ; t58 + paddsw m5, m8 ; t57 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + mova [tmp2q-32*4], m0 + mova [tmp2q-32*3], m5 + vpbroadcastd m6, [o(vvc_pw_m18_m89)] + vpbroadcastd m7, [o(vvc_pw_m89_18)] + ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6, 1 ; t36, t59 + ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6, 1 ; t37a, t58a + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m1 + mova [tmp2q-32*2], m3 + mova [tmp2q-32*1], m2 + ret +%define o_base vvc_pw_5 + 128 +.main_part2_pass1: ; idct2_64 steps 6-9 + idct2_16/32/64 sumsub + sub r6, o_idct2_64_offset + 8 + vpbroadcastd m11, [o(vvc_pw_36_83)] + vpbroadcastd m12, [o(vvc_pw_m83_36)] + vpbroadcastd m13, [o(vvc_pw_64_64)] + vpbroadcastd m14, [o(vvc_pw_m64_64)] +.main_part2_pass1_loop: + call .main_part2_internal + IDCT2_64_PART2_END 0, 7, 0, 6, 9, 10 + IDCT2_64_PART2_END 7, 8, 5, 0, 6, 7 + IDCT2_64_PART2_END 8, 2, 1, 0, 6, 7 + IDCT2_64_PART2_END 15, 3, 4, 0, 6, 7 + cmp tmp1q, tmp2q + jne .main_part2_pass1_loop + ret +cglobal_label .main_part2_internal + mova m0, [tmp1q-32*12] ; t32a + mova m6, [tmp2q-32*13] ; t39a + mova m1, [tmp1q-32* 4] ; t40a + mova m5, [tmp2q+32* 3] ; t55a + add tmp1q, 32 + sub tmp2q, 32 + mova m2, [tmp1q+32* 3] ; t48a + mova m4, [tmp2q-32* 4] ; t47a + mova m3, [tmp1q+32*11] ; t56a + mova m7, [tmp2q+32*12] ; t63a + psubsw m8, m0, m6 ; t39 + paddsw m0, m6 ; t32 + psubsw m6, m4, m1 ; t40 + paddsw m4, m1 ; t47 + psubsw m1, m2, m5 ; t55 + paddsw m2, m5 ; t48 + psubsw m5, m7, m3 ; t56 + paddsw m7, m3 ; t63 + ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12, 1 ; t39a, t56a + vpbroadcastd m9, [o(vvc_pw_m36_m83)] + ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9, 1 ; t40a, t55a + psubsw m3, m0, m4 ; t47a + paddsw m0, m4 ; t32a + psubsw m4, m7, m2 ; t48a + paddsw m7, m2 ; t63a + psubsw m2, m5, m1 ; t40 + paddsw m5, m1 ; t39 + psubsw m1, m8, m6 ; t55 + paddsw m8, m6 ; t56 + ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14, 1 ; t47, t48 + ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14, 1 ; t40a, t55a + ret +.main_part2_pass2: + sub r6, o_idct2_64_offset + 8 + vpbroadcastd m11, [o(vvc_pw_36_83)] + vpbroadcastd m12, [o(vvc_pw_m83_36)] + vpbroadcastd m13, [o(vvc_pw_64_64)] + lea r9, [strideq*5] ; stride*5 + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + lea r8, [r3+strideq*2] ; stride*8 + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [o(vvc_pw_m64_64)] + call .main_part2_internal + vpbroadcastd m14, [o(vvc_pw_2048)] + IDCT2_64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 + IDCT2_64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 + IDCT2_64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + IDCT2_64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp tmp1q, tmp2q + jne .main_part2_pass2_loop + ret + +cglobal vvc_inv_dct2_dct2_64x16_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 16 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [o(vvc_pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m1, m1 +.dconly_loop: + mova m2, [dstq+32*0] + mova m3, [dstq+32*1] + punpckhbw m4, m2, m1 + punpcklbw m2, m1 + punpckhbw m5, m3, m1 + punpcklbw m3, m1 + paddw m4, m0 + paddw m2, m0 + paddw m5, m0 + paddw m3, m0 + packuswb m2, m4 + packuswb m3, m5 + mova [dstq+32*0], m2 + mova [dstq+32*1], m3 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + LOAD_8ROWS cq+32*0, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+32*2, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + vpbroadcastd m15, [o(vvc_pd_64)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+32* 1] + mova m1, [cq+32*31] + mova m2, [cq+32*17] + mova m3, [cq+32*15] + mova m4, [cq+32* 9] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32* 7] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+32* 5] + mova m1, [cq+32*27] + mova m2, [cq+32*21] + mova m3, [cq+32*11] + mova m4, [cq+32*13] + mova m5, [cq+32*19] + mova m6, [cq+32*29] + mova m7, [cq+32* 3] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass1 + sub tmp1q, 32*36 + lea r2, [strideq*3] + mov tmp2d, 4 +.pass2_loop: + lea r3, [tmp1q-32*8] + mova xm0, [r3 -32*4] + mova xm1, [r3 -32*3] + vinserti128 m0, [tmp1q-32*4], 1 + vinserti128 m1, [tmp1q-32*3], 1 + mova xm2, [r3 -32*2] + mova xm3, [r3 -32*1] + vinserti128 m2, [tmp1q-32*2], 1 + vinserti128 m3, [tmp1q-32*1], 1 + mova xm4, [r3 +32*0] + mova xm5, [r3 +32*1] + vinserti128 m4, [tmp1q+32*0], 1 + vinserti128 m5, [tmp1q+32*1], 1 + mova xm6, [r3 +32*2] + mova xm7, [r3 +32*3] + vinserti128 m6, [tmp1q+32*2], 1 + vinserti128 m7, [tmp1q+32*3], 1 + mova xm8, [r3 -32*4+16] + mova xm9, [r3 -32*3+16] + vinserti128 m8, [tmp1q-32*4+16], 1 + vinserti128 m9, [tmp1q-32*3+16], 1 + mova xm10, [r3 -32*2+16] + mova xm11, [r3 -32*1+16] + vinserti128 m10, [tmp1q-32*2+16], 1 + vinserti128 m11, [tmp1q-32*1+16], 1 + mova xm12, [r3 +32*0+16] + mova xm13, [r3 +32*1+16] + vinserti128 m12, [tmp1q+32*0+16], 1 + vinserti128 m13, [tmp1q+32*1+16], 1 + mova xm14, [r3 +32*2+16] + mova xm15, [r3 +32*3+16] + vinserti128 m14, [tmp1q+32*2+16], 1 + vinserti128 m15, [tmp1q+32*3+16], 1 + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(vvc_pw_8192)] + call m(vvc_inv_dct2_dct2_16x32_8).transpose_2x8x8_round + call m(idct2_16x16_internal_8).main + mova [rsp+32*0], m15 + vpbroadcastd m15, [o(vvc_pw_2048)] + REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 + lea r3, [r3+strideq*4] + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + pmulhrsw m15, [rsp+32*0] + lea r3, [r3+strideq*4] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + add tmp1q, 32*16 + add r0, 16 + dec tmp2d + jg .pass2_loop + RET + +cglobal vvc_inv_dct2_dct2_32x64_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 64 + jmp m(vvc_inv_dct2_dct2_32x8_8).dconly +.normal: + PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*7] + lea r10d, [eobq-136] + sar r10d, 31 +.pass1_loop: + lea tmp2q, [tmp1q+32*16] + LOAD_8ROWS cq+64*1, 64*2, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test r10b, r10b + jnz .fast + LOAD_8ROWS_H cq+64*17, 64*2, 2 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2, 1 + mova [rsp], m15 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + jmp .idct2_16 +.fast: + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct2_16: + LOAD_8ROWS cq+64*0, 64*2, 1 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct2_16x16_internal_8).main + call m(vvc_inv_dct2_dct2_32x16_8).pass1_end + vpbroadcastd m7, [o(vvc_pw_16384)] + call m(vvc_inv_dct2_dct2_16x32_8).transpose_2x8x8_round + lea r3, [tmp1q+32*48] + mova m15, [rsp] + mova [r3-32*4], m0 + mova [r3-32*3], m2 + mova [r3-32*2], m4 + mova [r3-32*1], m6 + mova [r3+32*0], m8 + mova [r3+32*1], m10 + mova [r3+32*2], m12 + mova [r3+32*3], m14 + add r3, 32*24 + mova [r3-32*4], m1 + mova [r3-32*3], m3 + mova [r3-32*2], m5 + mova [r3-32*1], m7 + mova [r3+32*0], m9 + mova [r3+32*1], m11 + mova [r3+32*2], m13 + mova [r3+32*3], m15 + vpbroadcastd m9, [o(vvc_pw_16384)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*8 + add r10d, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*55] + lea r7, [r2+32*24] +.pass2_loop: + lea r3, [r2+32*8] + lea r8, [r7+32*8] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r10b, r10b + jnz .fast2 + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast2: + mova [rsp], m8 + lea tmp1q, [rsp+32*39] + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10b, r10b + jnz .fast3 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast3: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + vpbroadcastd m15, [o(vvc_pd_64)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r7-32*4] + mova m3, [r7+32*3] + mova m4, [r7+32*0] + mova m7, [r7-32*1] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast4 + mova m1, [r8+32*3] + mova m2, [r8-32*4] + mova m5, [r8-32*1] + mova m6, [r8+32*0] +.fast4: + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r7-32*2] + mova m3, [r7+32*1] + mova m4, [r7+32*2] + mova m7, [r7-32*3] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast5 + mova m1, [r8+32*1] + mova m2, [r8-32*2] + mova m5, [r8-32*3] + mova m6, [r8+32*2] +.fast5: + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass2 + add r10d, 0x80000000 + jc .ret + lea r2, [rsp+32*7] + lea r7, [r2+32*16] + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + jmp .pass2_loop +.ret: + RET + +cglobal vvc_inv_dct2_dct2_64x32_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 32 + jmp m(vvc_inv_dct2_dct2_64x16_8).dconly +.normal: + PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + lea tmp1q, [rsp+32*7] + lea tmp4d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + vpbroadcastd m15, [o(vvc_pd_64)] + add tmp1q, 32*16 + add tmp2q, 32*32 + vpbroadcastd m7, [o(vvc_pw_64x8)] + pmulhrsw m0, m7, [cq+64* 1] + pmulhrsw m1, m7, [cq+64*31] + pmulhrsw m2, m7, [cq+64*17] + pmulhrsw m3, m7, [cq+64*15] + pmulhrsw m4, m7, [cq+64* 9] + pmulhrsw m5, m7, [cq+64*23] + pmulhrsw m6, m7, [cq+64*25] + pmulhrsw m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + vpbroadcastd m7, [o(vvc_pw_64x8-(o_idct2_64_offset))] + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + pmulhrsw m0, m7, [cq+64* 5] + pmulhrsw m1, m7, [cq+64*27] + pmulhrsw m2, m7, [cq+64*21] + pmulhrsw m3, m7, [cq+64*11] + pmulhrsw m4, m7, [cq+64*13] + pmulhrsw m5, m7, [cq+64*19] + pmulhrsw m6, m7, [cq+64*29] + pmulhrsw m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(vvc_pw_16384)] + call m(vvc_inv_dct2_dct2_64x32_8).transpose_round_interleave + add cq, 32 + add tmp4d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*15] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + mov tmp4b, 4 +.pass2_loop: + lea tmp2q, [tmp1q+32*64] + LOAD_8ROWS tmp1q-32*4, 32 + test tmp4d, 0x40000000 + jnz .fast + LOAD_8ROWS_H tmp2q-32*4, 32 + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf + lea tmp3q, [tmp2q-32*8] + LOAD_8ROWS_H tmp3q-32*4, 32 + mova [rsp], m15 + jmp .idct2_16 +.fast: + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct2_16: + lea tmp3q, [tmp1q-32*8] + LOAD_8ROWS tmp3q-32*4, 32 + call m(idct2_16x16_internal_8).main + call m(vvc_inv_dct2_dct2_16x32_8).pass2_end + add tmp1q, 32*16 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + dec tmp4b + jg .pass2_loop + RET +ALIGN function_align +.transpose_round_interleave: + mov tmp3d, 4 +.loop: + lea tmp2q, [tmp1q+32*8] + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp2q-32*4], 1 + vinserti128 m1, [tmp2q-32*3], 1 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp2q-32*2], 1 + vinserti128 m3, [tmp2q-32*1], 1 + mova xm4, [tmp1q+32*0] + mova xm5, [tmp1q+32*1] + vinserti128 m4, [tmp2q+32*0], 1 + vinserti128 m5, [tmp2q+32*1], 1 + mova xm6, [tmp1q+32*2] + mova xm7, [tmp1q+32*3] + vinserti128 m6, [tmp2q+32*2], 1 + vinserti128 m7, [tmp2q+32*3], 1 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova xm8, [tmp1q-32*4+16] + mova xm9, [tmp1q-32*3+16] + vinserti128 m8, [tmp2q-32*4+16], 1 + vinserti128 m9, [tmp2q-32*3+16], 1 + mova [tmp1q-32*4], m0 + mova [tmp2q-32*4], m1 + mova [tmp1q-32*3], m2 + mova [tmp2q-32*3], m3 + mova xm2, [tmp1q-32*2+16] + mova xm3, [tmp1q-32*1+16] + vinserti128 m2, [tmp2q-32*2+16], 1 + vinserti128 m3, [tmp2q-32*1+16], 1 + mova [tmp1q-32*2], m4 + mova [tmp2q-32*2], m5 + mova [tmp1q-32*1], m6 + mova [tmp2q-32*1], m7 + mova xm4, [tmp1q+32*0+16] + mova xm5, [tmp1q+32*1+16] + vinserti128 m4, [tmp2q+32*0+16], 1 + vinserti128 m5, [tmp2q+32*1+16], 1 + mova xm6, [tmp1q+32*2+16] + mova xm7, [tmp1q+32*3+16] + vinserti128 m6, [tmp2q+32*2+16], 1 + vinserti128 m7, [tmp2q+32*3+16], 1 + pmulhrsw m0, m8, m10 + pmulhrsw m1, m9, m10 + REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 + call m(vvc_inv_identity_identity_8x32_8).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add tmp1q, 32*16 + dec tmp3d + jg .loop + ret + +cglobal vvc_inv_dct2_dct2_64x64_8, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(vvc_pw_64x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(vvc_pw_8192)] + mov [cq], eobd + or r3d, 64 + jmp m(vvc_inv_dct2_dct2_64x16_8).dconly +.normal: + PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*71] + lea r10d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + vpbroadcastd m15, [o(vvc_pd_64)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + mova m4, [cq+64* 9] + mova m5, [cq+64*23] + mova m6, [cq+64*25] + mova m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + mova m4, [cq+64*13] + mova m5, [cq+64*19] + mova m6, [cq+64*29] + mova m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(vvc_pw_8192)] + call m(vvc_inv_dct2_dct2_64x32_8).transpose_round_interleave + add cq, 32 + add r10d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*7] + mov r10b, 4 +.pass2_loop: + lea r2, [tmp1q+32*64] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + mova [rsp], m4 + test r10d, 0x40000000 + jnz .fast + lea r3, [r2+32*64] + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast: + call m(idct2_16x16_internal_8).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10d, 0x40000000 + jnz .fast2 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(vvc_inv_dct2_dct2_16x32_8).main_oddhalf_fast + vpbroadcastd m15, [o(vvc_pd_64)] + add r2, 32*8 + add r3, 32*8 + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r2-32*4] ; 1 + mova m3, [r2+32*3] ; 15 + mova m4, [r2+32*0] ; 9 + mova m7, [r2-32*1] ; 7 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast3 + mova m1, [r3+32*3] ; 31 + mova m2, [r3-32*4] ; 17 + mova m5, [r3-32*1] ; 23 + mova m6, [r3+32*0] ; 25 +.fast3: + add r6, o_idct2_64_offset + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r2-32*2] ; 5 + mova m3, [r2+32*1] ; 11 + mova m4, [r2+32*2] ; 13 + mova m7, [r2-32*3] ; 3 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast4 + mova m1, [r3+32*1] ; 27 + mova m2, [r3-32*2] ; 21 + mova m5, [r3-32*3] ; 19 + mova m6, [r3+32*2] ; 29 +.fast4: + call m(vvc_inv_dct2_dct2_16x64_8).main_part1 + call m(vvc_inv_dct2_dct2_16x64_8).main_part2_pass2 + sub tmp1q, 32*28 + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + dec r10b + jg .pass2_loop + RET + +%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/vvcdsp_init.c b/libavcodec/x86/vvcdsp_init.c index a5849e3a2ed..797349e5665 100644 --- a/libavcodec/x86/vvcdsp_init.c +++ b/libavcodec/x86/vvcdsp_init.c @@ -241,6 +241,44 @@ PUT_VVC_LUMA_FORWARD_FUNCS(12, avx512icl) c->inter.put[LUMA][1][1] = ff_vvc_put_vvc_luma_hv_##bitd##_##opt; \ } while (0) +#define ITX_COMMON_SIZES(TYPE_H, type_h, TYPE_V, type_v, bitd, opt) \ + ITX(TYPE_H, type_h, TYPE_V, type_v, 4, 4, bitd, opt); \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 4, 8, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 4, 16, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 8, 4, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 8, 8, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 8, 16, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 8, 32, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 16, 4, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 16, 8, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 16, 16, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 16, 32, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 32, 8, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 32, 16, bitd, opt); */ \ + /* ITX(TYPE_H, type_h, TYPE_V, type_v, 32, 32, bitd, opt); */ + +#define ITX_SIZES(bitd, opt) \ + ITX_COMMON_SIZES(DCT2, dct2, DCT2, dct2, bitd, opt); \ + /* ITX(DCT2, dct2, DCT2, dct2, 16, 64, bitd, opt); */ \ + /* ITX(DCT2, dct2, DCT2, dct2, 32, 64, bitd, opt); */ \ + /* ITX(DCT2, dct2, DCT2, dct2, 64, 16, bitd, opt); */ \ + /* ITX(DCT2, dct2, DCT2, dct2, 64, 32, bitd, opt); */ \ + /* ITX(DCT2, dct2, DCT2, dct2, 64, 64, bitd, opt); */ + +#define ITX(TYPE_H, type_h, TYPE_V, type_v, width, height, bitd, opt) \ +void ff_vvc_inv_##type_h##_##type_v##_##width##x##height##_##bitd##_##opt( \ + int16_t *dst, const int *coeff, int nzw, int log2_transform_range); +/* ITX_SIZES(8, avx2) */ +ITX_SIZES(10, avx2) + +#undef ITX +#define ITX(TYPE_H, type_h, TYPE_V, type_v, width, height, bitd, opt) \ + c->itx.itx[TYPE_H][TYPE_V][TX_SIZE_##width][TX_SIZE_##height] = ff_vvc_inv_##type_h##_##type_v##_##width##x##height##_##bitd##_##opt; + +#define ITX_INIT(bitd, opt) do { \ + ITX_SIZES(bitd, opt) \ +} while (0) + void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bit_depth) { const int cpu_flags = av_get_cpu_flags(); @@ -250,12 +288,14 @@ void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bit_depth) case 8: ALF_DSP(8); PUT_VVC_LUMA_INIT(8, avx2); + /* ITX_INIT(8, avx2); */ c->sao.band_filter[0] = ff_vvc_sao_band_filter_8_8_avx2; c->sao.band_filter[1] = ff_vvc_sao_band_filter_16_8_avx2; break; case 10: ALF_DSP(10); PUT_VVC_LUMA_INIT(10, avx2); + ITX_INIT(10, avx2); c->sao.band_filter[0] = ff_vvc_sao_band_filter_8_10_avx2; break; case 12: diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 251ee797dec..e099ee4b10d 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -232,6 +232,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %define gprsize 4 %endif +; Repeats an instruction/operation for multiple arguments. +; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" +%macro REPX 2-* ; operation, args + %xdefine %%f(x) %1 + %rep %0 - 1 + %rotate 1 + %%f(%1) + %endrep +%endmacro + %macro PUSH 1 push %1 %ifidn rstk, rsp diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 9a2105da3b3..c008072bbe3 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -35,7 +35,10 @@ AVCODECOBJS-$(CONFIG_V210_DECODER) += v210dec.o AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER) += vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o -AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_alf.o vvc_sao.o vvc_mc.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_alf.o \ + vvc_sao.o \ + vvc_mc.o \ + vvc_itx.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index c4f80ece513..02f2a56cc57 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -179,6 +179,7 @@ static const struct { { "vvc_alf", checkasm_check_vvc_alf }, { "vvc_sao", checkasm_check_vvc_sao }, { "vvc_mc", checkasm_check_vvc_mc }, + { "vvc_itx", checkasm_check_vvc_itx }, #endif #endif #if CONFIG_AVFILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index a82e157a4ea..408f77608ed 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -97,6 +97,7 @@ void checkasm_check_vorbisdsp(void); void checkasm_check_vvc_alf(void); void checkasm_check_vvc_sao(void); void checkasm_check_vvc_mc(void); +void checkasm_check_vvc_itx(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_itx.c b/tests/checkasm/vvc_itx.c new file mode 100644 index 00000000000..c7fc312965e --- /dev/null +++ b/tests/checkasm/vvc_itx.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2023 Frank Plowman + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/mem_internal.h" + +#include "libavcodec/avcodec.h" + +#include "libavcodec/vvc/vvcdsp.h" +#include "libavcodec/vvc/vvcdec.h" + +#include "checkasm.h" + +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) +#define BUF_SIZE (MAX_TB_SIZE * MAX_TB_SIZE) + +#define randomize_buffers(buf0, buf1, size, min, max) \ + do { \ + int k; \ + for (k = 0; k < size; ++k) { \ + uint32_t r = rnd(); \ + int32_t a = min + r / (max / (max - min + 1) + 1); \ + AV_WN32A(buf0 + k, a); \ + AV_WN32A(buf1 + k, a); \ + } \ + } while (0) + +const char *itx_str[N_TX_TYPE] = { + "dct2", // DCT2 + "dst7", // DST7 + "dct8", // DCT8 +}; + +const int itx_log2_min_size[N_TX_TYPE] = { + 1, // DCT2 + 2, // DST7 + 2, // DCT8 +}; + +const int itx_log2_max_size[N_TX_TYPE] = { + 6, // DCT2 + 5, // DST7 + 5, // DCT8 +}; + +static void check_itx(VVCDSPContext h, enum TxType trh, enum TxType trv, int bit_depth) +{ + // @TODO: test extended precision (log2_transform_range != 15) + const int log2_transform_range = 15; + + LOCAL_ALIGNED_32(int16_t, ref_dst, [BUF_SIZE]); + LOCAL_ALIGNED_32(int16_t, new_dst, [BUF_SIZE]); + LOCAL_ALIGNED_32(int, ref_src, [BUF_SIZE]); + LOCAL_ALIGNED_32(int, new_src, [BUF_SIZE]); + + for (int log2_width = itx_log2_min_size[trh]; log2_width <= itx_log2_max_size[trh]; ++log2_width) { + const int width = 1 << log2_width; + for (int log2_height = itx_log2_min_size[trv]; log2_height <= itx_log2_max_size[trv]; ++log2_height) { + const int height = 1 << log2_height; + + declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *dst, const int *src, + int nzw, int log2_transform_range); + + randomize_buffers(ref_src, new_src, BUF_SIZE, + -(1 << log2_transform_range), + (1 << log2_transform_range) - 1); + memset(ref_dst, 0, BUF_SIZE); + memset(new_dst, 0, BUF_SIZE); + + // @TODO: test nzw != width + if (check_func(h.itx.itx[trh][trv][log2_width][log2_height], + "inv_%s_%s_%dx%d_%d", + itx_str[trh], itx_str[trv], width, height, bit_depth)) { + call_ref(ref_dst, ref_src, width, log2_transform_range); + call_new(new_dst, new_src, width, log2_transform_range); + checkasm_check_int16_t("vvc_itx_1d.asm", 0, + ref_dst, width * sizeof(*ref_dst), + new_dst, width * sizeof(*new_dst), + width, height, "dst"); + } + bench_new(new_dst, new_src, width, 15); + } + } +} + +void checkasm_check_vvc_itx(void) +{ + VVCDSPContext h; + ff_vvc_dsp_init(&h, 8); + check_itx(h, DCT2, DCT2, 8); + ff_vvc_dsp_init(&h, 10); + check_itx(h, DCT2, DCT2, 10); + report("idct2"); +}