diff --git a/libavcodec/aarch64/vvcdsp_init_aarch64.c b/libavcodec/aarch64/vvcdsp_init_aarch64.c index a2c20e8d568..e4e2054164b 100644 --- a/libavcodec/aarch64/vvcdsp_init_aarch64.c +++ b/libavcodec/aarch64/vvcdsp_init_aarch64.c @@ -34,7 +34,7 @@ void ff_vvc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdi void ff_vvc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height); -av_cold void ff_vvc_dsp_init_aarch64(VVCDSPContext *c, const int bit_depth) { +av_cold void ff_vvc_dsp_init_aarch64(VVCDSPContext *c, const int bit_depth, int extended_precision_flag) { if (!have_neon(av_get_cpu_flags())) return; if (bit_depth == 8) { c->sao.band_filter[0] = diff --git a/libavcodec/vvc/vvcdec.c b/libavcodec/vvc/vvcdec.c index c85b7fac8ca..88f4afc8e62 100644 --- a/libavcodec/vvc/vvcdec.c +++ b/libavcodec/vvc/vvcdec.c @@ -717,6 +717,7 @@ static av_cold int frame_context_init(VVCFrameContext *fc, AVCodecContext *avctx static int frame_context_setup(VVCFrameContext *fc, VVCContext *s) { + const VVCSPS *sps = fc->ps.sps; int ret = 0; // copy refs from the last frame @@ -740,7 +741,8 @@ static int frame_context_setup(VVCFrameContext *fc, VVCContext *s) ret = pic_arrays_init(s, fc); if (ret < 0) goto fail; - ff_vvc_dsp_init(&fc->vvcdsp, fc->ps.sps->bit_depth); + ff_vvc_dsp_init(&fc->vvcdsp, fc->ps.sps->bit_depth, + sps->r->sps_extended_precision_flag); ff_videodsp_init(&fc->vdsp, fc->ps.sps->bit_depth); fail: diff --git a/libavcodec/vvc/vvcdsp.c b/libavcodec/vvc/vvcdsp.c index 1056cb8ff9f..a5c2648275b 100644 --- a/libavcodec/vvc/vvcdsp.c +++ b/libavcodec/vvc/vvcdsp.c @@ -295,7 +295,8 @@ typedef struct IntraEdgeParams { #include "vvcdsp_template.c" #undef BIT_DEPTH -void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth) +void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth, + int extended_precision_flag) { #undef FUNC #define FUNC(a, depth) a ## _ ## depth @@ -321,8 +322,8 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int bit_depth) break; } #if ARCH_X86 - ff_vvc_dsp_init_x86(vvcdsp, bit_depth); + ff_vvc_dsp_init_x86(vvcdsp, bit_depth, extended_precision_flag); #elif ARCH_AARCH64 - ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth); + ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth, extended_precision_flag); #endif } diff --git a/libavcodec/vvc/vvcdsp.h b/libavcodec/vvc/vvcdsp.h index cb1c7c33121..fdcd7a24fb4 100644 --- a/libavcodec/vvc/vvcdsp.h +++ b/libavcodec/vvc/vvcdsp.h @@ -166,13 +166,14 @@ typedef struct VVCDSPContext { VVCALFDSPContext alf; } VVCDSPContext; -void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); +void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth, + int extended_precision_flag); extern const int8_t ff_vvc_chroma_filters[3][32][4]; extern const int8_t ff_vvc_luma_filters[3][16][8]; extern const int8_t ff_vvc_dmvr_filters[16][2]; -void ff_vvc_dsp_init_x86(VVCDSPContext *c, const int bit_depth); -void ff_vvc_dsp_init_aarch64(VVCDSPContext *c, const int bit_depth); +void ff_vvc_dsp_init_x86(VVCDSPContext *c, const int bit_depth, int extended_precision_flag); +void ff_vvc_dsp_init_aarch64(VVCDSPContext *c, const int bit_depth, int extended_precision_flag); #endif /* AVCODEC_VVCDSP_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 71a1cdf63e7..06939194c04 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -206,5 +206,6 @@ X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ X86ASM-OBJS-$(CONFIG_VVC_DECODER) += x86/vvc_alf.o \ x86/vvc_sao.o \ x86/vvc_sao_10bit.o \ - x86/vvc_mc.o + x86/vvc_mc.o \ + x86/vvc_itx_1d.o X86ASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o diff --git a/libavcodec/x86/vvc_itx_1d.asm b/libavcodec/x86/vvc_itx_1d.asm new file mode 100644 index 00000000000..1256de0de62 --- /dev/null +++ b/libavcodec/x86/vvc_itx_1d.asm @@ -0,0 +1,1142 @@ +;****************************************************************************** +;* +;* SIMD-optimized inverse transform functions for VVC decoding +;* +;* Copyright (c) 2023 Frank Plowman +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;* +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +const vvc_dct2_4_even_mat, dw 64, 64, 64, -64, 64, -64, 64, 64 + +const vvc_dct2_4_odd_mat, dw 83, 36, 36, -83, -36, 83, -83, -36 + +%define matvec_mul_4_permute(m11, m12, m13, m14, \ + m21, m22, m23, m24, \ + m31, m32, m33, m34, \ + m41, m42, m43, m44) \ + m11, m12, m21, m22, m31, m32, m41, m42, \ + m13, m14, m23, m24, m33, m34, m43, m44 + +%define dct2_8_odd_mat_permute(m11, m12, m13, m14, \ + m21, m22, m23, m24, \ + m31, m32, m33, m34, \ + m41, m42, m43, m44) \ + m41, m42, m43, m44, \ + m31, m32, m33, m34, \ + m21, m22, m23, m24, \ + m11, m12, m13, m14 + +const vvc_dct2_8_odd_mat, dw matvec_mul_4_permute(dct2_8_odd_mat_permute( \ + -18, 50, -75, 89, \ + -50, 89, -18, -75, \ + -75, 18, 89, 50, \ + -89, -75, -50, -18)) + +%define matvec_mul_8_permute(m11, m12, m13, m14, m15, m16, m17, m18, \ + m21, m22, m23, m24, m25, m26, m27, m28, \ + m31, m32, m33, m34, m35, m36, m37, m38, \ + m41, m42, m43, m44, m45, m46, m47, m48, \ + m51, m52, m53, m54, m55, m56, m57, m58, \ + m61, m62, m63, m64, m65, m66, m67, m68, \ + m71, m72, m73, m74, m75, m76, m77, m78, \ + m81, m82, m83, m84, m85, m86, m87, m88) \ + m11, m12, m21, m22, m31, m32, m41, m42, \ + m51, m52, m61, m62, m71, m72, m81, m82, \ + m13, m14, m23, m24, m33, m34, m43, m44, \ + m53, m54, m63, m64, m73, m74, m83, m84, \ + m15, m16, m25, m26, m35, m36, m45, m46, \ + m55, m56, m65, m66, m75, m76, m85, m86, \ + m17, m18, m27, m28, m37, m38, m47, m48, \ + m57, m58, m67, m68, m77, m78, m87, m88 + +%define dct2_16_odd_mat_permute(m11, m12, m13, m14, m15, m16, m17, m18, \ + m21, m22, m23, m24, m25, m26, m27, m28, \ + m31, m32, m33, m34, m35, m36, m37, m38, \ + m41, m42, m43, m44, m45, m46, m47, m48, \ + m51, m52, m53, m54, m55, m56, m57, m58, \ + m61, m62, m63, m64, m65, m66, m67, m68, \ + m71, m72, m73, m74, m75, m76, m77, m78, \ + m81, m82, m83, m84, m85, m86, m87, m88) \ + m11, m12, m13, m14, m15, m16, m17, m18, \ + m21, m22, m23, m24, m25, m26, m27, m28, \ + m31, m32, m33, m34, m35, m36, m37, m38, \ + m41, m42, m43, m44, m45, m46, m47, m48, \ + m81, m82, m83, m84, m85, m86, m87, m88, \ + m71, m72, m73, m74, m75, m76, m77, m78, \ + m61, m62, m63, m64, m65, m66, m67, m68, \ + m51, m52, m53, m54, m55, m56, m57, m58 + +const vvc_dct2_16_odd_mat, dw matvec_mul_8_permute(dct2_16_odd_mat_permute( \ + -9, 25, -43, 57, -70, 80, -87, 90, \ + -25, 70, -90, 80, -43, -9, 57, -87, \ + -43, 90, -57, -25, 87, -70, -9, 80, \ + -57, 80, 25, -90, 9, 87, -43, -70, \ + -70, 43, 87, -9, -90, -25, 80, 57, \ + -80, -9, 70, 87, 25, -57, -90, -43, \ + -87, -57, -9, 43, 80, 90, 70, 25, \ + -90, -87, -80, -70, -57, -43, -25, -9)) + +%define matvec_mul_16_permute(m1_1, m1_2, m1_3, m1_4, m1_5, m1_6, m1_7, m1_8, m1_9, m1_10, m1_11, m1_12, m1_13, m1_14, m1_15, m1_16, \ + m2_1, m2_2, m2_3, m2_4, m2_5, m2_6, m2_7, m2_8, m2_9, m2_10, m2_11, m2_12, m2_13, m2_14, m2_15, m2_16, \ + m3_1, m3_2, m3_3, m3_4, m3_5, m3_6, m3_7, m3_8, m3_9, m3_10, m3_11, m3_12, m3_13, m3_14, m3_15, m3_16, \ + m4_1, m4_2, m4_3, m4_4, m4_5, m4_6, m4_7, m4_8, m4_9, m4_10, m4_11, m4_12, m4_13, m4_14, m4_15, m4_16, \ + m5_1, m5_2, m5_3, m5_4, m5_5, m5_6, m5_7, m5_8, m5_9, m5_10, m5_11, m5_12, m5_13, m5_14, m5_15, m5_16, \ + m6_1, m6_2, m6_3, m6_4, m6_5, m6_6, m6_7, m6_8, m6_9, m6_10, m6_11, m6_12, m6_13, m6_14, m6_15, m6_16, \ + m7_1, m7_2, m7_3, m7_4, m7_5, m7_6, m7_7, m7_8, m7_9, m7_10, m7_11, m7_12, m7_13, m7_14, m7_15, m7_16, \ + m8_1, m8_2, m8_3, m8_4, m8_5, m8_6, m8_7, m8_8, m8_9, m8_10, m8_11, m8_12, m8_13, m8_14, m8_15, m8_16, \ + m9_1, m9_2, m9_3, m9_4, m9_5, m9_6, m9_7, m9_8, m9_9, m9_10, m9_11, m9_12, m9_13, m9_14, m9_15, m9_16, \ + m10_1, m10_2, m10_3, m10_4, m10_5, m10_6, m10_7, m10_8, m10_9, m10_10, m10_11, m10_12, m10_13, m10_14, m10_15, m10_16, \ + m11_1, m11_2, m11_3, m11_4, m11_5, m11_6, m11_7, m11_8, m11_9, m11_10, m11_11, m11_12, m11_13, m11_14, m11_15, m11_16, \ + m12_1, m12_2, m12_3, m12_4, m12_5, m12_6, m12_7, m12_8, m12_9, m12_10, m12_11, m12_12, m12_13, m12_14, m12_15, m12_16, \ + m13_1, m13_2, m13_3, m13_4, m13_5, m13_6, m13_7, m13_8, m13_9, m13_10, m13_11, m13_12, m13_13, m13_14, m13_15, m13_16, \ + m14_1, m14_2, m14_3, m14_4, m14_5, m14_6, m14_7, m14_8, m14_9, m14_10, m14_11, m14_12, m14_13, m14_14, m14_15, m14_16, \ + m15_1, m15_2, m15_3, m15_4, m15_5, m15_6, m15_7, m15_8, m15_9, m15_10, m15_11, m15_12, m15_13, m15_14, m15_15, m15_16, \ + m16_1, m16_2, m16_3, m16_4, m16_5, m16_6, m16_7, m16_8, m16_9, m16_10, m16_11, m16_12, m16_13, m16_14, m16_15, m16_16) \ + m1_1, m1_2, m2_1, m2_2, m3_1, m3_2, m4_1, m4_2, \ + m5_1, m5_2, m6_1, m6_2, m7_1, m7_2, m8_1, m8_2, \ + m9_1, m9_2, m10_1, m10_2, m11_1, m11_2, m12_1, m12_2, \ + m13_1, m13_2, m14_1, m14_2, m15_1, m15_2, m16_1, m16_2, \ + m1_3, m1_4, m2_3, m2_4, m3_3, m3_4, m4_3, m4_4, \ + m5_3, m5_4, m6_3, m6_4, m7_3, m7_4, m8_3, m8_4, \ + m9_3, m9_4, m10_3, m10_4, m11_3, m11_4, m12_3, m12_4, \ + m13_3, m13_4, m14_3, m14_4, m15_3, m15_4, m16_3, m16_4, \ + m1_5, m1_6, m2_5, m2_6, m3_5, m3_6, m4_5, m4_6, \ + m5_5, m5_6, m6_5, m6_6, m7_5, m7_6, m8_5, m8_6, \ + m9_5, m9_6, m10_5, m10_6, m11_5, m11_6, m12_5, m12_6, \ + m13_5, m13_6, m14_5, m14_6, m15_5, m15_6, m16_5, m16_6, \ + m1_7, m1_8, m2_7, m2_8, m3_7, m3_8, m4_7, m4_8, \ + m5_7, m5_8, m6_7, m6_8, m7_7, m7_8, m8_7, m8_8, \ + m9_7, m9_8, m10_7, m10_8, m11_7, m11_8, m12_7, m12_8, \ + m13_7, m13_8, m14_7, m14_8, m15_7, m15_8, m16_7, m16_8, \ + m1_9, m1_10, m2_9, m2_10, m3_9, m3_10, m4_9, m4_10, \ + m5_9, m5_10, m6_9, m6_10, m7_9, m7_10, m8_9, m8_10, \ + m9_9, m9_10, m10_9, m10_10, m11_9, m11_10, m12_9, m12_10, \ + m13_9, m13_10, m14_9, m14_10, m15_9, m15_10, m16_9, m16_10, \ + m1_11, m1_12, m2_11, m2_12, m3_11, m3_12, m4_11, m4_12, \ + m5_11, m5_12, m6_11, m6_12, m7_11, m7_12, m8_11, m8_12, \ + m9_11, m9_12, m10_11, m10_12, m11_11, m11_12, m12_11, m12_12, \ + m13_11, m13_12, m14_11, m14_12, m15_11, m15_12, m16_11, m16_12, \ + m1_13, m1_14, m2_13, m2_14, m3_13, m3_14, m4_13, m4_14, \ + m5_13, m5_14, m6_13, m6_14, m7_13, m7_14, m8_13, m8_14, \ + m9_13, m9_14, m10_13, m10_14, m11_13, m11_14, m12_13, m12_14, \ + m13_13, m13_14, m14_13, m14_14, m15_13, m15_14, m16_13, m16_14, \ + m1_15, m1_16, m2_15, m2_16, m3_15, m3_16, m4_15, m4_16, \ + m5_15, m5_16, m6_15, m6_16, m7_15, m7_16, m8_15, m8_16, \ + m9_15, m9_16, m10_15, m10_16, m11_15, m11_16, m12_15, m12_16, \ + m13_15, m13_16, m14_15, m14_16, m15_15, m15_16, m16_15, m16_16 + +%define dct2_32_odd_mat_permute(m1_1, m1_2, m1_3, m1_4, m1_5, m1_6, m1_7, m1_8, m1_9, m1_10, m1_11, m1_12, m1_13, m1_14, m1_15, m1_16, \ + m2_1, m2_2, m2_3, m2_4, m2_5, m2_6, m2_7, m2_8, m2_9, m2_10, m2_11, m2_12, m2_13, m2_14, m2_15, m2_16, \ + m3_1, m3_2, m3_3, m3_4, m3_5, m3_6, m3_7, m3_8, m3_9, m3_10, m3_11, m3_12, m3_13, m3_14, m3_15, m3_16, \ + m4_1, m4_2, m4_3, m4_4, m4_5, m4_6, m4_7, m4_8, m4_9, m4_10, m4_11, m4_12, m4_13, m4_14, m4_15, m4_16, \ + m5_1, m5_2, m5_3, m5_4, m5_5, m5_6, m5_7, m5_8, m5_9, m5_10, m5_11, m5_12, m5_13, m5_14, m5_15, m5_16, \ + m6_1, m6_2, m6_3, m6_4, m6_5, m6_6, m6_7, m6_8, m6_9, m6_10, m6_11, m6_12, m6_13, m6_14, m6_15, m6_16, \ + m7_1, m7_2, m7_3, m7_4, m7_5, m7_6, m7_7, m7_8, m7_9, m7_10, m7_11, m7_12, m7_13, m7_14, m7_15, m7_16, \ + m8_1, m8_2, m8_3, m8_4, m8_5, m8_6, m8_7, m8_8, m8_9, m8_10, m8_11, m8_12, m8_13, m8_14, m8_15, m8_16, \ + m9_1, m9_2, m9_3, m9_4, m9_5, m9_6, m9_7, m9_8, m9_9, m9_10, m9_11, m9_12, m9_13, m9_14, m9_15, m9_16, \ + m10_1, m10_2, m10_3, m10_4, m10_5, m10_6, m10_7, m10_8, m10_9, m10_10, m10_11, m10_12, m10_13, m10_14, m10_15, m10_16, \ + m11_1, m11_2, m11_3, m11_4, m11_5, m11_6, m11_7, m11_8, m11_9, m11_10, m11_11, m11_12, m11_13, m11_14, m11_15, m11_16, \ + m12_1, m12_2, m12_3, m12_4, m12_5, m12_6, m12_7, m12_8, m12_9, m12_10, m12_11, m12_12, m12_13, m12_14, m12_15, m12_16, \ + m13_1, m13_2, m13_3, m13_4, m13_5, m13_6, m13_7, m13_8, m13_9, m13_10, m13_11, m13_12, m13_13, m13_14, m13_15, m13_16, \ + m14_1, m14_2, m14_3, m14_4, m14_5, m14_6, m14_7, m14_8, m14_9, m14_10, m14_11, m14_12, m14_13, m14_14, m14_15, m14_16, \ + m15_1, m15_2, m15_3, m15_4, m15_5, m15_6, m15_7, m15_8, m15_9, m15_10, m15_11, m15_12, m15_13, m15_14, m15_15, m15_16, \ + m16_1, m16_2, m16_3, m16_4, m16_5, m16_6, m16_7, m16_8, m16_9, m16_10, m16_11, m16_12, m16_13, m16_14, m16_15, m16_16) \ + m1_1, m1_2, m1_3, m1_4, m1_5, m1_6, m1_7, m1_8, m1_9, m1_10, m1_11, m1_12, m1_13, m1_14, m1_15, m1_16, \ + m2_1, m2_2, m2_3, m2_4, m2_5, m2_6, m2_7, m2_8, m2_9, m2_10, m2_11, m2_12, m2_13, m2_14, m2_15, m2_16, \ + m3_1, m3_2, m3_3, m3_4, m3_5, m3_6, m3_7, m3_8, m3_9, m3_10, m3_11, m3_12, m3_13, m3_14, m3_15, m3_16, \ + m4_1, m4_2, m4_3, m4_4, m4_5, m4_6, m4_7, m4_8, m4_9, m4_10, m4_11, m4_12, m4_13, m4_14, m4_15, m4_16, \ + m8_1, m8_2, m8_3, m8_4, m8_5, m8_6, m8_7, m8_8, m8_9, m8_10, m8_11, m8_12, m8_13, m8_14, m8_15, m8_16, \ + m7_1, m7_2, m7_3, m7_4, m7_5, m7_6, m7_7, m7_8, m7_9, m7_10, m7_11, m7_12, m7_13, m7_14, m7_15, m7_16, \ + m6_1, m6_2, m6_3, m6_4, m6_5, m6_6, m6_7, m6_8, m6_9, m6_10, m6_11, m6_12, m6_13, m6_14, m6_15, m6_16, \ + m5_1, m5_2, m5_3, m5_4, m5_5, m5_6, m5_7, m5_8, m5_9, m5_10, m5_11, m5_12, m5_13, m5_14, m5_15, m5_16, \ + m9_1, m9_2, m9_3, m9_4, m9_5, m9_6, m9_7, m9_8, m9_9, m9_10, m9_11, m9_12, m9_13, m9_14, m9_15, m9_16, \ + m10_1, m10_2, m10_3, m10_4, m10_5, m10_6, m10_7, m10_8, m10_9, m10_10, m10_11, m10_12, m10_13, m10_14, m10_15, m10_16, \ + m11_1, m11_2, m11_3, m11_4, m11_5, m11_6, m11_7, m11_8, m11_9, m11_10, m11_11, m11_12, m11_13, m11_14, m11_15, m11_16, \ + m12_1, m12_2, m12_3, m12_4, m12_5, m12_6, m12_7, m12_8, m12_9, m12_10, m12_11, m12_12, m12_13, m12_14, m12_15, m12_16, \ + m16_1, m16_2, m16_3, m16_4, m16_5, m16_6, m16_7, m16_8, m16_9, m16_10, m16_11, m16_12, m16_13, m16_14, m16_15, m16_16, \ + m15_1, m15_2, m15_3, m15_4, m15_5, m15_6, m15_7, m15_8, m15_9, m15_10, m15_11, m15_12, m15_13, m15_14, m15_15, m15_16, \ + m14_1, m14_2, m14_3, m14_4, m14_5, m14_6, m14_7, m14_8, m14_9, m14_10, m14_11, m14_12, m14_13, m14_14, m14_15, m14_16, \ + m13_1, m13_2, m13_3, m13_4, m13_5, m13_6, m13_7, m13_8, m13_9, m13_10, m13_11, m13_12, m13_13, m13_14, m13_15, m13_16 + +const vvc_dct2_32_odd_mat, dw matvec_mul_16_permute(dct2_32_odd_mat_permute( \ + -4, 13, -22, 31, -38, 46, -54, 61, -67, 73, -78, 82, -85, 88, -90, 90, \ + -13, 38, -61, 78, -88, 90, -85, 73, -54, 31, -4, -22, 46, -67, 82, -90, \ + -22, 61, -85, 90, -73, 38, 4, -46, 78, -90, 82, -54, 13, 31, -67, 88, \ + -31, 78, -90, 61, -4, -54, 88, -82, 38, 22, -73, 90, -67, 13, 46, -85, \ + -38, 88, -73, 4, 67, -90, 46, 31, -85, 78, -13, -61, 90, -54, -22, 82, \ + -46, 90, -38, -54, 90, -31, -61, 88, -22, -67, 85, -13, -73, 82, -4, -78, \ + -54, 85, 4, -88, 46, 61, -82, -13, 90, -38, -67, 78, 22, -90, 31, 73, \ + -61, 73, 46, -82, -31, 88, 13, -90, 4, 90, -22, -85, 38, 78, -54, -67, \ + -67, 54, 78, -38, -85, 22, 90, -4, -90, -13, 88, 31, -82, -46, 73, 61, \ + -73, 31, 90, 22, -78, -67, 38, 90, 13, -82, -61, 46, 88, 4, -85, -54, \ + -78, 4, 82, 73, -13, -85, -67, 22, 88, 61, -31, -90, -54, 38, 90, 46, \ + -82, -22, 54, 90, 61, -13, -78, -85, -31, 46, 90, 67, -4, -73, -88, -38, \ + -85, -46, 13, 67, 90, 73, 22, -38, -82, -88, -54, 4, 61, 90, 78, 31, \ + -88, -67, -31, 13, 54, 82, 90, 78, 46, 4, -38, -73, -90, -85, -61, -22, \ + -90, -82, -67, -46, -22, 4, 31, 54, 73, 85, 90, 88, 78, 61, 38, 13, \ + -90, -90, -88, -85, -82, -78, -73, -67, -61, -54, -46, -38, -31, -22, -13, -4)) + +%define matvec_mul_32_permute(m1_1, m1_2, m1_3, m1_4, m1_5, m1_6, m1_7, m1_8, m1_9, m1_10, m1_11, m1_12, m1_13, m1_14, m1_15, m1_16, m1_17, m1_18, m1_19, m1_20, m1_21, m1_22, m1_23, m1_24, m1_25, m1_26, m1_27, m1_28, m1_29, m1_30, m1_31, m1_32, \ + m2_1, m2_2, m2_3, m2_4, m2_5, m2_6, m2_7, m2_8, m2_9, m2_10, m2_11, m2_12, m2_13, m2_14, m2_15, m2_16, m2_17, m2_18, m2_19, m2_20, m2_21, m2_22, m2_23, m2_24, m2_25, m2_26, m2_27, m2_28, m2_29, m2_30, m2_31, m2_32, \ + m3_1, m3_2, m3_3, m3_4, m3_5, m3_6, m3_7, m3_8, m3_9, m3_10, m3_11, m3_12, m3_13, m3_14, m3_15, m3_16, m3_17, m3_18, m3_19, m3_20, m3_21, m3_22, m3_23, m3_24, m3_25, m3_26, m3_27, m3_28, m3_29, m3_30, m3_31, m3_32, \ + m4_1, m4_2, m4_3, m4_4, m4_5, m4_6, m4_7, m4_8, m4_9, m4_10, m4_11, m4_12, m4_13, m4_14, m4_15, m4_16, m4_17, m4_18, m4_19, m4_20, m4_21, m4_22, m4_23, m4_24, m4_25, m4_26, m4_27, m4_28, m4_29, m4_30, m4_31, m4_32, \ + m5_1, m5_2, m5_3, m5_4, m5_5, m5_6, m5_7, m5_8, m5_9, m5_10, m5_11, m5_12, m5_13, m5_14, m5_15, m5_16, m5_17, m5_18, m5_19, m5_20, m5_21, m5_22, m5_23, m5_24, m5_25, m5_26, m5_27, m5_28, m5_29, m5_30, m5_31, m5_32, \ + m6_1, m6_2, m6_3, m6_4, m6_5, m6_6, m6_7, m6_8, m6_9, m6_10, m6_11, m6_12, m6_13, m6_14, m6_15, m6_16, m6_17, m6_18, m6_19, m6_20, m6_21, m6_22, m6_23, m6_24, m6_25, m6_26, m6_27, m6_28, m6_29, m6_30, m6_31, m6_32, \ + m7_1, m7_2, m7_3, m7_4, m7_5, m7_6, m7_7, m7_8, m7_9, m7_10, m7_11, m7_12, m7_13, m7_14, m7_15, m7_16, m7_17, m7_18, m7_19, m7_20, m7_21, m7_22, m7_23, m7_24, m7_25, m7_26, m7_27, m7_28, m7_29, m7_30, m7_31, m7_32, \ + m8_1, m8_2, m8_3, m8_4, m8_5, m8_6, m8_7, m8_8, m8_9, m8_10, m8_11, m8_12, m8_13, m8_14, m8_15, m8_16, m8_17, m8_18, m8_19, m8_20, m8_21, m8_22, m8_23, m8_24, m8_25, m8_26, m8_27, m8_28, m8_29, m8_30, m8_31, m8_32, \ + m9_1, m9_2, m9_3, m9_4, m9_5, m9_6, m9_7, m9_8, m9_9, m9_10, m9_11, m9_12, m9_13, m9_14, m9_15, m9_16, m9_17, m9_18, m9_19, m9_20, m9_21, m9_22, m9_23, m9_24, m9_25, m9_26, m9_27, m9_28, m9_29, m9_30, m9_31, m9_32, \ + m10_1, m10_2, m10_3, m10_4, m10_5, m10_6, m10_7, m10_8, m10_9, m10_10, m10_11, m10_12, m10_13, m10_14, m10_15, m10_16, m10_17, m10_18, m10_19, m10_20, m10_21, m10_22, m10_23, m10_24, m10_25, m10_26, m10_27, m10_28, m10_29, m10_30, m10_31, m10_32, \ + m11_1, m11_2, m11_3, m11_4, m11_5, m11_6, m11_7, m11_8, m11_9, m11_10, m11_11, m11_12, m11_13, m11_14, m11_15, m11_16, m11_17, m11_18, m11_19, m11_20, m11_21, m11_22, m11_23, m11_24, m11_25, m11_26, m11_27, m11_28, m11_29, m11_30, m11_31, m11_32, \ + m12_1, m12_2, m12_3, m12_4, m12_5, m12_6, m12_7, m12_8, m12_9, m12_10, m12_11, m12_12, m12_13, m12_14, m12_15, m12_16, m12_17, m12_18, m12_19, m12_20, m12_21, m12_22, m12_23, m12_24, m12_25, m12_26, m12_27, m12_28, m12_29, m12_30, m12_31, m12_32, \ + m13_1, m13_2, m13_3, m13_4, m13_5, m13_6, m13_7, m13_8, m13_9, m13_10, m13_11, m13_12, m13_13, m13_14, m13_15, m13_16, m13_17, m13_18, m13_19, m13_20, m13_21, m13_22, m13_23, m13_24, m13_25, m13_26, m13_27, m13_28, m13_29, m13_30, m13_31, m13_32, \ + m14_1, m14_2, m14_3, m14_4, m14_5, m14_6, m14_7, m14_8, m14_9, m14_10, m14_11, m14_12, m14_13, m14_14, m14_15, m14_16, m14_17, m14_18, m14_19, m14_20, m14_21, m14_22, m14_23, m14_24, m14_25, m14_26, m14_27, m14_28, m14_29, m14_30, m14_31, m14_32, \ + m15_1, m15_2, m15_3, m15_4, m15_5, m15_6, m15_7, m15_8, m15_9, m15_10, m15_11, m15_12, m15_13, m15_14, m15_15, m15_16, m15_17, m15_18, m15_19, m15_20, m15_21, m15_22, m15_23, m15_24, m15_25, m15_26, m15_27, m15_28, m15_29, m15_30, m15_31, m15_32, \ + m16_1, m16_2, m16_3, m16_4, m16_5, m16_6, m16_7, m16_8, m16_9, m16_10, m16_11, m16_12, m16_13, m16_14, m16_15, m16_16, m16_17, m16_18, m16_19, m16_20, m16_21, m16_22, m16_23, m16_24, m16_25, m16_26, m16_27, m16_28, m16_29, m16_30, m16_31, m16_32, \ + m17_1, m17_2, m17_3, m17_4, m17_5, m17_6, m17_7, m17_8, m17_9, m17_10, m17_11, m17_12, m17_13, m17_14, m17_15, m17_16, m17_17, m17_18, m17_19, m17_20, m17_21, m17_22, m17_23, m17_24, m17_25, m17_26, m17_27, m17_28, m17_29, m17_30, m17_31, m17_32, \ + m18_1, m18_2, m18_3, m18_4, m18_5, m18_6, m18_7, m18_8, m18_9, m18_10, m18_11, m18_12, m18_13, m18_14, m18_15, m18_16, m18_17, m18_18, m18_19, m18_20, m18_21, m18_22, m18_23, m18_24, m18_25, m18_26, m18_27, m18_28, m18_29, m18_30, m18_31, m18_32, \ + m19_1, m19_2, m19_3, m19_4, m19_5, m19_6, m19_7, m19_8, m19_9, m19_10, m19_11, m19_12, m19_13, m19_14, m19_15, m19_16, m19_17, m19_18, m19_19, m19_20, m19_21, m19_22, m19_23, m19_24, m19_25, m19_26, m19_27, m19_28, m19_29, m19_30, m19_31, m19_32, \ + m20_1, m20_2, m20_3, m20_4, m20_5, m20_6, m20_7, m20_8, m20_9, m20_10, m20_11, m20_12, m20_13, m20_14, m20_15, m20_16, m20_17, m20_18, m20_19, m20_20, m20_21, m20_22, m20_23, m20_24, m20_25, m20_26, m20_27, m20_28, m20_29, m20_30, m20_31, m20_32, \ + m21_1, m21_2, m21_3, m21_4, m21_5, m21_6, m21_7, m21_8, m21_9, m21_10, m21_11, m21_12, m21_13, m21_14, m21_15, m21_16, m21_17, m21_18, m21_19, m21_20, m21_21, m21_22, m21_23, m21_24, m21_25, m21_26, m21_27, m21_28, m21_29, m21_30, m21_31, m21_32, \ + m22_1, m22_2, m22_3, m22_4, m22_5, m22_6, m22_7, m22_8, m22_9, m22_10, m22_11, m22_12, m22_13, m22_14, m22_15, m22_16, m22_17, m22_18, m22_19, m22_20, m22_21, m22_22, m22_23, m22_24, m22_25, m22_26, m22_27, m22_28, m22_29, m22_30, m22_31, m22_32, \ + m23_1, m23_2, m23_3, m23_4, m23_5, m23_6, m23_7, m23_8, m23_9, m23_10, m23_11, m23_12, m23_13, m23_14, m23_15, m23_16, m23_17, m23_18, m23_19, m23_20, m23_21, m23_22, m23_23, m23_24, m23_25, m23_26, m23_27, m23_28, m23_29, m23_30, m23_31, m23_32, \ + m24_1, m24_2, m24_3, m24_4, m24_5, m24_6, m24_7, m24_8, m24_9, m24_10, m24_11, m24_12, m24_13, m24_14, m24_15, m24_16, m24_17, m24_18, m24_19, m24_20, m24_21, m24_22, m24_23, m24_24, m24_25, m24_26, m24_27, m24_28, m24_29, m24_30, m24_31, m24_32, \ + m25_1, m25_2, m25_3, m25_4, m25_5, m25_6, m25_7, m25_8, m25_9, m25_10, m25_11, m25_12, m25_13, m25_14, m25_15, m25_16, m25_17, m25_18, m25_19, m25_20, m25_21, m25_22, m25_23, m25_24, m25_25, m25_26, m25_27, m25_28, m25_29, m25_30, m25_31, m25_32, \ + m26_1, m26_2, m26_3, m26_4, m26_5, m26_6, m26_7, m26_8, m26_9, m26_10, m26_11, m26_12, m26_13, m26_14, m26_15, m26_16, m26_17, m26_18, m26_19, m26_20, m26_21, m26_22, m26_23, m26_24, m26_25, m26_26, m26_27, m26_28, m26_29, m26_30, m26_31, m26_32, \ + m27_1, m27_2, m27_3, m27_4, m27_5, m27_6, m27_7, m27_8, m27_9, m27_10, m27_11, m27_12, m27_13, m27_14, m27_15, m27_16, m27_17, m27_18, m27_19, m27_20, m27_21, m27_22, m27_23, m27_24, m27_25, m27_26, m27_27, m27_28, m27_29, m27_30, m27_31, m27_32, \ + m28_1, m28_2, m28_3, m28_4, m28_5, m28_6, m28_7, m28_8, m28_9, m28_10, m28_11, m28_12, m28_13, m28_14, m28_15, m28_16, m28_17, m28_18, m28_19, m28_20, m28_21, m28_22, m28_23, m28_24, m28_25, m28_26, m28_27, m28_28, m28_29, m28_30, m28_31, m28_32, \ + m29_1, m29_2, m29_3, m29_4, m29_5, m29_6, m29_7, m29_8, m29_9, m29_10, m29_11, m29_12, m29_13, m29_14, m29_15, m29_16, m29_17, m29_18, m29_19, m29_20, m29_21, m29_22, m29_23, m29_24, m29_25, m29_26, m29_27, m29_28, m29_29, m29_30, m29_31, m29_32, \ + m30_1, m30_2, m30_3, m30_4, m30_5, m30_6, m30_7, m30_8, m30_9, m30_10, m30_11, m30_12, m30_13, m30_14, m30_15, m30_16, m30_17, m30_18, m30_19, m30_20, m30_21, m30_22, m30_23, m30_24, m30_25, m30_26, m30_27, m30_28, m30_29, m30_30, m30_31, m30_32, \ + m31_1, m31_2, m31_3, m31_4, m31_5, m31_6, m31_7, m31_8, m31_9, m31_10, m31_11, m31_12, m31_13, m31_14, m31_15, m31_16, m31_17, m31_18, m31_19, m31_20, m31_21, m31_22, m31_23, m31_24, m31_25, m31_26, m31_27, m31_28, m31_29, m31_30, m31_31, m31_32, \ + m32_1, m32_2, m32_3, m32_4, m32_5, m32_6, m32_7, m32_8, m32_9, m32_10, m32_11, m32_12, m32_13, m32_14, m32_15, m32_16, m32_17, m32_18, m32_19, m32_20, m32_21, m32_22, m32_23, m32_24, m32_25, m32_26, m32_27, m32_28, m32_29, m32_30, m32_31, m32_32) \ + m1_1, m1_2, m2_1, m2_2, m3_1, m3_2, m4_1, m4_2, \ + m5_1, m5_2, m6_1, m6_2, m7_1, m7_2, m8_1, m8_2, \ + m9_1, m9_2, m10_1, m10_2, m11_1, m11_2, m12_1, m12_2, \ + m13_1, m13_2, m14_1, m14_2, m15_1, m15_2, m16_1, m16_2, \ + m17_1, m17_2, m18_1, m18_2, m19_1, m19_2, m20_1, m20_2, \ + m21_1, m21_2, m22_1, m22_2, m23_1, m23_2, m24_1, m24_2, \ + m25_1, m25_2, m26_1, m26_2, m27_1, m27_2, m28_1, m28_2, \ + m29_1, m29_2, m30_1, m30_2, m31_1, m31_2, m32_1, m32_2, \ + m1_3, m1_4, m2_3, m2_4, m3_3, m3_4, m4_3, m4_4, \ + m5_3, m5_4, m6_3, m6_4, m7_3, m7_4, m8_3, m8_4, \ + m9_3, m9_4, m10_3, m10_4, m11_3, m11_4, m12_3, m12_4, \ + m13_3, m13_4, m14_3, m14_4, m15_3, m15_4, m16_3, m16_4, \ + m17_3, m17_4, m18_3, m18_4, m19_3, m19_4, m20_3, m20_4, \ + m21_3, m21_4, m22_3, m22_4, m23_3, m23_4, m24_3, m24_4, \ + m25_3, m25_4, m26_3, m26_4, m27_3, m27_4, m28_3, m28_4, \ + m29_3, m29_4, m30_3, m30_4, m31_3, m31_4, m32_3, m32_4, \ + m1_5, m1_6, m2_5, m2_6, m3_5, m3_6, m4_5, m4_6, \ + m5_5, m5_6, m6_5, m6_6, m7_5, m7_6, m8_5, m8_6, \ + m9_5, m9_6, m10_5, m10_6, m11_5, m11_6, m12_5, m12_6, \ + m13_5, m13_6, m14_5, m14_6, m15_5, m15_6, m16_5, m16_6, \ + m17_5, m17_6, m18_5, m18_6, m19_5, m19_6, m20_5, m20_6, \ + m21_5, m21_6, m22_5, m22_6, m23_5, m23_6, m24_5, m24_6, \ + m25_5, m25_6, m26_5, m26_6, m27_5, m27_6, m28_5, m28_6, \ + m29_5, m29_6, m30_5, m30_6, m31_5, m31_6, m32_5, m32_6, \ + m1_7, m1_8, m2_7, m2_8, m3_7, m3_8, m4_7, m4_8, \ + m5_7, m5_8, m6_7, m6_8, m7_7, m7_8, m8_7, m8_8, \ + m9_7, m9_8, m10_7, m10_8, m11_7, m11_8, m12_7, m12_8, \ + m13_7, m13_8, m14_7, m14_8, m15_7, m15_8, m16_7, m16_8, \ + m17_7, m17_8, m18_7, m18_8, m19_7, m19_8, m20_7, m20_8, \ + m21_7, m21_8, m22_7, m22_8, m23_7, m23_8, m24_7, m24_8, \ + m25_7, m25_8, m26_7, m26_8, m27_7, m27_8, m28_7, m28_8, \ + m29_7, m29_8, m30_7, m30_8, m31_7, m31_8, m32_7, m32_8, \ + m1_9, m1_10, m2_9, m2_10, m3_9, m3_10, m4_9, m4_10, \ + m5_9, m5_10, m6_9, m6_10, m7_9, m7_10, m8_9, m8_10, \ + m9_9, m9_10, m10_9, m10_10, m11_9, m11_10, m12_9, m12_10, \ + m13_9, m13_10, m14_9, m14_10, m15_9, m15_10, m16_9, m16_10, \ + m17_9, m17_10, m18_9, m18_10, m19_9, m19_10, m20_9, m20_10, \ + m21_9, m21_10, m22_9, m22_10, m23_9, m23_10, m24_9, m24_10, \ + m25_9, m25_10, m26_9, m26_10, m27_9, m27_10, m28_9, m28_10, \ + m29_9, m29_10, m30_9, m30_10, m31_9, m31_10, m32_9, m32_10, \ + m1_11, m1_12, m2_11, m2_12, m3_11, m3_12, m4_11, m4_12, \ + m5_11, m5_12, m6_11, m6_12, m7_11, m7_12, m8_11, m8_12, \ + m9_11, m9_12, m10_11, m10_12, m11_11, m11_12, m12_11, m12_12, \ + m13_11, m13_12, m14_11, m14_12, m15_11, m15_12, m16_11, m16_12, \ + m17_11, m17_12, m18_11, m18_12, m19_11, m19_12, m20_11, m20_12, \ + m21_11, m21_12, m22_11, m22_12, m23_11, m23_12, m24_11, m24_12, \ + m25_11, m25_12, m26_11, m26_12, m27_11, m27_12, m28_11, m28_12, \ + m29_11, m29_12, m30_11, m30_12, m31_11, m31_12, m32_11, m32_12, \ + m1_13, m1_14, m2_13, m2_14, m3_13, m3_14, m4_13, m4_14, \ + m5_13, m5_14, m6_13, m6_14, m7_13, m7_14, m8_13, m8_14, \ + m9_13, m9_14, m10_13, m10_14, m11_13, m11_14, m12_13, m12_14, \ + m13_13, m13_14, m14_13, m14_14, m15_13, m15_14, m16_13, m16_14, \ + m17_13, m17_14, m18_13, m18_14, m19_13, m19_14, m20_13, m20_14, \ + m21_13, m21_14, m22_13, m22_14, m23_13, m23_14, m24_13, m24_14, \ + m25_13, m25_14, m26_13, m26_14, m27_13, m27_14, m28_13, m28_14, \ + m29_13, m29_14, m30_13, m30_14, m31_13, m31_14, m32_13, m32_14, \ + m1_15, m1_16, m2_15, m2_16, m3_15, m3_16, m4_15, m4_16, \ + m5_15, m5_16, m6_15, m6_16, m7_15, m7_16, m8_15, m8_16, \ + m9_15, m9_16, m10_15, m10_16, m11_15, m11_16, m12_15, m12_16, \ + m13_15, m13_16, m14_15, m14_16, m15_15, m15_16, m16_15, m16_16, \ + m17_15, m17_16, m18_15, m18_16, m19_15, m19_16, m20_15, m20_16, \ + m21_15, m21_16, m22_15, m22_16, m23_15, m23_16, m24_15, m24_16, \ + m25_15, m25_16, m26_15, m26_16, m27_15, m27_16, m28_15, m28_16, \ + m29_15, m29_16, m30_15, m30_16, m31_15, m31_16, m32_15, m32_16, \ + m1_17, m1_18, m2_17, m2_18, m3_17, m3_18, m4_17, m4_18, \ + m5_17, m5_18, m6_17, m6_18, m7_17, m7_18, m8_17, m8_18, \ + m9_17, m9_18, m10_17, m10_18, m11_17, m11_18, m12_17, m12_18, \ + m13_17, m13_18, m14_17, m14_18, m15_17, m15_18, m16_17, m16_18, \ + m17_17, m17_18, m18_17, m18_18, m19_17, m19_18, m20_17, m20_18, \ + m21_17, m21_18, m22_17, m22_18, m23_17, m23_18, m24_17, m24_18, \ + m25_17, m25_18, m26_17, m26_18, m27_17, m27_18, m28_17, m28_18, \ + m29_17, m29_18, m30_17, m30_18, m31_17, m31_18, m32_17, m32_18, \ + m1_19, m1_20, m2_19, m2_20, m3_19, m3_20, m4_19, m4_20, \ + m5_19, m5_20, m6_19, m6_20, m7_19, m7_20, m8_19, m8_20, \ + m9_19, m9_20, m10_19, m10_20, m11_19, m11_20, m12_19, m12_20, \ + m13_19, m13_20, m14_19, m14_20, m15_19, m15_20, m16_19, m16_20, \ + m17_19, m17_20, m18_19, m18_20, m19_19, m19_20, m20_19, m20_20, \ + m21_19, m21_20, m22_19, m22_20, m23_19, m23_20, m24_19, m24_20, \ + m25_19, m25_20, m26_19, m26_20, m27_19, m27_20, m28_19, m28_20, \ + m29_19, m29_20, m30_19, m30_20, m31_19, m31_20, m32_19, m32_20, \ + m1_21, m1_22, m2_21, m2_22, m3_21, m3_22, m4_21, m4_22, \ + m5_21, m5_22, m6_21, m6_22, m7_21, m7_22, m8_21, m8_22, \ + m9_21, m9_22, m10_21, m10_22, m11_21, m11_22, m12_21, m12_22, \ + m13_21, m13_22, m14_21, m14_22, m15_21, m15_22, m16_21, m16_22, \ + m17_21, m17_22, m18_21, m18_22, m19_21, m19_22, m20_21, m20_22, \ + m21_21, m21_22, m22_21, m22_22, m23_21, m23_22, m24_21, m24_22, \ + m25_21, m25_22, m26_21, m26_22, m27_21, m27_22, m28_21, m28_22, \ + m29_21, m29_22, m30_21, m30_22, m31_21, m31_22, m32_21, m32_22, \ + m1_23, m1_24, m2_23, m2_24, m3_23, m3_24, m4_23, m4_24, \ + m5_23, m5_24, m6_23, m6_24, m7_23, m7_24, m8_23, m8_24, \ + m9_23, m9_24, m10_23, m10_24, m11_23, m11_24, m12_23, m12_24, \ + m13_23, m13_24, m14_23, m14_24, m15_23, m15_24, m16_23, m16_24, \ + m17_23, m17_24, m18_23, m18_24, m19_23, m19_24, m20_23, m20_24, \ + m21_23, m21_24, m22_23, m22_24, m23_23, m23_24, m24_23, m24_24, \ + m25_23, m25_24, m26_23, m26_24, m27_23, m27_24, m28_23, m28_24, \ + m29_23, m29_24, m30_23, m30_24, m31_23, m31_24, m32_23, m32_24, \ + m1_25, m1_26, m2_25, m2_26, m3_25, m3_26, m4_25, m4_26, \ + m5_25, m5_26, m6_25, m6_26, m7_25, m7_26, m8_25, m8_26, \ + m9_25, m9_26, m10_25, m10_26, m11_25, m11_26, m12_25, m12_26, \ + m13_25, m13_26, m14_25, m14_26, m15_25, m15_26, m16_25, m16_26, \ + m17_25, m17_26, m18_25, m18_26, m19_25, m19_26, m20_25, m20_26, \ + m21_25, m21_26, m22_25, m22_26, m23_25, m23_26, m24_25, m24_26, \ + m25_25, m25_26, m26_25, m26_26, m27_25, m27_26, m28_25, m28_26, \ + m29_25, m29_26, m30_25, m30_26, m31_25, m31_26, m32_25, m32_26, \ + m1_27, m1_28, m2_27, m2_28, m3_27, m3_28, m4_27, m4_28, \ + m5_27, m5_28, m6_27, m6_28, m7_27, m7_28, m8_27, m8_28, \ + m9_27, m9_28, m10_27, m10_28, m11_27, m11_28, m12_27, m12_28, \ + m13_27, m13_28, m14_27, m14_28, m15_27, m15_28, m16_27, m16_28, \ + m17_27, m17_28, m18_27, m18_28, m19_27, m19_28, m20_27, m20_28, \ + m21_27, m21_28, m22_27, m22_28, m23_27, m23_28, m24_27, m24_28, \ + m25_27, m25_28, m26_27, m26_28, m27_27, m27_28, m28_27, m28_28, \ + m29_27, m29_28, m30_27, m30_28, m31_27, m31_28, m32_27, m32_28, \ + m1_29, m1_30, m2_29, m2_30, m3_29, m3_30, m4_29, m4_30, \ + m5_29, m5_30, m6_29, m6_30, m7_29, m7_30, m8_29, m8_30, \ + m9_29, m9_30, m10_29, m10_30, m11_29, m11_30, m12_29, m12_30, \ + m13_29, m13_30, m14_29, m14_30, m15_29, m15_30, m16_29, m16_30, \ + m17_29, m17_30, m18_29, m18_30, m19_29, m19_30, m20_29, m20_30, \ + m21_29, m21_30, m22_29, m22_30, m23_29, m23_30, m24_29, m24_30, \ + m25_29, m25_30, m26_29, m26_30, m27_29, m27_30, m28_29, m28_30, \ + m29_29, m29_30, m30_29, m30_30, m31_29, m31_30, m32_29, m32_30, \ + m1_31, m1_32, m2_31, m2_32, m3_31, m3_32, m4_31, m4_32, \ + m5_31, m5_32, m6_31, m6_32, m7_31, m7_32, m8_31, m8_32, \ + m9_31, m9_32, m10_31, m10_32, m11_31, m11_32, m12_31, m12_32, \ + m13_31, m13_32, m14_31, m14_32, m15_31, m15_32, m16_31, m16_32, \ + m17_31, m17_32, m18_31, m18_32, m19_31, m19_32, m20_31, m20_32, \ + m21_31, m21_32, m22_31, m22_32, m23_31, m23_32, m24_31, m24_32, \ + m25_31, m25_32, m26_31, m26_32, m27_31, m27_32, m28_31, m28_32, \ + m29_31, m29_32, m30_31, m30_32, m31_31, m31_32, m32_31, m32_32 + +%define dct2_64_odd_mat_permute(m1_1, m1_2, m1_3, m1_4, m1_5, m1_6, m1_7, m1_8, m1_9, m1_10, m1_11, m1_12, m1_13, m1_14, m1_15, m1_16, m1_17, m1_18, m1_19, m1_20, m1_21, m1_22, m1_23, m1_24, m1_25, m1_26, m1_27, m1_28, m1_29, m1_30, m1_31, m1_32, \ + m2_1, m2_2, m2_3, m2_4, m2_5, m2_6, m2_7, m2_8, m2_9, m2_10, m2_11, m2_12, m2_13, m2_14, m2_15, m2_16, m2_17, m2_18, m2_19, m2_20, m2_21, m2_22, m2_23, m2_24, m2_25, m2_26, m2_27, m2_28, m2_29, m2_30, m2_31, m2_32, \ + m3_1, m3_2, m3_3, m3_4, m3_5, m3_6, m3_7, m3_8, m3_9, m3_10, m3_11, m3_12, m3_13, m3_14, m3_15, m3_16, m3_17, m3_18, m3_19, m3_20, m3_21, m3_22, m3_23, m3_24, m3_25, m3_26, m3_27, m3_28, m3_29, m3_30, m3_31, m3_32, \ + m4_1, m4_2, m4_3, m4_4, m4_5, m4_6, m4_7, m4_8, m4_9, m4_10, m4_11, m4_12, m4_13, m4_14, m4_15, m4_16, m4_17, m4_18, m4_19, m4_20, m4_21, m4_22, m4_23, m4_24, m4_25, m4_26, m4_27, m4_28, m4_29, m4_30, m4_31, m4_32, \ + m5_1, m5_2, m5_3, m5_4, m5_5, m5_6, m5_7, m5_8, m5_9, m5_10, m5_11, m5_12, m5_13, m5_14, m5_15, m5_16, m5_17, m5_18, m5_19, m5_20, m5_21, m5_22, m5_23, m5_24, m5_25, m5_26, m5_27, m5_28, m5_29, m5_30, m5_31, m5_32, \ + m6_1, m6_2, m6_3, m6_4, m6_5, m6_6, m6_7, m6_8, m6_9, m6_10, m6_11, m6_12, m6_13, m6_14, m6_15, m6_16, m6_17, m6_18, m6_19, m6_20, m6_21, m6_22, m6_23, m6_24, m6_25, m6_26, m6_27, m6_28, m6_29, m6_30, m6_31, m6_32, \ + m7_1, m7_2, m7_3, m7_4, m7_5, m7_6, m7_7, m7_8, m7_9, m7_10, m7_11, m7_12, m7_13, m7_14, m7_15, m7_16, m7_17, m7_18, m7_19, m7_20, m7_21, m7_22, m7_23, m7_24, m7_25, m7_26, m7_27, m7_28, m7_29, m7_30, m7_31, m7_32, \ + m8_1, m8_2, m8_3, m8_4, m8_5, m8_6, m8_7, m8_8, m8_9, m8_10, m8_11, m8_12, m8_13, m8_14, m8_15, m8_16, m8_17, m8_18, m8_19, m8_20, m8_21, m8_22, m8_23, m8_24, m8_25, m8_26, m8_27, m8_28, m8_29, m8_30, m8_31, m8_32, \ + m9_1, m9_2, m9_3, m9_4, m9_5, m9_6, m9_7, m9_8, m9_9, m9_10, m9_11, m9_12, m9_13, m9_14, m9_15, m9_16, m9_17, m9_18, m9_19, m9_20, m9_21, m9_22, m9_23, m9_24, m9_25, m9_26, m9_27, m9_28, m9_29, m9_30, m9_31, m9_32, \ + m10_1, m10_2, m10_3, m10_4, m10_5, m10_6, m10_7, m10_8, m10_9, m10_10, m10_11, m10_12, m10_13, m10_14, m10_15, m10_16, m10_17, m10_18, m10_19, m10_20, m10_21, m10_22, m10_23, m10_24, m10_25, m10_26, m10_27, m10_28, m10_29, m10_30, m10_31, m10_32, \ + m11_1, m11_2, m11_3, m11_4, m11_5, m11_6, m11_7, m11_8, m11_9, m11_10, m11_11, m11_12, m11_13, m11_14, m11_15, m11_16, m11_17, m11_18, m11_19, m11_20, m11_21, m11_22, m11_23, m11_24, m11_25, m11_26, m11_27, m11_28, m11_29, m11_30, m11_31, m11_32, \ + m12_1, m12_2, m12_3, m12_4, m12_5, m12_6, m12_7, m12_8, m12_9, m12_10, m12_11, m12_12, m12_13, m12_14, m12_15, m12_16, m12_17, m12_18, m12_19, m12_20, m12_21, m12_22, m12_23, m12_24, m12_25, m12_26, m12_27, m12_28, m12_29, m12_30, m12_31, m12_32, \ + m13_1, m13_2, m13_3, m13_4, m13_5, m13_6, m13_7, m13_8, m13_9, m13_10, m13_11, m13_12, m13_13, m13_14, m13_15, m13_16, m13_17, m13_18, m13_19, m13_20, m13_21, m13_22, m13_23, m13_24, m13_25, m13_26, m13_27, m13_28, m13_29, m13_30, m13_31, m13_32, \ + m14_1, m14_2, m14_3, m14_4, m14_5, m14_6, m14_7, m14_8, m14_9, m14_10, m14_11, m14_12, m14_13, m14_14, m14_15, m14_16, m14_17, m14_18, m14_19, m14_20, m14_21, m14_22, m14_23, m14_24, m14_25, m14_26, m14_27, m14_28, m14_29, m14_30, m14_31, m14_32, \ + m15_1, m15_2, m15_3, m15_4, m15_5, m15_6, m15_7, m15_8, m15_9, m15_10, m15_11, m15_12, m15_13, m15_14, m15_15, m15_16, m15_17, m15_18, m15_19, m15_20, m15_21, m15_22, m15_23, m15_24, m15_25, m15_26, m15_27, m15_28, m15_29, m15_30, m15_31, m15_32, \ + m16_1, m16_2, m16_3, m16_4, m16_5, m16_6, m16_7, m16_8, m16_9, m16_10, m16_11, m16_12, m16_13, m16_14, m16_15, m16_16, m16_17, m16_18, m16_19, m16_20, m16_21, m16_22, m16_23, m16_24, m16_25, m16_26, m16_27, m16_28, m16_29, m16_30, m16_31, m16_32, \ + m17_1, m17_2, m17_3, m17_4, m17_5, m17_6, m17_7, m17_8, m17_9, m17_10, m17_11, m17_12, m17_13, m17_14, m17_15, m17_16, m17_17, m17_18, m17_19, m17_20, m17_21, m17_22, m17_23, m17_24, m17_25, m17_26, m17_27, m17_28, m17_29, m17_30, m17_31, m17_32, \ + m18_1, m18_2, m18_3, m18_4, m18_5, m18_6, m18_7, m18_8, m18_9, m18_10, m18_11, m18_12, m18_13, m18_14, m18_15, m18_16, m18_17, m18_18, m18_19, m18_20, m18_21, m18_22, m18_23, m18_24, m18_25, m18_26, m18_27, m18_28, m18_29, m18_30, m18_31, m18_32, \ + m19_1, m19_2, m19_3, m19_4, m19_5, m19_6, m19_7, m19_8, m19_9, m19_10, m19_11, m19_12, m19_13, m19_14, m19_15, m19_16, m19_17, m19_18, m19_19, m19_20, m19_21, m19_22, m19_23, m19_24, m19_25, m19_26, m19_27, m19_28, m19_29, m19_30, m19_31, m19_32, \ + m20_1, m20_2, m20_3, m20_4, m20_5, m20_6, m20_7, m20_8, m20_9, m20_10, m20_11, m20_12, m20_13, m20_14, m20_15, m20_16, m20_17, m20_18, m20_19, m20_20, m20_21, m20_22, m20_23, m20_24, m20_25, m20_26, m20_27, m20_28, m20_29, m20_30, m20_31, m20_32, \ + m21_1, m21_2, m21_3, m21_4, m21_5, m21_6, m21_7, m21_8, m21_9, m21_10, m21_11, m21_12, m21_13, m21_14, m21_15, m21_16, m21_17, m21_18, m21_19, m21_20, m21_21, m21_22, m21_23, m21_24, m21_25, m21_26, m21_27, m21_28, m21_29, m21_30, m21_31, m21_32, \ + m22_1, m22_2, m22_3, m22_4, m22_5, m22_6, m22_7, m22_8, m22_9, m22_10, m22_11, m22_12, m22_13, m22_14, m22_15, m22_16, m22_17, m22_18, m22_19, m22_20, m22_21, m22_22, m22_23, m22_24, m22_25, m22_26, m22_27, m22_28, m22_29, m22_30, m22_31, m22_32, \ + m23_1, m23_2, m23_3, m23_4, m23_5, m23_6, m23_7, m23_8, m23_9, m23_10, m23_11, m23_12, m23_13, m23_14, m23_15, m23_16, m23_17, m23_18, m23_19, m23_20, m23_21, m23_22, m23_23, m23_24, m23_25, m23_26, m23_27, m23_28, m23_29, m23_30, m23_31, m23_32, \ + m24_1, m24_2, m24_3, m24_4, m24_5, m24_6, m24_7, m24_8, m24_9, m24_10, m24_11, m24_12, m24_13, m24_14, m24_15, m24_16, m24_17, m24_18, m24_19, m24_20, m24_21, m24_22, m24_23, m24_24, m24_25, m24_26, m24_27, m24_28, m24_29, m24_30, m24_31, m24_32, \ + m25_1, m25_2, m25_3, m25_4, m25_5, m25_6, m25_7, m25_8, m25_9, m25_10, m25_11, m25_12, m25_13, m25_14, m25_15, m25_16, m25_17, m25_18, m25_19, m25_20, m25_21, m25_22, m25_23, m25_24, m25_25, m25_26, m25_27, m25_28, m25_29, m25_30, m25_31, m25_32, \ + m26_1, m26_2, m26_3, m26_4, m26_5, m26_6, m26_7, m26_8, m26_9, m26_10, m26_11, m26_12, m26_13, m26_14, m26_15, m26_16, m26_17, m26_18, m26_19, m26_20, m26_21, m26_22, m26_23, m26_24, m26_25, m26_26, m26_27, m26_28, m26_29, m26_30, m26_31, m26_32, \ + m27_1, m27_2, m27_3, m27_4, m27_5, m27_6, m27_7, m27_8, m27_9, m27_10, m27_11, m27_12, m27_13, m27_14, m27_15, m27_16, m27_17, m27_18, m27_19, m27_20, m27_21, m27_22, m27_23, m27_24, m27_25, m27_26, m27_27, m27_28, m27_29, m27_30, m27_31, m27_32, \ + m28_1, m28_2, m28_3, m28_4, m28_5, m28_6, m28_7, m28_8, m28_9, m28_10, m28_11, m28_12, m28_13, m28_14, m28_15, m28_16, m28_17, m28_18, m28_19, m28_20, m28_21, m28_22, m28_23, m28_24, m28_25, m28_26, m28_27, m28_28, m28_29, m28_30, m28_31, m28_32, \ + m29_1, m29_2, m29_3, m29_4, m29_5, m29_6, m29_7, m29_8, m29_9, m29_10, m29_11, m29_12, m29_13, m29_14, m29_15, m29_16, m29_17, m29_18, m29_19, m29_20, m29_21, m29_22, m29_23, m29_24, m29_25, m29_26, m29_27, m29_28, m29_29, m29_30, m29_31, m29_32, \ + m30_1, m30_2, m30_3, m30_4, m30_5, m30_6, m30_7, m30_8, m30_9, m30_10, m30_11, m30_12, m30_13, m30_14, m30_15, m30_16, m30_17, m30_18, m30_19, m30_20, m30_21, m30_22, m30_23, m30_24, m30_25, m30_26, m30_27, m30_28, m30_29, m30_30, m30_31, m30_32, \ + m31_1, m31_2, m31_3, m31_4, m31_5, m31_6, m31_7, m31_8, m31_9, m31_10, m31_11, m31_12, m31_13, m31_14, m31_15, m31_16, m31_17, m31_18, m31_19, m31_20, m31_21, m31_22, m31_23, m31_24, m31_25, m31_26, m31_27, m31_28, m31_29, m31_30, m31_31, m31_32, \ + m32_1, m32_2, m32_3, m32_4, m32_5, m32_6, m32_7, m32_8, m32_9, m32_10, m32_11, m32_12, m32_13, m32_14, m32_15, m32_16, m32_17, m32_18, m32_19, m32_20, m32_21, m32_22, m32_23, m32_24, m32_25, m32_26, m32_27, m32_28, m32_29, m32_30, m32_31, m32_32) \ + m1_1, m1_2, m1_3, m1_4, m1_5, m1_6, m1_7, m1_8, m1_9, m1_10, m1_11, m1_12, m1_13, m1_14, m1_15, m1_16, m1_17, m1_18, m1_19, m1_20, m1_21, m1_22, m1_23, m1_24, m1_25, m1_26, m1_27, m1_28, m1_29, m1_30, m1_31, m1_32, \ + m2_1, m2_2, m2_3, m2_4, m2_5, m2_6, m2_7, m2_8, m2_9, m2_10, m2_11, m2_12, m2_13, m2_14, m2_15, m2_16, m2_17, m2_18, m2_19, m2_20, m2_21, m2_22, m2_23, m2_24, m2_25, m2_26, m2_27, m2_28, m2_29, m2_30, m2_31, m2_32, \ + m3_1, m3_2, m3_3, m3_4, m3_5, m3_6, m3_7, m3_8, m3_9, m3_10, m3_11, m3_12, m3_13, m3_14, m3_15, m3_16, m3_17, m3_18, m3_19, m3_20, m3_21, m3_22, m3_23, m3_24, m3_25, m3_26, m3_27, m3_28, m3_29, m3_30, m3_31, m3_32, \ + m4_1, m4_2, m4_3, m4_4, m4_5, m4_6, m4_7, m4_8, m4_9, m4_10, m4_11, m4_12, m4_13, m4_14, m4_15, m4_16, m4_17, m4_18, m4_19, m4_20, m4_21, m4_22, m4_23, m4_24, m4_25, m4_26, m4_27, m4_28, m4_29, m4_30, m4_31, m4_32, \ + m8_1, m8_2, m8_3, m8_4, m8_5, m8_6, m8_7, m8_8, m8_9, m8_10, m8_11, m8_12, m8_13, m8_14, m8_15, m8_16, m8_17, m8_18, m8_19, m8_20, m8_21, m8_22, m8_23, m8_24, m8_25, m8_26, m8_27, m8_28, m8_29, m8_30, m8_31, m8_32, \ + m7_1, m7_2, m7_3, m7_4, m7_5, m7_6, m7_7, m7_8, m7_9, m7_10, m7_11, m7_12, m7_13, m7_14, m7_15, m7_16, m7_17, m7_18, m7_19, m7_20, m7_21, m7_22, m7_23, m7_24, m7_25, m7_26, m7_27, m7_28, m7_29, m7_30, m7_31, m7_32, \ + m6_1, m6_2, m6_3, m6_4, m6_5, m6_6, m6_7, m6_8, m6_9, m6_10, m6_11, m6_12, m6_13, m6_14, m6_15, m6_16, m6_17, m6_18, m6_19, m6_20, m6_21, m6_22, m6_23, m6_24, m6_25, m6_26, m6_27, m6_28, m6_29, m6_30, m6_31, m6_32, \ + m5_1, m5_2, m5_3, m5_4, m5_5, m5_6, m5_7, m5_8, m5_9, m5_10, m5_11, m5_12, m5_13, m5_14, m5_15, m5_16, m5_17, m5_18, m5_19, m5_20, m5_21, m5_22, m5_23, m5_24, m5_25, m5_26, m5_27, m5_28, m5_29, m5_30, m5_31, m5_32, \ + m9_1, m9_2, m9_3, m9_4, m9_5, m9_6, m9_7, m9_8, m9_9, m9_10, m9_11, m9_12, m9_13, m9_14, m9_15, m9_16, m9_17, m9_18, m9_19, m9_20, m9_21, m9_22, m9_23, m9_24, m9_25, m9_26, m9_27, m9_28, m9_29, m9_30, m9_31, m9_32, \ + m10_1, m10_2, m10_3, m10_4, m10_5, m10_6, m10_7, m10_8, m10_9, m10_10, m10_11, m10_12, m10_13, m10_14, m10_15, m10_16, m10_17, m10_18, m10_19, m10_20, m10_21, m10_22, m10_23, m10_24, m10_25, m10_26, m10_27, m10_28, m10_29, m10_30, m10_31, m10_32, \ + m11_1, m11_2, m11_3, m11_4, m11_5, m11_6, m11_7, m11_8, m11_9, m11_10, m11_11, m11_12, m11_13, m11_14, m11_15, m11_16, m11_17, m11_18, m11_19, m11_20, m11_21, m11_22, m11_23, m11_24, m11_25, m11_26, m11_27, m11_28, m11_29, m11_30, m11_31, m11_32, \ + m12_1, m12_2, m12_3, m12_4, m12_5, m12_6, m12_7, m12_8, m12_9, m12_10, m12_11, m12_12, m12_13, m12_14, m12_15, m12_16, m12_17, m12_18, m12_19, m12_20, m12_21, m12_22, m12_23, m12_24, m12_25, m12_26, m12_27, m12_28, m12_29, m12_30, m12_31, m12_32, \ + m16_1, m16_2, m16_3, m16_4, m16_5, m16_6, m16_7, m16_8, m16_9, m16_10, m16_11, m16_12, m16_13, m16_14, m16_15, m16_16, m16_17, m16_18, m16_19, m16_20, m16_21, m16_22, m16_23, m16_24, m16_25, m16_26, m16_27, m16_28, m16_29, m16_30, m16_31, m16_32, \ + m15_1, m15_2, m15_3, m15_4, m15_5, m15_6, m15_7, m15_8, m15_9, m15_10, m15_11, m15_12, m15_13, m15_14, m15_15, m15_16, m15_17, m15_18, m15_19, m15_20, m15_21, m15_22, m15_23, m15_24, m15_25, m15_26, m15_27, m15_28, m15_29, m15_30, m15_31, m15_32, \ + m14_1, m14_2, m14_3, m14_4, m14_5, m14_6, m14_7, m14_8, m14_9, m14_10, m14_11, m14_12, m14_13, m14_14, m14_15, m14_16, m14_17, m14_18, m14_19, m14_20, m14_21, m14_22, m14_23, m14_24, m14_25, m14_26, m14_27, m14_28, m14_29, m14_30, m14_31, m14_32, \ + m13_1, m13_2, m13_3, m13_4, m13_5, m13_6, m13_7, m13_8, m13_9, m13_10, m13_11, m13_12, m13_13, m13_14, m13_15, m13_16, m13_17, m13_18, m13_19, m13_20, m13_21, m13_22, m13_23, m13_24, m13_25, m13_26, m13_27, m13_28, m13_29, m13_30, m13_31, m13_32, \ + m17_1, m17_2, m17_3, m17_4, m17_5, m17_6, m17_7, m17_8, m17_9, m17_10, m17_11, m17_12, m17_13, m17_14, m17_15, m17_16, m17_17, m17_18, m17_19, m17_20, m17_21, m17_22, m17_23, m17_24, m17_25, m17_26, m17_27, m17_28, m17_29, m17_30, m17_31, m17_32, \ + m18_1, m18_2, m18_3, m18_4, m18_5, m18_6, m18_7, m18_8, m18_9, m18_10, m18_11, m18_12, m18_13, m18_14, m18_15, m18_16, m18_17, m18_18, m18_19, m18_20, m18_21, m18_22, m18_23, m18_24, m18_25, m18_26, m18_27, m18_28, m18_29, m18_30, m18_31, m18_32, \ + m19_1, m19_2, m19_3, m19_4, m19_5, m19_6, m19_7, m19_8, m19_9, m19_10, m19_11, m19_12, m19_13, m19_14, m19_15, m19_16, m19_17, m19_18, m19_19, m19_20, m19_21, m19_22, m19_23, m19_24, m19_25, m19_26, m19_27, m19_28, m19_29, m19_30, m19_31, m19_32, \ + m20_1, m20_2, m20_3, m20_4, m20_5, m20_6, m20_7, m20_8, m20_9, m20_10, m20_11, m20_12, m20_13, m20_14, m20_15, m20_16, m20_17, m20_18, m20_19, m20_20, m20_21, m20_22, m20_23, m20_24, m20_25, m20_26, m20_27, m20_28, m20_29, m20_30, m20_31, m20_32, \ + m24_1, m24_2, m24_3, m24_4, m24_5, m24_6, m24_7, m24_8, m24_9, m24_10, m24_11, m24_12, m24_13, m24_14, m24_15, m24_16, m24_17, m24_18, m24_19, m24_20, m24_21, m24_22, m24_23, m24_24, m24_25, m24_26, m24_27, m24_28, m24_29, m24_30, m24_31, m24_32, \ + m23_1, m23_2, m23_3, m23_4, m23_5, m23_6, m23_7, m23_8, m23_9, m23_10, m23_11, m23_12, m23_13, m23_14, m23_15, m23_16, m23_17, m23_18, m23_19, m23_20, m23_21, m23_22, m23_23, m23_24, m23_25, m23_26, m23_27, m23_28, m23_29, m23_30, m23_31, m23_32, \ + m22_1, m22_2, m22_3, m22_4, m22_5, m22_6, m22_7, m22_8, m22_9, m22_10, m22_11, m22_12, m22_13, m22_14, m22_15, m22_16, m22_17, m22_18, m22_19, m22_20, m22_21, m22_22, m22_23, m22_24, m22_25, m22_26, m22_27, m22_28, m22_29, m22_30, m22_31, m22_32, \ + m21_1, m21_2, m21_3, m21_4, m21_5, m21_6, m21_7, m21_8, m21_9, m21_10, m21_11, m21_12, m21_13, m21_14, m21_15, m21_16, m21_17, m21_18, m21_19, m21_20, m21_21, m21_22, m21_23, m21_24, m21_25, m21_26, m21_27, m21_28, m21_29, m21_30, m21_31, m21_32, \ + m25_1, m25_2, m25_3, m25_4, m25_5, m25_6, m25_7, m25_8, m25_9, m25_10, m25_11, m25_12, m25_13, m25_14, m25_15, m25_16, m25_17, m25_18, m25_19, m25_20, m25_21, m25_22, m25_23, m25_24, m25_25, m25_26, m25_27, m25_28, m25_29, m25_30, m25_31, m25_32, \ + m26_1, m26_2, m26_3, m26_4, m26_5, m26_6, m26_7, m26_8, m26_9, m26_10, m26_11, m26_12, m26_13, m26_14, m26_15, m26_16, m26_17, m26_18, m26_19, m26_20, m26_21, m26_22, m26_23, m26_24, m26_25, m26_26, m26_27, m26_28, m26_29, m26_30, m26_31, m26_32, \ + m27_1, m27_2, m27_3, m27_4, m27_5, m27_6, m27_7, m27_8, m27_9, m27_10, m27_11, m27_12, m27_13, m27_14, m27_15, m27_16, m27_17, m27_18, m27_19, m27_20, m27_21, m27_22, m27_23, m27_24, m27_25, m27_26, m27_27, m27_28, m27_29, m27_30, m27_31, m27_32, \ + m28_1, m28_2, m28_3, m28_4, m28_5, m28_6, m28_7, m28_8, m28_9, m28_10, m28_11, m28_12, m28_13, m28_14, m28_15, m28_16, m28_17, m28_18, m28_19, m28_20, m28_21, m28_22, m28_23, m28_24, m28_25, m28_26, m28_27, m28_28, m28_29, m28_30, m28_31, m28_32, \ + m32_1, m32_2, m32_3, m32_4, m32_5, m32_6, m32_7, m32_8, m32_9, m32_10, m32_11, m32_12, m32_13, m32_14, m32_15, m32_16, m32_17, m32_18, m32_19, m32_20, m32_21, m32_22, m32_23, m32_24, m32_25, m32_26, m32_27, m32_28, m32_29, m32_30, m32_31, m32_32, \ + m31_1, m31_2, m31_3, m31_4, m31_5, m31_6, m31_7, m31_8, m31_9, m31_10, m31_11, m31_12, m31_13, m31_14, m31_15, m31_16, m31_17, m31_18, m31_19, m31_20, m31_21, m31_22, m31_23, m31_24, m31_25, m31_26, m31_27, m31_28, m31_29, m31_30, m31_31, m31_32, \ + m30_1, m30_2, m30_3, m30_4, m30_5, m30_6, m30_7, m30_8, m30_9, m30_10, m30_11, m30_12, m30_13, m30_14, m30_15, m30_16, m30_17, m30_18, m30_19, m30_20, m30_21, m30_22, m30_23, m30_24, m30_25, m30_26, m30_27, m30_28, m30_29, m30_30, m30_31, m30_32, \ + m29_1, m29_2, m29_3, m29_4, m29_5, m29_6, m29_7, m29_8, m29_9, m29_10, m29_11, m29_12, m29_13, m29_14, m29_15, m29_16, m29_17, m29_18, m29_19, m29_20, m29_21, m29_22, m29_23, m29_24, m29_25, m29_26, m29_27, m29_28, m29_29, m29_30, m29_31, m29_32 + +const vvc_dct2_64_odd_mat, dw matvec_mul_32_permute(dct2_64_odd_mat_permute( \ + -2, 7, -11, 15, -20, 24, -28, 33, -37, 41, -44, 48, -52, 56, -59, 62, -65, 69, -71, 73, -77, 79, -81, 83, -84, 86, -87, 88, -90, 90, -90, 91, \ + -7, 20, -33, 44, -56, 65, -73, 81, -86, 90, -91, 90, -87, 83, -77, 69, -59, 48, -37, 24, -11, -2, 15, -28, 41, -52, 62, -71, 79, -84, 88, -90, \ + -11, 33, -52, 69, -81, 88, -91, 87, -79, 65, -48, 28, -7, -15, 37, -56, 71, -83, 90, -90, 86, -77, 62, -44, 24, -2, -20, 41, -59, 73, -84, 90, \ + -15, 44, -69, 84, -91, 86, -71, 48, -20, -11, 41, -65, 83, -90, 87, -73, 52, -24, -7, 37, -62, 81, -90, 88, -77, 56, -28, -2, 33, -59, 79, -90, \ + -20, 56, -81, 91, -83, 59, -24, -15, 52, -79, 90, -84, 62, -28, -11, 48, -77, 90, -86, 65, -33, -7, 44, -73, 90, -87, 69, -37, -2, 41, -71, 88, \ + -24, 65, -88, 86, -59, 15, 33, -71, 90, -83, 52, -7, -41, 77, -91, 79, -44, -2, 48, -81, 90, -73, 37, 11, -56, 84, -90, 69, -28, -20, 62, -87, \ + -28, 73, -91, 71, -24, -33, 77, -90, 69, -20, -37, 79, -90, 65, -15, -41, 81, -90, 62, -11, -44, 83, -88, 59, -7, -48, 84, -87, 56, -2, -52, 86, \ + -33, 81, -87, 48, 15, -71, 90, -62, 2, 59, -90, 73, -20, -44, 86, -83, 37, 28, -79, 88, -52, -11, 69, -91, 65, -7, -56, 90, -77, 24, 41, -84, \ + -37, 86, -79, 20, 52, -90, 69, -2, -65, 90, -56, -15, 77, -87, 41, 33, -84, 81, -24, -48, 90, -71, 7, 62, -91, 59, 11, -73, 88, -44, -28, 83, \ + -41, 90, -65, -11, 79, -83, 20, 59, -90, 48, 33, -87, 71, 2, -73, 86, -28, -52, 91, -56, -24, 84, -77, 7, 69, -88, 37, 44, -90, 62, 15, -81, \ + -44, 91, -48, -41, 90, -52, -37, 90, -56, -33, 90, -59, -28, 88, -62, -24, 87, -65, -20, 86, -69, -15, 84, -71, -11, 83, -73, -7, 81, -77, -2, 79, \ + -48, 90, -28, -65, 84, -7, -79, 73, 15, -87, 59, 37, -91, 41, 56, -88, 20, 71, -81, -2, 83, -69, -24, 90, -52, -44, 90, -33, -62, 86, -11, -77, \ + -52, 87, -7, -83, 62, 41, -90, 20, 77, -71, -28, 91, -33, -69, 79, 15, -90, 44, 59, -84, -2, 86, -56, -48, 88, -11, -81, 65, 37, -90, 24, 73, \ + -56, 83, 15, -90, 28, 77, -65, -44, 87, 2, -88, 41, 69, -73, -33, 90, -11, -84, 52, 59, -81, -20, 91, -24, -79, 62, 48, -86, -7, 90, -37, -71, \ + -59, 77, 37, -87, -11, 91, -15, -86, 41, 73, -62, -56, 79, 33, -88, -7, 90, -20, -84, 44, 71, -65, -52, 81, 28, -90, -2, 90, -24, -83, 48, 69, \ + -62, 69, 56, -73, -48, 79, 41, -83, -33, 86, 24, -88, -15, 90, 7, -91, 2, 90, -11, -90, 20, 87, -28, -84, 37, 81, -44, -77, 52, 71, -59, -65, \ + -65, 59, 71, -52, -77, 44, 81, -37, -84, 28, 87, -20, -90, 11, 90, -2, -91, -7, 90, 15, -88, -24, 86, 33, -83, -41, 79, 48, -73, -56, 69, 62, \ + -69, 48, 83, -24, -90, -2, 90, 28, -81, -52, 65, 71, -44, -84, 20, 90, 7, -88, -33, 79, 56, -62, -73, 41, 86, -15, -91, -11, 87, 37, -77, -59, \ + -71, 37, 90, 7, -86, -48, 62, 79, -24, -91, -20, 81, 59, -52, -84, 11, 90, 33, -73, -69, 41, 88, 2, -87, -44, 65, 77, -28, -90, -15, 83, 56, \ + -73, 24, 90, 37, -65, -81, 11, 88, 48, -56, -86, -2, 84, 59, -44, -90, -15, 79, 69, -33, -91, -28, 71, 77, -20, -90, -41, 62, 83, -7, -87, -52, \ + -77, 11, 86, 62, -33, -90, -44, 52, 90, 24, -69, -83, -2, 81, 71, -20, -88, -56, 41, 91, 37, -59, -87, -15, 73, 79, -7, -84, -65, 28, 90, 48, \ + -79, -2, 77, 81, 7, -73, -83, -11, 71, 84, 15, -69, -86, -20, 65, 87, 24, -62, -88, -28, 59, 90, 33, -56, -90, -37, 52, 90, 41, -48, -91, -44, \ + -81, -15, 62, 90, 44, -37, -88, -69, 7, 77, 84, 24, -56, -91, -52, 28, 86, 73, 2, -71, -87, -33, 48, 90, 59, -20, -83, -79, -11, 65, 90, 41, \ + -83, -28, 44, 88, 73, 11, -59, -91, -62, 7, 71, 90, 48, -24, -81, -84, -33, 41, 87, 77, 15, -56, -90, -65, 2, 69, 90, 52, -20, -79, -86, -37, \ + -84, -41, 24, 77, 90, 56, -7, -65, -91, -69, -11, 52, 88, 79, 28, -37, -83, -86, -44, 20, 73, 90, 59, -2, -62, -90, -71, -15, 48, 87, 81, 33, \ + -86, -52, 2, 56, 87, 84, 48, -7, -59, -88, -83, -44, 11, 62, 90, 81, 41, -15, -65, -90, -79, -37, 20, 69, 90, 77, 33, -24, -71, -91, -73, -28, \ + -87, -62, -20, 28, 69, 90, 84, 56, 11, -37, -73, -90, -81, -48, -2, 44, 79, 91, 77, 41, -7, -52, -83, -90, -71, -33, 15, 59, 86, 88, 65, 24, \ + -88, -71, -41, -2, 37, 69, 87, 90, 73, 44, 7, -33, -65, -86, -90, -77, -48, -11, 28, 62, 84, 90, 79, 52, 15, -24, -59, -83, -91, -81, -56, -20, \ + -90, -79, -59, -33, -2, 28, 56, 77, 88, 90, 81, 62, 37, 7, -24, -52, -73, -87, -90, -83, -65, -41, -11, 20, 48, 71, 86, 91, 84, 69, 44, 15, \ + -90, -84, -73, -59, -41, -20, 2, 24, 44, 62, 77, 86, 90, 90, 83, 71, 56, 37, 15, -7, -28, -48, -65, -79, -87, -91, -88, -81, -69, -52, -33, -11, \ + -90, -88, -84, -79, -71, -62, -52, -41, -28, -15, -2, 11, 24, 37, 48, 59, 69, 77, 83, 87, 90, 91, 90, 86, 81, 73, 65, 56, 44, 33, 20, 7, \ + -91, -90, -90, -90, -88, -87, -86, -84, -83, -81, -79, -77, -73, -71, -69, -65, -62, -59, -56, -52, -48, -44, -41, -37, -33, -28, -24, -20, -15, -11, -7, -2)) + +SECTION .text + +INIT_YMM avx2 + +; Multiply a 2D vector by a 2x2 matrix. +%macro MATVEC_MUL_2 4 ; out, in, stride, mat + movd xm%1, [%2 + 0*%3*4] + punpckldq m%1, [%2 + 1*%3*4] + packssdw m%1, m%1 + pmaddwd m%1, [%4] +%endmacro + +; Multiply a 4D vector by a 4x4 matrix. +%macro MATVEC_MUL_4 5 ; out, in, stride, mat, temp + %push matvec_mul_4 + %define %$out %1 + %define %$in %2 + %define %$stride %3 + %define %$mat %4 + %define %$temp %5 + + lea stride3q, [3*%$stride] + + movd xm%$out, [%$in + 0*%$stride*4] + punpckldq m%$out, [%$in + 1*%$stride*4] + movd xm%$temp, [%$in + 2*%$stride*4] + punpckldq m%$temp, [%$in + stride3q*4] + + punpcklqdq m%$out, m%$out + packssdw m%$out, m%$out + + punpcklqdq m%$temp, m%$temp + packssdw m%$temp, m%$temp + + pmaddwd m%$out, [%$mat] + pmaddwd m%$temp, [%$mat + 16] + + paddd m%$out, m%$temp + + %pop +%endmacro + +; Multiply an 8D vector by an 8x8 matrix. +%macro MATVEC_MUL_8 7 ; out[2], in, stride, mat, temp[2] + %push matvec_mul_8 + %define %$out0 %1 + %define %$out1 %2 + %define %$in %3 + %define %$stride %4 + %define %$mat %5 + %define %$temp0 %6 + %define %$temp1 %7 + + movd xm%$out0, [%$in] + punpckldq m%$out0, [%$in + %$stride*4] + punpcklqdq m%$out0, m%$out0 + packssdw m%$out0, m%$out0 + mova m%$out1, m%$out0 + pmaddwd m%$out0, [%$mat] + pmaddwd m%$out1, [%$mat + 16] + + %assign mat_offset 0 + %rep 3 + %assign mat_offset mat_offset + 16*2 + lea %$in, [%$in + 2*%$stride*4] + + movd xm%$temp0, [%$in] + punpckldq m%$temp0, [%$in + %$stride*4] + punpcklqdq m%$temp0, m%$temp0 + packssdw m%$temp0, m%$temp0 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp0, [%$mat + mat_offset] + pmaddwd m%$temp1, [%$mat + mat_offset+16] + paddd m%$out0, m%$temp0 + paddd m%$out1, m%$temp1 + %endrep + + %pop +%endmacro + +; Multiply a 16D vector by an 16x16 matrix. +%macro MATVEC_MUL_16 9 ; out[4], in, stride, mat, temp[2] + %push matvec_mul_16 + %define %$out0 %1 + %define %$out1 %2 + %define %$out2 %3 + %define %$out3 %4 + %define %$in %5 + %define %$stride %6 + %define %$mat %7 + %define %$temp0 %8 + %define %$temp1 %9 + + movd xm%$out0, [%$in] + punpckldq m%$out0, [%$in + %$stride*4] + punpcklqdq m%$out0, m%$out0 + packssdw m%$out0, m%$out0 + mova m%$out1, m%$out0 + mova m%$out2, m%$out0 + mova m%$out3, m%$out0 + pmaddwd m%$out0, [%$mat] + pmaddwd m%$out1, [%$mat + 16] + pmaddwd m%$out2, [%$mat + 32] + pmaddwd m%$out3, [%$mat + 48] + + %assign mat_offset 0 + %rep 7 + %assign mat_offset mat_offset + 16*4 + lea %$in, [%$in + 2*%$stride*4] + movd xm%$temp0, [%$in] + punpckldq m%$temp0, [%$in + %$stride*4] + punpcklqdq m%$temp0, m%$temp0 + packssdw m%$temp0, m%$temp0 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp1, [%$mat + mat_offset+16*0] + paddd m%$out0, m%$temp1 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp1, [%$mat + mat_offset+16*1] + paddd m%$out1, m%$temp1 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp1, [%$mat + mat_offset+16*2] + paddd m%$out2, m%$temp1 + pmaddwd m%$temp0, [%$mat + mat_offset+16*3] + paddd m%$out3, m%$temp0 + %endrep + + %pop +%endmacro + +; Multiply a 32D vector by a 32x32 matrix. +%macro MATVEC_MUL_32 13 ; out[8], in, stride, mat, temp[2] + %push matvec_mul_32 + %define %$out0 %1 + %define %$out1 %2 + %define %$out2 %3 + %define %$out3 %4 + %define %$out4 %5 + %define %$out5 %6 + %define %$out6 %7 + %define %$out7 %8 + %define %$in %9 + %define %$stride %10 + %define %$mat %11 + %define %$temp0 %12 + %define %$temp1 %13 + + movd xm%$out0, [%$in] + punpckldq m%$out0, [%$in + %$stride*4] + punpcklqdq m%$out0, m%$out0 + packssdw m%$out0, m%$out0 + mova m%$out1, m%$out0 + mova m%$out2, m%$out0 + mova m%$out3, m%$out0 + mova m%$out4, m%$out0 + mova m%$out5, m%$out0 + mova m%$out6, m%$out0 + mova m%$out7, m%$out0 + pmaddwd m%$out0, [%$mat] + pmaddwd m%$out1, [%$mat + 16] + pmaddwd m%$out2, [%$mat + 32] + pmaddwd m%$out3, [%$mat + 48] + pmaddwd m%$out4, [%$mat + 64] + pmaddwd m%$out5, [%$mat + 80] + pmaddwd m%$out6, [%$mat + 96] + pmaddwd m%$out7, [%$mat + 112] + + %assign mat_offset 0 + %rep 15 + %assign mat_offset mat_offset + 16*8 + lea %$in, [%$in + 2*%$stride*4] + movd xm%$temp0, [%$in] + punpckldq m%$temp0, [%$in + %$stride*4] + punpcklqdq m%$temp0, m%$temp0 + packssdw m%$temp0, m%$temp0 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp1, [%$mat + mat_offset+16*0] + paddd m%$out0, m%$temp1 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp1, [%$mat + mat_offset+16*1] + paddd m%$out1, m%$temp1 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp1, [%$mat + mat_offset+16*2] + paddd m%$out2, m%$temp1 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp1, [%$mat + mat_offset+16*3] + paddd m%$out3, m%$temp1 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp1, [%$mat + mat_offset+16*4] + paddd m%$out4, m%$temp1 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp1, [%$mat + mat_offset+16*5] + paddd m%$out5, m%$temp1 + mova m%$temp1, m%$temp0 + pmaddwd m%$temp1, [%$mat + mat_offset+16*6] + paddd m%$out6, m%$temp1 + pmaddwd m%$temp0, [%$mat + mat_offset+16*7] + paddd m%$out7, m%$temp0 + %endrep + + %pop +%endmacro + +%macro IDCT2_1D_2 3 ; out, in, stride + MATVEC_MUL_2 %1, %2, %3, vvc_dct2_4_even_mat +%endmacro + +; Performs a single type-II DCT with length 4 +; +; %1 Index of a SIMD register in which to store the result. +; Result is stored as 4 packed doublewords. +; +; %2 Memory address of input data. +; +; %3 Difference in memory address between input elements in bytes. +; +; %4 Index of scratch SIMD register. +%macro IDCT2_1D_4 4 ; out, in, stride, temp + %push idct_1d_4 + %define %$out %1 + %define %$in %2 + %define %$stride %3 + %define %$temp %4 + + lea stride3q, [%$stride*3] + + ; even part + movd xm%$out, [%$in + 0*%$stride*4] + punpckldq m%$out, [%2 + 2*%$stride*4] + punpcklqdq m%$out, m%$out + packssdw m%$out, m%$out + pmaddwd m%$out, [vvc_dct2_4_even_mat] + + ; odd part + movd xm%$temp, [%$in + 1*%$stride*4] + punpckldq m%$temp, [%$in + stride3q*4] + punpcklqdq m%$temp, m%$temp + packssdw m%$temp, m%$temp + pmaddwd m%$temp, [vvc_dct2_4_odd_mat] + + paddd m%$out, m%$temp + + %pop +%endmacro + +; Performs a single type-II DCT with length 8 +; +; %1 Index of SIMD register. Stores first half of output in order as +; packed doublewords: +; | y0 | y1 | y2 | y3 | +; +; %2 Index of SIMD register. Stores second half of output in reverse order as +; packed doublewords: +; | y7 | y6 | y5 | y4 | +; +; %3 Memory address of input data. Modified by macro. +; +; %4 Difference in memory address between input elements in bytes. +; Modified by macro. +; +; %5 Index of scratch SIMD register. +%macro IDCT2_1D_8 5 ; out[2], in, stride, temp + %push idct_1d_8 + %define %$out0 %1 + %define %$out1 %2 + %define %$in %3 + %define %$stride %4 + %define %$temp %5 + + lea %$stride, [%$stride*2] + + ; even part + IDCT2_1D_4 %$out0, %$in, %$stride, %$temp + + ; odd part + lea %$in, [%$in + %$stride*2] + MATVEC_MUL_4 %$out1, %$in, %$stride, vvc_dct2_8_odd_mat, %$temp + + SUMSUB_BA d, %$out1, %$out0, %$temp + + %pop +%endmacro + +; Performs a single type-II DCT with length 16 +; +; %1 Index of SIMD register. Stores first quarter of output in order as +; packed doublewords: +; | y0 | y1 | y2 | y3 | +; +; %2 Index of SIMD register. Stores second quarter of output in reverse order as +; packed doublewords: +; | y7 | y6 | y5 | y4 | +; +; %3 Index of SIMD register. Stores third quarter of output in order as +; packed doublewords: +; | y8 | y9 | y10 | y11 | +; +; %4 Index of SIMD register. Stores fourth quarter of output in reverse order as +; packed doublewords: +; | y15 | y14 | y13 | y12 | +; +; %5 Memory address of input data. Modified by macro. +; +; %6 Difference in memory address between input elements in bytes. +; Modified by macro. +; +; %7 Index of scratch SIMD register. +; +; %8 Index of scratch SIMD register. +%macro IDCT2_1D_16 8 ; out[4], in, stride, temp[2] + %push idct_1d_16 + %define %$out0 %1 + %define %$out1 %2 + %define %$out2 %3 + %define %$out3 %4 + %define %$in %5 + %define %$stride %6 + %define %$temp0 %7 + %define %$temp1 %8 + + lea %$stride, [%$stride*2] + + ; even part + push %$in + push %$stride + IDCT2_1D_8 %$out0, %$out1, %$in, %$stride, %$temp0 + pop %$stride + pop %$in + + ; odd part + lea %$in, [%$in + %$stride*2] + MATVEC_MUL_8 %$out2, %$out3, %$in, %$stride, vvc_dct2_16_odd_mat, %$temp0, %$temp1 + + SUMSUB_BADC d, %$out3, %$out0, %$out2, %$out1, %$temp0 + + %pop +%endmacro + +; Performs a single type-II DCT with length 32 +; +; %1 Index of SIMD register. Stores first eighth of output in order as +; packed doublewords: +; | y0 | y1 | y2 | y3 | +; +; %2 Index of SIMD register. Stores second eighth of output in reverse order as +; packed doublewords: +; | y7 | y6 | y5 | y4 | +; +; %3 Index of SIMD register. Stores third eighth of output in order as +; packed doublewords: +; | y8 | y9 | y10 | y11 | +; +; %4 Index of SIMD register. Stores fourth eighth of output in reverse order as +; packed doublewords: +; | y15 | y14 | y13 | y12 | +; +; %5 Index of SIMD register. Stores fifth eighth of output in order as +; packed doublewords: +; | y16 | y17 | y18 | y19 | +; +; %6 Index of SIMD register. Stores sixth eighth of output in reverse order as +; packed doublewords: +; | y23 | y22 | y21 | y20 | +; +; %7 Index of SIMD register. Stores seventh eighth of output in order as +; packed doublewords: +; | y24 | y25 | y26 | y27 | +; +; %8 Index of SIMD register. Stores eigth eighth of output in order as +; packed doublewords: +; | y28 | y29 | y30 | y31 | +; +; %9 Memory address of input data. Modified by macro. +; +; %10 Difference in memory address between input elements in bytes. +; Modified by macro. +; +; %11 Index of scratch SIMD register. + +; %12 Index of scratch SIMD register. +; +; %13 Index of scratch SIMD register. + +; %14 Index of scratch SIMD register. +%macro IDCT2_1D_32 12 ; out[8], in, stride, temp[2] + %push idct_1d_32 + %define %$out0 %1 + %define %$out1 %2 + %define %$out2 %3 + %define %$out3 %4 + %define %$out4 %5 + %define %$out5 %6 + %define %$out6 %7 + %define %$out7 %8 + %define %$in %9 + %define %$stride %10 + %define %$temp0 %11 + %define %$temp1 %12 + %define %$temp2 %13 + %define %$temp3 %14 + + lea %$stride, [%$stride*2] + + ; even part + push %$in + push %$stride + IDCT2_1D_16 %$out0, %$out1, %$out2, %$out3, %$in, %$stride, %$temp0, %$temp1 + pop %$stride + pop %$in + + ; odd part + lea %$in, [%$in + %$stride*2] + MATVEC_MUL_16 %$out4, %$out5, %$out6, %$out7, %$in, %$stride, vvc_dct2_32_odd_mat, %$temp0, %$temp1 + + SUMSUB_BADC d, %$out7, %$out0, %$out6, %$out1, %$temp0 + SUMSUB_BADC d, %$out5, %$out2, %$out4, %$out3, %$temp0 + + %pop +%endmacro + +; Performs a single type-II DCT with length 64 +; +; %1 Index of SIMD register. Stores first eighth of output in mixed order as +; packed doublewords: +; | y0 | y1 | y2 | y3 | y7 | y6 | y5 | y4 | +; +; %2 Index of SIMD register. Stores second eighth of output in mixed order as +; packed doublewords: +; | y8 | y9 | y10 | y11 | y15 | y14 | y13 | y12 +; +; %3 Index of SIMD register. Stores third eighth of output in mixed order as +; packed doublewords: +; | y16 | y17 | y18 | y19 | y23 | y22 | y21 | y20 | +; +; %4 Index of SIMD register. Stores fourth eighth of output in mixed order as +; packed doublewords: +; | y24 | y25 | y26 | y27 | y31 | y30 | y29 | y28 | +; +; %5 Index of SIMD register. Stores fifth eighth of output in mixed order as +; packed doublewords: +; | y32 | y33 | y34 | y35 | y39 | y38 | y37 | y36 | +; +; %6 Index of SIMD register. Stores sixth eighth of output in mixed order as +; packed doublewords: +; | y40 | y41 | y42 | y43 | y47 | y46 | y45 | y44 | +; +; %7 Index of SIMD register. Stores seventh eighth of output in mixed order as +; packed doublewords: +; | y48 | y49 | y50 | y51 | y55 | y54 | y53 | y52 | +; +; %8 Index of SIMD register. Stores eigth eighth of output in mixed order as +; packed doublewords: +; | y56 | y57 | y58 | y59 | y63 | y62 | y61 | y60 | +; +; %9 Memory address of input data. Modified by macro. +; +; %10 Difference in memory address between input elements in bytes. +; Modified by macro. +; +; %11 Index of scratch SIMD register. + +; %12 Index of scratch SIMD register. +; +; %13 Index of scratch SIMD register. + +; %14 Index of scratch SIMD register. +; +; %15 Index of scratch SIMD register. +; +; %16 Index of scratch SIMD register. +%macro IDCT2_1D_64 16 ; out[8], in, stride, temp[6] + %push idct_1d_64 + %define %$out0 %1 + %define %$out1 %2 + %define %$out2 %3 + %define %$out3 %4 + %define %$out4 %5 + %define %$out5 %6 + %define %$out6 %7 + %define %$out7 %8 + %define %$in %9 + %define %$stride %10 + %define %$temp0 %11 + %define %$temp1 %12 + %define %$temp2 %13 + %define %$temp3 %14 + %define %$temp4 %15 + %define %$temp5 %16 + + lea %$stride, [%$stride*2] + + ; even part + push %$in + push %$stride + IDCT2_1D_32 %$out0, %$out1, %$out2, %$out3, %$out4, %$out5, %$out6, %$out7, %$in, %$stride, %$temp0, %$temp1 + pop %$stride + pop %$in + vinserti128 m%$out0, m%$out0, xm%$out1, 1 + vinserti128 m%$out1, m%$out2, xm%$out3, 1 + vinserti128 m%$out2, m%$out4, xm%$out5, 1 + vinserti128 m%$out3, m%$out6, xm%$out7, 1 + + ; odd part + lea %$in, [%$in + %$stride*2] + MATVEC_MUL_32 %$out4, %$out5, %$out6, %$out7, %$temp0, %$temp1, %$temp2, %$temp3, %$in, %$stride, vvc_dct2_64_odd_mat, %$temp4, %$temp5 + vinserti128 m%$out4, m%$out5, xm%$out4, 1 + vinserti128 m%$out5, m%$out7, xm%$out6, 1 + vinserti128 m%$out6, m%$temp1, xm%$temp0, 1 + vinserti128 m%$out7, m%$temp3, xm%$temp2, 1 + + SUMSUB_BADC d, %$out7, %$out0, %$out6, %$out1, %$temp0 + SUMSUB_BADC d, %$out5, %$out2, %$out4, %$out3, %$temp0 + + %pop +%endmacro + +; void ff_vvc_inv_dct2_2_avx2(int *out, ptrdiff_t out_stride, +; const int *in, ptrdiff_t in_stride); +cglobal vvc_inv_dct2_2, 4, 5, 3, out, out_stride, in, in_stride, \ + stride3 + IDCT2_1D_2 0, inq, in_strideq + + pextrd [outq + 0*out_strideq*4], xm0, 0 + pextrd [outq + 1*out_strideq*4], xm0, 2 + RET + +; void ff_vvc_inv_dct2_4_avx2(int *out, ptrdiff_t out_stride, +; const int *in, ptrdiff_t in_stride); +cglobal vvc_inv_dct2_4, 4, 5, 3, out, out_stride, in, in_stride, \ + stride3 + IDCT2_1D_4 0, inq, in_strideq, 1 + + lea stride3q, [in_strideq*3] + pextrd [outq + 0*out_strideq*4], xm0, 0 + pextrd [outq + 1*out_strideq*4], xm0, 1 + pextrd [outq + 2*out_strideq*4], xm0, 2 + pextrd [outq + stride3q*4], xm0, 3 + RET + +; void ff_vvc_inv_dct2_8_avx2(int *out, ptrdiff_t out_stride, +; const int *in, ptrdiff_t in_stride); +cglobal vvc_inv_dct2_8, 4, 5, 3, out, out_stride, in, in_stride, \ + stride3 + IDCT2_1D_8 0, 1, inq, in_strideq, 2 + + pextrd [outq + 0*out_strideq*4], xm0, 0 + pextrd [outq + 1*out_strideq*4], xm0, 1 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm0, 2 + pextrd [outq + 1*out_strideq*4], xm0, 3 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm1, 3 + pextrd [outq + 1*out_strideq*4], xm1, 2 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm1, 1 + pextrd [outq + 1*out_strideq*4], xm1, 0 + RET + +; void ff_vvc_inv_dct2_16_avx2(int *out, ptrdiff_t out_stride, +; const int *in, ptrdiff_t in_stride); +cglobal vvc_inv_dct2_16, 4, 5, 6, out, out_stride, in, in_stride, \ + stride3 + IDCT2_1D_16 0, 1, 2, 3, inq, in_strideq, 4, 5 + + pextrd [outq + 0*out_strideq*4], xm0, 0 + pextrd [outq + 1*out_strideq*4], xm0, 1 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm0, 2 + pextrd [outq + 1*out_strideq*4], xm0, 3 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm1, 3 + pextrd [outq + 1*out_strideq*4], xm1, 2 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm1, 1 + pextrd [outq + 1*out_strideq*4], xm1, 0 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm2, 0 + pextrd [outq + 1*out_strideq*4], xm2, 1 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm2, 2 + pextrd [outq + 1*out_strideq*4], xm2, 3 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm3, 3 + pextrd [outq + 1*out_strideq*4], xm3, 2 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm3, 1 + pextrd [outq + 1*out_strideq*4], xm3, 0 + RET + +; void ff_vvc_inv_dct2_32_avx2(int *out, ptrdiff_t out_stride, +; const int *in, ptrdiff_t in_stride); +cglobal vvc_inv_dct2_32, 4, 5, 12, out, out_stride, in, in_stride, \ + stride3 + IDCT2_1D_32 0, 1, 2, 3, 4, 5, 6, 7, inq, in_strideq, 8, 9 + + pextrd [outq + 0*out_strideq*4], xm0, 0 + pextrd [outq + 1*out_strideq*4], xm0, 1 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm0, 2 + pextrd [outq + 1*out_strideq*4], xm0, 3 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm1, 3 + pextrd [outq + 1*out_strideq*4], xm1, 2 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm1, 1 + pextrd [outq + 1*out_strideq*4], xm1, 0 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm2, 0 + pextrd [outq + 1*out_strideq*4], xm2, 1 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm2, 2 + pextrd [outq + 1*out_strideq*4], xm2, 3 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm3, 3 + pextrd [outq + 1*out_strideq*4], xm3, 2 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm3, 1 + pextrd [outq + 1*out_strideq*4], xm3, 0 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm4, 0 + pextrd [outq + 1*out_strideq*4], xm4, 1 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm4, 2 + pextrd [outq + 1*out_strideq*4], xm4, 3 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm5, 3 + pextrd [outq + 1*out_strideq*4], xm5, 2 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm5, 1 + pextrd [outq + 1*out_strideq*4], xm5, 0 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm6, 0 + pextrd [outq + 1*out_strideq*4], xm6, 1 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm6, 2 + pextrd [outq + 1*out_strideq*4], xm6, 3 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm7, 3 + pextrd [outq + 1*out_strideq*4], xm7, 2 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm7, 1 + pextrd [outq + 1*out_strideq*4], xm7, 0 + RET + +; void ff_vvc_inv_dct2_64_avx2(int *out, ptrdiff_t out_stride, +; const int *in, ptrdiff_t in_stride); +cglobal vvc_inv_dct2_64, 4, 5, 14, out, out_stride, in, in_stride, \ + stride3 + IDCT2_1D_64 0, 1, 2, 3, 4, 5, 6, 7, inq, in_strideq, 8, 9, 10, 11, 12, 13 + + %assign i 0 + %rep 4 + pextrd [outq + 0*out_strideq*4], xm %+ i, 0 + pextrd [outq + 1*out_strideq*4], xm %+ i, 1 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm %+ i, 2 + pextrd [outq + 1*out_strideq*4], xm %+ i, 3 + vperm2i128 m %+ i, m %+ i, m %+ i, 21o + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm %+ i, 3 + pextrd [outq + 1*out_strideq*4], xm %+ i, 2 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm %+ i, 1 + pextrd [outq + 1*out_strideq*4], xm %+ i, 0 + lea outq, [outq + 2*out_strideq*4] + %assign i i+1 + %endrep + %rep 4 + %assign upper_half i-4 + vextracti128 xm %+ upper_half, m %+ i, 1 + pextrd [outq + 0*out_strideq*4], xm %+ upper_half, 0 + pextrd [outq + 1*out_strideq*4], xm %+ upper_half, 1 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm %+ upper_half, 2 + pextrd [outq + 1*out_strideq*4], xm %+ upper_half, 3 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm %+ i, 3 + pextrd [outq + 1*out_strideq*4], xm %+ i, 2 + lea outq, [outq + 2*out_strideq*4] + pextrd [outq + 0*out_strideq*4], xm %+ i, 1 + pextrd [outq + 1*out_strideq*4], xm %+ i, 0 + lea outq, [outq + 2*out_strideq*4] + %assign i i+1 + %endrep + RET diff --git a/libavcodec/x86/vvcdsp_init.c b/libavcodec/x86/vvcdsp_init.c index a5849e3a2ed..e2ef7787c55 100644 --- a/libavcodec/x86/vvcdsp_init.c +++ b/libavcodec/x86/vvcdsp_init.c @@ -241,11 +241,36 @@ PUT_VVC_LUMA_FORWARD_FUNCS(12, avx512icl) c->inter.put[LUMA][1][1] = ff_vvc_put_vvc_luma_hv_##bitd##_##opt; \ } while (0) -void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bit_depth) +#define ITX_FUNC(type, size, opt) \ +void ff_vvc_inv_##type##_##size##_##opt(int *out, ptrdiff_t out_stride, \ + const int *in, ptrdiff_t in_stride); + +ITX_FUNC(dct2, 2, avx2); +ITX_FUNC(dct2, 4, avx2); +ITX_FUNC(dct2, 8, avx2); +ITX_FUNC(dct2, 16, avx2); +ITX_FUNC(dct2, 32, avx2); +ITX_FUNC(dct2, 64, avx2); + +#define IDCT2_INIT(opt) do { \ + c->itx.itx[DCT2][0] = ff_vvc_inv_dct2_2_##opt; \ + c->itx.itx[DCT2][1] = ff_vvc_inv_dct2_4_##opt; \ + c->itx.itx[DCT2][2] = ff_vvc_inv_dct2_8_##opt; \ + c->itx.itx[DCT2][3] = ff_vvc_inv_dct2_16_##opt; \ + c->itx.itx[DCT2][4] = ff_vvc_inv_dct2_32_##opt; \ + c->itx.itx[DCT2][5] = ff_vvc_inv_dct2_64_##opt; \ +} while(0); + +void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bit_depth, + int extended_precision_flag) { const int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_AVX2(cpu_flags)) { + if (!extended_precision_flag) { + IDCT2_INIT(avx2); + } + switch (bit_depth) { case 8: ALF_DSP(8); diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 9a2105da3b3..762b4989cc9 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -36,6 +36,10 @@ AVCODECOBJS-$(CONFIG_V210_ENCODER) += v210enc.o AVCODECOBJS-$(CONFIG_VORBIS_DECODER) += vorbisdsp.o AVCODECOBJS-$(CONFIG_VP9_DECODER) += vp9dsp.o AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_alf.o vvc_sao.o vvc_mc.o +AVCODECOBJS-$(CONFIG_VVC_DECODER) += vvc_alf.o \ + vvc_sao.o \ + vvc_mc.o \ + vvc_itx_1d.o CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index c4f80ece513..73d79b028ee 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -179,6 +179,7 @@ static const struct { { "vvc_alf", checkasm_check_vvc_alf }, { "vvc_sao", checkasm_check_vvc_sao }, { "vvc_mc", checkasm_check_vvc_mc }, + { "vvc_itx_1d", checkasm_check_vvc_itx_1d }, #endif #endif #if CONFIG_AVFILTER diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index a82e157a4ea..d78ffe73d12 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -97,6 +97,7 @@ void checkasm_check_vorbisdsp(void); void checkasm_check_vvc_alf(void); void checkasm_check_vvc_sao(void); void checkasm_check_vvc_mc(void); +void checkasm_check_vvc_itx_1d(void); struct CheckasmPerf; diff --git a/tests/checkasm/vvc_alf.c b/tests/checkasm/vvc_alf.c index 080f9430521..43201c05571 100644 --- a/tests/checkasm/vvc_alf.c +++ b/tests/checkasm/vvc_alf.c @@ -167,12 +167,12 @@ void checkasm_check_vvc_alf(void) int bit_depth; VVCDSPContext h; for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { - ff_vvc_dsp_init(&h, bit_depth); + ff_vvc_dsp_init(&h, bit_depth, 0); check_alf_filter(&h, bit_depth); } report("alf_filter"); for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { - ff_vvc_dsp_init(&h, bit_depth); + ff_vvc_dsp_init(&h, bit_depth, 0); check_alf_classify(&h, bit_depth); } report("alf_classify"); diff --git a/tests/checkasm/vvc_itx_1d.c b/tests/checkasm/vvc_itx_1d.c new file mode 100644 index 00000000000..51b917c7214 --- /dev/null +++ b/tests/checkasm/vvc_itx_1d.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2023 Frank Plowman + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/mem_internal.h" + +#include "libavcodec/avcodec.h" + +#include "libavcodec/vvc/vvcdsp.h" +#include "libavcodec/vvc/vvcdec.h" + +#include "checkasm.h" + +#define SIZEOF_PIXEL ((bit_depth + 7) / 8) +#define PIXEL_STRIDE (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) +#define BUF_SIZE (PIXEL_STRIDE * MAX_TB_SIZE) + +#define randomize_buffers(buf0, buf1, size) \ + do { \ + int k; \ + for (k = 0; k < size; ++k) { \ + uint32_t r = rnd(); \ + int32_t a = INT16_MIN + r / (UINT32_MAX / (INT16_MAX - INT16_MIN + 1) + 1); \ + AV_WN32A(buf0 + k, a); \ + AV_WN32A(buf1 + k, a); \ + } \ + } while (0) + +static void check_idct2(VVCDSPContext h, int bit_depth) +{ + LOCAL_ALIGNED_32(int, ref_dst, [BUF_SIZE]); + LOCAL_ALIGNED_32(int, new_dst, [BUF_SIZE]); + LOCAL_ALIGNED_32(int, ref_src, [BUF_SIZE]); + LOCAL_ALIGNED_32(int, new_src, [BUF_SIZE]); + + const ptrdiff_t stride = PIXEL_STRIDE * SIZEOF_PIXEL; + + for (int log2_size = 1; log2_size <= 6; log2_size++) { + const int size = 1 << log2_size; + declare_func_emms(AV_CPU_FLAG_MMX, void, int *dst, ptrdiff_t dst_stride, + int *src, ptrdiff_t src_stride); + + randomize_buffers(ref_src, new_src, BUF_SIZE); + memset(ref_dst, 0, BUF_SIZE); + memset(new_dst, 0, BUF_SIZE); + + if (check_func(h.itx.itx[DCT2][log2_size - 1], "vvc_inv_dct2_%d", size)) { + call_ref(ref_dst, stride, ref_src, stride); + call_new(new_dst, stride, new_src, stride); + checkasm_check_int32_t("vvc_itx_1d.asm", 0, ref_dst, stride * sizeof(int), new_dst, stride * sizeof(int), 1, size, "dst"); + } + bench_new(new_dst, stride, new_src, stride); + } +} + +void checkasm_check_vvc_itx_1d(void) +{ + VVCDSPContext h; + ff_vvc_dsp_init(&h, 8, 0); + check_idct2(h, 8); + report("idct2"); +} diff --git a/tests/checkasm/vvc_mc.c b/tests/checkasm/vvc_mc.c index 4c4225b29a7..1dd96d2744d 100644 --- a/tests/checkasm/vvc_mc.c +++ b/tests/checkasm/vvc_mc.c @@ -93,7 +93,7 @@ void checkasm_check_vvc_mc(void) int bit_depth; VVCDSPContext h; for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { - ff_vvc_dsp_init(&h, bit_depth); + ff_vvc_dsp_init(&h, bit_depth, 0); check_put_vvc_luma(&h, bit_depth); } diff --git a/tests/checkasm/vvc_sao.c b/tests/checkasm/vvc_sao.c index 68bf4b11025..5bf43d0abbf 100644 --- a/tests/checkasm/vvc_sao.c +++ b/tests/checkasm/vvc_sao.c @@ -150,14 +150,14 @@ void checkasm_check_vvc_sao(void) for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { VVCDSPContext h; - ff_vvc_dsp_init(&h, bit_depth); + ff_vvc_dsp_init(&h, bit_depth, 0); check_sao_band(h, bit_depth); } report("sao_band"); for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) { VVCDSPContext h; - ff_vvc_dsp_init(&h, bit_depth); + ff_vvc_dsp_init(&h, bit_depth, 0); check_sao_edge(h, bit_depth); } report("sao_edge");