8599 lines
311 KiB
NASM
8599 lines
311 KiB
NASM
; Copyright © 2021, VideoLAN and dav1d authors
|
|
; Copyright © 2021, Two Orioles, LLC
|
|
; Copyright © 2021, Matthias Dressel
|
|
; All rights reserved.
|
|
;
|
|
; Redistribution and use in source and binary forms, with or without
|
|
; modification, are permitted provided that the following conditions are met:
|
|
;
|
|
; 1. Redistributions of source code must retain the above copyright notice, this
|
|
; list of conditions and the following disclaimer.
|
|
;
|
|
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
; this list of conditions and the following disclaimer in the documentation
|
|
; and/or other materials provided with the distribution.
|
|
;
|
|
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
%include "config.asm"
|
|
%include "ext/x86/x86inc.asm"
|
|
|
|
%if ARCH_X86_64
|
|
|
|
SECTION_RODATA 32
|
|
itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6
|
|
dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7
|
|
idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7
|
|
idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5
|
|
iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
|
|
idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6
|
|
iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5
|
|
pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048
|
|
idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11
|
|
idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
|
|
|
|
%macro COEF_PAIR 2-3 0
|
|
pd_%1_%2: dd %1, %1, %2, %2
|
|
%define pd_%1 (pd_%1_%2 + 4*0)
|
|
%define pd_%2 (pd_%1_%2 + 4*2)
|
|
%if %3
|
|
dd -%2, -%2
|
|
%define pd_%2_m%2 pd_%2
|
|
%endif
|
|
%endmacro
|
|
|
|
COEF_PAIR 201, 995
|
|
COEF_PAIR 401, 1931
|
|
COEF_PAIR 799, 3406
|
|
COEF_PAIR 1380, 601
|
|
COEF_PAIR 1751, 2440
|
|
COEF_PAIR 2598, 1189
|
|
COEF_PAIR 2751, 2106
|
|
COEF_PAIR 2896, 1567, 1
|
|
COEF_PAIR 2896, 3784, 1
|
|
COEF_PAIR 3035, 3513
|
|
COEF_PAIR 3166, 3920
|
|
COEF_PAIR 3703, 3290
|
|
COEF_PAIR 3857, 4052
|
|
COEF_PAIR 4017, 2276
|
|
COEF_PAIR 4076, 3612
|
|
COEF_PAIR 4091, 3973
|
|
|
|
pd_8: dd 8
|
|
pd_m601: dd -601
|
|
pd_m1189: dd -1189
|
|
pd_m1380: dd -1380
|
|
pd_m2106: dd -2106
|
|
pd_m2598: dd -2598
|
|
pd_m2751: dd -2751
|
|
pd_m3344: dd -3344
|
|
pd_1024: dd 1024
|
|
pd_1321: dd 1321
|
|
pd_1448: dd 1448
|
|
pd_1697: dd 1697
|
|
pd_2482: dd 2482
|
|
pd_3072: dd 3072 ; 1024 + 2048
|
|
pd_3803: dd 3803
|
|
pd_5119: dd 5119 ; 1024 + 4096 - 1
|
|
pd_5120: dd 5120 ; 1024 + 4096
|
|
pd_5793: dd 5793
|
|
pd_6144: dd 6144 ; 2048 + 4096
|
|
pd_17408: dd 17408 ; 1024 + 16384
|
|
|
|
pixel_10bpc_max: times 2 dw 0x03ff
|
|
pixel_12bpc_max: times 2 dw 0x0fff
|
|
dconly_10bpc: times 2 dw 0x7c00
|
|
dconly_12bpc: times 2 dw 0x7000
|
|
clip_18b_min: dd -0x20000
|
|
clip_18b_max: dd 0x1ffff
|
|
clip_20b_min: dd -0x80000
|
|
clip_20b_max: dd 0x7ffff
|
|
|
|
const idct64_mul_16bpc
|
|
dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
|
|
dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
|
|
dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
|
|
dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
|
|
|
|
cextern deint_shuf
|
|
cextern idct64_mul
|
|
cextern pw_1697x8
|
|
cextern pw_1697x16
|
|
cextern pw_1567_3784
|
|
cextern pw_m1567_m3784
|
|
cextern pw_m3784_1567
|
|
cextern pw_2896_2896
|
|
cextern pw_m2896_2896
|
|
cextern pw_5
|
|
cextern pw_2048
|
|
cextern pw_4096
|
|
cextern pw_8192
|
|
cextern pw_16384
|
|
cextern pw_2896x8
|
|
cextern pd_2048
|
|
|
|
cextern idct_4x8_internal_8bpc_avx2.main
|
|
cextern idct_4x16_internal_8bpc_avx2.main
|
|
cextern idct_8x8_internal_8bpc_avx2.main
|
|
cextern idct_8x16_internal_8bpc_avx2.main
|
|
cextern idct_16x4_internal_8bpc_avx2.main
|
|
cextern idct_16x8_internal_8bpc_avx2.main
|
|
cextern idct_16x16_internal_8bpc_avx2.main
|
|
cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main
|
|
cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast
|
|
cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf
|
|
cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast
|
|
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1
|
|
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal
|
|
|
|
cextern iadst_4x4_internal_8bpc_avx2.main
|
|
cextern iadst_4x8_internal_8bpc_avx2.main_pass2
|
|
cextern iadst_4x16_internal_8bpc_avx2.main2
|
|
cextern iadst_8x4_internal_8bpc_avx2.main
|
|
cextern iadst_8x8_internal_8bpc_avx2.main_pass2
|
|
cextern iadst_8x16_internal_8bpc_avx2.main
|
|
cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end
|
|
cextern iadst_16x4_internal_8bpc_avx2.main
|
|
cextern iadst_16x8_internal_8bpc_avx2.main
|
|
cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end
|
|
cextern iadst_16x16_internal_8bpc_avx2.main
|
|
cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
|
|
|
|
SECTION .text
|
|
|
|
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
|
|
|
|
%macro WRAP_XMM 1+
|
|
INIT_XMM cpuname
|
|
%1
|
|
INIT_YMM cpuname
|
|
%endmacro
|
|
|
|
%macro IWHT4_1D_PACKED 0
|
|
; m0 = in0 in2, m1 = in1 in3
|
|
psubd m2, m0, m1 ; t2
|
|
paddd xm0, xm1 ; t0
|
|
vpermq m2, m2, q3322
|
|
vpermq m0, m0, q1100
|
|
vpermq m1, m1, q3120
|
|
psubd m3, m0, m2
|
|
psrad m3, 1
|
|
psubd m3, m1 ; t1 t3
|
|
psubd m0, m3 ; ____ out0
|
|
paddd m2, m3 ; out3 ____
|
|
%endmacro
|
|
|
|
INIT_YMM avx2
|
|
cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
|
|
mova xm0, [cq+16*0]
|
|
vinserti128 m0, [cq+16*2], 1
|
|
mova xm1, [cq+16*1]
|
|
vinserti128 m1, [cq+16*3], 1
|
|
pxor m4, m4
|
|
mova [cq+32*0], m4
|
|
mova [cq+32*1], m4
|
|
lea r6, [dstq+strideq*2]
|
|
psrad m0, 2
|
|
psrad m1, 2
|
|
IWHT4_1D_PACKED
|
|
punpckhdq m0, m3
|
|
punpckldq m3, m2
|
|
punpckhqdq m1, m0, m3
|
|
punpcklqdq m0, m3
|
|
IWHT4_1D_PACKED
|
|
vpblendd m0, m2, 0x33
|
|
packssdw m0, m3
|
|
vextracti128 xm2, m0, 1
|
|
punpckhdq xm1, xm0, xm2 ; out2 out1
|
|
punpckldq xm0, xm2 ; out3 out0
|
|
movq xm2, [r6 +strideq*1]
|
|
movhps xm2, [dstq+strideq*0]
|
|
movq xm3, [r6 +strideq*0]
|
|
movhps xm3, [dstq+strideq*1]
|
|
%ifidn bdmaxd, bdmaxm
|
|
movd xm5, bdmaxd
|
|
vpbroadcastw xm5, xm5
|
|
%else ; win64: load from stack
|
|
vpbroadcastw xm5, bdmaxm
|
|
%endif
|
|
paddsw xm0, xm2
|
|
paddsw xm1, xm3
|
|
pmaxsw xm0, xm4
|
|
pmaxsw xm1, xm4
|
|
pminsw xm0, xm5
|
|
pminsw xm1, xm5
|
|
movhps [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm1
|
|
movq [r6 +strideq*0], xm1
|
|
movq [r6 +strideq*1], xm0
|
|
RET
|
|
|
|
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
|
|
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
|
|
; flags: 1 = packed, 2 = inv_dst2
|
|
; skip round/shift if rnd is not a number
|
|
%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
|
|
%if %8 < 32
|
|
pmulld m%4, m%1, m%8
|
|
pmulld m%3, m%2, m%8
|
|
%else
|
|
%if %9 & 1
|
|
vbroadcasti128 m%3, [pd_%8]
|
|
%else
|
|
vpbroadcastd m%3, [pd_%8]
|
|
%endif
|
|
pmulld m%4, m%1, m%3
|
|
pmulld m%3, m%2
|
|
%endif
|
|
%if %7 < 32
|
|
pmulld m%1, m%7
|
|
pmulld m%2, m%7
|
|
%else
|
|
%if %9 & 1
|
|
vbroadcasti128 m%5, [pd_%7]
|
|
%else
|
|
vpbroadcastd m%5, [pd_%7]
|
|
%endif
|
|
pmulld m%1, m%5
|
|
pmulld m%2, m%5
|
|
%endif
|
|
%if %9 & 2
|
|
psubd m%4, m%6, m%4
|
|
psubd m%2, m%4, m%2
|
|
%else
|
|
%ifnum %6
|
|
paddd m%4, m%6
|
|
%endif
|
|
paddd m%2, m%4
|
|
%endif
|
|
%ifnum %6
|
|
paddd m%1, m%6
|
|
%endif
|
|
psubd m%1, m%3
|
|
%ifnum %6
|
|
psrad m%2, 12
|
|
psrad m%1, 12
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth
|
|
cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2
|
|
%define %%p1 m(i%1_%4_internal_%5bpc)
|
|
; Jump to the 1st txfm function if we're not taking the fast path, which
|
|
; in turn performs an indirect jump to the 2nd txfm function.
|
|
lea tx2q, [m(i%2_%4_internal_%5bpc).pass2]
|
|
%ifidn %1_%2, dct_dct
|
|
test eobd, eobd
|
|
jnz %%p1
|
|
%else
|
|
%if %3
|
|
add eobd, %3
|
|
%endif
|
|
; jump to the 1st txfm function unless it's located directly after this
|
|
times ((%%end - %%p1) >> 31) & 1 jmp %%p1
|
|
ALIGN function_align
|
|
%%end:
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth
|
|
INV_TXFM_FN %1, %2, 0, 4x4, %3
|
|
%ifidn %1_%2, dct_dct
|
|
vpbroadcastd xm2, [dconly_%3bpc]
|
|
%if %3 = 10
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
mov [cq], eobd ; 0
|
|
or r3d, 4
|
|
.dconly2:
|
|
add r6d, 128
|
|
sar r6d, 8
|
|
.dconly3:
|
|
imul r6d, 181
|
|
add r6d, 2176
|
|
sar r6d, 12
|
|
movd xm0, r6d
|
|
paddsw xm0, xm2
|
|
vpbroadcastw xm0, xm0
|
|
.dconly_loop:
|
|
movq xm1, [dstq+strideq*0]
|
|
movhps xm1, [dstq+strideq*1]
|
|
paddsw xm1, xm0
|
|
psubusw xm1, xm2
|
|
movq [dstq+strideq*0], xm1
|
|
movhps [dstq+strideq*1], xm1
|
|
lea dstq, [dstq+strideq*2]
|
|
sub r3d, 2
|
|
jg .dconly_loop
|
|
WRAP_XMM RET
|
|
%else
|
|
jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly
|
|
%endif
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd
|
|
ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1
|
|
punpckhqdq m%3, m%2, m%1 ; t3 t2
|
|
punpcklqdq m%2, m%1 ; t0 t1
|
|
paddd m%1, m%2, m%3 ; out0 out1
|
|
psubd m%2, m%3 ; out3 out2
|
|
%endmacro
|
|
|
|
%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
|
|
vpbroadcastd m%5, [pw_m3784_1567]
|
|
punpckhwd m%3, m%2, m%1
|
|
vpbroadcastd m%4, [pw_1567_3784]
|
|
punpcklwd m%2, m%1
|
|
vpbroadcastd m%1, [pw_m2896_2896]
|
|
pmaddwd m%5, m%3
|
|
pmaddwd m%3, m%4
|
|
vpbroadcastd m%4, [pw_2896_2896]
|
|
pmaddwd m%1, m%2
|
|
pmaddwd m%2, m%4
|
|
REPX {paddd x, m%6}, m%5, m%3, m%1, m%2
|
|
REPX {psrad x, 12 }, m%5, m%3, m%1, m%2
|
|
packssdw m%3, m%5 ; t3 t2
|
|
packssdw m%2, m%1 ; t0 t1
|
|
paddsw m%1, m%2, m%3 ; out0 out1
|
|
psubsw m%2, m%3 ; out3 out2
|
|
%endmacro
|
|
|
|
INV_TXFM_4X4_FN dct, dct
|
|
INV_TXFM_4X4_FN dct, identity
|
|
INV_TXFM_4X4_FN dct, adst
|
|
INV_TXFM_4X4_FN dct, flipadst
|
|
|
|
cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
|
|
call .main
|
|
vbroadcasti128 m2, [idct4_shuf]
|
|
packssdw m0, m1
|
|
pshufb m0, m2
|
|
jmp tx2q
|
|
.pass2:
|
|
vextracti128 xm1, m0, 1
|
|
WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5
|
|
packssdw xm5, xm5 ; pw_2048
|
|
pmulhrsw xm0, xm5
|
|
pmulhrsw xm1, xm5
|
|
movq xm2, [dstq+strideq*0]
|
|
movhps xm2, [dstq+strideq*1]
|
|
lea r6, [dstq+strideq*2]
|
|
movq xm3, [r6 +strideq*1]
|
|
movhps xm3, [r6 +strideq*0]
|
|
vpbroadcastd xm5, [pixel_10bpc_max]
|
|
pxor m4, m4
|
|
mova [cq+32*0], m4
|
|
mova [cq+32*1], m4
|
|
paddw xm0, xm2
|
|
paddw xm1, xm3
|
|
pmaxsw xm0, xm4
|
|
pmaxsw xm1, xm4
|
|
pminsw xm0, xm5
|
|
pminsw xm1, xm5
|
|
movq [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
movhps [r6 +strideq*0], xm1
|
|
movq [r6 +strideq*1], xm1
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
vpermq m0, [cq+32*0], q3120
|
|
vpermq m1, [cq+32*1], q3120
|
|
vpbroadcastd m5, [pd_2048]
|
|
.main2:
|
|
IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5
|
|
ret
|
|
|
|
INV_TXFM_4X4_FN adst, dct
|
|
INV_TXFM_4X4_FN adst, adst
|
|
INV_TXFM_4X4_FN adst, flipadst
|
|
INV_TXFM_4X4_FN adst, identity
|
|
|
|
%macro IADST4_1D 0
|
|
vpbroadcastd m5, [pd_1321]
|
|
vpbroadcastd m7, [pd_2482]
|
|
pmulld m4, m0, m5 ; 1321*in0
|
|
pmulld m6, m3, m7 ; 2482*in3
|
|
paddd m4, m6 ; 1321*in0 + 2482*in3
|
|
pmulld m6, m0, m7 ; 2482*in0
|
|
paddd m0, m3 ; in0 + in3
|
|
paddd m7, m5 ; pd_3803
|
|
pmulld m5, m2 ; 1321*in2
|
|
pmulld m3, m7 ; 3803*in3
|
|
pmulld m7, m2 ; 3803*in2
|
|
psubd m2, m0 ; in2 - in0 - in3
|
|
vpbroadcastd m0, [pd_m3344]
|
|
pmulld m1, m0 ; -t3
|
|
pmulld m2, m0 ; out2 (unrounded)
|
|
psubd m6, m5 ; 2482*in0 - 1321*in2
|
|
paddd m4, m7 ; t0
|
|
psubd m6, m3 ; t1
|
|
paddd m3, m4, m6
|
|
psubd m4, m1 ; out0 (unrounded)
|
|
psubd m6, m1 ; out1 (unrounded)
|
|
paddd m3, m1 ; out3 (unrounded)
|
|
%endmacro
|
|
|
|
cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
|
|
call .main
|
|
vinserti128 m0, m4, xm6, 1
|
|
vinserti128 m1, m2, xm3, 1
|
|
.pass1_end:
|
|
vpbroadcastd m5, [pd_2048]
|
|
mova m2, [itx4_shuf]
|
|
paddd m0, m5
|
|
paddd m1, m5
|
|
psrad m0, 12
|
|
psrad m1, 12
|
|
packssdw m0, m1
|
|
vpermd m0, m2, m0
|
|
psrld m2, 4
|
|
pshufb m0, m2
|
|
%if WIN64
|
|
movaps xmm6, [rsp+ 8]
|
|
movaps xmm7, [rsp+24]
|
|
%endif
|
|
jmp tx2q
|
|
.pass2:
|
|
lea r6, [deint_shuf+128]
|
|
vextracti128 xm1, m0, 1
|
|
call m(iadst_4x4_internal_8bpc).main
|
|
.end:
|
|
vpbroadcastd xm4, [pw_2048]
|
|
movq xm2, [dstq+strideq*0]
|
|
movhps xm2, [dstq+strideq*1]
|
|
lea r6, [dstq+strideq*2]
|
|
movq xm3, [r6 +strideq*0]
|
|
movhps xm3, [r6 +strideq*1]
|
|
vpbroadcastd xm5, [pixel_10bpc_max]
|
|
pmulhrsw xm0, xm4
|
|
pmulhrsw xm1, xm4
|
|
pxor m4, m4
|
|
mova [cq+32*0], m4
|
|
mova [cq+32*1], m4
|
|
paddw xm0, xm2
|
|
paddw xm1, xm3
|
|
pmaxsw xm0, xm4
|
|
pmaxsw xm1, xm4
|
|
pminsw xm0, xm5
|
|
pminsw xm1, xm5
|
|
movq [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
movq [r6 +strideq*0], xm1
|
|
movhps [r6 +strideq*1], xm1
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
mova xm0, [cq+16*0]
|
|
mova xm1, [cq+16*1]
|
|
mova xm2, [cq+16*2]
|
|
mova xm3, [cq+16*3]
|
|
%if WIN64
|
|
movaps [rsp+16], xmm6
|
|
movaps [rsp+32], xmm7
|
|
%endif
|
|
.main2:
|
|
WRAP_XMM IADST4_1D
|
|
ret
|
|
|
|
INV_TXFM_4X4_FN flipadst, dct
|
|
INV_TXFM_4X4_FN flipadst, adst
|
|
INV_TXFM_4X4_FN flipadst, flipadst
|
|
INV_TXFM_4X4_FN flipadst, identity
|
|
|
|
cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
|
|
call m(iadst_4x4_internal_10bpc).main
|
|
vinserti128 m0, m3, xm2, 1
|
|
vinserti128 m1, m6, xm4, 1
|
|
jmp m(iadst_4x4_internal_10bpc).pass1_end
|
|
.pass2:
|
|
lea r6, [deint_shuf+128]
|
|
vextracti128 xm1, m0, 1
|
|
call m(iadst_4x4_internal_8bpc).main
|
|
vpbroadcastd xm4, [pw_2048]
|
|
movq xm3, [dstq+strideq*1]
|
|
movhps xm3, [dstq+strideq*0]
|
|
lea r6, [dstq+strideq*2]
|
|
movq xm2, [r6 +strideq*1]
|
|
movhps xm2, [r6 +strideq*0]
|
|
vpbroadcastd xm5, [pixel_10bpc_max]
|
|
pmulhrsw xm0, xm4
|
|
pmulhrsw xm1, xm4
|
|
pxor m4, m4
|
|
mova [cq+32*0], m4
|
|
mova [cq+32*1], m4
|
|
paddw xm0, xm2
|
|
paddw xm1, xm3
|
|
pmaxsw xm0, xm4
|
|
pmaxsw xm1, xm4
|
|
pminsw xm0, xm5
|
|
pminsw xm1, xm5
|
|
movhps [dstq+strideq*0], xm1
|
|
movq [dstq+strideq*1], xm1
|
|
movhps [r6 +strideq*0], xm0
|
|
movq [r6 +strideq*1], xm0
|
|
RET
|
|
|
|
INV_TXFM_4X4_FN identity, dct
|
|
INV_TXFM_4X4_FN identity, adst
|
|
INV_TXFM_4X4_FN identity, flipadst
|
|
INV_TXFM_4X4_FN identity, identity
|
|
|
|
cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
|
|
vpbroadcastd m1, [pd_5793]
|
|
pmulld m0, m1, [cq+32*0]
|
|
pmulld m1, [cq+32*1]
|
|
vpbroadcastd m5, [pd_2048]
|
|
mova m3, [itx4_shuf]
|
|
paddd m0, m5
|
|
paddd m1, m5
|
|
psrad m0, 12
|
|
psrad m1, 12
|
|
packssdw m0, m1
|
|
vpermd m0, m3, m0
|
|
psrld m3, 4
|
|
pshufb m0, m3
|
|
jmp tx2q
|
|
.pass2:
|
|
vpbroadcastd m1, [pw_1697x8]
|
|
movq xm2, [dstq+strideq*0]
|
|
movhps xm2, [dstq+strideq*1]
|
|
lea r6, [dstq+strideq*2]
|
|
pmulhrsw m1, m0
|
|
paddsw m0, m1
|
|
movq xm3, [r6 +strideq*0]
|
|
movhps xm3, [r6 +strideq*1]
|
|
vpbroadcastd xm4, [pixel_10bpc_max]
|
|
packssdw m5, m5 ; pw_2048
|
|
pmulhrsw m0, m5
|
|
pxor m5, m5
|
|
mova [cq+32*0], m5
|
|
mova [cq+32*1], m5
|
|
vextracti128 xm1, m0, 1
|
|
paddw xm0, xm2
|
|
paddw xm1, xm3
|
|
pmaxsw xm0, xm5
|
|
pmaxsw xm1, xm5
|
|
pminsw xm0, xm4
|
|
pminsw xm1, xm4
|
|
movq [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
movq [r6 +strideq*0], xm1
|
|
movhps [r6 +strideq*1], xm1
|
|
RET
|
|
|
|
INV_TXFM_4X4_FN dct, dct, 12
|
|
INV_TXFM_4X4_FN dct, identity, 12
|
|
INV_TXFM_4X4_FN dct, adst, 12
|
|
INV_TXFM_4X4_FN dct, flipadst, 12
|
|
|
|
cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
|
|
call m(idct_4x4_internal_10bpc).main
|
|
mova m3, [idct4_12_shuf]
|
|
mova m4, [idct4_12_shuf2]
|
|
vpermd m2, m4, m1
|
|
vpermd m1, m3, m0
|
|
jmp m(iadst_4x4_internal_12bpc).pass1_end2
|
|
.pass2:
|
|
vpbroadcastd m5, [pd_2048]
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m1, q3120
|
|
call m(idct_4x4_internal_10bpc).main2
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m1, q2031
|
|
jmp m(iadst_4x4_internal_12bpc).end
|
|
|
|
INV_TXFM_4X4_FN adst, dct, 12
|
|
INV_TXFM_4X4_FN adst, adst, 12
|
|
INV_TXFM_4X4_FN adst, flipadst, 12
|
|
INV_TXFM_4X4_FN adst, identity, 12
|
|
|
|
cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
|
|
call m(iadst_4x4_internal_10bpc).main
|
|
vinserti128 m1, m4, xm6, 1
|
|
vinserti128 m2, xm3, 1
|
|
.pass1_end:
|
|
mova m3, [itx4_shuf]
|
|
vpbroadcastd m5, [pd_1024]
|
|
psrad m1, 1
|
|
psrad m2, 1
|
|
vpermd m1, m3, m1
|
|
vpermd m2, m3, m2
|
|
paddd m1, m5
|
|
paddd m2, m5
|
|
psrad m1, 11
|
|
psrad m2, 11
|
|
.pass1_end2:
|
|
vpbroadcastd m3, [clip_18b_min]
|
|
vpbroadcastd m4, [clip_18b_max]
|
|
punpcklqdq m0, m1, m2
|
|
punpckhqdq m1, m2
|
|
pmaxsd m0, m3
|
|
pmaxsd m1, m3
|
|
pminsd m0, m4
|
|
pminsd m1, m4
|
|
jmp tx2q
|
|
.pass2:
|
|
call .main_pass2
|
|
vinserti128 m0, m4, xm6, 1
|
|
vinserti128 m1, m2, xm3, 1
|
|
.pass2_end:
|
|
vpbroadcastd m5, [pd_2048]
|
|
paddd m0, m5
|
|
paddd m1, m5
|
|
psrad m0, 12
|
|
psrad m1, 12
|
|
.end:
|
|
%if WIN64
|
|
WIN64_RESTORE_XMM_INTERNAL
|
|
%assign xmm_regs_used 6
|
|
%endif
|
|
.end2:
|
|
vpbroadcastd m4, [pw_16384]
|
|
movq xm2, [dstq+strideq*0]
|
|
movq xm3, [dstq+strideq*1]
|
|
lea r6, [dstq+strideq*2]
|
|
movhps xm2, [r6 +strideq*0] ; dst0 dst2
|
|
movhps xm3, [r6 +strideq*1] ; dst1 dst3
|
|
vpbroadcastd m5, [pixel_12bpc_max]
|
|
vinserti128 m2, xm3, 1
|
|
psrad m0, 3
|
|
psrad m1, 3
|
|
packssdw m0, m1 ; t0 t2 t1 t3
|
|
pmulhrsw m0, m4
|
|
pxor m4, m4
|
|
mova [cq+32*0], m4
|
|
mova [cq+32*1], m4
|
|
paddw m0, m2 ; out0 out2 out1 out3
|
|
pmaxsw m0, m4
|
|
pminsw m0, m5
|
|
vextracti128 xm1, m0, 1 ; out1 out3
|
|
movq [dstq+strideq*0], xm0
|
|
movq [dstq+strideq*1], xm1
|
|
movhps [r6 +strideq*0], xm0
|
|
movhps [r6 +strideq*1], xm1
|
|
RET
|
|
.main_pass2:
|
|
vextracti128 xm3, m1, 1
|
|
mova xm2, xm1
|
|
vextracti128 xm1, m0, 1
|
|
jmp m(iadst_4x4_internal_10bpc).main2
|
|
|
|
INV_TXFM_4X4_FN flipadst, dct, 12
|
|
INV_TXFM_4X4_FN flipadst, adst, 12
|
|
INV_TXFM_4X4_FN flipadst, flipadst, 12
|
|
INV_TXFM_4X4_FN flipadst, identity, 12
|
|
|
|
cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
|
|
call m(iadst_4x4_internal_10bpc).main
|
|
vinserti128 m1, m3, xm2, 1
|
|
vinserti128 m2, m6, xm4, 1
|
|
jmp m(iadst_4x4_internal_12bpc).pass1_end
|
|
.pass2:
|
|
call m(iadst_4x4_internal_12bpc).main_pass2
|
|
vinserti128 m0, m3, xm2, 1
|
|
vinserti128 m1, m6, xm4, 1
|
|
jmp m(iadst_4x4_internal_12bpc).pass2_end
|
|
|
|
INV_TXFM_4X4_FN identity, dct, 12
|
|
INV_TXFM_4X4_FN identity, adst, 12
|
|
INV_TXFM_4X4_FN identity, flipadst, 12
|
|
INV_TXFM_4X4_FN identity, identity, 12
|
|
|
|
cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
|
|
mova m2, [itx4_shuf]
|
|
vpbroadcastd m3, [pd_1697]
|
|
vpermd m0, m2, [cq+32*0]
|
|
vpermd m2, m2, [cq+32*1]
|
|
vpbroadcastd m5, [pd_2048]
|
|
pmulld m1, m3, m0
|
|
pmulld m3, m2
|
|
paddd m1, m5
|
|
paddd m3, m5
|
|
psrad m1, 12
|
|
psrad m3, 12
|
|
paddd m1, m0
|
|
paddd m2, m3
|
|
jmp m(iadst_4x4_internal_12bpc).pass1_end2
|
|
.pass2:
|
|
; m0 = in0 in1
|
|
; m1 = in2 in3
|
|
vpbroadcastd m3, [pd_5793]
|
|
vpbroadcastd m5, [pd_2048]
|
|
pmulld m0, m3
|
|
pmulld m1, m3
|
|
paddd m0, m5 ; 2048
|
|
paddd m1, m5
|
|
psrad m0, 12
|
|
psrad m1, 12
|
|
jmp m(iadst_4x4_internal_12bpc).end
|
|
|
|
%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth
|
|
INV_TXFM_FN %1, %2, 0, 4x8, %3
|
|
%ifidn %1_%2, dct_dct
|
|
vpbroadcastd xm2, [dconly_%3bpc]
|
|
%if %3 = 10
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
mov [cq], eobd ; 0
|
|
or r3d, 8
|
|
add r6d, 128
|
|
sar r6d, 8
|
|
imul r6d, 181
|
|
jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2
|
|
%else
|
|
jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly
|
|
%endif
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
|
|
ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3
|
|
vpbroadcastd m%5, [pd_2896]
|
|
pmulld m%1, m%5
|
|
pmulld m%3, m%5
|
|
paddd m%1, m%8
|
|
paddd m%5, m%1, m%3
|
|
psubd m%1, m%3
|
|
psrad m%5, 12 ; t0
|
|
psrad m%1, 12 ; t1
|
|
psubd m%3, m%1, m%2
|
|
paddd m%2, m%1
|
|
paddd m%1, m%5, m%4
|
|
psubd m%4, m%5, m%4
|
|
%endmacro
|
|
|
|
INV_TXFM_4X8_FN dct, dct
|
|
INV_TXFM_4X8_FN dct, identity
|
|
INV_TXFM_4X8_FN dct, adst
|
|
INV_TXFM_4X8_FN dct, flipadst
|
|
|
|
cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
|
|
.pass1:
|
|
vpbroadcastd m3, [pd_2896]
|
|
pmulld m0, m3, [cq+32*0]
|
|
pmulld m1, m3, [cq+32*1]
|
|
pmulld m2, m3, [cq+32*2]
|
|
pmulld m3, m3, [cq+32*3]
|
|
vpbroadcastd m7, [pd_2048]
|
|
REPX {paddd x, m7}, m0, m1, m2, m3
|
|
REPX {psrad x, 12}, m0, m1, m2, m3
|
|
IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7
|
|
jmp tx2q
|
|
.pass2:
|
|
packssdw m0, m2
|
|
packssdw m1, m3
|
|
lea r6, [deint_shuf+128]
|
|
punpckhwd m2, m0, m1
|
|
punpcklwd m0, m1
|
|
punpckhdq m1, m0, m2 ; 2 3
|
|
punpckldq m0, m2 ; 0 1
|
|
vextracti128 xm2, m0, 1 ; 4 5
|
|
vextracti128 xm3, m1, 1 ; 6 7
|
|
call m(idct_4x8_internal_8bpc).main
|
|
vpbroadcastd xm4, [pw_2048]
|
|
REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
|
|
lea r3, [strideq*3]
|
|
lea r6, [dstq+strideq*4]
|
|
movq xm4, [dstq+strideq*0]
|
|
movhps xm4, [dstq+strideq*1]
|
|
movq xm5, [dstq+r3 ]
|
|
movhps xm5, [dstq+strideq*2]
|
|
movq xm6, [r6 +strideq*0]
|
|
movhps xm6, [r6 +strideq*1]
|
|
movq xm7, [r6 +r3 ]
|
|
movhps xm7, [r6 +strideq*2]
|
|
paddw xm0, xm4 ; 0 1
|
|
paddw xm1, xm5 ; 3 2
|
|
paddw xm2, xm6 ; 4 5
|
|
paddw xm3, xm7 ; 7 6
|
|
vpbroadcastd xm5, [pixel_10bpc_max]
|
|
pxor m4, m4
|
|
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
|
|
REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
|
|
REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
|
|
movq [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
movhps [dstq+strideq*2], xm1
|
|
movq [dstq+r3 ], xm1
|
|
movq [r6 +strideq*0], xm2
|
|
movhps [r6 +strideq*1], xm2
|
|
movhps [r6 +strideq*2], xm3
|
|
movq [r6 +r3 ], xm3
|
|
RET
|
|
|
|
INV_TXFM_4X8_FN adst, dct
|
|
INV_TXFM_4X8_FN adst, adst
|
|
INV_TXFM_4X8_FN adst, flipadst
|
|
INV_TXFM_4X8_FN adst, identity
|
|
|
|
cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
|
|
call m(iadst_8x4_internal_10bpc).main
|
|
vpbroadcastd m5, [pd_2048]
|
|
paddd m0, m5, m4
|
|
paddd m1, m5, m6
|
|
paddd m2, m5
|
|
paddd m3, m5
|
|
.pass1_end:
|
|
REPX {psrad x, 12}, m0, m1, m2, m3
|
|
jmp tx2q
|
|
.pass2:
|
|
call .pass2_main
|
|
mova xm4, [pw_2048_m2048]
|
|
REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
|
|
.end:
|
|
lea r3, [strideq*3]
|
|
lea r6, [dstq+strideq*4]
|
|
movq xm4, [dstq+strideq*0]
|
|
movhps xm4, [dstq+strideq*1]
|
|
movq xm5, [dstq+strideq*2]
|
|
movhps xm5, [dstq+r3 ]
|
|
movq xm6, [r6 +strideq*0]
|
|
movhps xm6, [r6 +strideq*1]
|
|
movq xm7, [r6 +strideq*2]
|
|
movhps xm7, [r6 +r3 ]
|
|
paddw xm0, xm4 ; 0 1
|
|
paddw xm1, xm5 ; 2 3
|
|
paddw xm2, xm6 ; 4 5
|
|
paddw xm3, xm7 ; 6 7
|
|
vpbroadcastd xm5, [pixel_10bpc_max]
|
|
pxor m4, m4
|
|
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
|
|
REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
|
|
REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
|
|
movq [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
movq [dstq+strideq*2], xm1
|
|
movhps [dstq+r3 ], xm1
|
|
movq [r6 +strideq*0], xm2
|
|
movhps [r6 +strideq*1], xm2
|
|
movq [r6 +strideq*2], xm3
|
|
movhps [r6 +r3 ], xm3
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
packssdw m0, m2
|
|
packssdw m1, m3
|
|
lea r6, [deint_shuf+128]
|
|
punpcklwd m4, m0, m1
|
|
punpckhwd m0, m1
|
|
punpckhdq m5, m4, m0
|
|
punpckldq m4, m0
|
|
vextracti128 xm2, m4, 1 ; 4 5
|
|
vextracti128 xm3, m5, 1 ; 6 7
|
|
pshufd xm4, xm4, q1032 ; 1 0
|
|
pshufd xm5, xm5, q1032 ; 3 2
|
|
jmp m(iadst_4x8_internal_8bpc).main_pass2
|
|
ALIGN function_align
|
|
.main:
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
.main2:
|
|
vbroadcasti128 m0, [cq+16*0]
|
|
vbroadcasti128 m2, [cq+16*2]
|
|
vbroadcasti128 m3, [cq+16*5]
|
|
vbroadcasti128 m1, [cq+16*7]
|
|
vpbroadcastd m6, [pd_2896]
|
|
shufpd m0, m2, 0x0c ; 0 2
|
|
shufpd m1, m3, 0x0c ; 7 5
|
|
vbroadcasti128 m2, [cq+16*4]
|
|
vbroadcasti128 m4, [cq+16*6]
|
|
vbroadcasti128 m5, [cq+16*1]
|
|
vbroadcasti128 m3, [cq+16*3]
|
|
vpbroadcastd m7, [pd_2048]
|
|
shufpd m2, m4, 0x0c ; 4 6
|
|
shufpd m3, m5, 0x0c ; 3 1
|
|
REPX {pmulld x, m6}, m0, m1, m2, m3
|
|
REPX {paddd x, m7}, m0, m1, m2, m3
|
|
REPX {psrad x, 12}, m0, m1, m2, m3
|
|
.main3:
|
|
ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1
|
|
ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1
|
|
psubd m4, m0, m2 ; t4 t6
|
|
paddd m0, m2 ; t0 t2
|
|
psubd m2, m1, m3 ; t5 t7
|
|
paddd m1, m3 ; t1 t3
|
|
REPX {pmaxsd x, m8}, m4, m2, m0, m1
|
|
REPX {pminsd x, m9}, m4, m2, m0, m1
|
|
pxor m5, m5
|
|
psubd m5, m4
|
|
vpblendd m4, m2, 0xcc ; t4 t7
|
|
vpblendd m2, m5, 0xcc ; t5 -t6
|
|
ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784
|
|
vpbroadcastd m5, [pd_2896]
|
|
vbroadcasti128 m6, [pw_2048_m2048] ; + + - -
|
|
punpckhqdq m3, m0, m1
|
|
punpcklqdq m0, m1
|
|
psubd m1, m0, m3 ; t2 t3
|
|
paddd m0, m3 ; out0 -out7
|
|
punpckhqdq m3, m4, m2 ; t7a t6a
|
|
punpcklqdq m4, m2 ; t5a t4a
|
|
psubd m2, m4, m3 ; t7 t6
|
|
paddd m4, m3 ; out6 -out1
|
|
REPX {pmaxsd x, m8}, m1, m2
|
|
REPX {pminsd x, m9}, m1, m2
|
|
vpblendd m3, m1, m2, 0xcc
|
|
shufpd m1, m2, 0x05
|
|
pmulld m3, m5
|
|
pmulld m5, m1
|
|
psignd m0, m6 ; out0 out7
|
|
psignd m4, m6 ; out6 out1
|
|
paddd m3, m7
|
|
psubd m2, m3, m5
|
|
paddd m5, m3
|
|
psrad m2, 12 ; out4 -out5
|
|
psrad m5, 12 ; -out3 out2
|
|
ret
|
|
|
|
INV_TXFM_4X8_FN flipadst, dct
|
|
INV_TXFM_4X8_FN flipadst, adst
|
|
INV_TXFM_4X8_FN flipadst, flipadst
|
|
INV_TXFM_4X8_FN flipadst, identity
|
|
|
|
cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
|
|
call m(iadst_8x4_internal_10bpc).main
|
|
vpbroadcastd m5, [pd_2048]
|
|
paddd m0, m5, m3
|
|
paddd m1, m5, m2
|
|
paddd m2, m5, m6
|
|
paddd m3, m5, m4
|
|
jmp m(iadst_4x8_internal_10bpc).pass1_end
|
|
.pass2:
|
|
call m(iadst_4x8_internal_10bpc).pass2_main
|
|
mova xm4, [pw_2048_m2048]
|
|
REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0
|
|
lea r3, [strideq*3]
|
|
lea r6, [dstq+strideq*4]
|
|
movq xm4, [dstq+strideq*1]
|
|
movhps xm4, [dstq+strideq*0]
|
|
movq xm5, [dstq+r3 ]
|
|
movhps xm5, [dstq+strideq*2]
|
|
movq xm6, [r6 +strideq*1]
|
|
movhps xm6, [r6 +strideq*0]
|
|
movq xm7, [r6 +r3 ]
|
|
movhps xm7, [r6 +strideq*2]
|
|
paddw xm3, xm4 ; 1 0
|
|
paddw xm2, xm5 ; 3 2
|
|
paddw xm1, xm6 ; 5 4
|
|
paddw xm0, xm7 ; 7 6
|
|
vpbroadcastd xm5, [pixel_10bpc_max]
|
|
pxor m4, m4
|
|
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
|
|
REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0
|
|
REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0
|
|
movhps [dstq+strideq*0], xm3
|
|
movq [dstq+strideq*1], xm3
|
|
movhps [dstq+strideq*2], xm2
|
|
movq [dstq+r3 ], xm2
|
|
movhps [r6 +strideq*0], xm1
|
|
movq [r6 +strideq*1], xm1
|
|
movhps [r6 +strideq*2], xm0
|
|
movq [r6 +r3 ], xm0
|
|
RET
|
|
|
|
INV_TXFM_4X8_FN identity, dct
|
|
INV_TXFM_4X8_FN identity, adst
|
|
INV_TXFM_4X8_FN identity, flipadst
|
|
INV_TXFM_4X8_FN identity, identity
|
|
|
|
cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
|
|
.pass1:
|
|
vpbroadcastd m3, [pd_2896]
|
|
pmulld m0, m3, [cq+32*0]
|
|
pmulld m1, m3, [cq+32*1]
|
|
pmulld m2, m3, [cq+32*2]
|
|
pmulld m3, [cq+32*3]
|
|
vpbroadcastd m5, [pd_2048]
|
|
vpbroadcastd m4, [pd_5793]
|
|
REPX {paddd x, m5}, m0, m1, m2, m3
|
|
REPX {psrad x, 12}, m0, m1, m2, m3
|
|
REPX {pmulld x, m4}, m0, m1, m2, m3
|
|
REPX {paddd x, m5}, m0, m1, m2, m3
|
|
REPX {psrad x, 12}, m0, m1, m2, m3
|
|
jmp tx2q
|
|
.pass2:
|
|
vpbroadcastd m6, [pixel_10bpc_max]
|
|
call .pass2_end
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_end:
|
|
vpbroadcastd m4, [pw_4096]
|
|
packssdw m0, m2
|
|
packssdw m1, m3
|
|
punpckhwd m2, m0, m1
|
|
punpcklwd m0, m1
|
|
pmulhrsw m2, m4
|
|
pmulhrsw m0, m4
|
|
punpckhdq m1, m0, m2 ; 2 3 6 7
|
|
punpckldq m0, m2 ; 0 1 4 5
|
|
lea r3, [strideq*3]
|
|
lea r6, [dstq+strideq*4]
|
|
movq xm2, [dstq+strideq*0]
|
|
movhps xm2, [dstq+strideq*1]
|
|
vpbroadcastq m4, [r6 +strideq*0]
|
|
vpbroadcastq m5, [r6 +strideq*1]
|
|
movq xm3, [dstq+strideq*2]
|
|
movhps xm3, [dstq+r3 ]
|
|
vpblendd m2, m4, 0x30
|
|
vpblendd m2, m5, 0xc0
|
|
vpbroadcastq m4, [r6 +strideq*2]
|
|
vpbroadcastq m5, [r6 +r3 ]
|
|
vpblendd m3, m4, 0x30
|
|
vpblendd m3, m5, 0xc0
|
|
pxor m4, m4
|
|
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
|
|
paddw m0, m2 ; out0 out1 out4 out5
|
|
paddw m1, m3 ; out2 out3 out6 out7
|
|
pmaxsw m0, m4
|
|
pmaxsw m1, m4
|
|
pminsw m0, m6
|
|
pminsw m1, m6
|
|
vextracti128 xm2, m0, 1 ; out4 out5
|
|
vextracti128 xm3, m1, 1 ; out6 out7
|
|
movq [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
movq [dstq+strideq*2], xm1
|
|
movhps [dstq+r3 ], xm1
|
|
movq [r6 +strideq*0], xm2
|
|
movhps [r6 +strideq*1], xm2
|
|
movq [r6 +strideq*2], xm3
|
|
movhps [r6 +r3 ], xm3
|
|
ret
|
|
|
|
INV_TXFM_4X8_FN dct, dct, 12
|
|
INV_TXFM_4X8_FN dct, identity, 12
|
|
INV_TXFM_4X8_FN dct, adst, 12
|
|
INV_TXFM_4X8_FN dct, flipadst, 12
|
|
|
|
cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
|
|
jmp m(idct_4x8_internal_10bpc).pass1
|
|
.pass2:
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
REPX {pmaxsd x, m8}, m0, m1, m2, m3
|
|
REPX {pminsd x, m9}, m0, m1, m2, m3
|
|
; transpose & interleave
|
|
pshufd m0, m0, q1320
|
|
pshufd m1, m1, q1320
|
|
pshufd m2, m2, q1320
|
|
pshufd m3, m3, q1320
|
|
punpckldq m4, m0, m1
|
|
punpckhdq m0, m1
|
|
punpckldq m5, m2, m3
|
|
punpckhdq m2, m3
|
|
vpermq m0, m0, q3102
|
|
vpermq m2, m2, q3102
|
|
vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved)
|
|
vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved)
|
|
vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved)
|
|
vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved)
|
|
vpbroadcastd m7, [pd_2048]
|
|
call m(idct_8x4_internal_10bpc).main
|
|
psubd m3, m0, m4 ; out7 out6
|
|
paddd m0, m4 ; out0 out1
|
|
paddd m1, m2, m5 ; out3 out2
|
|
psubd m2, m5 ; out4 out5
|
|
pshufd m1, m1, q1032
|
|
pshufd m3, m3, q1032
|
|
jmp m(iadst_4x8_internal_12bpc).end
|
|
|
|
INV_TXFM_4X8_FN adst, dct, 12
|
|
INV_TXFM_4X8_FN adst, adst, 12
|
|
INV_TXFM_4X8_FN adst, flipadst, 12
|
|
INV_TXFM_4X8_FN adst, identity, 12
|
|
|
|
cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
|
|
call m(iadst_8x4_internal_10bpc).main
|
|
psrad m0, m4, 1
|
|
psrad m1, m6, 1
|
|
psrad m2, 1
|
|
psrad m3, 1
|
|
.pass1_end:
|
|
vpbroadcastd m5, [pd_1024]
|
|
REPX {paddd x, m5}, m0, m1, m2, m3
|
|
REPX {psrad x, 11}, m0, m1, m2, m3
|
|
jmp tx2q
|
|
.pass2:
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
REPX {pmaxsd x, m8}, m0, m1, m2, m3
|
|
REPX {pminsd x, m9}, m0, m1, m2, m3
|
|
call .pass2_main
|
|
vpblendd m3, m0, m4, 0x33 ; out6 out7
|
|
vpblendd m0, m4, 0xcc ; out0 out1
|
|
pshufd m1, m5, q1032
|
|
psignd m2, m6 ; out4 out5
|
|
psignd m1, m6 ; out2 out3
|
|
.end:
|
|
vpbroadcastd m4, [pw_16384]
|
|
REPX {psrad x, 3}, m0, m1, m2, m3
|
|
packssdw m0, m2 ; 0 1 4 5 (interleaved)
|
|
packssdw m1, m3 ; 2 3 6 7 (interleaved)
|
|
mova m2, [iadst8_12_shuf]
|
|
vpermd m0, m2, m0 ; 0 1 4 5
|
|
vpermd m1, m2, m1 ; 2 3 6 7
|
|
pmulhrsw m0, m4
|
|
pmulhrsw m1, m4
|
|
lea r3, [strideq*3]
|
|
lea r6, [dstq+strideq*4]
|
|
movq xm4, [dstq+strideq*0]
|
|
movhps xm4, [dstq+strideq*1]
|
|
movq xm5, [dstq+strideq*2]
|
|
movhps xm5, [dstq+r3 ]
|
|
movq xm6, [r6 +strideq*0]
|
|
movhps xm6, [r6 +strideq*1]
|
|
vinserti128 m4, xm6, 1
|
|
movq xm7, [r6 +strideq*2]
|
|
movhps xm7, [r6 +r3 ]
|
|
vinserti128 m5, xm7, 1
|
|
paddw m0, m4 ; 0 1 4 5
|
|
paddw m1, m5 ; 2 3 6 7
|
|
vpbroadcastd m5, [pixel_12bpc_max]
|
|
pxor m4, m4
|
|
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
|
|
REPX {pmaxsw x, m4}, m0, m1
|
|
REPX {pminsw x, m5}, m0, m1
|
|
vextracti128 xm2, m0, 1 ; out4 out5
|
|
vextracti128 xm3, m1, 1 ; out6 out7
|
|
movq [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
movq [dstq+strideq*2], xm1
|
|
movhps [dstq+r3 ], xm1
|
|
movq [r6 +strideq*0], xm2
|
|
movhps [r6 +strideq*1], xm2
|
|
movq [r6 +strideq*2], xm3
|
|
movhps [r6 +r3 ], xm3
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
; transpose & interleave
|
|
pshufd m0, m0, q1320
|
|
pshufd m1, m1, q1320
|
|
pshufd m2, m2, q1320
|
|
pshufd m3, m3, q1320
|
|
punpckldq m4, m0, m1
|
|
punpckhdq m0, m1
|
|
punpckldq m5, m2, m3
|
|
punpckhdq m2, m3
|
|
vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved)
|
|
vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved)
|
|
vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved)
|
|
vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved)
|
|
vpbroadcastd m7, [pd_2048]
|
|
jmp m(iadst_4x8_internal_10bpc).main3
|
|
|
|
INV_TXFM_4X8_FN flipadst, dct, 12
|
|
INV_TXFM_4X8_FN flipadst, adst, 12
|
|
INV_TXFM_4X8_FN flipadst, flipadst, 12
|
|
INV_TXFM_4X8_FN flipadst, identity, 12
|
|
|
|
cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
|
|
call m(iadst_8x4_internal_10bpc).main
|
|
psrad m0, m3, 1
|
|
psrad m1, m2, 1
|
|
psrad m2, m6, 1
|
|
psrad m3, m4, 1
|
|
jmp m(iadst_4x8_internal_12bpc).pass1_end
|
|
.pass2:
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
REPX {pmaxsd x, m8}, m0, m1, m2, m3
|
|
REPX {pminsd x, m9}, m0, m1, m2, m3
|
|
call m(iadst_4x8_internal_12bpc).pass2_main
|
|
shufpd m3, m4, m0, 0x05 ; out1 out0
|
|
shufpd m0, m4, 0x05 ; out7 out6
|
|
psignd m2, m6
|
|
pshufd m6, m6, q1032
|
|
pshufd m1, m2, q1032 ; out5 out4
|
|
psignd m2, m5, m6 ; out3 out2
|
|
jmp m(iadst_4x8_internal_12bpc).end
|
|
|
|
INV_TXFM_4X8_FN identity, dct, 12
|
|
INV_TXFM_4X8_FN identity, adst, 12
|
|
INV_TXFM_4X8_FN identity, flipadst, 12
|
|
INV_TXFM_4X8_FN identity, identity, 12
|
|
|
|
cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
|
|
jmp m(iidentity_4x8_internal_10bpc).pass1
|
|
.pass2:
|
|
; m0 = in0 in1
|
|
; m1 = in2 in3
|
|
; m2 = in4 in5
|
|
; m3 = in6 in7
|
|
vpbroadcastd m6, [pixel_12bpc_max]
|
|
call m(iidentity_4x8_internal_10bpc).pass2_end
|
|
RET
|
|
|
|
%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth
|
|
INV_TXFM_FN %1, %2, 0, 4x16, %3
|
|
%ifidn %1_%2, dct_dct
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd xm2, [dconly_%3bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 16
|
|
add r6d, 384
|
|
sar r6d, 9
|
|
jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3
|
|
%endif
|
|
%endmacro
|
|
|
|
INV_TXFM_4X16_FN dct, dct
|
|
INV_TXFM_4X16_FN dct, identity
|
|
INV_TXFM_4X16_FN dct, adst
|
|
INV_TXFM_4X16_FN dct, flipadst
|
|
|
|
cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
|
|
.pass1:
|
|
vpbroadcastd m10, [pd_3072]
|
|
mova m1, [cq+32*2]
|
|
mova m3, [cq+32*6]
|
|
mova m5, [cq+32*3]
|
|
mova m7, [cq+32*7]
|
|
call .pass1_main
|
|
pmulld m0, m6, [cq+32*0]
|
|
pmulld m2, m6, [cq+32*4]
|
|
pmulld m4, m6, [cq+32*1]
|
|
pmulld m6, [cq+32*5]
|
|
call .pass1_main2
|
|
REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
jmp tx2q
|
|
.pass2:
|
|
packssdw m0, m4
|
|
packssdw m1, m5
|
|
packssdw m2, m6
|
|
packssdw m3, m7
|
|
lea r6, [deint_shuf+128]
|
|
punpcklwd m4, m2, m3
|
|
punpckhwd m2, m3
|
|
punpckhwd m5, m0, m1
|
|
punpcklwd m0, m1
|
|
punpckhdq m1, m0, m4 ; 2 3
|
|
punpckldq m0, m4 ; 0 1
|
|
punpckldq m4, m5, m2 ; 8 9
|
|
punpckhdq m5, m2 ; a b
|
|
vextracti128 xm2, m0, 1 ; 4 5
|
|
vextracti128 xm3, m1, 1 ; 6 7
|
|
vextracti128 xm6, m4, 1 ; c d
|
|
vextracti128 xm7, m5, 1 ; e f
|
|
call m(idct_4x16_internal_8bpc).main
|
|
vpbroadcastd m9, [pw_2048]
|
|
vinserti128 m0, m0, xm1, 1 ; 0 1 3 2
|
|
vinserti128 m1, m2, xm3, 1 ; 4 5 7 6
|
|
vinserti128 m2, m4, xm5, 1 ; 8 9 b a
|
|
vinserti128 m3, m6, xm7, 1 ; c d f e
|
|
vpbroadcastd m8, [pixel_10bpc_max]
|
|
call .pass2_end
|
|
RET
|
|
ALIGN function_align
|
|
.pass1_main:
|
|
vpbroadcastd m4, [pd_3784]
|
|
vpbroadcastd m8, [pd_1567]
|
|
vpbroadcastd m9, [pd_2048]
|
|
vpbroadcastd m6, [pd_1448]
|
|
ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l
|
|
ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h
|
|
ret
|
|
ALIGN function_align
|
|
.pass1_main2:
|
|
paddd m0, m10
|
|
paddd m4, m10
|
|
paddd m8, m0, m2
|
|
psubd m0, m2
|
|
paddd m9, m4, m6
|
|
psubd m4, m6
|
|
REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
|
|
psubd m2, m0, m1
|
|
paddd m1, m0
|
|
psubd m6, m4, m5
|
|
paddd m5, m4
|
|
paddd m0, m8, m3
|
|
psubd m3, m8, m3
|
|
paddd m4, m9, m7
|
|
psubd m7, m9, m7
|
|
ret
|
|
ALIGN function_align
|
|
.pass2_end:
|
|
lea r6, [strideq*3]
|
|
pxor m7, m7
|
|
pmulhrsw m0, m9
|
|
call .write_4x4
|
|
pmulhrsw m0, m1, m9
|
|
call .write_4x4
|
|
pmulhrsw m0, m2, m9
|
|
call .write_4x4
|
|
pmulhrsw m0, m3, m9
|
|
call .write_4x4
|
|
ret
|
|
ALIGN function_align
|
|
.write_4x4:
|
|
movq xm4, [dstq+strideq*0]
|
|
movhps xm4, [dstq+strideq*1]
|
|
vpbroadcastq m5, [dstq+strideq*2]
|
|
vpbroadcastq m6, [dstq+r6 ]
|
|
mova [cq+32*0], m7
|
|
mova [cq+32*1], m7
|
|
add cq, 32*2
|
|
vpblendd m4, m5, 0xc0
|
|
vpblendd m4, m6, 0x30
|
|
paddw m4, m0
|
|
pmaxsw m4, m7
|
|
pminsw m4, m8
|
|
vextracti128 xm5, m4, 1
|
|
movq [dstq+strideq*0], xm4
|
|
movhps [dstq+strideq*1], xm4
|
|
movhps [dstq+strideq*2], xm5
|
|
movq [dstq+r6 ], xm5
|
|
lea dstq, [dstq+strideq*4]
|
|
ret
|
|
|
|
INV_TXFM_4X16_FN adst, dct
|
|
INV_TXFM_4X16_FN adst, adst
|
|
INV_TXFM_4X16_FN adst, flipadst
|
|
INV_TXFM_4X16_FN adst, identity
|
|
|
|
cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
|
|
call m(iadst_16x4_internal_10bpc).main
|
|
vpbroadcastd m6, [pd_6144]
|
|
call m(iadst_16x4_internal_10bpc).main_end
|
|
psrad m0, m4, 13
|
|
psrad m1, m5, 13
|
|
psrad m2, 13
|
|
psrad m3, 13
|
|
psrad m4, m8, 13
|
|
psrad m5, m9, 13
|
|
psrad m6, 13
|
|
psrad m7, 13
|
|
jmp tx2q
|
|
.pass2:
|
|
call .pass2_main
|
|
vpbroadcastd m5, [pw_2048]
|
|
vpbroadcastd m8, [pixel_10bpc_max]
|
|
lea r6, [strideq*3]
|
|
vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1
|
|
pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
|
|
vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13
|
|
pxor m7, m7
|
|
psubw m9, m7, m5
|
|
vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
|
|
pmulhrsw m0, m4, m9
|
|
call .write_4x4
|
|
pmulhrsw m0, m1, m9
|
|
call .write_4x4
|
|
pmulhrsw m0, m2, m9
|
|
call .write_4x4
|
|
pmulhrsw m0, m3, m9
|
|
call .write_4x4
|
|
RET
|
|
ALIGN function_align
|
|
.write_4x4:
|
|
movq xm4, [dstq+r6 ]
|
|
movhps xm4, [dstq+strideq*0]
|
|
vpbroadcastq m5, [dstq+strideq*1]
|
|
vpbroadcastq m6, [dstq+strideq*2]
|
|
mova [cq+32*0], m7
|
|
mova [cq+32*1], m7
|
|
add cq, 32*2
|
|
vpblendd m4, m5, 0xc0
|
|
vpblendd m4, m6, 0x30
|
|
paddw m4, m0
|
|
pmaxsw m4, m7
|
|
pminsw m4, m8
|
|
vextracti128 xm5, m4, 1
|
|
movhps [dstq+strideq*0], xm4
|
|
movhps [dstq+strideq*1], xm5
|
|
movq [dstq+strideq*2], xm5
|
|
movq [dstq+r6 ], xm4
|
|
lea dstq, [dstq+strideq*4]
|
|
ret
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
packssdw m0, m4
|
|
packssdw m1, m5
|
|
packssdw m2, m6
|
|
packssdw m3, m7
|
|
lea r6, [deint_shuf+128]
|
|
punpcklwd m4, m2, m3
|
|
punpckhwd m2, m3
|
|
punpckhwd m5, m0, m1
|
|
punpcklwd m0, m1
|
|
punpckhdq m1, m0, m4
|
|
punpckldq m0, m4
|
|
punpckldq m4, m5, m2
|
|
punpckhdq m5, m2
|
|
vpblendd m3, m0, m1, 0x33
|
|
vpblendd m0, m1, 0xcc
|
|
shufpd m2, m5, m4, 0x05
|
|
shufpd m4, m5, 0x05
|
|
vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5
|
|
vinserti128 m0, xm3, 1 ; 0 3 2 1
|
|
vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ????
|
|
vinserti128 m2, xm4, 1 ; b 8 9 a
|
|
call m(iadst_4x16_internal_8bpc).main2
|
|
vpbroadcastd m5, [pw_2896x8]
|
|
paddsw m1, m2, m4
|
|
psubsw m2, m4
|
|
pmulhrsw m1, m5 ; -out7 out4 out6 -out5
|
|
pmulhrsw m2, m5 ; out8 -out11 -out9 out10
|
|
ret
|
|
ALIGN function_align
|
|
.main:
|
|
vbroadcasti128 m0, [cq+16* 0]
|
|
vbroadcasti128 m4, [cq+16* 2]
|
|
vbroadcasti128 m1, [cq+16*15]
|
|
vbroadcasti128 m5, [cq+16*13]
|
|
vbroadcasti128 m2, [cq+16* 4]
|
|
vbroadcasti128 m6, [cq+16* 6]
|
|
vbroadcasti128 m3, [cq+16*11]
|
|
vbroadcasti128 m7, [cq+16* 9]
|
|
shufpd m0, m4, 0x0c ; 0 2
|
|
shufpd m1, m5, 0x0c ; 15 13
|
|
shufpd m2, m6, 0x0c ; 4 6
|
|
shufpd m3, m7, 0x0c ; 11 9
|
|
vbroadcasti128 m4, [cq+16* 8]
|
|
vbroadcasti128 m6, [cq+16*10]
|
|
vbroadcasti128 m5, [cq+16* 7]
|
|
vbroadcasti128 m7, [cq+16* 5]
|
|
shufpd m4, m6, 0x0c ; 8 10
|
|
shufpd m5, m7, 0x0c ; 7 5
|
|
vbroadcasti128 m6, [cq+16*12]
|
|
vbroadcasti128 m7, [cq+16*14]
|
|
shufpd m6, m7, 0x0c ; 12 14
|
|
vbroadcasti128 m7, [cq+16* 3]
|
|
vbroadcasti128 m8, [cq+16* 1]
|
|
shufpd m7, m8, 0x0c ; 3 1
|
|
.main2:
|
|
; expects: m12 = clip_min m13 = clip_max
|
|
vpbroadcastd m11, [pd_2048]
|
|
ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1
|
|
ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1
|
|
ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1
|
|
ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1
|
|
psubd m8, m0, m4 ; t8a t10a
|
|
paddd m0, m4 ; t0a t2a
|
|
psubd m4, m1, m5 ; t9a t11a
|
|
paddd m1, m5 ; t1a t3a
|
|
psubd m5, m2, m6 ; t12a t14a
|
|
paddd m2, m6 ; t4a t6a
|
|
psubd m6, m3, m7 ; t13a t15a
|
|
paddd m3, m7 ; t5a t7a
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8
|
|
ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1
|
|
ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1
|
|
psubd m7, m0, m2 ; t4 t6
|
|
paddd m0, m2 ; t0 t2
|
|
psubd m2, m1, m3 ; t5 t7
|
|
paddd m1, m3 ; t1 t3
|
|
psubd m3, m4, m6 ; t12a t14a
|
|
paddd m4, m6 ; t8a t10a
|
|
psubd m6, m8, m5 ; t13a t15a
|
|
paddd m8, m5 ; t9a t11a
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8
|
|
punpcklqdq m5, m3, m7 ; t12a t4
|
|
punpckhqdq m3, m7 ; t14a t6
|
|
punpckhqdq m7, m6, m2 ; t15a t7
|
|
punpcklqdq m6, m2 ; t13a t5
|
|
ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567
|
|
ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10
|
|
vpbroadcastd m10, [pd_2896]
|
|
vbroadcasti128 m9, [pw_2048_m2048] ; + + - -
|
|
punpckhqdq m2, m4, m0 ; t10a t2
|
|
punpcklqdq m4, m0 ; t8a t0
|
|
punpckhqdq m0, m8, m1 ; t11a t3
|
|
punpcklqdq m8, m1 ; t9a t1
|
|
paddd m1, m6, m7 ; out2 -out3
|
|
psubd m6, m7 ; t14a t6
|
|
paddd m7, m5, m3 ; -out13 out12
|
|
psubd m5, m3 ; t15a t7
|
|
psubd m3, m8, m0 ; t11 t3a
|
|
paddd m8, m0 ; out14 -out15
|
|
paddd m0, m4, m2 ; -out1 out0
|
|
psubd m4, m2 ; t10 t2a
|
|
REPX {pmaxsd x, m12}, m6, m5, m3, m4
|
|
REPX {pminsd x, m13}, m6, m5, m3, m4
|
|
REPX {pmulld x, m10}, m6, m5, m3, m4
|
|
paddd m6, m11
|
|
paddd m4, m11
|
|
paddd m2, m6, m5 ; -out5 out4
|
|
psubd m6, m5 ; out10 -out11
|
|
psubd m5, m4, m3 ; -out9 out8
|
|
paddd m3, m4 ; out6 -out7
|
|
REPX {psrad x, 12}, m2, m3, m5, m6
|
|
REPX {psignd x, m9}, m1, m8, m3, m6
|
|
pshufd m9, m9, q1032
|
|
REPX {psignd x, m9}, m0, m7, m2, m5
|
|
ret
|
|
|
|
INV_TXFM_4X16_FN flipadst, dct
|
|
INV_TXFM_4X16_FN flipadst, adst
|
|
INV_TXFM_4X16_FN flipadst, flipadst
|
|
INV_TXFM_4X16_FN flipadst, identity
|
|
|
|
cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
|
|
.pass1:
|
|
call m(iadst_16x4_internal_10bpc).main
|
|
vpbroadcastd m6, [pd_6144]
|
|
call m(iadst_16x4_internal_10bpc).main_end
|
|
psrad m0, m3, 13
|
|
psrad m1, m2, 13
|
|
psrad m2, m5, 13
|
|
psrad m3, m4, 13
|
|
psrad m4, m7, 13
|
|
psrad m5, m6, 13
|
|
psrad m6, m9, 13
|
|
psrad m7, m8, 13
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(iadst_4x16_internal_10bpc).pass2_main
|
|
vpbroadcastd m5, [pw_2048]
|
|
vpbroadcastd m8, [pixel_10bpc_max]
|
|
lea r6, [strideq*3]
|
|
vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2
|
|
pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
|
|
vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14
|
|
pxor m7, m7
|
|
psubw m9, m7, m5
|
|
vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
|
|
pmulhrsw m0, m4, m9
|
|
call .write_4x4
|
|
pmulhrsw m0, m2, m9
|
|
call .write_4x4
|
|
pmulhrsw m0, m1, m9
|
|
call .write_4x4
|
|
pmulhrsw m0, m3, m9
|
|
call .write_4x4
|
|
RET
|
|
ALIGN function_align
|
|
.write_4x4:
|
|
movq xm4, [dstq+strideq*0]
|
|
movhps xm4, [dstq+r6 ]
|
|
vpbroadcastq m5, [dstq+strideq*1]
|
|
vpbroadcastq m6, [dstq+strideq*2]
|
|
mova [cq+32*0], m7
|
|
mova [cq+32*1], m7
|
|
add cq, 32*2
|
|
vpblendd m4, m5, 0x30
|
|
vpblendd m4, m6, 0xc0
|
|
paddw m4, m0
|
|
pmaxsw m4, m7
|
|
pminsw m4, m8
|
|
vextracti128 xm5, m4, 1
|
|
movq [dstq+strideq*0], xm4
|
|
movq [dstq+strideq*1], xm5
|
|
movhps [dstq+strideq*2], xm5
|
|
movhps [dstq+r6 ], xm4
|
|
lea dstq, [dstq+strideq*4]
|
|
ret
|
|
|
|
INV_TXFM_4X16_FN identity, dct
|
|
INV_TXFM_4X16_FN identity, adst
|
|
INV_TXFM_4X16_FN identity, flipadst
|
|
INV_TXFM_4X16_FN identity, identity
|
|
|
|
cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
|
|
vpbroadcastd m7, [pd_5793]
|
|
pmulld m0, m7, [cq+32*0]
|
|
pmulld m4, m7, [cq+32*1]
|
|
pmulld m1, m7, [cq+32*2]
|
|
pmulld m5, m7, [cq+32*3]
|
|
pmulld m2, m7, [cq+32*4]
|
|
pmulld m6, m7, [cq+32*5]
|
|
pmulld m3, m7, [cq+32*6]
|
|
pmulld m7, [cq+32*7]
|
|
vpbroadcastd m8, [pd_6144]
|
|
REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7
|
|
REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7
|
|
jmp tx2q
|
|
.pass2:
|
|
packssdw m0, m4
|
|
packssdw m1, m5
|
|
packssdw m2, m6
|
|
packssdw m3, m7
|
|
vpbroadcastd m7, [pw_1697x16]
|
|
vpbroadcastd m8, [pw_2048]
|
|
pmulhrsw m4, m7, m0
|
|
pmulhrsw m5, m7, m1
|
|
pmulhrsw m6, m7, m2
|
|
pmulhrsw m7, m3
|
|
REPX {paddsw x, x}, m0, m1, m2, m3
|
|
paddsw m0, m4
|
|
paddsw m1, m5
|
|
paddsw m2, m6
|
|
paddsw m3, m7
|
|
vpbroadcastd m4, [pixel_10bpc_max]
|
|
call .pass2_end
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_end:
|
|
punpckhwd m7, m0, m1
|
|
punpcklwd m0, m1
|
|
punpckhwd m1, m2, m3
|
|
punpcklwd m2, m3
|
|
lea r6, [strideq*5]
|
|
pxor m3, m3
|
|
punpckhdq m5, m0, m2 ; 2 3 6 7
|
|
punpckldq m0, m2 ; 0 1 4 5
|
|
punpckldq m6, m7, m1 ; 8 9 c d
|
|
punpckhdq m7, m1 ; a b e f
|
|
pmulhrsw m0, m8
|
|
call .write_2x4x2
|
|
pmulhrsw m0, m5, m8
|
|
call .write_2x4x2
|
|
pmulhrsw m0, m6, m8
|
|
lea dstq, [dstq+strideq*4]
|
|
call .write_2x4x2
|
|
pmulhrsw m0, m7, m8
|
|
call .write_2x4x2
|
|
ret
|
|
ALIGN function_align
|
|
.write_2x4x2:
|
|
movq xm1, [dstq+strideq*0]
|
|
movhps xm1, [dstq+strideq*1]
|
|
vpbroadcastq m2, [dstq+strideq*4]
|
|
vpblendd m1, m2, 0x30
|
|
vpbroadcastq m2, [dstq+r6 ]
|
|
vpblendd m1, m2, 0xc0
|
|
mova [cq+32*0], m3
|
|
mova [cq+32*1], m3
|
|
add cq, 32*2
|
|
paddw m1, m0
|
|
pmaxsw m1, m3
|
|
pminsw m1, m4
|
|
vextracti128 xm2, m1, 1
|
|
movq [dstq+strideq*0], xm1
|
|
movhps [dstq+strideq*1], xm1
|
|
movq [dstq+strideq*4], xm2
|
|
movhps [dstq+r6 ], xm2
|
|
lea dstq, [dstq+strideq*2]
|
|
ret
|
|
|
|
INV_TXFM_4X16_FN dct, dct, 12
|
|
INV_TXFM_4X16_FN dct, identity, 12
|
|
INV_TXFM_4X16_FN dct, adst, 12
|
|
INV_TXFM_4X16_FN dct, flipadst, 12
|
|
|
|
cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
jmp m(idct_4x16_internal_10bpc).pass1
|
|
.pass2:
|
|
punpckldq m8, m0, m1
|
|
punpckhdq m0, m1
|
|
punpckldq m9, m2, m3
|
|
punpckhdq m2, m3
|
|
punpckldq m1, m4, m5
|
|
punpckhdq m4, m5
|
|
punpckldq m3, m6, m7
|
|
punpckhdq m6, m7
|
|
punpcklqdq m5, m0, m2 ; 2 6
|
|
punpckhqdq m12, m0, m2 ; 3 7
|
|
punpcklqdq m0, m8, m9 ; 0 4
|
|
punpckhqdq m10, m8, m9 ; 1 5
|
|
punpcklqdq m2, m1, m3 ; 8 12
|
|
punpckhqdq m13, m1, m3 ; 9 13
|
|
punpcklqdq m9, m4, m6 ; 10 14
|
|
punpckhqdq m4, m6 ; 11 15
|
|
vperm2i128 m1, m5, m9, 0x20 ; 2 10
|
|
vperm2i128 m3, m9, m5, 0x31 ; 14 6
|
|
vpermq m11, m4, q1302 ; 15 11
|
|
; interleave
|
|
REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13
|
|
REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13
|
|
call m(idct_16x4_internal_10bpc).pass1_main
|
|
vpermq m6, m12, q1302 ; 7 3
|
|
vpermq m5, m13, q3120 ; 9 13
|
|
call m(idct_16x4_internal_10bpc).pass1_main2
|
|
call m(idct_16x4_internal_10bpc).pass1_main3
|
|
REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
packssdw m0, m1
|
|
packssdw m1, m2, m3
|
|
packssdw m2, m4, m5
|
|
packssdw m3, m6, m7
|
|
mova m4, [idct16_12_shuf]
|
|
REPX {vpermd x, m4, x}, m0, m1, m2, m3
|
|
vpbroadcastd m9, [pw_16384]
|
|
vpbroadcastd m8, [pixel_12bpc_max]
|
|
call m(idct_4x16_internal_10bpc).pass2_end
|
|
RET
|
|
|
|
INV_TXFM_4X16_FN adst, dct, 12
|
|
INV_TXFM_4X16_FN adst, adst, 12
|
|
INV_TXFM_4X16_FN adst, flipadst, 12
|
|
INV_TXFM_4X16_FN adst, identity, 12
|
|
|
|
cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
call .main_pass1
|
|
psrad m0, m4, 12
|
|
psrad m1, m5, 12
|
|
psrad m2, 12
|
|
psrad m3, 12
|
|
psrad m4, m8, 12
|
|
psrad m5, m9, 12
|
|
psrad m6, 12
|
|
psrad m7, 12
|
|
jmp tx2q
|
|
.pass2:
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call .transpose_16x4
|
|
call m(iadst_4x16_internal_10bpc).main2
|
|
pshufd m4, m5, q1032
|
|
psrad m5, m6, 3
|
|
pshufd m6, m7, q1032
|
|
psrad m7, m8, 3
|
|
REPX {pshufd x, x, q1032}, m0, m2
|
|
REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6
|
|
.pass2_end:
|
|
packssdw m0, m1
|
|
packssdw m1, m2, m3
|
|
packssdw m2, m4, m5
|
|
packssdw m3, m6, m7
|
|
mova m4, [iadst16_12_shuf]
|
|
REPX {vpermd x, m4, x}, m0, m1, m2, m3
|
|
vpbroadcastd m9, [pw_16384]
|
|
vpbroadcastd m8, [pixel_12bpc_max]
|
|
lea r6, [strideq*3]
|
|
pxor m7, m7
|
|
pmulhrsw m0, m9
|
|
call m(iadst_4x16_internal_10bpc).write_4x4
|
|
pmulhrsw m0, m9, m1
|
|
call m(iadst_4x16_internal_10bpc).write_4x4
|
|
pmulhrsw m0, m9, m2
|
|
call m(iadst_4x16_internal_10bpc).write_4x4
|
|
pmulhrsw m0, m9, m3
|
|
call m(iadst_4x16_internal_10bpc).write_4x4
|
|
RET
|
|
ALIGN function_align
|
|
.transpose_16x4:
|
|
; transpose & interleave
|
|
punpckldq m8, m0, m1
|
|
punpckhdq m0, m1
|
|
punpckldq m9, m2, m3
|
|
punpckhdq m2, m3
|
|
punpckldq m1, m4, m5
|
|
punpckhdq m4, m5
|
|
punpckldq m3, m6, m7
|
|
punpckhdq m6, m7
|
|
punpcklqdq m10, m8, m0
|
|
punpckhqdq m0, m8
|
|
punpcklqdq m11, m9, m2
|
|
punpckhqdq m2, m9
|
|
punpcklqdq m8, m1, m4
|
|
punpckhqdq m4, m1
|
|
punpcklqdq m9, m3, m6
|
|
punpckhqdq m6, m3
|
|
vperm2i128 m5, m0, m2, 0x31 ; 7 5
|
|
vperm2i128 m7, m0, m2, 0x20 ; 3 1
|
|
vperm2i128 m0, m10, m11, 0x20 ; 0 2
|
|
vperm2i128 m2, m10, m11, 0x31 ; 4 6
|
|
vperm2i128 m1, m4, m6, 0x31 ; 15 13
|
|
vperm2i128 m3, m4, m6, 0x20 ; 11 9
|
|
vperm2i128 m4, m8, m9, 0x20 ; 8 10
|
|
vperm2i128 m6, m8, m9, 0x31 ; 12 14
|
|
ret
|
|
ALIGN function_align
|
|
.main_pass1:
|
|
call m(iadst_16x4_internal_10bpc).main
|
|
vpbroadcastd m6, [pd_3072]
|
|
paddd m10, m4, m5
|
|
psubd m4, m3
|
|
psubd m5, m3
|
|
paddd m3, m10
|
|
psubd m8, m7, m1
|
|
paddd m7, m9
|
|
psubd m9, m1
|
|
paddd m7, m1
|
|
REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7
|
|
REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7
|
|
paddd m6, m0
|
|
ret
|
|
|
|
INV_TXFM_4X16_FN flipadst, dct, 12
|
|
INV_TXFM_4X16_FN flipadst, adst, 12
|
|
INV_TXFM_4X16_FN flipadst, flipadst, 12
|
|
INV_TXFM_4X16_FN flipadst, identity, 12
|
|
|
|
cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
call m(iadst_4x16_internal_12bpc).main_pass1
|
|
psrad m0, m3, 12
|
|
psrad m1, m2, 12
|
|
psrad m2, m5, 12
|
|
psrad m3, m4, 12
|
|
psrad m4, m7, 12
|
|
psrad m5, m6, 12
|
|
psrad m6, m9, 12
|
|
psrad m7, m8, 12
|
|
jmp tx2q
|
|
.pass2:
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(iadst_4x16_internal_12bpc).transpose_16x4
|
|
call m(iadst_4x16_internal_10bpc).main2
|
|
pshufd m4, m3, q1032
|
|
psrad m3, m5, 3
|
|
psrad m5, m2, 3
|
|
pshufd m2, m6, q1032
|
|
pshufd m6, m1, q1032
|
|
psrad m1, m7, 3
|
|
psrad m7, m0, 3
|
|
pshufd m0, m8, q1032
|
|
REPX {psrad x, 3}, m0, m2, m4, m6
|
|
jmp m(iadst_4x16_internal_12bpc).pass2_end
|
|
|
|
INV_TXFM_4X16_FN identity, dct, 12
|
|
INV_TXFM_4X16_FN identity, adst, 12
|
|
INV_TXFM_4X16_FN identity, flipadst, 12
|
|
INV_TXFM_4X16_FN identity, identity, 12
|
|
|
|
cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m8, [pd_1697]
|
|
mova m0, [cq+32*0]
|
|
mova m4, [cq+32*1]
|
|
mova m1, [cq+32*2]
|
|
mova m5, [cq+32*3]
|
|
vpbroadcastd m9, [pd_6144]
|
|
pmulld m2, m8, m0
|
|
pmulld m6, m8, m4
|
|
pmulld m3, m8, m1
|
|
pmulld m7, m8, m5
|
|
mova m10, [cq+32*4]
|
|
mova m11, [cq+32*5]
|
|
mova m12, [cq+32*6]
|
|
mova m13, [cq+32*7]
|
|
REPX {paddd x, m9}, m2, m6, m3, m7
|
|
REPX {psrad x, 12}, m2, m6, m3, m7
|
|
paddd m0, m2
|
|
pmulld m2, m8, m10
|
|
paddd m4, m6
|
|
pmulld m6, m8, m11
|
|
paddd m1, m3
|
|
pmulld m3, m8, m12
|
|
paddd m5, m7
|
|
pmulld m7, m8, m13
|
|
REPX {psrad x, 1 }, m0, m4, m1, m5
|
|
REPX {paddd x, m9}, m2, m6, m3, m7
|
|
REPX {psrad x, 12}, m2, m6, m3, m7
|
|
paddd m2, m10
|
|
paddd m6, m11
|
|
paddd m3, m12
|
|
paddd m7, m13
|
|
REPX {psrad x, 1 }, m2, m6, m3, m7
|
|
jmp tx2q
|
|
.pass2:
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
vpbroadcastd m8, [pd_5793]
|
|
vpbroadcastd m9, [pd_1024]
|
|
REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
packssdw m0, m4
|
|
packssdw m1, m5
|
|
packssdw m2, m6
|
|
packssdw m3, m7
|
|
vpbroadcastd m8, [pw_16384]
|
|
vpbroadcastd m4, [pixel_12bpc_max]
|
|
call m(iidentity_4x16_internal_10bpc).pass2_end
|
|
RET
|
|
|
|
%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth
|
|
INV_TXFM_FN %1, %2, 0, 8x4, %3
|
|
%ifidn %1_%2, dct_dct
|
|
vpbroadcastd m2, [dconly_%3bpc]
|
|
%if %3 = 10
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
mov [cq], eobd ; 0
|
|
or r3d, 4
|
|
add r6d, 128
|
|
sar r6d, 8
|
|
imul r6d, 181
|
|
add r6d, 128
|
|
sar r6d, 8
|
|
jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
|
|
%else
|
|
jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly
|
|
%endif
|
|
%endif
|
|
%endmacro
|
|
|
|
INV_TXFM_8X4_FN dct, dct
|
|
INV_TXFM_8X4_FN dct, identity
|
|
INV_TXFM_8X4_FN dct, adst
|
|
INV_TXFM_8X4_FN dct, flipadst
|
|
|
|
cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
.pass1:
|
|
vbroadcasti128 m1, [cq+16*1]
|
|
vbroadcasti128 m0, [cq+16*5]
|
|
vbroadcasti128 m2, [cq+16*3]
|
|
vbroadcasti128 m3, [cq+16*7]
|
|
vpbroadcastd m6, [pd_2896]
|
|
shufpd m1, m0, 0x0c ; 1 5
|
|
shufpd m3, m2, 0x0c ; 7 3
|
|
vbroadcasti128 m0, [cq+16*0]
|
|
vbroadcasti128 m4, [cq+16*2]
|
|
vbroadcasti128 m2, [cq+16*4]
|
|
vbroadcasti128 m5, [cq+16*6]
|
|
vpbroadcastd m7, [pd_2048]
|
|
shufpd m0, m4, 0x0c ; 0 2
|
|
shufpd m2, m5, 0x0c ; 4 6
|
|
REPX {pmulld x, m6}, m1, m3, m0, m2
|
|
REPX {paddd x, m7}, m1, m3, m0, m2
|
|
REPX {psrad x, 12}, m1, m3, m0, m2
|
|
call .main
|
|
psubd m3, m0, m4 ; out7 out6 (interleaved)
|
|
paddd m0, m4 ; out0 out1 (interleaved)
|
|
paddd m1, m2, m5 ; out3 out2 (interleaved)
|
|
psubd m2, m5 ; out4 out5 (interleaved)
|
|
pshufd m1, m1, q1032
|
|
pshufd m3, m3, q1032
|
|
jmp tx2q
|
|
.pass2:
|
|
vbroadcasti128 m4, [deint_shuf]
|
|
packssdw m0, m1
|
|
packssdw m2, m3
|
|
vperm2i128 m1, m0, m2, 0x31
|
|
vinserti128 m0, xm2, 1
|
|
pshufb m0, m4
|
|
pshufb m1, m4
|
|
IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7
|
|
vpermq m0, m0, q3120 ; out0 out1
|
|
vpermq m2, m1, q2031 ; out2 out3
|
|
jmp m(iadst_8x4_internal_10bpc).end
|
|
ALIGN function_align
|
|
.main:
|
|
ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1
|
|
IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7
|
|
vpbroadcastd m6, [pd_2896]
|
|
punpcklqdq m4, m1, m3 ; t4a t7a
|
|
punpckhqdq m1, m3 ; t5a t6a
|
|
psubd m3, m4, m1 ; t5a t6a
|
|
paddd m4, m1 ; t4 t7
|
|
REPX {pmaxsd x, m8}, m3, m4, m0, m2
|
|
REPX {pminsd x, m9}, m3, m4, m0, m2
|
|
pmulld m3, m6
|
|
pshufd m1, m3, q1032
|
|
paddd m3, m7
|
|
psubd m5, m3, m1
|
|
paddd m1, m3
|
|
psrad m5, 12
|
|
psrad m1, 12
|
|
vpblendd m5, m4, 0x33 ; t4 t5
|
|
punpckhqdq m4, m1 ; t7 t6
|
|
ret
|
|
|
|
INV_TXFM_8X4_FN adst, dct
|
|
INV_TXFM_8X4_FN adst, adst
|
|
INV_TXFM_8X4_FN adst, flipadst
|
|
INV_TXFM_8X4_FN adst, identity
|
|
|
|
cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
|
|
call m(iadst_4x8_internal_10bpc).main
|
|
vpblendd m3, m0, m4, 0x33 ; out6 out7
|
|
vpblendd m0, m4, 0xcc ; out0 out1
|
|
pshufd m1, m5, q1032
|
|
psignd m2, m6 ; out4 out5
|
|
psignd m1, m6 ; out2 out3
|
|
jmp tx2q
|
|
.pass2:
|
|
call .pass2_main
|
|
vpermq m0, m0, q3120 ; out0 out1
|
|
vpermq m2, m1, q3120 ; out2 out3
|
|
.end:
|
|
vpbroadcastd m1, [pw_2048]
|
|
pmulhrsw m0, m1
|
|
pmulhrsw m1, m2
|
|
vpbroadcastd m5, [pixel_10bpc_max]
|
|
.end2:
|
|
mova xm2, [dstq+strideq*0]
|
|
vinserti128 m2, [dstq+strideq*1], 1
|
|
lea r6, [dstq+strideq*2]
|
|
mova xm3, [r6 +strideq*0]
|
|
vinserti128 m3, [r6 +strideq*1], 1
|
|
pxor m4, m4
|
|
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
pmaxsw m0, m4
|
|
pmaxsw m1, m4
|
|
pminsw m0, m5
|
|
pminsw m1, m5
|
|
mova [dstq+strideq*0], xm0
|
|
vextracti128 [dstq+strideq*1], m0, 1
|
|
mova [r6 +strideq*0], xm1
|
|
vextracti128 [r6 +strideq*1], m1, 1
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
vbroadcasti128 m4, [deint_shuf]
|
|
packssdw m0, m1
|
|
packssdw m2, m3
|
|
lea r6, [deint_shuf+128]
|
|
vperm2i128 m1, m0, m2, 0x31
|
|
vinserti128 m0, xm2, 1
|
|
pshufb m0, m4
|
|
pshufb m1, m4
|
|
jmp m(iadst_8x4_internal_8bpc).main
|
|
ALIGN function_align
|
|
.main:
|
|
vpbroadcastd m1, [pd_2896]
|
|
pmulld m0, m1, [cq+32*0]
|
|
pmulld m3, m1, [cq+32*3]
|
|
pmulld m2, m1, [cq+32*2]
|
|
pmulld m1, [cq+32*1]
|
|
vpbroadcastd m4, [pd_2048]
|
|
REPX {paddd x, m4}, m0, m3, m2, m1
|
|
REPX {psrad x, 12}, m0, m3, m2, m1
|
|
.main2:
|
|
IADST4_1D
|
|
ret
|
|
|
|
INV_TXFM_8X4_FN flipadst, dct
|
|
INV_TXFM_8X4_FN flipadst, adst
|
|
INV_TXFM_8X4_FN flipadst, flipadst
|
|
INV_TXFM_8X4_FN flipadst, identity
|
|
|
|
cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2
|
|
call m(iadst_4x8_internal_10bpc).main
|
|
shufpd m3, m4, m0, 0x05
|
|
shufpd m0, m4, 0x05
|
|
psignd m2, m6
|
|
pshufd m6, m6, q1032
|
|
pshufd m1, m2, q1032
|
|
psignd m2, m5, m6
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(iadst_8x4_internal_10bpc).pass2_main
|
|
vpermq m2, m0, q2031
|
|
vpermq m0, m1, q2031
|
|
jmp m(iadst_8x4_internal_10bpc).end
|
|
|
|
INV_TXFM_8X4_FN identity, dct
|
|
INV_TXFM_8X4_FN identity, adst
|
|
INV_TXFM_8X4_FN identity, flipadst
|
|
INV_TXFM_8X4_FN identity, identity
|
|
|
|
cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
|
|
.pass1:
|
|
vpbroadcastd m4, [pd_2896]
|
|
vpermq m0, [cq+32*0], q3120
|
|
vpermq m1, [cq+32*1], q3120
|
|
vpermq m2, [cq+32*2], q3120
|
|
vpermq m3, [cq+32*3], q3120
|
|
vpbroadcastd m7, [pd_2048]
|
|
REPX {pmulld x, m4}, m0, m1, m2, m3
|
|
REPX {paddd x, m7}, m0, m1, m2, m3
|
|
REPX {psrad x, 12}, m0, m1, m2, m3
|
|
REPX {paddd x, x }, m0, m1, m2, m3
|
|
jmp tx2q
|
|
.pass2:
|
|
vpbroadcastd m5, [pixel_10bpc_max]
|
|
vpbroadcastd m4, [pw_1697x8]
|
|
packssdw m0, m1
|
|
packssdw m2, m3
|
|
pmulhrsw m1, m4, m0
|
|
pmulhrsw m4, m2
|
|
paddsw m0, m1
|
|
paddsw m2, m4
|
|
packssdw m7, m7 ; pw_2048
|
|
.pass2_end:
|
|
punpckhwd m1, m0, m2
|
|
punpcklwd m0, m2
|
|
lea r6, [dstq+strideq*2]
|
|
punpckhwd m2, m0, m1
|
|
punpcklwd m0, m1
|
|
pmulhrsw m2, m7
|
|
pmulhrsw m0, m7
|
|
punpckhwd m1, m0, m2
|
|
punpcklwd m0, m2
|
|
mova xm2, [dstq+strideq*0]
|
|
vinserti128 m2, [r6 +strideq*0], 1
|
|
mova xm3, [dstq+strideq*1]
|
|
vinserti128 m3, [r6 +strideq*1], 1
|
|
pxor m4, m4
|
|
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
pmaxsw m0, m4
|
|
pmaxsw m1, m4
|
|
pminsw m0, m5
|
|
pminsw m1, m5
|
|
mova [dstq+strideq*0], xm0
|
|
mova [dstq+strideq*1], xm1
|
|
vextracti128 [r6 +strideq*0], m0, 1
|
|
vextracti128 [r6 +strideq*1], m1, 1
|
|
RET
|
|
|
|
INV_TXFM_8X4_FN dct, dct, 12
|
|
INV_TXFM_8X4_FN dct, identity, 12
|
|
INV_TXFM_8X4_FN dct, adst, 12
|
|
INV_TXFM_8X4_FN dct, flipadst, 12
|
|
|
|
cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
|
|
vpbroadcastd m8, [clip_20b_min]
|
|
vpbroadcastd m9, [clip_20b_max]
|
|
jmp m(idct_8x4_internal_10bpc).pass1
|
|
.pass2:
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
REPX {pmaxsd x, m8}, m0, m1, m2, m3
|
|
REPX {pminsd x, m9}, m0, m1, m2, m3
|
|
call m(iadst_8x4_internal_12bpc).transpose_4x8
|
|
IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7
|
|
jmp m(iadst_8x4_internal_12bpc).end
|
|
|
|
INV_TXFM_8X4_FN adst, dct, 12
|
|
INV_TXFM_8X4_FN adst, adst, 12
|
|
INV_TXFM_8X4_FN adst, flipadst, 12
|
|
INV_TXFM_8X4_FN adst, identity, 12
|
|
|
|
cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
|
|
vpbroadcastd m8, [clip_20b_min]
|
|
vpbroadcastd m9, [clip_20b_max]
|
|
call m(iadst_4x8_internal_10bpc).main2
|
|
vpblendd m3, m0, m4, 0x33 ; out6 out7
|
|
vpblendd m0, m4, 0xcc ; out0 out1
|
|
pshufd m1, m5, q1032
|
|
psignd m2, m6 ; out4 out5
|
|
psignd m1, m6 ; out2 out3
|
|
jmp tx2q
|
|
.pass2:
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
REPX {pmaxsd x, m8}, m0, m1, m2, m3
|
|
REPX {pminsd x, m9}, m0, m1, m2, m3
|
|
call .pass2_main
|
|
vpbroadcastd m5, [pd_2048]
|
|
paddd m0, m5, m4
|
|
paddd m1, m5, m6
|
|
paddd m2, m5
|
|
paddd m3, m5
|
|
.pass2_end:
|
|
REPX {psrad x, 12}, m0, m1, m2, m3
|
|
.end:
|
|
vpbroadcastd m4, [pw_16384]
|
|
REPX {psrad x, 3}, m0, m1, m2, m3
|
|
packssdw m0, m1
|
|
packssdw m2, m3
|
|
pmulhrsw m0, m4
|
|
pmulhrsw m1, m2, m4
|
|
vpermq m0, m0, q3120 ; out0 out1
|
|
vpermq m1, m1, q3120 ; out2 out3
|
|
vpbroadcastd m5, [pixel_12bpc_max]
|
|
jmp m(iadst_8x4_internal_10bpc).end2
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
call .transpose_4x8
|
|
jmp m(iadst_8x4_internal_10bpc).main2
|
|
ALIGN function_align
|
|
.transpose_4x8:
|
|
; deinterleave
|
|
pshufd m0, m0, q3120
|
|
pshufd m1, m1, q3120
|
|
pshufd m2, m2, q3120
|
|
pshufd m3, m3, q3120
|
|
; transpose
|
|
punpcklqdq m4, m0, m1
|
|
punpckhqdq m0, m1
|
|
punpcklqdq m5, m2, m3
|
|
punpckhqdq m2, m3
|
|
vperm2i128 m1, m0, m2, 0x20 ; out1
|
|
vperm2i128 m3, m0, m2, 0x31 ; out3
|
|
vperm2i128 m2, m4, m5, 0x31 ; out2
|
|
vperm2i128 m0, m4, m5, 0x20 ; out0
|
|
ret
|
|
|
|
INV_TXFM_8X4_FN flipadst, dct, 12
|
|
INV_TXFM_8X4_FN flipadst, adst, 12
|
|
INV_TXFM_8X4_FN flipadst, flipadst, 12
|
|
INV_TXFM_8X4_FN flipadst, identity, 12
|
|
|
|
cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2
|
|
vpbroadcastd m8, [clip_20b_min]
|
|
vpbroadcastd m9, [clip_20b_max]
|
|
call m(iadst_4x8_internal_10bpc).main2
|
|
shufpd m3, m4, m0, 0x05
|
|
shufpd m0, m4, 0x05
|
|
psignd m2, m6
|
|
pshufd m6, m6, q1032
|
|
pshufd m1, m2, q1032
|
|
psignd m2, m5, m6
|
|
jmp tx2q
|
|
.pass2:
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
REPX {pmaxsd x, m8}, m0, m1, m2, m3
|
|
REPX {pminsd x, m9}, m0, m1, m2, m3
|
|
call m(iadst_8x4_internal_12bpc).pass2_main
|
|
vpbroadcastd m5, [pd_2048]
|
|
paddd m0, m5, m3
|
|
paddd m1, m5, m2
|
|
paddd m3, m5, m4
|
|
paddd m2, m5, m6
|
|
jmp m(iadst_8x4_internal_12bpc).pass2_end
|
|
|
|
INV_TXFM_8X4_FN identity, dct, 12
|
|
INV_TXFM_8X4_FN identity, adst, 12
|
|
INV_TXFM_8X4_FN identity, flipadst, 12
|
|
INV_TXFM_8X4_FN identity, identity, 12
|
|
|
|
cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
|
|
jmp m(iidentity_8x4_internal_10bpc).pass1
|
|
.pass2:
|
|
; m0 = in0 in1 (interleaved)
|
|
; m1 = in2 in3 (interleaved)
|
|
; m2 = in4 in5 (interleaved)
|
|
; m3 = in6 in7 (interleaved)
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
REPX {pmaxsd x, m8}, m0, m1, m2, m3
|
|
REPX {pminsd x, m9}, m0, m1, m2, m3
|
|
vpbroadcastd m4, [pd_5793]
|
|
REPX {pmulld x, m4}, m0, m1, m2, m3
|
|
REPX {paddd x, m7}, m0, m1, m2, m3
|
|
REPX {psrad x, 15}, m0, m1, m2, m3
|
|
vpbroadcastd m5, [pixel_12bpc_max]
|
|
vpbroadcastd m7, [pw_16384]
|
|
packssdw m0, m1
|
|
packssdw m2, m3
|
|
jmp m(iidentity_8x4_internal_10bpc).pass2_end
|
|
|
|
%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth
|
|
INV_TXFM_FN %1, %2, 0, 8x8, %3
|
|
%ifidn %1_%2, dct_dct
|
|
vpbroadcastd m2, [dconly_%3bpc]
|
|
%if %3 = 10
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
mov [cq], eobd ; 0
|
|
or r3d, 8
|
|
.dconly2:
|
|
add r6d, 384
|
|
sar r6d, 9
|
|
.dconly3:
|
|
imul r6d, 181
|
|
add r6d, 2176
|
|
sar r6d, 12
|
|
movd xm0, r6d
|
|
paddsw xm0, xm2
|
|
vpbroadcastw m0, xm0
|
|
.dconly_loop:
|
|
mova xm1, [dstq+strideq*0]
|
|
vinserti128 m1, [dstq+strideq*1], 1
|
|
paddsw m1, m0
|
|
psubusw m1, m2
|
|
mova [dstq+strideq*0], xm1
|
|
vextracti128 [dstq+strideq*1], m1, 1
|
|
lea dstq, [dstq+strideq*2]
|
|
sub r3d, 2
|
|
jg .dconly_loop
|
|
RET
|
|
%else
|
|
jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
|
|
%endif
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2]
|
|
ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a
|
|
ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a
|
|
ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a
|
|
ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a
|
|
psubd m%9, m%3, m%7 ; t6
|
|
paddd m%3, m%7 ; t2
|
|
psubd m%7, m%1, m%5 ; t4
|
|
paddd m%1, m%5 ; t0
|
|
psubd m%5, m%6, m%2 ; t7
|
|
paddd m%6, m%2 ; t3
|
|
psubd m%2, m%8, m%4 ; t5
|
|
paddd m%8, m%4 ; t1
|
|
REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
|
|
REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
|
|
ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a
|
|
ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a
|
|
psubd m%10, m%7, m%9 ; t7
|
|
paddd m%7, m%9 ; out6
|
|
vpbroadcastd m%9, [pd_1448]
|
|
psubd m%4, m%8, m%6 ; t3
|
|
paddd m%8, m%6 ; -out7
|
|
psubd m%6, m%1, m%3 ; t2
|
|
paddd m%1, m%3 ; out0
|
|
psubd m%3, m%2, m%5 ; t6
|
|
paddd m%2, m%5 ; -out1
|
|
REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10
|
|
REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10
|
|
REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10
|
|
psubd m%5, m%6, m%4 ; (t2 - t3) * 1448
|
|
paddd m%4, m%6 ; (t2 + t3) * 1448
|
|
psubd m%6, m%3, m%10 ; (t6 - t7) * 1448
|
|
paddd m%3, m%10 ; (t6 + t7) * 1448
|
|
%endmacro
|
|
|
|
INV_TXFM_8X8_FN dct, dct
|
|
INV_TXFM_8X8_FN dct, identity
|
|
INV_TXFM_8X8_FN dct, adst
|
|
INV_TXFM_8X8_FN dct, flipadst
|
|
|
|
cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
.pass1:
|
|
mova m0, [cq+32*0]
|
|
mova m1, [cq+32*1]
|
|
mova m2, [cq+32*2]
|
|
mova m3, [cq+32*3]
|
|
mova m4, [cq+32*4]
|
|
mova m5, [cq+32*5]
|
|
mova m6, [cq+32*6]
|
|
mova m7, [cq+32*7]
|
|
vpbroadcastd m11, [pd_2048]
|
|
call .main
|
|
call .round_shift1
|
|
jmp tx2q
|
|
.pass2:
|
|
call .transpose_8x8_packed
|
|
call m(idct_8x8_internal_8bpc).main
|
|
vpbroadcastd m12, [pw_2048]
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m1, q2031
|
|
vpermq m2, m2, q3120
|
|
vpermq m3, m3, q2031
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call .write_8x4_start
|
|
pmulhrsw m0, m2, m12
|
|
pmulhrsw m1, m3, m12
|
|
call .write_8x4
|
|
RET
|
|
ALIGN function_align
|
|
.write_8x4_start:
|
|
vpbroadcastd m11, [pixel_10bpc_max]
|
|
lea r6, [strideq*3]
|
|
pxor m10, m10
|
|
.write_8x4:
|
|
mova xm8, [dstq+strideq*0]
|
|
vinserti128 m8, [dstq+strideq*1], 1
|
|
mova xm9, [dstq+strideq*2]
|
|
vinserti128 m9, [dstq+r6 ], 1
|
|
mova [cq+32*0], m10
|
|
mova [cq+32*1], m10
|
|
mova [cq+32*2], m10
|
|
mova [cq+32*3], m10
|
|
add cq, 32*4
|
|
paddw m0, m8
|
|
paddw m1, m9
|
|
pmaxsw m0, m10
|
|
pmaxsw m1, m10
|
|
pminsw m0, m11
|
|
pminsw m1, m11
|
|
mova [dstq+strideq*0], xm0
|
|
vextracti128 [dstq+strideq*1], m0, 1
|
|
mova [dstq+strideq*2], xm1
|
|
vextracti128 [dstq+r6 ], m1, 1
|
|
lea dstq, [dstq+strideq*4]
|
|
ret
|
|
ALIGN function_align
|
|
.transpose_8x8_packed:
|
|
packssdw m0, m4
|
|
packssdw m1, m5
|
|
packssdw m2, m6
|
|
packssdw m3, m7
|
|
lea r6, [deint_shuf+128]
|
|
punpckhwd m4, m0, m1
|
|
punpcklwd m0, m1
|
|
punpckhwd m1, m2, m3
|
|
punpcklwd m2, m3
|
|
punpckhdq m3, m0, m2
|
|
punpckldq m0, m2
|
|
punpckhdq m2, m4, m1
|
|
punpckldq m4, m1
|
|
vinserti128 m1, m3, xm2, 1
|
|
vperm2i128 m3, m2, 0x31
|
|
vperm2i128 m2, m0, m4, 0x31
|
|
vinserti128 m0, xm4, 1
|
|
ret
|
|
ALIGN function_align
|
|
.main_rect2:
|
|
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
.main:
|
|
ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a
|
|
ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a
|
|
ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3
|
|
paddd m8, m1, m5 ; t4
|
|
psubd m1, m5 ; t5a
|
|
paddd m9, m7, m3 ; t7
|
|
psubd m7, m3 ; t6a
|
|
vpbroadcastd m3, [pd_2896]
|
|
REPX {pmaxsd x, m12}, m1, m8, m7, m9
|
|
REPX {pminsd x, m13}, m1, m8, m7, m9
|
|
REPX {pmulld x, m3 }, m0, m4, m7, m1
|
|
paddd m0, m11
|
|
paddd m7, m11
|
|
psubd m5, m0, m4
|
|
paddd m0, m4
|
|
psubd m4, m7, m1
|
|
paddd m7, m1
|
|
REPX {psrad x, 12 }, m5, m0, m4, m7
|
|
psubd m3, m0, m6 ; dct4 out3
|
|
paddd m0, m6 ; dct4 out0
|
|
paddd m6, m5, m2 ; dct4 out1
|
|
psubd m5, m2 ; dct4 out2
|
|
REPX {pmaxsd x, m12}, m0, m6, m5, m3
|
|
REPX {pminsd x, m13}, m0, m6, m5, m3
|
|
ret
|
|
ALIGN function_align
|
|
.round_shift1:
|
|
pcmpeqd m1, m1
|
|
REPX {psubd x, m1}, m0, m6, m5, m3
|
|
paddd m1, m6, m7 ; out1
|
|
psubd m6, m7 ; out6
|
|
psubd m7, m0, m9 ; out7
|
|
paddd m0, m9 ; out0
|
|
paddd m2, m5, m4 ; out2
|
|
psubd m5, m4 ; out5
|
|
psubd m4, m3, m8 ; out4
|
|
paddd m3, m8 ; out3
|
|
REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
ret
|
|
|
|
INV_TXFM_8X8_FN adst, dct
|
|
INV_TXFM_8X8_FN adst, adst
|
|
INV_TXFM_8X8_FN adst, flipadst
|
|
INV_TXFM_8X8_FN adst, identity
|
|
|
|
cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
.pass1:
|
|
call .main
|
|
call .main_end
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(idct_8x8_internal_10bpc).transpose_8x8_packed
|
|
pshufd m4, m0, q1032
|
|
pshufd m5, m1, q1032
|
|
call m(iadst_8x8_internal_8bpc).main_pass2
|
|
vpbroadcastd m5, [pw_2048]
|
|
vpbroadcastd xm12, [pw_4096]
|
|
psubw m12, m5
|
|
REPX {vpermq x, x, q3120}, m0, m1, m2, m3
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4_start
|
|
pmulhrsw m0, m2, m12
|
|
pmulhrsw m1, m3, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
mova m0, [cq+32*0]
|
|
mova m7, [cq+32*7]
|
|
mova m1, [cq+32*1]
|
|
mova m6, [cq+32*6]
|
|
mova m2, [cq+32*2]
|
|
mova m5, [cq+32*5]
|
|
mova m3, [cq+32*3]
|
|
mova m4, [cq+32*4]
|
|
vpbroadcastd m11, [pd_2048]
|
|
.main2:
|
|
IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
|
|
psrld m8, 10 ; pd_1
|
|
vpbroadcastd m9, [pd_3072]
|
|
ret
|
|
ALIGN function_align
|
|
.main_end:
|
|
paddd m0, m8
|
|
psubd m1, m8, m1
|
|
paddd m6, m8
|
|
psubd m7, m8, m7
|
|
REPX {psrad x, 1 }, m0, m1, m6, m7
|
|
; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12
|
|
; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12
|
|
psubd m8, m9, m8 ; pd_3071
|
|
paddd m2, m9
|
|
psubd m3, m8, m3
|
|
paddd m4, m9
|
|
psubd m5, m8, m5
|
|
REPX {psrad x, 12}, m2, m3, m4, m5
|
|
ret
|
|
|
|
INV_TXFM_8X8_FN flipadst, dct
|
|
INV_TXFM_8X8_FN flipadst, adst
|
|
INV_TXFM_8X8_FN flipadst, flipadst
|
|
INV_TXFM_8X8_FN flipadst, identity
|
|
|
|
cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
.pass1:
|
|
call m(iadst_8x8_internal_10bpc).main
|
|
call .main_end
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(idct_8x8_internal_10bpc).transpose_8x8_packed
|
|
pshufd m4, m0, q1032
|
|
pshufd m5, m1, q1032
|
|
call m(iadst_8x8_internal_8bpc).main_pass2
|
|
vpbroadcastd m12, [pw_2048]
|
|
vpbroadcastd xm5, [pw_4096]
|
|
psubw m12, m5
|
|
vpermq m8, m3, q2031
|
|
vpermq m9, m2, q2031
|
|
vpermq m2, m1, q2031
|
|
vpermq m3, m0, q2031
|
|
pmulhrsw m0, m8, m12
|
|
pmulhrsw m1, m9, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4_start
|
|
pmulhrsw m0, m2, m12
|
|
pmulhrsw m1, m3, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
RET
|
|
ALIGN function_align
|
|
.main_end:
|
|
paddd m10, m8, m0
|
|
psubd m0, m8, m7
|
|
psubd m7, m8, m1
|
|
paddd m1, m8, m6
|
|
psrad m0, 1
|
|
psrad m1, 1
|
|
psrad m6, m7, 1
|
|
psrad m7, m10, 1
|
|
psubd m8, m9, m8 ; pd_6143
|
|
psubd m10, m8, m5
|
|
paddd m5, m9, m2
|
|
psubd m2, m8, m3
|
|
paddd m3, m9, m4
|
|
psrad m4, m2, 12
|
|
psrad m2, m10, 12
|
|
psrad m3, 12
|
|
psrad m5, 12
|
|
ret
|
|
|
|
INV_TXFM_8X8_FN identity, dct
|
|
INV_TXFM_8X8_FN identity, adst
|
|
INV_TXFM_8X8_FN identity, flipadst
|
|
INV_TXFM_8X8_FN identity, identity
|
|
|
|
cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
.pass1:
|
|
mova m0, [cq+32*0]
|
|
mova m1, [cq+32*1]
|
|
mova m2, [cq+32*2]
|
|
mova m3, [cq+32*3]
|
|
mova m4, [cq+32*4]
|
|
mova m5, [cq+32*5]
|
|
mova m6, [cq+32*6]
|
|
mova m7, [cq+32*7]
|
|
jmp tx2q
|
|
.pass2:
|
|
packssdw m3, m7
|
|
vpbroadcastd m7, [pixel_10bpc_max]
|
|
.pass2_main:
|
|
packssdw m0, m4
|
|
packssdw m1, m5
|
|
packssdw m2, m6
|
|
vpbroadcastd m12, [pw_4096]
|
|
punpckhwd m4, m0, m1
|
|
punpcklwd m0, m1
|
|
punpckhwd m1, m2, m3
|
|
punpcklwd m2, m3
|
|
punpckhdq m3, m0, m2
|
|
punpckldq m0, m2
|
|
punpckldq m2, m4, m1
|
|
punpckhdq m4, m1
|
|
punpckhqdq m1, m0, m2 ; 1 5
|
|
punpcklqdq m0, m2 ; 0 4
|
|
punpcklqdq m2, m3, m4 ; 2 6
|
|
punpckhqdq m3, m4 ; 3 7
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call .write_2x8x2_start
|
|
pmulhrsw m0, m2, m12
|
|
pmulhrsw m1, m3, m12
|
|
call .write_2x8x2_zero
|
|
RET
|
|
.write_2x8x2_start:
|
|
lea r6, [strideq*5]
|
|
pxor m6, m6
|
|
.write_2x8x2_zero:
|
|
mova [cq+32*0], m6
|
|
mova [cq+32*1], m6
|
|
mova [cq+32*2], m6
|
|
mova [cq+32*3], m6
|
|
add cq, 32*4
|
|
.write_2x8x2:
|
|
mova xm4, [dstq+strideq*0]
|
|
vinserti128 m4, [dstq+strideq*4], 1
|
|
mova xm5, [dstq+strideq*1]
|
|
vinserti128 m5, [dstq+r6 ], 1
|
|
paddw m0, m4
|
|
paddw m1, m5
|
|
pmaxsw m0, m6
|
|
pmaxsw m1, m6
|
|
pminsw m0, m7
|
|
pminsw m1, m7
|
|
mova [dstq+strideq*0], xm0
|
|
mova [dstq+strideq*1], xm1
|
|
vextracti128 [dstq+strideq*4], m0, 1
|
|
vextracti128 [dstq+r6 ], m1, 1
|
|
lea dstq, [dstq+strideq*2]
|
|
ret
|
|
|
|
%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4]
|
|
punpckldq m%9, m%1, m%2 ; aibj emfn
|
|
punpckhdq m%1, m%2 ; ckdl gohp
|
|
punpckldq m%10, m%3, m%4 ; qyrz uCvD
|
|
punpckhdq m%3, m%4 ; sAtB wExF
|
|
punpckldq m%11, m%5, m%6 ; GOHP KSLT
|
|
punpckhdq m%5, m%6 ; IQJR MUNV
|
|
punpckldq m%12, m%7, m%8 ; WeXf aibj
|
|
punpckhdq m%7, m%8 ; YgZh ckdl
|
|
punpcklqdq m%2, m%9, m%10 ; aiqy emuC
|
|
punpckhqdq m%9, m%10 ; bjrz fnvD
|
|
punpcklqdq m%4, m%1, m%3 ; cksA gowE
|
|
punpckhqdq m%10, m%1, m%3 ; dltB hpxF
|
|
punpcklqdq m%6, m%11, m%12 ; GOWe KSai
|
|
punpckhqdq m%11, m%12 ; HPXf LTbj
|
|
punpcklqdq m%8, m%5, m%7 ; IQYg MUck
|
|
punpckhqdq m%12, m%5, m%7 ; JRZh NVdl
|
|
vperm2i128 m%1, m%2, m%6, 0x20 ; out0
|
|
vperm2i128 m%5, m%2, m%6, 0x31 ; out4
|
|
vperm2i128 m%2, m%9, m%11, 0x20 ; out1
|
|
vperm2i128 m%6, m%9, m%11, 0x31 ; out5
|
|
vperm2i128 m%3, m%4, m%8, 0x20 ; out2
|
|
vperm2i128 m%7, m%4, m%8, 0x31 ; out6
|
|
vperm2i128 m%4, m%10, m%12, 0x20 ; out3
|
|
vperm2i128 m%8, m%10, m%12, 0x31 ; out7
|
|
%endmacro
|
|
|
|
INV_TXFM_8X8_FN dct, dct, 12
|
|
INV_TXFM_8X8_FN dct, identity, 12
|
|
INV_TXFM_8X8_FN dct, adst, 12
|
|
INV_TXFM_8X8_FN dct, flipadst, 12
|
|
|
|
cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
jmp m(idct_8x8_internal_10bpc).pass1
|
|
.pass2:
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call .transpose_8x8
|
|
vpbroadcastd m11, [pd_2048]
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call .round_shift4
|
|
jmp m(iadst_8x8_internal_12bpc).pass2_end
|
|
ALIGN function_align
|
|
.write_8x4_start:
|
|
vpbroadcastd m11, [pixel_12bpc_max]
|
|
lea r6, [strideq*3]
|
|
pxor m10, m10
|
|
ret
|
|
ALIGN function_align
|
|
.transpose_8x8:
|
|
TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
|
|
ret
|
|
ALIGN function_align
|
|
.round_shift4:
|
|
vpbroadcastd m1, [pd_8]
|
|
REPX {paddd x, m1}, m0, m6, m5, m3
|
|
paddd m1, m6, m7 ; out1
|
|
psubd m6, m7 ; out6
|
|
psubd m7, m0, m9 ; out7
|
|
paddd m0, m9 ; out0
|
|
paddd m2, m5, m4 ; out2
|
|
psubd m5, m4 ; out5
|
|
psubd m4, m3, m8 ; out4
|
|
paddd m3, m8 ; out3
|
|
REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
ret
|
|
|
|
INV_TXFM_8X8_FN adst, dct, 12
|
|
INV_TXFM_8X8_FN adst, adst, 12
|
|
INV_TXFM_8X8_FN adst, flipadst, 12
|
|
INV_TXFM_8X8_FN adst, identity, 12
|
|
|
|
cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
jmp m(iadst_8x8_internal_10bpc).pass1
|
|
.pass2:
|
|
call .pass2_main
|
|
.pass2_end:
|
|
packssdw m0, m1
|
|
packssdw m1, m2, m3
|
|
REPX {vpermq x, x, q3120}, m0, m1
|
|
call m(idct_8x8_internal_12bpc).write_8x4_start
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
packssdw m0, m4, m5
|
|
packssdw m1, m6, m7
|
|
REPX {vpermq x, x, q3120}, m0, m1
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(idct_8x8_internal_12bpc).transpose_8x8
|
|
vpbroadcastd m11, [pd_2048]
|
|
.pass2_main2:
|
|
call m(iadst_8x8_internal_10bpc).main2
|
|
pslld m9, m8, 3 ; pd_8
|
|
paddd m0, m9
|
|
psubd m1, m9, m1 ; 8+x
|
|
paddd m6, m9
|
|
psubd m7, m9, m7
|
|
REPX {psrad x, 4}, m0, m1, m6, m7
|
|
vpbroadcastd m9, [pd_17408]
|
|
psubd m8, m9, m8 ; 17407
|
|
paddd m2, m9
|
|
psubd m3, m8, m3
|
|
paddd m4, m9
|
|
psubd m5, m8, m5
|
|
REPX {psrad x, 15}, m2, m3, m4, m5
|
|
ret
|
|
|
|
INV_TXFM_8X8_FN flipadst, dct, 12
|
|
INV_TXFM_8X8_FN flipadst, adst, 12
|
|
INV_TXFM_8X8_FN flipadst, flipadst, 12
|
|
INV_TXFM_8X8_FN flipadst, identity, 12
|
|
|
|
cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
jmp m(iflipadst_8x8_internal_10bpc).pass1
|
|
.pass2:
|
|
call m(iadst_8x8_internal_12bpc).pass2_main
|
|
packssdw m7, m7, m6
|
|
packssdw m6, m1, m0
|
|
packssdw m1, m5, m4
|
|
vpermq m0, m7, q3120
|
|
vpermq m1, m1, q3120
|
|
call m(idct_8x8_internal_12bpc).write_8x4_start
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
packssdw m0, m3, m2
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m6, q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
RET
|
|
|
|
INV_TXFM_8X8_FN identity, dct, 12
|
|
INV_TXFM_8X8_FN identity, adst, 12
|
|
INV_TXFM_8X8_FN identity, flipadst, 12
|
|
INV_TXFM_8X8_FN identity, identity, 12
|
|
|
|
cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
jmp m(iidentity_8x8_internal_10bpc).pass1
|
|
.pass2:
|
|
packssdw m3, m7
|
|
vpbroadcastd m7, [pixel_12bpc_max]
|
|
jmp m(iidentity_8x8_internal_10bpc).pass2_main
|
|
|
|
%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
|
|
INV_TXFM_FN %1, %2, %3, 8x16, %4
|
|
%ifidn %1_%2, dct_dct
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m2, [dconly_%4bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 16
|
|
add r6d, 128
|
|
sar r6d, 8
|
|
imul r6d, 181
|
|
jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
|
|
%endif
|
|
%endmacro
|
|
|
|
INV_TXFM_8X16_FN dct, dct
|
|
INV_TXFM_8X16_FN dct, identity, 35
|
|
INV_TXFM_8X16_FN dct, adst
|
|
INV_TXFM_8X16_FN dct, flipadst
|
|
|
|
cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
|
|
%undef cmp
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
.pass1:
|
|
vpbroadcastd m14, [pd_2896]
|
|
vpbroadcastd m11, [pd_2048]
|
|
cmp eobd, 43
|
|
jl .fast
|
|
add cq, 32
|
|
call .pass1_main
|
|
sub cq, 32
|
|
mova [cq+32* 1], m0
|
|
mova [cq+32* 3], m1
|
|
mova [cq+32* 5], m2
|
|
mova [cq+32* 7], m3
|
|
mova [cq+32* 9], m4
|
|
mova [cq+32*11], m5
|
|
mova [cq+32*13], m6
|
|
mova m15, m7
|
|
call .pass1_main
|
|
mova m8, [cq+32* 1]
|
|
mova m9, [cq+32* 3]
|
|
mova m10, [cq+32* 5]
|
|
mova m11, [cq+32* 7]
|
|
mova m12, [cq+32* 9]
|
|
mova m13, [cq+32*11]
|
|
mova m14, [cq+32*13]
|
|
jmp tx2q
|
|
.fast:
|
|
call .pass1_main
|
|
pxor m8, m8
|
|
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
|
|
jmp tx2q
|
|
.pass2:
|
|
call .transpose
|
|
call m(idct_8x16_internal_8bpc).main
|
|
vpbroadcastd m12, [pw_2048]
|
|
REPX {vpermq x, x, q3120}, m0, m2, m4, m6
|
|
REPX {vpermq x, x, q2031}, m1, m3, m5, m7
|
|
.end:
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4_start
|
|
pmulhrsw m0, m2, m12
|
|
pmulhrsw m1, m3, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
pmulhrsw m0, m4, m12
|
|
pmulhrsw m1, m5, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
pmulhrsw m0, m6, m12
|
|
pmulhrsw m1, m7, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
RET
|
|
ALIGN function_align
|
|
.transpose:
|
|
packssdw m0, m8
|
|
packssdw m1, m9
|
|
packssdw m2, m10
|
|
packssdw m3, m11
|
|
packssdw m4, m12
|
|
packssdw m5, m13
|
|
packssdw m6, m14
|
|
packssdw m7, m15
|
|
lea r6, [deint_shuf+128]
|
|
punpckhwd m8, m0, m1
|
|
punpcklwd m0, m1
|
|
punpckhwd m1, m2, m3
|
|
punpcklwd m2, m3
|
|
punpcklwd m3, m4, m5
|
|
punpckhwd m4, m5
|
|
punpckhwd m5, m6, m7
|
|
punpcklwd m6, m7
|
|
punpckhdq m7, m3, m6
|
|
punpckldq m3, m6
|
|
punpckhdq m6, m4, m5
|
|
punpckldq m4, m5
|
|
punpckhdq m5, m8, m1
|
|
punpckldq m8, m1
|
|
punpckhdq m1, m0, m2
|
|
punpckldq m0, m2
|
|
vperm2i128 m2, m0, m3, 0x31
|
|
vinserti128 m0, xm3, 1
|
|
vperm2i128 m3, m1, m7, 0x31
|
|
vinserti128 m1, xm7, 1
|
|
vperm2i128 m7, m5, m6, 0x31
|
|
vinserti128 m5, xm6, 1
|
|
vperm2i128 m6, m8, m4, 0x31
|
|
vinserti128 m4, m8, xm4, 1
|
|
ret
|
|
ALIGN function_align
|
|
.pass1_main:
|
|
pmulld m0, m14, [cq+32* 0]
|
|
pmulld m1, m14, [cq+32* 2]
|
|
pmulld m2, m14, [cq+32* 4]
|
|
pmulld m3, m14, [cq+32* 6]
|
|
pmulld m4, m14, [cq+32* 8]
|
|
pmulld m5, m14, [cq+32*10]
|
|
pmulld m6, m14, [cq+32*12]
|
|
pmulld m7, m14, [cq+32*14]
|
|
call m(idct_8x8_internal_10bpc).main_rect2
|
|
jmp m(idct_8x8_internal_10bpc).round_shift1
|
|
ALIGN function_align
|
|
.main_evenhalf:
|
|
paddd m1, m6, m7 ; idct8 out1
|
|
psubd m6, m7 ; idct8 out6
|
|
psubd m7, m0, m9 ; idct8 out7
|
|
paddd m0, m9 ; idct8 out0
|
|
paddd m2, m5, m4 ; idct8 out2
|
|
psubd m5, m4 ; idct8 out5
|
|
psubd m4, m3, m8 ; idct8 out4
|
|
paddd m3, m8 ; idct8 out3
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
ret
|
|
.main_oddhalf_fast_rect2:
|
|
REPX {paddd x, m11}, m0, m1, m2, m3
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3
|
|
.main_oddhalf_fast: ; lower half zero
|
|
vpbroadcastd m7, [pd_4076]
|
|
vpbroadcastd m8, [pd_401]
|
|
vpbroadcastd m6, [pd_m1189]
|
|
vpbroadcastd m9, [pd_3920]
|
|
vpbroadcastd m5, [pd_3612]
|
|
vpbroadcastd m10, [pd_1931]
|
|
vpbroadcastd m4, [pd_m2598]
|
|
vpbroadcastd m15, [pd_3166]
|
|
pmulld m7, m0
|
|
pmulld m0, m8
|
|
pmulld m6, m1
|
|
pmulld m1, m9
|
|
pmulld m5, m2
|
|
pmulld m2, m10
|
|
pmulld m4, m3
|
|
pmulld m3, m15
|
|
jmp .main_oddhalf_fast2
|
|
.main_oddhalf_rect2:
|
|
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
.main_oddhalf:
|
|
ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a
|
|
ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
|
|
ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
|
|
ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a
|
|
.main_oddhalf_fast2:
|
|
REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
|
|
REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
|
|
psubd m8, m0, m4 ; t9
|
|
paddd m0, m4 ; t8
|
|
psubd m4, m6, m2 ; t10
|
|
paddd m2, m6 ; t11
|
|
psubd m6, m1, m5 ; t13
|
|
paddd m5, m1 ; t12
|
|
psubd m1, m7, m3 ; t14
|
|
paddd m7, m3 ; t15
|
|
REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
|
|
REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
|
|
vpbroadcastd m15, [pd_3784]
|
|
vpbroadcastd m10, [pd_1567]
|
|
ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
|
|
ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2
|
|
psubd m3, m1, m4 ; t10
|
|
paddd m1, m4 ; t9
|
|
psubd m4, m0, m2 ; t11a
|
|
paddd m0, m2 ; t8a
|
|
psubd m2, m8, m6 ; t13
|
|
paddd m6, m8 ; t14
|
|
psubd m8, m7, m5 ; t12a
|
|
paddd m7, m5 ; t15a
|
|
REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
|
|
REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
|
|
REPX {pmulld x, m14}, m2, m8, m3, m4
|
|
paddd m2, m11
|
|
paddd m8, m11
|
|
paddd m5, m2, m3 ; t13a
|
|
psubd m2, m3 ; t10a
|
|
psubd m3, m8, m4 ; t11
|
|
paddd m4, m8 ; t12
|
|
REPX {psrad x, 12}, m5, m2, m3, m4
|
|
mova [r6-32*4], m7
|
|
mova [r6-32*3], m6
|
|
mova [r6-32*2], m5
|
|
mova [r6-32*1], m4
|
|
mova [r6+32*0], m3
|
|
mova [r6+32*1], m2
|
|
mova [r6+32*2], m1
|
|
mova [r6+32*3], m0
|
|
ret
|
|
|
|
INV_TXFM_8X16_FN adst, dct
|
|
INV_TXFM_8X16_FN adst, adst
|
|
INV_TXFM_8X16_FN adst, flipadst
|
|
INV_TXFM_8X16_FN adst, identity, 35
|
|
|
|
cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
|
|
%undef cmp
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
.pass1:
|
|
vpbroadcastd m14, [pd_2896]
|
|
vpbroadcastd m11, [pd_2048]
|
|
cmp eobd, 43
|
|
jl .fast
|
|
add cq, 32
|
|
call .pass1_main
|
|
call m(iadst_8x8_internal_10bpc).main_end
|
|
sub cq, 32
|
|
mova [cq+32* 1], m0
|
|
mova [cq+32* 3], m1
|
|
mova [cq+32* 5], m2
|
|
mova [cq+32* 7], m3
|
|
mova [cq+32* 9], m4
|
|
mova [cq+32*11], m5
|
|
mova [cq+32*13], m6
|
|
mova m15, m7
|
|
call .pass1_main
|
|
call m(iadst_8x8_internal_10bpc).main_end
|
|
mova m8, [cq+32* 1]
|
|
mova m9, [cq+32* 3]
|
|
mova m10, [cq+32* 5]
|
|
mova m11, [cq+32* 7]
|
|
mova m12, [cq+32* 9]
|
|
mova m13, [cq+32*11]
|
|
mova m14, [cq+32*13]
|
|
jmp tx2q
|
|
.fast:
|
|
call .pass1_main
|
|
call m(iadst_8x8_internal_10bpc).main_end
|
|
pxor m8, m8
|
|
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(idct_8x16_internal_10bpc).transpose
|
|
call m(iadst_8x16_internal_8bpc).main
|
|
call m(iadst_8x16_internal_8bpc).main_pass2_end
|
|
vpbroadcastd m8, [pw_2048]
|
|
vpbroadcastd xm12, [pw_4096]
|
|
REPX {vpermq x, x, q2031}, m0, m1, m2, m3
|
|
REPX {vpermq x, x, q3120}, m4, m5, m6, m7
|
|
psubw m12, m8
|
|
jmp m(idct_8x16_internal_10bpc).end
|
|
ALIGN function_align
|
|
.pass1_main:
|
|
pmulld m0, m14, [cq+32* 0]
|
|
pmulld m7, m14, [cq+32*14]
|
|
pmulld m1, m14, [cq+32* 2]
|
|
pmulld m6, m14, [cq+32*12]
|
|
pmulld m2, m14, [cq+32* 4]
|
|
pmulld m5, m14, [cq+32*10]
|
|
pmulld m3, m14, [cq+32* 6]
|
|
pmulld m4, m14, [cq+32* 8]
|
|
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
jmp m(iadst_8x8_internal_10bpc).main2
|
|
|
|
INV_TXFM_8X16_FN flipadst, dct
|
|
INV_TXFM_8X16_FN flipadst, adst
|
|
INV_TXFM_8X16_FN flipadst, flipadst
|
|
INV_TXFM_8X16_FN flipadst, identity, 35
|
|
|
|
cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
|
|
%undef cmp
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
.pass1:
|
|
vpbroadcastd m14, [pd_2896]
|
|
vpbroadcastd m11, [pd_2048]
|
|
cmp eobd, 43
|
|
jl .fast
|
|
add cq, 32
|
|
call m(iadst_8x16_internal_10bpc).pass1_main
|
|
call m(iflipadst_8x8_internal_10bpc).main_end
|
|
sub cq, 32
|
|
mova [cq+32* 1], m0
|
|
mova [cq+32* 3], m1
|
|
mova [cq+32* 5], m2
|
|
mova [cq+32* 7], m3
|
|
mova [cq+32* 9], m4
|
|
mova [cq+32*11], m5
|
|
mova [cq+32*13], m6
|
|
mova m15, m7
|
|
call m(iadst_8x16_internal_10bpc).pass1_main
|
|
call m(iflipadst_8x8_internal_10bpc).main_end
|
|
mova m8, [cq+32* 1]
|
|
mova m9, [cq+32* 3]
|
|
mova m10, [cq+32* 5]
|
|
mova m11, [cq+32* 7]
|
|
mova m12, [cq+32* 9]
|
|
mova m13, [cq+32*11]
|
|
mova m14, [cq+32*13]
|
|
jmp tx2q
|
|
.fast:
|
|
call m(iadst_8x16_internal_10bpc).pass1_main
|
|
call m(iflipadst_8x8_internal_10bpc).main_end
|
|
pxor m8, m8
|
|
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(idct_8x16_internal_10bpc).transpose
|
|
call m(iadst_8x16_internal_8bpc).main
|
|
call m(iadst_8x16_internal_8bpc).main_pass2_end
|
|
vpbroadcastd m12, [pw_2048]
|
|
vpbroadcastd xm13, [pw_4096]
|
|
mova m11, m0
|
|
vpermq m0, m7, q2031
|
|
mova m10, m1
|
|
vpermq m1, m6, q2031
|
|
mova m9, m2
|
|
vpermq m2, m5, q2031
|
|
mova m8, m3
|
|
vpermq m3, m4, q2031
|
|
vpermq m4, m8, q3120
|
|
vpermq m5, m9, q3120
|
|
vpermq m6, m10, q3120
|
|
vpermq m7, m11, q3120
|
|
psubw m12, m13
|
|
jmp m(idct_8x16_internal_10bpc).end
|
|
|
|
INV_TXFM_8X16_FN identity, dct
|
|
INV_TXFM_8X16_FN identity, adst
|
|
INV_TXFM_8X16_FN identity, flipadst
|
|
INV_TXFM_8X16_FN identity, identity
|
|
|
|
%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384]
|
|
pmulhrsw m%2, m%3, m%1
|
|
%if %0 == 4 ; if downshifting by 1
|
|
%ifnum %4
|
|
pmulhrsw m%2, m%4
|
|
%else ; without rounding
|
|
psraw m%2, 1
|
|
%endif
|
|
%else
|
|
paddsw m%1, m%1
|
|
%endif
|
|
paddsw m%1, m%2
|
|
%endmacro
|
|
|
|
cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
|
|
.pass1:
|
|
vpbroadcastd m15, [pd_2896]
|
|
pmulld m0, m15, [cq+32* 0]
|
|
pmulld m8, m15, [cq+32* 1]
|
|
pmulld m1, m15, [cq+32* 2]
|
|
pmulld m9, m15, [cq+32* 3]
|
|
pmulld m2, m15, [cq+32* 4]
|
|
pmulld m10, m15, [cq+32* 5]
|
|
pmulld m3, m15, [cq+32* 6]
|
|
pmulld m11, m15, [cq+32* 7]
|
|
pmulld m4, m15, [cq+32* 8]
|
|
pmulld m12, m15, [cq+32* 9]
|
|
pmulld m5, m15, [cq+32*10]
|
|
pmulld m13, m15, [cq+32*11]
|
|
pmulld m6, m15, [cq+32*12]
|
|
pmulld m14, m15, [cq+32*13]
|
|
pmulld m7, m15, [cq+32*14]
|
|
pmulld m15, [cq+32*15]
|
|
mova [cq], m7
|
|
vpbroadcastd m7, [pd_2048]
|
|
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
paddd m7, [cq]
|
|
REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
jmp tx2q
|
|
.pass2:
|
|
packssdw m0, m8
|
|
packssdw m1, m9
|
|
packssdw m2, m10
|
|
packssdw m3, m11
|
|
packssdw m4, m12
|
|
packssdw m5, m13
|
|
packssdw m6, m14
|
|
packssdw m13, m7, m15
|
|
vpbroadcastd m8, [pw_1697x16]
|
|
REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13
|
|
vpbroadcastd m7, [pixel_10bpc_max]
|
|
vpbroadcastd m12, [pw_2048]
|
|
call .pass2_end
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_end:
|
|
punpckhwd m9, m0, m1
|
|
punpcklwd m0, m1
|
|
punpckhwd m1, m6, m13
|
|
punpcklwd m6, m13
|
|
punpckhwd m13, m4, m5
|
|
punpcklwd m4, m5
|
|
punpcklwd m5, m2, m3
|
|
punpckhwd m2, m3
|
|
punpckhdq m3, m0, m5
|
|
punpckldq m0, m5
|
|
punpckhdq m11, m9, m2
|
|
punpckldq m9, m2
|
|
punpckldq m2, m4, m6
|
|
punpckhdq m4, m6
|
|
punpckldq m6, m13, m1
|
|
punpckhdq m13, m1
|
|
punpckhqdq m1, m0, m2
|
|
punpcklqdq m0, m2
|
|
punpcklqdq m2, m3, m4
|
|
punpckhqdq m3, m4
|
|
punpcklqdq m8, m9, m6
|
|
punpckhqdq m9, m6
|
|
punpcklqdq m10, m11, m13
|
|
punpckhqdq m11, m13
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(iidentity_8x8_internal_10bpc).write_2x8x2_start
|
|
pmulhrsw m0, m12, m2
|
|
pmulhrsw m1, m12, m3
|
|
call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
|
|
pmulhrsw m0, m12, m8
|
|
pmulhrsw m1, m12, m9
|
|
lea dstq, [dstq+strideq*4]
|
|
call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
|
|
pmulhrsw m0, m12, m10
|
|
pmulhrsw m1, m12, m11
|
|
call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
|
|
ret
|
|
|
|
INV_TXFM_8X16_FN dct, dct, 0, 12
|
|
INV_TXFM_8X16_FN dct, identity, 35, 12
|
|
INV_TXFM_8X16_FN dct, adst, 0, 12
|
|
INV_TXFM_8X16_FN dct, flipadst, 0, 12
|
|
|
|
cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
jmp m(idct_8x16_internal_10bpc).pass1
|
|
.pass2:
|
|
lea r6, [rsp+32*4]
|
|
call .transpose
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
mova [cq+32* 8], m0
|
|
mova [cq+32*10], m2
|
|
mova [cq+32*12], m4
|
|
mova [cq+32*14], m6
|
|
pmaxsd m0, m12, [cq+32* 1]
|
|
pmaxsd m4, m12, m1
|
|
pmaxsd m1, m12, [cq+32* 3]
|
|
pmaxsd m2, m12, [cq+32* 5]
|
|
pmaxsd m6, m12, m5
|
|
pmaxsd m5, m12, m3
|
|
pmaxsd m3, m12, [cq+32* 7]
|
|
pmaxsd m7, m12
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m14, [pd_2896]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf
|
|
pmaxsd m0, m12, [cq+32* 0]
|
|
pmaxsd m1, m12, [cq+32* 2]
|
|
pmaxsd m2, m12, [cq+32* 4]
|
|
pmaxsd m3, m12, [cq+32* 6]
|
|
pmaxsd m4, m12, [cq+32* 8]
|
|
pmaxsd m5, m12, [cq+32*10]
|
|
pmaxsd m6, m12, [cq+32*12]
|
|
pmaxsd m7, m12, [cq+32*14]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
vpbroadcastd m11, [pd_8]
|
|
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(idct_16x8_internal_10bpc).pass1_rotations
|
|
REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
.end:
|
|
packssdw m0, m1
|
|
packssdw m1, m2, m3
|
|
packssdw m2, m4, m5
|
|
packssdw m3, m6, m7
|
|
packssdw m4, m8, m9
|
|
packssdw m5, m10, m11
|
|
packssdw m6, m12, m13
|
|
packssdw m7, m14, m15
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m1, q3120
|
|
call m(idct_8x8_internal_12bpc).write_8x4_start
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, m2, q3120
|
|
vpermq m1, m3, q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, m4, q3120
|
|
vpermq m1, m5, q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, m6, q3120
|
|
vpermq m1, m7, q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
RET
|
|
ALIGN function_align
|
|
.transpose:
|
|
mova [cq+32* 8], m8
|
|
mova [cq+32* 9], m9
|
|
mova [cq+32*10], m10
|
|
mova [cq+32*11], m11
|
|
call m(idct_8x8_internal_12bpc).transpose_8x8
|
|
mova [cq+32* 0], m0
|
|
mova [cq+32* 1], m1
|
|
mova [cq+32* 2], m2
|
|
mova [cq+32* 3], m3
|
|
mova [cq+32* 4], m4
|
|
mova [cq+32* 5], m5
|
|
mova [cq+32* 6], m6
|
|
mova [cq+32* 7], m7
|
|
mova m0, [cq+32* 8]
|
|
mova m1, [cq+32* 9]
|
|
mova m2, [cq+32*10]
|
|
mova m3, [cq+32*11]
|
|
mova m4, m12
|
|
mova m5, m13
|
|
mova m6, m14
|
|
mova m7, m15
|
|
jmp m(idct_8x8_internal_12bpc).transpose_8x8
|
|
|
|
INV_TXFM_8X16_FN adst, dct, 0, 12
|
|
INV_TXFM_8X16_FN adst, adst, 0, 12
|
|
INV_TXFM_8X16_FN adst, flipadst, 0, 12
|
|
INV_TXFM_8X16_FN adst, identity, 35, 12
|
|
|
|
cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
jmp m(iadst_8x16_internal_10bpc).pass1
|
|
.pass2:
|
|
lea r6, [rsp+32*4]
|
|
call .pass2_main
|
|
call m(iadst_16x8_internal_10bpc).pass1_rotations
|
|
.pass2_end:
|
|
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
|
|
REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
|
|
jmp m(idct_8x16_internal_12bpc).end
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
call m(idct_8x16_internal_12bpc).transpose
|
|
vpbroadcastd m13, [clip_18b_min]
|
|
vpbroadcastd m14, [clip_18b_max]
|
|
mova [cq+32* 8], m0
|
|
mova [cq+32*11], m3
|
|
mova [cq+32*12], m4
|
|
mova [cq+32*15], m7
|
|
pmaxsd m0, m13, [cq+32* 2] ; 2
|
|
pmaxsd m3, m13, m1 ; 9
|
|
pmaxsd m1, m13, m5 ; 13
|
|
pmaxsd m4, m13, m2 ; 10
|
|
pmaxsd m2, m13, [cq+32* 6] ; 6
|
|
pmaxsd m5, m13, [cq+32* 5] ; 5
|
|
pmaxsd m6, m13, m6 ; 14
|
|
pmaxsd m7, m13, [cq+32* 1] ; 1
|
|
REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
vpbroadcastd m12, [pd_2048]
|
|
vpbroadcastd m15, [pd_2896]
|
|
call m(iadst_16x8_internal_10bpc).main_part1
|
|
pmaxsd m0, m13, [cq+32* 0] ; 0
|
|
pmaxsd m1, m13, [cq+32*15] ; 15
|
|
pmaxsd m2, m13, [cq+32* 4] ; 4
|
|
pmaxsd m3, m13, [cq+32*11] ; 11
|
|
pmaxsd m4, m13, [cq+32* 8] ; 8
|
|
pmaxsd m5, m13, [cq+32* 7] ; 7
|
|
pmaxsd m6, m13, [cq+32*12] ; 12
|
|
pmaxsd m7, m13, [cq+32* 3] ; 3
|
|
REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(iadst_16x8_internal_10bpc).main_part2
|
|
vpbroadcastd m14, [pd_17408]
|
|
psrld m15, 11 ; pd_1
|
|
psubd m13, m14, m15 ; pd_17407
|
|
pslld m15, 3 ; pd_8
|
|
ret
|
|
|
|
INV_TXFM_8X16_FN flipadst, dct, 0, 12
|
|
INV_TXFM_8X16_FN flipadst, adst, 0, 12
|
|
INV_TXFM_8X16_FN flipadst, flipadst, 0, 12
|
|
INV_TXFM_8X16_FN flipadst, identity, 35, 12
|
|
|
|
cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
jmp m(iflipadst_8x16_internal_10bpc).pass1
|
|
.pass2:
|
|
lea r6, [rsp+32*4]
|
|
call m(iadst_8x16_internal_12bpc).pass2_main
|
|
call m(iflipadst_16x8_internal_10bpc).pass1_rotations
|
|
jmp m(iadst_8x16_internal_12bpc).pass2_end
|
|
|
|
INV_TXFM_8X16_FN identity, dct, 0, 12
|
|
INV_TXFM_8X16_FN identity, adst, 0, 12
|
|
INV_TXFM_8X16_FN identity, flipadst, 0, 12
|
|
INV_TXFM_8X16_FN identity, identity, 0, 12
|
|
|
|
cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
jmp m(iidentity_8x16_internal_10bpc).pass1
|
|
.pass2:
|
|
call .pass2_main
|
|
packssdw m0, m8
|
|
packssdw m1, m9
|
|
packssdw m2, m10
|
|
packssdw m3, m11
|
|
packssdw m4, m12
|
|
packssdw m5, m13
|
|
packssdw m6, m14
|
|
packssdw m13, m7, m15
|
|
vpbroadcastd m7, [pixel_12bpc_max]
|
|
vpbroadcastd m12, [pw_16384]
|
|
call m(iidentity_8x16_internal_10bpc).pass2_end
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
mova [cq], m7
|
|
vpbroadcastd m7, [clip_18b_min]
|
|
REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
pmaxsd m7, [cq]
|
|
mova [cq], m15
|
|
vpbroadcastd m15, [clip_18b_max]
|
|
REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14
|
|
pminsd m15, [cq]
|
|
mova [cq], m7
|
|
vpbroadcastd m7, [pd_5793]
|
|
REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
pmulld m7, [cq]
|
|
mova [cq], m15
|
|
vpbroadcastd m15, [pd_1024]
|
|
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14
|
|
paddd m15, [cq]
|
|
REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
ret
|
|
|
|
%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
|
|
INV_TXFM_FN %1, %2, 0, 16x4, %3
|
|
%ifidn %1_%2, dct_dct
|
|
vpbroadcastd m3, [dconly_%3bpc]
|
|
%if %3 = 10
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
mov [cq], eobd ; 0
|
|
or r3d, 4
|
|
.dconly2:
|
|
add r6d, 384
|
|
sar r6d, 9
|
|
.dconly3:
|
|
imul r6d, 181
|
|
add r6d, 2176
|
|
sar r6d, 12
|
|
movd xm0, r6d
|
|
paddsw xm0, xm3
|
|
vpbroadcastw m0, xm0
|
|
.dconly_loop:
|
|
paddsw m1, m0, [dstq+strideq*0]
|
|
paddsw m2, m0, [dstq+strideq*1]
|
|
psubusw m1, m3
|
|
psubusw m2, m3
|
|
mova [dstq+strideq*0], m1
|
|
mova [dstq+strideq*1], m2
|
|
lea dstq, [dstq+strideq*2]
|
|
sub r3d, 2
|
|
jg .dconly_loop
|
|
RET
|
|
%else
|
|
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly
|
|
%endif
|
|
%endif
|
|
%endmacro
|
|
|
|
INV_TXFM_16X4_FN dct, dct
|
|
INV_TXFM_16X4_FN dct, identity
|
|
INV_TXFM_16X4_FN dct, adst
|
|
INV_TXFM_16X4_FN dct, flipadst
|
|
|
|
cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m8, [clip_18b_min]
|
|
vpbroadcastd m9, [clip_18b_max]
|
|
.pass1:
|
|
vbroadcasti128 m0, [cq+16* 0]
|
|
vbroadcasti128 m4, [cq+16* 4]
|
|
vbroadcasti128 m1, [cq+16* 2]
|
|
vbroadcasti128 m7, [cq+16* 6]
|
|
vbroadcasti128 m5, [cq+16*10]
|
|
vbroadcasti128 m2, [cq+16* 8]
|
|
vbroadcasti128 m6, [cq+16*12]
|
|
vbroadcasti128 m3, [cq+16*14]
|
|
shufpd m0, m4, 0x0c ; 0 4
|
|
shufpd m1, m5, 0x0c ; 2 10
|
|
shufpd m2, m6, 0x0c ; 8 12
|
|
shufpd m3, m7, 0x0c ; 14 6
|
|
call .pass1_main
|
|
vbroadcasti128 m10, [cq+16* 1]
|
|
vbroadcasti128 m4, [cq+16* 5]
|
|
vbroadcasti128 m11, [cq+16*15]
|
|
vbroadcasti128 m5, [cq+16*11]
|
|
shufpd m10, m4, 0x0c ; 1 5
|
|
shufpd m11, m5, 0x0c ; 15 11
|
|
vbroadcasti128 m5, [cq+16* 9]
|
|
vbroadcasti128 m4, [cq+16*13]
|
|
shufpd m5, m4, 0x0c ; 9 13
|
|
vbroadcasti128 m6, [cq+16* 7]
|
|
vbroadcasti128 m4, [cq+16* 3]
|
|
shufpd m6, m4, 0x0c ; 7 3
|
|
call .pass1_main2
|
|
pcmpeqd m4, m4
|
|
REPX {psubd x, m4}, m0, m1, m2, m3
|
|
call .pass1_main3
|
|
REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
jmp tx2q
|
|
.pass2:
|
|
call .transpose_4x16_packed
|
|
lea r6, [deint_shuf+128]
|
|
call m(idct_16x4_internal_8bpc).main
|
|
.end:
|
|
vpbroadcastd m4, [pw_2048]
|
|
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
|
|
vpbroadcastd m5, [pixel_10bpc_max]
|
|
.end2:
|
|
paddw m0, [dstq+strideq*0]
|
|
paddw m1, [dstq+strideq*1]
|
|
.end3:
|
|
lea r6, [dstq+strideq*2]
|
|
paddw m2, [r6 +strideq*0]
|
|
paddw m3, [r6 +strideq*1]
|
|
pxor m4, m4
|
|
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
|
|
REPX {pmaxsw x, m4}, m0, m1, m2, m3
|
|
REPX {pminsw x, m5}, m0, m1, m2, m3
|
|
mova [dstq+strideq*0], m0
|
|
mova [dstq+strideq*1], m1
|
|
mova [r6 +strideq*0], m2
|
|
mova [r6 +strideq*1], m3
|
|
RET
|
|
ALIGN function_align
|
|
.pass1_main:
|
|
vpbroadcastd m7, [pd_2048]
|
|
call m(idct_8x4_internal_10bpc).main
|
|
psubd m3, m0, m4 ; idct8 out7 out6
|
|
paddd m0, m4 ; idct8 out0 out1
|
|
paddd m1, m2, m5 ; idct8 out3 out2
|
|
psubd m2, m5 ; idct8 out4 out5
|
|
ret
|
|
ALIGN function_align
|
|
.pass1_main2:
|
|
ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1
|
|
ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1
|
|
vbroadcasti128 m12, [pd_3784_m3784]
|
|
psubd m4, m10, m5
|
|
paddd m10, m5 ; t8 t11
|
|
psignd m4, m12 ; t9 t10
|
|
psubd m5, m11, m6
|
|
paddd m11, m6 ; t15 t12
|
|
psignd m5, m12 ; t14 t13
|
|
vpbroadcastd m6, [pd_1567]
|
|
vpbroadcastd m13, [pd_3784]
|
|
REPX {pmaxsd x, m8}, m5, m4
|
|
REPX {pminsd x, m9}, m5, m4
|
|
pmulld m12, m5
|
|
pmulld m5, m6
|
|
vbroadcasti128 m6, [pd_1567_m1567]
|
|
pmulld m13, m4
|
|
pmulld m4, m6
|
|
REPX {pmaxsd x, m8}, m10, m11, m0, m1
|
|
REPX {pminsd x, m9}, m10, m11, m0, m1
|
|
paddd m12, m7
|
|
paddd m5, m7
|
|
paddd m4, m12
|
|
psubd m5, m13
|
|
psrad m4, 12 ; t14a t10a
|
|
psrad m5, 12 ; t9a t13a
|
|
vpbroadcastd m12, [pd_2896]
|
|
punpckhqdq m6, m11, m5
|
|
punpcklqdq m11, m4
|
|
punpckhqdq m4, m10, m4
|
|
punpcklqdq m10, m5
|
|
psubd m5, m11, m6 ; t12a t13
|
|
paddd m11, m6 ; t15a t14
|
|
psubd m6, m10, m4 ; t11a t10
|
|
paddd m10, m4 ; t8a t9
|
|
REPX {pmaxsd x, m8}, m5, m6
|
|
REPX {pminsd x, m9}, m5, m6
|
|
pmulld m5, m12
|
|
pmulld m6, m12
|
|
REPX {pmaxsd x, m8}, m2, m3, m11, m10
|
|
REPX {pminsd x, m9}, m2, m3, m11, m10
|
|
ret
|
|
ALIGN function_align
|
|
.pass1_main3:
|
|
paddd m5, m7
|
|
psubd m4, m5, m6
|
|
paddd m5, m6
|
|
psrad m4, 12 ; t11 t10a
|
|
psrad m5, 12 ; t12 t13a
|
|
psubd m7, m0, m11 ; out15 out14
|
|
paddd m0, m11 ; out0 out1
|
|
psubd m6, m1, m5 ; out12 out13
|
|
paddd m1, m5 ; out3 out2
|
|
psubd m5, m2, m4 ; out11 out10
|
|
paddd m2, m4 ; out4 out5
|
|
psubd m4, m3, m10 ; out8 out9
|
|
paddd m3, m10 ; out7 out6
|
|
REPX {pshufd x, x, q1032}, m1, m3, m5, m7
|
|
ret
|
|
ALIGN function_align
|
|
.transpose_4x16_packed:
|
|
vbroadcasti128 m8, [deint_shuf]
|
|
packssdw m0, m1
|
|
packssdw m2, m3
|
|
packssdw m4, m5
|
|
packssdw m6, m7
|
|
REPX {pshufb x, m8}, m0, m2, m4, m6
|
|
punpckhqdq m1, m0, m2
|
|
punpcklqdq m0, m2
|
|
punpckhqdq m2, m4, m6
|
|
punpcklqdq m4, m6
|
|
vperm2i128 m3, m1, m2, 0x31
|
|
vinserti128 m1, xm2, 1
|
|
vperm2i128 m2, m0, m4, 0x31
|
|
vinserti128 m0, xm4, 1
|
|
ret
|
|
|
|
INV_TXFM_16X4_FN adst, dct
|
|
INV_TXFM_16X4_FN adst, adst
|
|
INV_TXFM_16X4_FN adst, flipadst
|
|
INV_TXFM_16X4_FN adst, identity
|
|
|
|
cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
.pass1:
|
|
call m(iadst_4x16_internal_10bpc).main
|
|
psrad m11, 11 ; pd_1
|
|
REPX {paddd x, m11}, m0, m1, m2, m3
|
|
paddd m4, m5, m11
|
|
paddd m5, m6, m11
|
|
paddd m6, m7, m11
|
|
paddd m7, m8, m11
|
|
.pass1_end:
|
|
REPX {pshufd x, x, q1032}, m0, m2, m4, m6
|
|
REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(idct_16x4_internal_10bpc).transpose_4x16_packed
|
|
lea r6, [deint_shuf+128]
|
|
call m(iadst_16x4_internal_8bpc).main
|
|
jmp m(idct_16x4_internal_10bpc).end
|
|
ALIGN function_align
|
|
.main:
|
|
vpbroadcastd m6, [pd_1321]
|
|
mova m0, [cq+32*0]
|
|
mova m1, [cq+32*1]
|
|
vpbroadcastd m7, [pd_2482]
|
|
mova m2, [cq+32*6]
|
|
mova m3, [cq+32*7]
|
|
pmulld m4, m0, m6
|
|
pmulld m5, m1, m6 ; 1321*in0
|
|
pmulld m9, m2, m7
|
|
pmulld m8, m3, m7 ; 2482*in3
|
|
paddd m4, m9
|
|
paddd m8, m5 ; 1321*in0 + 2482*in3
|
|
pmulld m5, m0, m7
|
|
pmulld m9, m1, m7 ; 2482*in0
|
|
paddd m0, m2
|
|
paddd m1, m3 ; in0 + in3
|
|
paddd m7, m6 ; pd_3803
|
|
pmulld m2, m7
|
|
pmulld m3, m7 ; 3803*in3
|
|
psubd m5, m2
|
|
psubd m9, m3 ; 2482*in0 - 3803*in3
|
|
mova m2, [cq+32*4]
|
|
pmulld m10, m7, m2
|
|
pmulld m3, m6, m2
|
|
psubd m2, m0
|
|
mova m0, [cq+32*5]
|
|
pmulld m7, m0 ; 3803*in2
|
|
pmulld m6, m0 ; 1321*in2
|
|
psubd m0, m1 ; in2 - in0 - in3
|
|
vpbroadcastd m1, [pd_m3344]
|
|
paddd m4, m10
|
|
paddd m7, m8 ; t0
|
|
psubd m5, m3
|
|
psubd m9, m6 ; t1
|
|
pmulld m2, m1
|
|
pmulld m0, m1 ; t2
|
|
pmulld m3, m1, [cq+32*2]
|
|
pmulld m1, [cq+32*3] ; -t3
|
|
ret
|
|
ALIGN function_align
|
|
.main_end:
|
|
; expects: m6 = rnd
|
|
paddd m5, m6
|
|
paddd m9, m6
|
|
paddd m10, m4, m5
|
|
paddd m4, m6
|
|
paddd m8, m7, m6
|
|
paddd m7, m9
|
|
psubd m4, m3 ; out0 (unshifted)
|
|
psubd m5, m3 ; out1 (unshifted)
|
|
paddd m2, m6 ; out2 (unshifted)
|
|
paddd m3, m10 ; out3 (unshifted)
|
|
psubd m8, m1 ; out4 (unshifted)
|
|
psubd m9, m1 ; out5 (unshifted)
|
|
paddd m6, m0 ; out6 (unshifted)
|
|
paddd m7, m1 ; out7 (unshifted)
|
|
ret
|
|
|
|
INV_TXFM_16X4_FN flipadst, dct
|
|
INV_TXFM_16X4_FN flipadst, adst
|
|
INV_TXFM_16X4_FN flipadst, flipadst
|
|
INV_TXFM_16X4_FN flipadst, identity
|
|
|
|
cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
.pass1:
|
|
call m(iadst_4x16_internal_10bpc).main
|
|
psrad m11, 11 ; pd_1
|
|
paddd m4, m3, m11
|
|
paddd m3, m5, m11
|
|
paddd m5, m2, m11
|
|
paddd m2, m6, m11
|
|
paddd m6, m1, m11
|
|
paddd m1, m7, m11
|
|
paddd m7, m0, m11
|
|
paddd m0, m8, m11
|
|
jmp m(iadst_16x4_internal_10bpc).pass1_end
|
|
.pass2:
|
|
call m(idct_16x4_internal_10bpc).transpose_4x16_packed
|
|
lea r6, [deint_shuf+128]
|
|
call m(iadst_16x4_internal_8bpc).main
|
|
vpbroadcastd m4, [pw_2048]
|
|
pmulhrsw m5, m3, m4
|
|
pmulhrsw m6, m2, m4
|
|
pmulhrsw m2, m1, m4
|
|
pmulhrsw m3, m0, m4
|
|
paddw m0, m5, [dstq+strideq*0]
|
|
paddw m1, m6, [dstq+strideq*1]
|
|
vpbroadcastd m5, [pixel_10bpc_max]
|
|
jmp m(idct_16x4_internal_10bpc).end3
|
|
|
|
INV_TXFM_16X4_FN identity, dct
|
|
INV_TXFM_16X4_FN identity, adst
|
|
INV_TXFM_16X4_FN identity, flipadst
|
|
INV_TXFM_16X4_FN identity, identity
|
|
|
|
cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m8, [pd_5793]
|
|
vpermq m0, [cq+32*0], q3120 ; 0 1
|
|
vpermq m1, [cq+32*1], q3120 ; 2 3
|
|
vpermq m2, [cq+32*2], q3120 ; 4 5
|
|
vpermq m3, [cq+32*3], q3120 ; 6 7
|
|
vpermq m4, [cq+32*4], q3120 ; 8 9
|
|
vpermq m5, [cq+32*5], q3120 ; a b
|
|
vpermq m6, [cq+32*6], q3120 ; c d
|
|
vpermq m7, [cq+32*7], q3120 ; e f
|
|
vpbroadcastd m9, [pd_3072]
|
|
REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(idct_16x4_internal_10bpc).transpose_4x16_packed
|
|
vpbroadcastd m7, [pw_1697x8]
|
|
pmulhrsw m4, m7, m0
|
|
pmulhrsw m5, m7, m1
|
|
pmulhrsw m6, m7, m2
|
|
pmulhrsw m7, m3
|
|
paddsw m0, m4
|
|
paddsw m1, m5
|
|
paddsw m2, m6
|
|
paddsw m3, m7
|
|
jmp m(idct_16x4_internal_10bpc).end
|
|
|
|
INV_TXFM_16X4_FN dct, dct, 12
|
|
INV_TXFM_16X4_FN dct, identity, 12
|
|
INV_TXFM_16X4_FN dct, adst, 12
|
|
INV_TXFM_16X4_FN dct, flipadst, 12
|
|
|
|
cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m8, [clip_20b_min]
|
|
vpbroadcastd m9, [clip_20b_max]
|
|
jmp m(idct_16x4_internal_10bpc).pass1
|
|
.pass2:
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
; deinterleave
|
|
REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
; transpose
|
|
punpcklqdq m8, m0, m1
|
|
punpckhqdq m0, m1
|
|
punpcklqdq m9, m2, m3
|
|
punpckhqdq m2, m3
|
|
punpcklqdq m10, m4, m5
|
|
punpckhqdq m4, m5
|
|
punpcklqdq m11, m6, m7
|
|
punpckhqdq m6, m7
|
|
vperm2i128 m3, m0, m2, 0x31 ; out6
|
|
vperm2i128 m1, m0, m2, 0x20 ; out2
|
|
vperm2i128 m7, m4, m6, 0x31 ; out7
|
|
vperm2i128 m5, m4, m6, 0x20 ; out3
|
|
vperm2i128 m13, m10, m11, 0x31 ; out5
|
|
vperm2i128 m12, m10, m11, 0x20 ; out1
|
|
vperm2i128 m11, m8, m9, 0x31 ; out4
|
|
vperm2i128 m10, m8, m9, 0x20 ; out0
|
|
call m(idct_4x16_internal_10bpc).pass1_main
|
|
pmulld m0, m6, m10
|
|
pmulld m2, m6, m11
|
|
pmulld m4, m6, m12
|
|
pmulld m6, m13
|
|
vpbroadcastd m10, [pd_17408]
|
|
call m(idct_4x16_internal_10bpc).pass1_main2
|
|
REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
packssdw m0, m4
|
|
packssdw m1, m5
|
|
packssdw m2, m6
|
|
packssdw m3, m7
|
|
vpbroadcastd m5, [pixel_12bpc_max]
|
|
REPX {vpermq x, x, q3120}, m0, m1, m2, m3
|
|
jmp m(idct_16x4_internal_10bpc).end2
|
|
|
|
INV_TXFM_16X4_FN adst, dct, 12
|
|
INV_TXFM_16X4_FN adst, adst, 12
|
|
INV_TXFM_16X4_FN adst, flipadst, 12
|
|
INV_TXFM_16X4_FN adst, identity, 12
|
|
|
|
cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
jmp m(iadst_16x4_internal_10bpc).pass1
|
|
.pass2:
|
|
call .pass2_main
|
|
REPX {vpermq x, x, q3120}, m0, m1, m2, m3
|
|
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
|
|
jmp m(idct_16x4_internal_10bpc).end2
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7
|
|
pmaxsd m8, m4, m12
|
|
pmaxsd m9, m5, m12
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3
|
|
call m(iadst_8x4_internal_12bpc).transpose_4x8
|
|
mova [cq+32*0], m0
|
|
mova [cq+32*2], m1
|
|
mova [cq+32*4], m2
|
|
mova [cq+32*6], m3
|
|
pminsd m0, m8, m13
|
|
pminsd m1, m9, m13
|
|
pminsd m2, m6, m13
|
|
pminsd m3, m7, m13
|
|
call m(iadst_8x4_internal_12bpc).transpose_4x8
|
|
mova [cq+32*1], m0
|
|
mova [cq+32*3], m1
|
|
mova [cq+32*5], m2
|
|
mova [cq+32*7], m3
|
|
call m(iadst_16x4_internal_10bpc).main
|
|
vpbroadcastd m6, [pd_2048]
|
|
call m(iadst_16x4_internal_10bpc).main_end
|
|
psrad m0, m4, 15
|
|
psrad m1, m5, 15
|
|
psrad m2, 15
|
|
psrad m3, 15
|
|
psrad m4, m8, 15
|
|
psrad m5, m9, 15
|
|
psrad m6, 15
|
|
psrad m7, 15
|
|
packssdw m0, m4
|
|
packssdw m1, m5
|
|
packssdw m2, m6
|
|
packssdw m3, m7
|
|
vpbroadcastd m4, [pw_16384]
|
|
vpbroadcastd m5, [pixel_12bpc_max]
|
|
ret
|
|
|
|
INV_TXFM_16X4_FN flipadst, dct, 12
|
|
INV_TXFM_16X4_FN flipadst, adst, 12
|
|
INV_TXFM_16X4_FN flipadst, flipadst, 12
|
|
INV_TXFM_16X4_FN flipadst, identity, 12
|
|
|
|
cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
jmp m(iflipadst_16x4_internal_10bpc).pass1
|
|
.pass2:
|
|
call m(iadst_16x4_internal_12bpc).pass2_main
|
|
vpermq m7, m0, q3120
|
|
vpermq m6, m1, q3120
|
|
vpermq m1, m2, q3120
|
|
vpermq m0, m3, q3120
|
|
pmulhrsw m0, m4
|
|
pmulhrsw m1, m4
|
|
pmulhrsw m2, m6, m4
|
|
pmulhrsw m3, m7, m4
|
|
jmp m(idct_16x4_internal_10bpc).end2
|
|
|
|
INV_TXFM_16X4_FN identity, dct, 12
|
|
INV_TXFM_16X4_FN identity, adst, 12
|
|
INV_TXFM_16X4_FN identity, flipadst, 12
|
|
INV_TXFM_16X4_FN identity, identity, 12
|
|
|
|
cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
|
|
vpbroadcastd m8, [pd_1697]
|
|
vpermq m0, [cq+32*0], q3120 ; 0 1
|
|
vpermq m1, [cq+32*1], q3120 ; 2 3
|
|
vpermq m2, [cq+32*2], q3120 ; 4 5
|
|
vpermq m3, [cq+32*3], q3120 ; 6 7
|
|
vpbroadcastd m9, [pd_3072]
|
|
pmulld m4, m8, m0
|
|
pmulld m5, m8, m1
|
|
pmulld m6, m8, m2
|
|
pmulld m7, m8, m3
|
|
vpermq m10, [cq+32*4], q3120 ; 8 9
|
|
vpermq m11, [cq+32*5], q3120 ; a b
|
|
vpermq m12, [cq+32*6], q3120 ; c d
|
|
vpermq m13, [cq+32*7], q3120 ; e f
|
|
REPX {paddd x, m9}, m4, m5, m6, m7
|
|
REPX {psrad x, 12}, m4, m5, m6, m7
|
|
paddd m0, m4
|
|
pmulld m4, m8, m10
|
|
paddd m1, m5
|
|
pmulld m5, m8, m11
|
|
paddd m2, m6
|
|
pmulld m6, m8, m12
|
|
paddd m3, m7
|
|
pmulld m7, m8, m13
|
|
REPX {paddd x, m9}, m4, m5, m6, m7
|
|
REPX {psrad x, 12}, m4, m5, m6, m7
|
|
paddd m4, m10
|
|
paddd m5, m11
|
|
paddd m6, m12
|
|
paddd m7, m13
|
|
jmp tx2q
|
|
.pass2:
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
vpbroadcastd m8, [pd_5793]
|
|
vpbroadcastd m9, [pd_2048]
|
|
REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(idct_16x4_internal_10bpc).transpose_4x16_packed
|
|
vpbroadcastd m4, [pw_16384]
|
|
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
|
|
vpbroadcastd m5, [pixel_12bpc_max]
|
|
jmp m(idct_16x4_internal_10bpc).end2
|
|
|
|
%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth
|
|
INV_TXFM_FN %1, %2, 0, 16x8, %3
|
|
%ifidn %1_%2, dct_dct
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m3, [dconly_%3bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 8
|
|
add r6d, 128
|
|
sar r6d, 8
|
|
imul r6d, 181
|
|
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
|
|
%endif
|
|
%endmacro
|
|
|
|
INV_TXFM_16X8_FN dct, dct
|
|
INV_TXFM_16X8_FN dct, identity
|
|
INV_TXFM_16X8_FN dct, adst
|
|
INV_TXFM_16X8_FN dct, flipadst
|
|
|
|
cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
.pass1:
|
|
vpbroadcastd m14, [pd_2896]
|
|
pmulld m0, m14, [cq+32* 1]
|
|
pmulld m1, m14, [cq+32* 3]
|
|
pmulld m2, m14, [cq+32* 5]
|
|
pmulld m3, m14, [cq+32* 7]
|
|
pmulld m4, m14, [cq+32* 9]
|
|
pmulld m5, m14, [cq+32*11]
|
|
pmulld m6, m14, [cq+32*13]
|
|
pmulld m7, m14, [cq+32*15]
|
|
vpbroadcastd m11, [pd_2048]
|
|
lea r6, [rsp+32*4]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
|
|
pmulld m0, m14, [cq+32* 0]
|
|
pmulld m1, m14, [cq+32* 2]
|
|
pmulld m2, m14, [cq+32* 4]
|
|
pmulld m3, m14, [cq+32* 6]
|
|
pmulld m4, m14, [cq+32* 8]
|
|
pmulld m5, m14, [cq+32*10]
|
|
pmulld m6, m14, [cq+32*12]
|
|
pmulld m7, m14, [cq+32*14]
|
|
call m(idct_8x8_internal_10bpc).main_rect2
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
psrld m11, 11 ; pd_1
|
|
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call .pass1_rotations
|
|
REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
jmp tx2q
|
|
.pass2:
|
|
call .transpose
|
|
call m(idct_16x8_internal_8bpc).main
|
|
vpbroadcastd m10, [pw_2048]
|
|
.end:
|
|
pmulhrsw m0, m10
|
|
pmulhrsw m1, m10
|
|
pmulhrsw m2, m10
|
|
pmulhrsw m3, m10
|
|
call .write_16x4_start
|
|
.end2:
|
|
pmulhrsw m0, m4, m10
|
|
pmulhrsw m1, m5, m10
|
|
pmulhrsw m2, m6, m10
|
|
pmulhrsw m3, m7, m10
|
|
call .write_16x4_zero
|
|
RET
|
|
ALIGN function_align
|
|
.pass1_rotations:
|
|
mova m14, [r6-32*4]
|
|
mova m13, [r6-32*3]
|
|
mova m12, [r6-32*2]
|
|
mova m11, [r6-32*1]
|
|
mova m10, [r6+32*0]
|
|
mova m9, [r6+32*1]
|
|
mova m8, [r6+32*2]
|
|
psubd m15, m0, m14 ; out15
|
|
paddd m0, m14 ; out0
|
|
psubd m14, m1, m13 ; out14
|
|
paddd m1, m13 ; out1
|
|
psubd m13, m2, m12 ; out13
|
|
paddd m2, m12 ; out2
|
|
psubd m12, m3, m11 ; out12
|
|
paddd m3, m11 ; out3
|
|
psubd m11, m4, m10 ; out11
|
|
paddd m4, m10 ; out4
|
|
psubd m10, m5, m9 ; out10
|
|
paddd m5, m9 ; out5
|
|
psubd m9, m6, m8 ; out9
|
|
paddd m6, m8 ; out6
|
|
psubd m8, m7, [r6+32*3] ; out8
|
|
paddd m7, [r6+32*3] ; out7
|
|
ret
|
|
ALIGN function_align
|
|
.transpose:
|
|
lea r6, [deint_shuf+128]
|
|
.transpose2:
|
|
packssdw m0, m8
|
|
packssdw m1, m9
|
|
packssdw m2, m10
|
|
packssdw m3, m11
|
|
packssdw m4, m12
|
|
packssdw m5, m13
|
|
packssdw m6, m14
|
|
packssdw m7, m15
|
|
.transpose3:
|
|
punpckhwd m8, m0, m1
|
|
punpcklwd m0, m1
|
|
punpcklwd m1, m2, m3
|
|
punpckhwd m2, m3
|
|
punpckhwd m3, m4, m5
|
|
punpcklwd m4, m5
|
|
punpckhwd m5, m6, m7
|
|
punpcklwd m6, m7
|
|
punpckhdq m7, m4, m6
|
|
punpckldq m4, m6
|
|
punpckldq m6, m8, m2
|
|
punpckhdq m8, m2
|
|
punpckhdq m2, m0, m1
|
|
punpckldq m0, m1
|
|
punpckhdq m1, m3, m5
|
|
punpckldq m3, m5
|
|
punpcklqdq m5, m6, m3
|
|
punpckhqdq m6, m3
|
|
punpckhqdq m3, m2, m7
|
|
punpcklqdq m2, m7
|
|
punpcklqdq m7, m8, m1
|
|
punpckhqdq m8, m1
|
|
punpckhqdq m1, m0, m4
|
|
punpcklqdq m0, m4
|
|
vperm2i128 m4, m0, m5, 0x31
|
|
vinserti128 m0, xm5, 1
|
|
vperm2i128 m5, m1, m6, 0x31
|
|
vinserti128 m1, xm6, 1
|
|
vperm2i128 m6, m2, m7, 0x31
|
|
vinserti128 m2, xm7, 1
|
|
vperm2i128 m7, m3, m8, 0x31
|
|
vinserti128 m3, xm8, 1
|
|
ret
|
|
ALIGN function_align
|
|
.write_16x4_start:
|
|
vpbroadcastd m9, [pixel_10bpc_max]
|
|
lea r3, [strideq*3]
|
|
pxor m8, m8
|
|
.write_16x4_zero:
|
|
REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7
|
|
add cq, 32*8
|
|
.write_16x4:
|
|
paddw m0, [dstq+strideq*0]
|
|
paddw m1, [dstq+strideq*1]
|
|
paddw m2, [dstq+strideq*2]
|
|
paddw m3, [dstq+r3 ]
|
|
REPX {pmaxsw x, m8}, m0, m1, m2, m3
|
|
REPX {pminsw x, m9}, m0, m1, m2, m3
|
|
mova [dstq+strideq*0], m0
|
|
mova [dstq+strideq*1], m1
|
|
mova [dstq+strideq*2], m2
|
|
mova [dstq+r3 ], m3
|
|
lea dstq, [dstq+strideq*4]
|
|
ret
|
|
|
|
INV_TXFM_16X8_FN adst, dct
|
|
INV_TXFM_16X8_FN adst, adst
|
|
INV_TXFM_16X8_FN adst, flipadst
|
|
INV_TXFM_16X8_FN adst, identity
|
|
|
|
cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
vpbroadcastd m13, [clip_18b_min]
|
|
vpbroadcastd m14, [clip_18b_max]
|
|
.pass1:
|
|
lea r6, [rsp+32*4]
|
|
call .main
|
|
vpbroadcastd m14, [pd_3072]
|
|
psrld m15, 11 ; pd_1
|
|
psubd m13, m14, m15 ; pd_3071
|
|
call .pass1_rotations
|
|
.pass1_end:
|
|
REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15
|
|
REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(idct_16x8_internal_10bpc).transpose
|
|
call m(iadst_16x8_internal_8bpc).main
|
|
call m(iadst_16x8_internal_8bpc).main_pass2_end
|
|
vpbroadcastd m10, [pw_2048]
|
|
pxor m11, m11
|
|
psubw m11, m10
|
|
pmulhrsw m0, m10
|
|
pmulhrsw m1, m11
|
|
pmulhrsw m2, m10
|
|
pmulhrsw m3, m11
|
|
call m(idct_16x8_internal_10bpc).write_16x4_start
|
|
pmulhrsw m0, m4, m10
|
|
pmulhrsw m1, m5, m11
|
|
pmulhrsw m2, m6, m10
|
|
pmulhrsw m3, m7, m11
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
RET
|
|
ALIGN function_align
|
|
.pass1_rotations:
|
|
paddd m0, m15
|
|
psubd m1, m15, m1
|
|
paddd m2, m15
|
|
psubd m3, m15, m3
|
|
paddd m4, m14
|
|
psubd m5, m13, m5
|
|
paddd m6, m14
|
|
psubd m7, m13, m7
|
|
paddd m8, m14, m9
|
|
psubd m9, m13, m10
|
|
paddd m10, m14, m11
|
|
psubd m11, m13, m12
|
|
paddd m12, m15, [r6-32*1]
|
|
psubd m13, m15, [r6-32*2]
|
|
paddd m14, m15, [r6-32*3]
|
|
psubd m15, [r6-32*4]
|
|
ret
|
|
ALIGN function_align
|
|
.main:
|
|
; expects: m13 = clip_min m14 = clip_max
|
|
vpbroadcastd m15, [pd_2896]
|
|
pmulld m0, m15, [cq+32* 2]
|
|
pmulld m1, m15, [cq+32*13]
|
|
pmulld m2, m15, [cq+32* 6]
|
|
pmulld m3, m15, [cq+32* 9]
|
|
pmulld m4, m15, [cq+32*10]
|
|
pmulld m5, m15, [cq+32* 5]
|
|
pmulld m6, m15, [cq+32*14]
|
|
pmulld m7, m15, [cq+32* 1]
|
|
vpbroadcastd m12, [pd_2048]
|
|
REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call .main_part1
|
|
pmulld m0, m15, [cq+32* 0]
|
|
pmulld m1, m15, [cq+32*15]
|
|
pmulld m2, m15, [cq+32* 4]
|
|
pmulld m3, m15, [cq+32*11]
|
|
pmulld m4, m15, [cq+32* 8]
|
|
pmulld m5, m15, [cq+32* 7]
|
|
pmulld m6, m15, [cq+32*12]
|
|
pmulld m7, m15, [cq+32* 3]
|
|
REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
.main_part2:
|
|
ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091
|
|
ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703
|
|
ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751
|
|
ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380
|
|
psubd m8, m0, m4 ; t8a
|
|
paddd m0, m4 ; t0a
|
|
psubd m4, m1, m5 ; t9a
|
|
paddd m1, m5 ; t1a
|
|
psubd m5, m2, m6 ; t12a
|
|
paddd m2, m6 ; t4a
|
|
psubd m6, m3, m7 ; t13a
|
|
paddd m7, m3 ; t5a
|
|
REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
|
|
REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
|
|
vpbroadcastd m11, [pd_4017]
|
|
vpbroadcastd m10, [pd_799]
|
|
ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11
|
|
ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10
|
|
psubd m3, m0, m2 ; t4
|
|
paddd m0, m2 ; t0
|
|
psubd m2, m1, m7 ; t5
|
|
paddd m1, m7 ; t1
|
|
psubd m7, m4, m6 ; t12a
|
|
paddd m4, m6 ; t8a
|
|
psubd m6, m8, m5 ; t13a
|
|
paddd m5, m8 ; t9a
|
|
REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
|
|
REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5
|
|
vpbroadcastd m11, [pd_3784]
|
|
vpbroadcastd m10, [pd_1567]
|
|
ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11
|
|
ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11
|
|
pminsd m10, m14, [r6-32*4] ; t2
|
|
pminsd m8, m14, [r6-32*3] ; t3
|
|
psubd m9, m0, m10 ; t2a
|
|
paddd m0, m10 ; out0
|
|
psubd m10, m1, m8 ; t3a
|
|
paddd m1, m8 ; -out15
|
|
pmaxsd m9, m13
|
|
pmaxsd m10, m13
|
|
pminsd m9, m14
|
|
pminsd m10, m14
|
|
mova [r6-32*4], m1
|
|
mova m11, [r6-32*1] ; t7a
|
|
mova m1, [r6-32*2] ; t6a
|
|
psubd m8, m3, m11 ; t7
|
|
paddd m11, m3 ; out12
|
|
paddd m3, m2, m1 ; -out3
|
|
psubd m2, m1 ; t6
|
|
pmaxsd m8, m13
|
|
pmaxsd m2, m13
|
|
pminsd m8, m14
|
|
pminsd m2, m14
|
|
mova [r6-32*1], m11
|
|
mova [r6-32*3], m2
|
|
mova m1, [r6+32*3] ; t15
|
|
mova m2, [r6+32*2] ; t14
|
|
paddd m12, m7, m1 ; -out13
|
|
psubd m7, m1 ; t15a
|
|
psubd m11, m6, m2 ; t14a
|
|
paddd m2, m6 ; out2
|
|
pmaxsd m7, m13
|
|
pmaxsd m11, m13
|
|
pminsd m7, m14
|
|
pminsd m11, m14
|
|
mova [r6-32*2], m12
|
|
pminsd m1, m14, [r6+32*0] ; t10a
|
|
pminsd m12, m14, [r6+32*1] ; t11a
|
|
psubd m6, m4, m1 ; t10
|
|
paddd m1, m4 ; -out1
|
|
psubd m4, m5, m12 ; t11
|
|
paddd m5, m12 ; out14
|
|
vpbroadcastd m12, [pd_1448]
|
|
pmaxsd m6, m13
|
|
pmaxsd m4, m13
|
|
pminsd m6, m14
|
|
pminsd m4, m14
|
|
REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4
|
|
pmulld m12, [r6-32*3] ; t6
|
|
mova [r6-32*3], m5
|
|
paddd m5, m11, m7 ; -out5 (unshifted)
|
|
psubd m11, m7 ; out10 (unshifted)
|
|
paddd m7, m9, m10 ; -out7 (unshifted)
|
|
psubd m9, m10 ; out8 (unshifted)
|
|
psubd m10, m6, m4 ; -out9 (unshifted)
|
|
paddd m6, m4 ; out6 (unshifted)
|
|
paddd m4, m12, m8 ; out4 (unshifted)
|
|
psubd m12, m8 ; -out11 (unshifted)
|
|
ret
|
|
.main_part1:
|
|
ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973
|
|
ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290
|
|
ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106
|
|
ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601
|
|
psubd m8, m0, m4 ; t10a
|
|
paddd m0, m4 ; t2a
|
|
psubd m4, m1, m5 ; t11a
|
|
paddd m1, m5 ; t3a
|
|
psubd m5, m2, m6 ; t14a
|
|
paddd m2, m6 ; t6a
|
|
psubd m6, m3, m7 ; t15a
|
|
paddd m7, m3 ; t7a
|
|
REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
|
|
REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
|
|
vpbroadcastd m11, [pd_2276]
|
|
vpbroadcastd m10, [pd_3406]
|
|
ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11
|
|
ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10
|
|
psubd m3, m0, m2 ; t6
|
|
paddd m0, m2 ; t2
|
|
psubd m2, m1, m7 ; t7
|
|
paddd m1, m7 ; t3
|
|
psubd m7, m4, m6 ; t14a
|
|
paddd m4, m6 ; t10a
|
|
psubd m6, m8, m5 ; t15a
|
|
paddd m5, m8 ; t11a
|
|
REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
|
|
REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later
|
|
vpbroadcastd m11, [pd_1567]
|
|
vpbroadcastd m10, [pd_3784]
|
|
ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11
|
|
ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11
|
|
mova [r6-32*4], m0
|
|
mova [r6-32*3], m1
|
|
mova [r6+32*0], m4
|
|
mova [r6+32*1], m5
|
|
mova [r6-32*2], m2
|
|
mova [r6-32*1], m3
|
|
mova [r6+32*2], m6
|
|
mova [r6+32*3], m7
|
|
ret
|
|
|
|
INV_TXFM_16X8_FN flipadst, dct
|
|
INV_TXFM_16X8_FN flipadst, adst
|
|
INV_TXFM_16X8_FN flipadst, flipadst
|
|
INV_TXFM_16X8_FN flipadst, identity
|
|
|
|
cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
vpbroadcastd m13, [clip_18b_min]
|
|
vpbroadcastd m14, [clip_18b_max]
|
|
.pass1:
|
|
lea r6, [rsp+32*4]
|
|
call m(iadst_16x8_internal_10bpc).main
|
|
vpbroadcastd m14, [pd_3072]
|
|
psrld m15, 11
|
|
psubd m13, m14, m15
|
|
call .pass1_rotations
|
|
jmp m(iadst_16x8_internal_10bpc).pass1_end
|
|
.pass2:
|
|
call m(idct_16x8_internal_10bpc).transpose
|
|
call m(iadst_16x8_internal_8bpc).main
|
|
call m(iadst_16x8_internal_8bpc).main_pass2_end
|
|
vpbroadcastd m10, [pw_2048]
|
|
pxor m11, m11
|
|
psubw m11, m10
|
|
mova m12, m0
|
|
pmulhrsw m0, m7, m11
|
|
mova m7, m1
|
|
pmulhrsw m1, m6, m10
|
|
mova m6, m2
|
|
pmulhrsw m2, m5, m11
|
|
mova m5, m3
|
|
pmulhrsw m3, m4, m10
|
|
call m(idct_16x8_internal_10bpc).write_16x4_start
|
|
pmulhrsw m0, m5, m11
|
|
pmulhrsw m1, m6, m10
|
|
pmulhrsw m2, m7, m11
|
|
pmulhrsw m3, m12, m10
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
RET
|
|
ALIGN function_align
|
|
.pass1_rotations:
|
|
psubd m8, m13, m7
|
|
paddd m7, m14, m9
|
|
paddd m9, m14, m6
|
|
psubd m6, m13, m10
|
|
psubd m10, m13, m5
|
|
paddd m5, m14, m11
|
|
paddd m11, m14, m4
|
|
psubd m4, m13, m12
|
|
psubd m12, m15, m3
|
|
paddd m3, m15, [r6-32*1]
|
|
paddd m13, m15, m2
|
|
psubd m2, m15, [r6-32*2]
|
|
psubd m14, m15, m1
|
|
mova m1, m15
|
|
paddd m15, m0
|
|
psubd m0, m1, [r6-32*4]
|
|
paddd m1, [r6-32*3]
|
|
ret
|
|
|
|
INV_TXFM_16X8_FN identity, dct
|
|
INV_TXFM_16X8_FN identity, adst
|
|
INV_TXFM_16X8_FN identity, flipadst
|
|
INV_TXFM_16X8_FN identity, identity
|
|
|
|
cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
.pass1:
|
|
vpbroadcastd m15, [pd_2896]
|
|
pmulld m0, m15, [cq+32* 0]
|
|
pmulld m1, m15, [cq+32* 1]
|
|
pmulld m2, m15, [cq+32* 2]
|
|
pmulld m3, m15, [cq+32* 3]
|
|
pmulld m4, m15, [cq+32* 4]
|
|
pmulld m5, m15, [cq+32* 5]
|
|
pmulld m6, m15, [cq+32* 6]
|
|
pmulld m7, m15, [cq+32* 7]
|
|
pmulld m8, m15, [cq+32* 8]
|
|
pmulld m9, m15, [cq+32* 9]
|
|
pmulld m10, m15, [cq+32*10]
|
|
pmulld m11, m15, [cq+32*11]
|
|
pmulld m12, m15, [cq+32*12]
|
|
pmulld m13, m15, [cq+32*13]
|
|
pmulld m14, m15, [cq+32*14]
|
|
pmulld m15, [cq+32*15]
|
|
mova [rsp], m7
|
|
vpbroadcastd m7, [pd_2048]
|
|
REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
paddd m7, [rsp]
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
mova [rsp], m15
|
|
vpbroadcastd m15, [pd_5793]
|
|
REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14
|
|
pmulld m15, [rsp]
|
|
mova [rsp], m7
|
|
vpbroadcastd m7, [pd_3072]
|
|
REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
paddd m7, [rsp]
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(idct_16x8_internal_10bpc).transpose
|
|
vpbroadcastd m10, [pw_4096]
|
|
jmp m(idct_16x8_internal_10bpc).end
|
|
|
|
INV_TXFM_16X8_FN dct, dct, 12
|
|
INV_TXFM_16X8_FN dct, identity, 12
|
|
INV_TXFM_16X8_FN dct, adst, 12
|
|
INV_TXFM_16X8_FN dct, flipadst, 12
|
|
|
|
cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
jmp m(idct_16x8_internal_10bpc).pass1
|
|
.pass2:
|
|
call .pass2_main
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
call m(idct_8x16_internal_12bpc).transpose
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
vpbroadcastd m11, [pd_2048]
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call m(idct_8x8_internal_12bpc).round_shift4
|
|
mova [cq+32* 8], m0
|
|
mova [cq+32* 9], m1
|
|
mova [cq+32*10], m2
|
|
mova [cq+32*11], m3
|
|
mova [cq+32*12], m4
|
|
mova [cq+32*13], m5
|
|
mova [cq+32*14], m6
|
|
mova [cq+32*15], m7
|
|
pmaxsd m0, m12, [cq+32*0]
|
|
pmaxsd m1, m12, [cq+32*1]
|
|
pmaxsd m2, m12, [cq+32*2]
|
|
pmaxsd m3, m12, [cq+32*3]
|
|
pmaxsd m4, m12, [cq+32*4]
|
|
pmaxsd m5, m12, [cq+32*5]
|
|
pmaxsd m6, m12, [cq+32*6]
|
|
pmaxsd m7, m12, [cq+32*7]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call m(idct_8x8_internal_12bpc).round_shift4
|
|
.end:
|
|
packssdw m0, [cq+32* 8]
|
|
packssdw m1, [cq+32* 9]
|
|
packssdw m2, [cq+32*10]
|
|
packssdw m3, [cq+32*11]
|
|
packssdw m4, [cq+32*12]
|
|
packssdw m5, [cq+32*13]
|
|
packssdw m6, [cq+32*14]
|
|
packssdw m7, [cq+32*15]
|
|
REPX {vpermq x, x, q3120}, m0, m1, m2, m3
|
|
call .write_16x4_start
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
vpermq m0, m4, q3120
|
|
vpermq m1, m5, q3120
|
|
vpermq m2, m6, q3120
|
|
vpermq m3, m7, q3120
|
|
jmp m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
ALIGN function_align
|
|
.write_16x4_start:
|
|
vpbroadcastd m9, [pixel_12bpc_max]
|
|
lea r3, [strideq*3]
|
|
pxor m8, m8
|
|
ret
|
|
|
|
INV_TXFM_16X8_FN adst, dct, 12
|
|
INV_TXFM_16X8_FN adst, adst, 12
|
|
INV_TXFM_16X8_FN adst, flipadst, 12
|
|
INV_TXFM_16X8_FN adst, identity, 12
|
|
|
|
cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
vpbroadcastd m13, [clip_20b_min]
|
|
vpbroadcastd m14, [clip_20b_max]
|
|
jmp m(iadst_16x8_internal_10bpc).pass1
|
|
.pass2:
|
|
call .pass2_main
|
|
call m(idct_16x8_internal_12bpc).end
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
call m(idct_8x16_internal_12bpc).transpose
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
vpbroadcastd m11, [pd_2048]
|
|
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(iadst_8x8_internal_12bpc).pass2_main2
|
|
mova [cq+32* 8], m0
|
|
mova [cq+32* 9], m1
|
|
mova [cq+32*10], m2
|
|
mova [cq+32*11], m3
|
|
mova [cq+32*12], m4
|
|
mova [cq+32*13], m5
|
|
mova [cq+32*14], m6
|
|
mova [cq+32*15], m7
|
|
pmaxsd m0, m12, [cq+32*0]
|
|
pmaxsd m1, m12, [cq+32*1]
|
|
pmaxsd m2, m12, [cq+32*2]
|
|
pmaxsd m3, m12, [cq+32*3]
|
|
pmaxsd m4, m12, [cq+32*4]
|
|
pmaxsd m5, m12, [cq+32*5]
|
|
pmaxsd m6, m12, [cq+32*6]
|
|
pmaxsd m7, m12, [cq+32*7]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(iadst_8x8_internal_12bpc).pass2_main2
|
|
ret
|
|
|
|
INV_TXFM_16X8_FN flipadst, dct, 12
|
|
INV_TXFM_16X8_FN flipadst, adst, 12
|
|
INV_TXFM_16X8_FN flipadst, flipadst, 12
|
|
INV_TXFM_16X8_FN flipadst, identity, 12
|
|
|
|
cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
vpbroadcastd m13, [clip_20b_min]
|
|
vpbroadcastd m14, [clip_20b_max]
|
|
jmp m(iflipadst_16x8_internal_10bpc).pass1
|
|
.pass2:
|
|
call m(iadst_16x8_internal_12bpc).pass2_main
|
|
packssdw m13, m0, [cq+32* 8]
|
|
packssdw m12, m1, [cq+32* 9]
|
|
packssdw m11, m2, [cq+32*10]
|
|
packssdw m10, m3, [cq+32*11]
|
|
packssdw m3, m4, [cq+32*12]
|
|
packssdw m2, m5, [cq+32*13]
|
|
packssdw m1, m6, [cq+32*14]
|
|
packssdw m0, m7, [cq+32*15]
|
|
REPX {vpermq x, x, q3120}, m0, m1, m2, m3
|
|
call m(idct_16x8_internal_12bpc).write_16x4_start
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
vpermq m0, m10, q3120
|
|
vpermq m1, m11, q3120
|
|
vpermq m2, m12, q3120
|
|
vpermq m3, m13, q3120
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
RET
|
|
|
|
INV_TXFM_16X8_FN identity, dct, 12
|
|
INV_TXFM_16X8_FN identity, adst, 12
|
|
INV_TXFM_16X8_FN identity, flipadst, 12
|
|
INV_TXFM_16X8_FN identity, identity, 12
|
|
|
|
cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
|
|
jmp m(iidentity_16x8_internal_10bpc).pass1
|
|
.pass2:
|
|
call m(idct_16x8_internal_10bpc).transpose2
|
|
vpbroadcastd m10, [pw_4096]
|
|
pmulhrsw m0, m10
|
|
pmulhrsw m1, m10
|
|
pmulhrsw m2, m10
|
|
pmulhrsw m3, m10
|
|
call m(idct_16x8_internal_12bpc).write_16x4_start
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
jmp m(idct_16x8_internal_10bpc).end2
|
|
|
|
%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
|
|
INV_TXFM_FN %1, %2, %3, 16x16, %4
|
|
%ifidn %1_%2, dct_dct
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m3, [dconly_%4bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 16
|
|
add r6d, 640
|
|
sar r6d, 10
|
|
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
|
|
%endif
|
|
%endmacro
|
|
|
|
INV_TXFM_16X16_FN dct, dct
|
|
INV_TXFM_16X16_FN dct, identity, 28
|
|
INV_TXFM_16X16_FN dct, adst
|
|
INV_TXFM_16X16_FN dct, flipadst
|
|
|
|
cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
.pass1:
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m14, [pd_2896]
|
|
lea r6, [rsp+32*4]
|
|
sub eobd, 36
|
|
jl .fast
|
|
add cq, 32
|
|
call .main
|
|
sub cq, 32
|
|
mova m10, [r6-32*4]
|
|
mova m9, [r6-32*3]
|
|
mova m8, [r6-32*2]
|
|
psubd m15, m0, m10 ; out15
|
|
paddd m0, m10 ; out0
|
|
psubd m10, m1, m9 ; out14
|
|
paddd m1, m9 ; out1
|
|
psubd m9, m2, m8 ; out13
|
|
paddd m2, m8 ; out2
|
|
REPX {psrad x, 2}, m0, m1, m2
|
|
mova [r6-32*4], m0
|
|
mova [r6-32*3], m1
|
|
mova [r6-32*2], m2
|
|
mova m2, [r6-32*1]
|
|
mova m1, [r6+32*0]
|
|
mova m0, [r6+32*1]
|
|
REPX {psrad x, 2}, m9, m10, m15
|
|
psubd m8, m3, m2 ; out12
|
|
paddd m3, m2 ; out3
|
|
psubd m2, m4, m1 ; out11
|
|
paddd m4, m1 ; out4
|
|
psubd m1, m5, m0 ; out10
|
|
paddd m5, m0 ; out5
|
|
REPX {psrad x, 2}, m3, m4, m5
|
|
mova [r6-32*1], m3
|
|
mova [r6+32*0], m4
|
|
mova [r6+32*1], m5
|
|
mova m4, [r6+32*2]
|
|
mova m3, [r6+32*3]
|
|
REPX {psrad x, 2}, m1, m2, m8
|
|
psubd m5, m6, m4 ; out9
|
|
paddd m6, m4 ; out6
|
|
psubd m4, m7, m3 ; out8
|
|
paddd m7, m3 ; out7
|
|
REPX {psrad x, 2}, m6, m7, m4, m5
|
|
mova [r6+32*2], m6
|
|
mova [r6+32*3], m7
|
|
add r6, 32*8
|
|
mova [r6-32*4], m4
|
|
mova [r6-32*3], m5
|
|
mova [r6-32*2], m1
|
|
mova [r6-32*1], m2
|
|
mova [r6+32*0], m8
|
|
mova [r6+32*1], m9
|
|
mova [r6+32*2], m10
|
|
mova [r6+32*3], m15
|
|
.fast:
|
|
add r6, 32*8
|
|
call .main
|
|
mova m14, [r6-32*4]
|
|
mova m13, [r6-32*3]
|
|
mova m12, [r6-32*2]
|
|
mova m11, [r6-32*1]
|
|
mova m10, [r6+32*0]
|
|
mova m9, [r6+32*1]
|
|
mova m8, [r6+32*2]
|
|
psubd m15, m0, m14 ; out15
|
|
paddd m0, m14 ; out0
|
|
psubd m14, m1, m13 ; out14
|
|
paddd m1, m13 ; out1
|
|
psubd m13, m2, m12 ; out13
|
|
paddd m2, m12 ; out2
|
|
psubd m12, m3, m11 ; out12
|
|
paddd m3, m11 ; out3
|
|
psubd m11, m4, m10 ; out11
|
|
paddd m4, m10 ; out4
|
|
psubd m10, m5, m9 ; out10
|
|
paddd m5, m9 ; out5
|
|
psubd m9, m6, m8 ; out9
|
|
paddd m6, m8 ; out6
|
|
psubd m8, m7, [r6+32*3] ; out8
|
|
paddd m7, [r6+32*3] ; out7
|
|
sub r6, 32*8
|
|
REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
jmp tx2q
|
|
.pass2:
|
|
call .transpose
|
|
lea r6, [pw_5+128]
|
|
mova [rsp], m15
|
|
call m(idct_16x16_internal_8bpc).main
|
|
mova m1, [rsp+32*1]
|
|
.end:
|
|
call .write_16x16
|
|
RET
|
|
ALIGN function_align
|
|
.write_16x16:
|
|
mova [rsp+gprsize+32*0], m8
|
|
mova [rsp+gprsize+32*1], m9
|
|
mova [rsp+gprsize+32*2], m12
|
|
vpbroadcastd m12, [pw_2048]
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
pmulhrsw m2, m12
|
|
pmulhrsw m3, m12
|
|
call m(idct_16x8_internal_10bpc).write_16x4_start
|
|
.write_16x16_2:
|
|
pmulhrsw m0, m12, m4
|
|
pmulhrsw m1, m12, m5
|
|
pmulhrsw m2, m12, m6
|
|
pmulhrsw m3, m12, m7
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
pmulhrsw m0, m12, [rsp+gprsize+32*0]
|
|
pmulhrsw m1, m12, [rsp+gprsize+32*1]
|
|
pmulhrsw m2, m12, m10
|
|
pmulhrsw m3, m12, m11
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
pmulhrsw m0, m12, [rsp+gprsize+32*2]
|
|
pmulhrsw m1, m12, m13
|
|
pmulhrsw m2, m12, m14
|
|
pmulhrsw m3, m12, m15
|
|
jmp m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
ALIGN function_align
|
|
.transpose:
|
|
test eobd, eobd
|
|
jl .transpose_fast
|
|
packssdw m8, [r6-32*4]
|
|
packssdw m9, [r6-32*3]
|
|
packssdw m10, [r6-32*2]
|
|
packssdw m11, [r6-32*1]
|
|
packssdw m12, [r6+32*0]
|
|
packssdw m13, [r6+32*1]
|
|
packssdw m14, [r6+32*2]
|
|
packssdw m15, [r6+32*3]
|
|
sub r6, 32*8
|
|
packssdw m0, [r6-32*4]
|
|
packssdw m1, [r6-32*3]
|
|
packssdw m2, [r6-32*2]
|
|
packssdw m3, [r6-32*1]
|
|
packssdw m4, [r6+32*0]
|
|
packssdw m5, [r6+32*1]
|
|
packssdw m6, [r6+32*2]
|
|
packssdw m7, [r6+32*3]
|
|
mova [r6], m8
|
|
punpckhwd m8, m0, m1
|
|
punpcklwd m0, m1
|
|
punpcklwd m1, m2, m3
|
|
punpckhwd m2, m3
|
|
punpckhwd m3, m6, m7
|
|
punpcklwd m6, m7
|
|
punpcklwd m7, m4, m5
|
|
punpckhwd m4, m5
|
|
punpckldq m5, m8, m2
|
|
punpckhdq m8, m2
|
|
punpckhdq m2, m0, m1
|
|
punpckldq m0, m1
|
|
punpckhdq m1, m7, m6
|
|
punpckldq m7, m6
|
|
punpckhdq m6, m4, m3
|
|
punpckldq m4, m3
|
|
punpckhqdq m3, m2, m1
|
|
punpcklqdq m2, m1
|
|
punpckhqdq m1, m0, m7
|
|
punpcklqdq m0, m7
|
|
punpcklqdq m7, m8, m6
|
|
punpckhqdq m8, m6
|
|
punpckhqdq m6, m5, m4
|
|
punpcklqdq m5, m4
|
|
mova m4, [r6]
|
|
mova [r6], m8
|
|
punpcklwd m8, m4, m9
|
|
punpckhwd m4, m9
|
|
punpcklwd m9, m10, m11
|
|
punpckhwd m10, m11
|
|
punpckhwd m11, m14, m15
|
|
punpcklwd m14, m15
|
|
punpckhwd m15, m12, m13
|
|
punpcklwd m12, m13
|
|
punpckldq m13, m4, m10
|
|
punpckhdq m4, m10
|
|
punpckhdq m10, m8, m9
|
|
punpckldq m8, m9
|
|
punpckhdq m9, m12, m14
|
|
punpckldq m12, m14
|
|
punpckhdq m14, m15, m11
|
|
punpckldq m15, m11
|
|
punpckhqdq m11, m10, m9
|
|
punpcklqdq m10, m9
|
|
punpckhqdq m9, m8, m12
|
|
punpcklqdq m8, m12
|
|
punpcklqdq m12, m13, m15
|
|
punpckhqdq m13, m15
|
|
punpckhqdq m15, m4, m14
|
|
punpcklqdq m14, m4, m14
|
|
vperm2i128 m4, m0, m8, 0x31
|
|
vinserti128 m0, xm8, 1
|
|
vinserti128 m8, m5, xm12, 1
|
|
vperm2i128 m12, m5, 0x13
|
|
vperm2i128 m5, m1, m9, 0x31
|
|
vinserti128 m1, xm9, 1
|
|
vinserti128 m9, m6, xm13, 1
|
|
vperm2i128 m13, m6, 0x13
|
|
vperm2i128 m6, m2, m10, 0x31
|
|
vinserti128 m2, xm10, 1
|
|
vinserti128 m10, m7, xm14, 1
|
|
vperm2i128 m14, m7, 0x13
|
|
vperm2i128 m7, m3, m11, 0x31
|
|
vinserti128 m3, xm11, 1
|
|
mova xm11, [r6]
|
|
vinserti128 m11, xm15, 1
|
|
vinserti128 m15, [r6+16], 0
|
|
ret
|
|
.transpose_fast:
|
|
call m(idct_16x8_internal_10bpc).transpose2
|
|
pxor m8, m8
|
|
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
|
|
ret
|
|
ALIGN function_align
|
|
.main:
|
|
mova m0, [cq+64* 1]
|
|
mova m1, [cq+64* 3]
|
|
mova m2, [cq+64* 5]
|
|
mova m3, [cq+64* 7]
|
|
mova m4, [cq+64* 9]
|
|
mova m5, [cq+64*11]
|
|
mova m6, [cq+64*13]
|
|
mova m7, [cq+64*15]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf
|
|
mova m0, [cq+64* 0]
|
|
mova m1, [cq+64* 2]
|
|
mova m2, [cq+64* 4]
|
|
mova m3, [cq+64* 6]
|
|
mova m4, [cq+64* 8]
|
|
mova m5, [cq+64*10]
|
|
mova m6, [cq+64*12]
|
|
mova m7, [cq+64*14]
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
psrld m10, m11, 10 ; pd_2
|
|
REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
ret
|
|
|
|
INV_TXFM_16X16_FN adst, dct
|
|
INV_TXFM_16X16_FN adst, adst
|
|
INV_TXFM_16X16_FN adst, flipadst
|
|
|
|
cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
|
vpbroadcastd m13, [clip_18b_min]
|
|
vpbroadcastd m14, [clip_18b_max]
|
|
.pass1:
|
|
vpbroadcastd m15, [pd_2896]
|
|
lea r6, [rsp+32*4]
|
|
sub eobd, 36
|
|
jl .fast
|
|
add cq, 32
|
|
call .main
|
|
sub cq, 32
|
|
vpbroadcastd m8, [pd_5120]
|
|
paddd m4, m8
|
|
paddd m6, m8
|
|
paddd m9, m8
|
|
paddd m11, m8
|
|
vpbroadcastd m8, [pd_5119]
|
|
psubd m5, m8, m5
|
|
psubd m7, m8, m7
|
|
psubd m10, m8, m10
|
|
psubd m12, m8, m12
|
|
REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12
|
|
mova [r6+32*0], m4
|
|
mova [r6+32*1], m5
|
|
mova [r6+32*2], m6
|
|
mova [r6+32*3], m7
|
|
psrld m4, m15, 10 ; pd_2
|
|
paddd m0, m4
|
|
psubd m1, m4, m1
|
|
paddd m2, m4
|
|
psubd m3, m4, m3
|
|
psubd m7, m4, [r6-32*4]
|
|
paddd m6, m4, [r6-32*3]
|
|
psubd m5, m4, [r6-32*2]
|
|
paddd m4, [r6-32*1]
|
|
REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
mova [r6-32*4], m0
|
|
mova [r6-32*3], m1
|
|
mova [r6-32*2], m2
|
|
mova [r6-32*1], m3
|
|
add r6, 32*8
|
|
mova [r6-32*4], m9
|
|
mova [r6-32*3], m10
|
|
mova [r6-32*2], m11
|
|
mova [r6-32*1], m12
|
|
mova [r6+32*0], m4
|
|
mova [r6+32*1], m5
|
|
mova [r6+32*2], m6
|
|
mova [r6+32*3], m7
|
|
.fast:
|
|
add r6, 32*8
|
|
call .main
|
|
vpbroadcastd m14, [pd_5120]
|
|
vpbroadcastd m13, [pd_5119]
|
|
psrld m15, 10 ; pd_2
|
|
paddd m0, m15
|
|
psubd m1, m15, m1
|
|
paddd m2, m15
|
|
psubd m3, m15, m3
|
|
paddd m4, m14
|
|
psubd m5, m13, m5
|
|
paddd m6, m14
|
|
psubd m7, m13, m7
|
|
paddd m8, m14, m9
|
|
psubd m9, m13, m10
|
|
paddd m10, m14, m11
|
|
psubd m11, m13, m12
|
|
paddd m12, m15, [r6-32*1]
|
|
psubd m13, m15, [r6-32*2]
|
|
paddd m14, m15, [r6-32*3]
|
|
psubd m15, [r6-32*4]
|
|
.pass1_end:
|
|
REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
|
|
REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
|
|
sub r6, 32*8
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(idct_16x16_internal_10bpc).transpose
|
|
lea r6, [pw_5+128]
|
|
mova [rsp], m15
|
|
call m(iadst_16x16_internal_8bpc).main
|
|
call m(iadst_16x16_internal_8bpc).main_pass2_end
|
|
mova [rsp+32*0], m8
|
|
mova [rsp+32*2], m12
|
|
mova [rsp+32*3], m13
|
|
vpbroadcastd m12, [pw_2048]
|
|
pxor m13, m13
|
|
psubw m13, m12
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m13, [rsp+32*1]
|
|
mova [rsp+32*1], m9
|
|
pmulhrsw m2, m12
|
|
pmulhrsw m3, m13
|
|
call m(idct_16x8_internal_10bpc).write_16x4_start
|
|
pmulhrsw m0, m12, m4
|
|
pmulhrsw m1, m13, m5
|
|
pmulhrsw m2, m12, m6
|
|
pmulhrsw m3, m13, m7
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
pmulhrsw m0, m12, [rsp+32*0]
|
|
pmulhrsw m1, m13, [rsp+32*1]
|
|
pmulhrsw m2, m12, m10
|
|
pmulhrsw m3, m13, m11
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
pmulhrsw m0, m12, [rsp+32*2]
|
|
pmulhrsw m1, m13, [rsp+32*3]
|
|
pmulhrsw m2, m12, m14
|
|
pmulhrsw m3, m13, m15
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
mova m0, [cq+64* 2]
|
|
mova m1, [cq+64*13]
|
|
mova m2, [cq+64* 6]
|
|
mova m3, [cq+64* 9]
|
|
mova m4, [cq+64*10]
|
|
mova m5, [cq+64* 5]
|
|
mova m6, [cq+64*14]
|
|
mova m7, [cq+64* 1]
|
|
vpbroadcastd m12, [pd_2048]
|
|
call m(iadst_16x8_internal_10bpc).main_part1
|
|
mova m0, [cq+64* 0]
|
|
mova m1, [cq+64*15]
|
|
mova m2, [cq+64* 4]
|
|
mova m3, [cq+64*11]
|
|
mova m4, [cq+64* 8]
|
|
mova m5, [cq+64* 7]
|
|
mova m6, [cq+64*12]
|
|
mova m7, [cq+64* 3]
|
|
jmp m(iadst_16x8_internal_10bpc).main_part2
|
|
|
|
INV_TXFM_16X16_FN flipadst, dct
|
|
INV_TXFM_16X16_FN flipadst, adst
|
|
INV_TXFM_16X16_FN flipadst, flipadst
|
|
|
|
cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
|
vpbroadcastd m13, [clip_18b_min]
|
|
vpbroadcastd m14, [clip_18b_max]
|
|
.pass1:
|
|
vpbroadcastd m15, [pd_2896]
|
|
lea r6, [rsp+32*4]
|
|
sub eobd, 36
|
|
jl .fast
|
|
add cq, 32
|
|
call m(iadst_16x16_internal_10bpc).main
|
|
sub cq, 32
|
|
vpbroadcastd m8, [pd_5120]
|
|
paddd m11, m8
|
|
paddd m9, m8
|
|
paddd m6, m8
|
|
paddd m4, m8
|
|
vpbroadcastd m8, [pd_5119]
|
|
psubd m12, m8, m12
|
|
psubd m10, m8, m10
|
|
psubd m7, m8, m7
|
|
psubd m5, m8, m5
|
|
REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4
|
|
mova [r6+32*0], m12
|
|
mova [r6+32*1], m11
|
|
mova [r6+32*2], m10
|
|
mova [r6+32*3], m9
|
|
psrld m9, m15, 10 ; pd_2
|
|
psubd m3, m9, m3
|
|
paddd m2, m9
|
|
psubd m1, m9, m1
|
|
paddd m0, m9
|
|
psubd m12, m9, [r6-32*4]
|
|
paddd m11, m9, [r6-32*3]
|
|
psubd m10, m9, [r6-32*2]
|
|
paddd m9, [r6-32*1]
|
|
REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0
|
|
mova [r6-32*4], m12
|
|
mova [r6-32*3], m11
|
|
mova [r6-32*2], m10
|
|
mova [r6-32*1], m9
|
|
add r6, 32*8
|
|
mova [r6-32*4], m7
|
|
mova [r6-32*3], m6
|
|
mova [r6-32*2], m5
|
|
mova [r6-32*1], m4
|
|
mova [r6+32*0], m3
|
|
mova [r6+32*1], m2
|
|
mova [r6+32*2], m1
|
|
mova [r6+32*3], m0
|
|
.fast:
|
|
add r6, 32*8
|
|
call m(iadst_16x16_internal_10bpc).main
|
|
vpbroadcastd m14, [pd_5120]
|
|
vpbroadcastd m13, [pd_5119]
|
|
psrld m15, 10 ; pd_2
|
|
psubd m8, m13, m7
|
|
paddd m7, m14, m9
|
|
paddd m9, m14, m6
|
|
psubd m6, m13, m10
|
|
psubd m10, m13, m5
|
|
paddd m5, m14, m11
|
|
paddd m11, m14, m4
|
|
psubd m4, m13, m12
|
|
psubd m12, m15, m3
|
|
paddd m3, m15, [r6-32*1]
|
|
paddd m13, m15, m2
|
|
psubd m2, m15, [r6-32*2]
|
|
psubd m14, m15, m1
|
|
mova m1, m15
|
|
paddd m15, m0
|
|
psubd m0, m1, [r6-32*4]
|
|
paddd m1, [r6-32*3]
|
|
jmp m(iadst_16x16_internal_10bpc).pass1_end
|
|
.pass2:
|
|
call m(idct_16x16_internal_10bpc).transpose
|
|
lea r6, [pw_5+128]
|
|
mova [rsp], m15
|
|
call m(iadst_16x16_internal_8bpc).main
|
|
call m(iadst_16x16_internal_8bpc).main_pass2_end
|
|
mova [rsp+32*3], m3
|
|
mova [rsp+32*2], m2
|
|
mova [rsp+32*0], m0
|
|
mova m2, m13
|
|
mova m3, m12
|
|
vpbroadcastd m12, [pw_2048]
|
|
pxor m13, m13
|
|
psubw m13, m12
|
|
pmulhrsw m0, m13, m15
|
|
pmulhrsw m1, m12, m14
|
|
pmulhrsw m2, m13
|
|
pmulhrsw m3, m12
|
|
mova m14, m8
|
|
mova m15, m9
|
|
call m(idct_16x8_internal_10bpc).write_16x4_start
|
|
pmulhrsw m0, m13, m11
|
|
pmulhrsw m1, m12, m10
|
|
pmulhrsw m2, m13, m15
|
|
pmulhrsw m3, m12, m14
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
pmulhrsw m0, m13, m7
|
|
pmulhrsw m1, m12, m6
|
|
pmulhrsw m2, m13, m5
|
|
pmulhrsw m3, m12, m4
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
pmulhrsw m0, m13, [rsp+32*3]
|
|
pmulhrsw m1, m12, [rsp+32*2]
|
|
pmulhrsw m2, m13, [rsp+32*1]
|
|
pmulhrsw m3, m12, [rsp+32*0]
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
RET
|
|
|
|
INV_TXFM_16X16_FN identity, dct, -92
|
|
INV_TXFM_16X16_FN identity, identity
|
|
|
|
cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
|
vpbroadcastd m15, [pd_5793]
|
|
vpbroadcastd m7, [pd_5120]
|
|
lea r6, [rsp+32*4]
|
|
sub eobd, 36
|
|
jl .fast
|
|
mov r3, -32*8*4
|
|
.righthalf:
|
|
pmulld m0, m15, [cq+r3+32*33]
|
|
pmulld m1, m15, [cq+r3+32*35]
|
|
pmulld m2, m15, [cq+r3+32*37]
|
|
pmulld m3, m15, [cq+r3+32*39]
|
|
add r6, 32*4
|
|
REPX {paddd x, m7}, m0, m1, m2, m3
|
|
REPX {psrad x, 13}, m0, m1, m2, m3
|
|
mova [r6+32*0], m0
|
|
mova [r6+32*1], m1
|
|
mova [r6+32*2], m2
|
|
mova [r6+32*3], m3
|
|
add r3, 32*8
|
|
jl .righthalf
|
|
.fast:
|
|
pmulld m0, m15, [cq+64* 0]
|
|
pmulld m1, m15, [cq+64* 1]
|
|
pmulld m2, m15, [cq+64* 2]
|
|
pmulld m3, m15, [cq+64* 3]
|
|
pmulld m4, m15, [cq+64* 4]
|
|
pmulld m5, m15, [cq+64* 5]
|
|
pmulld m6, m15, [cq+64* 6]
|
|
pmulld m8, m15, [cq+64* 7]
|
|
mova [cq], m8
|
|
pmulld m8, m15, [cq+64* 8]
|
|
pmulld m9, m15, [cq+64* 9]
|
|
pmulld m10, m15, [cq+64*10]
|
|
pmulld m11, m15, [cq+64*11]
|
|
pmulld m12, m15, [cq+64*12]
|
|
pmulld m13, m15, [cq+64*13]
|
|
pmulld m14, m15, [cq+64*14]
|
|
pmulld m15, [cq+64*15]
|
|
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
paddd m7, [cq]
|
|
REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(idct_16x16_internal_10bpc).transpose
|
|
|
|
mova [cq+32*0], m15
|
|
mova [cq+32*1], m0
|
|
vpbroadcastd m15, [pw_1697x16]
|
|
|
|
REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
|
|
8, 9, 10, 11, 12, 13, 14
|
|
mova m0, [cq+32*1]
|
|
mova [cq+32*1], m1
|
|
IDTX16 0, 1, 15
|
|
mova m1, [cq+32*0]
|
|
pmulhrsw m15, m1
|
|
paddsw m1, m1
|
|
paddsw m15, m1
|
|
mova m1, [cq+32*1]
|
|
jmp m(idct_16x16_internal_10bpc).end
|
|
|
|
INV_TXFM_16X16_FN dct, dct, 0, 12
|
|
INV_TXFM_16X16_FN dct, identity, 28, 12
|
|
INV_TXFM_16X16_FN dct, adst, 0, 12
|
|
INV_TXFM_16X16_FN dct, flipadst, 0, 12
|
|
|
|
cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
jmp m(idct_16x16_internal_10bpc).pass1
|
|
.pass2:
|
|
mova [cq+32* 8], m8
|
|
mova [cq+32* 9], m9
|
|
mova [cq+32*10], m10
|
|
mova [cq+32*11], m11
|
|
mova [cq+32*12], m12
|
|
mova [cq+32*13], m13
|
|
mova [cq+32*14], m14
|
|
mova [cq+32*15], m15
|
|
call .pass2_main
|
|
packssdw m0, m1
|
|
packssdw m1, m2, m3
|
|
packssdw m2, m4, m5
|
|
packssdw m3, m6, m7
|
|
packssdw m4, m8, m9
|
|
packssdw m5, m10, m11
|
|
packssdw m6, m12, m13
|
|
packssdw m7, m14, m15
|
|
mova [r6-32*4], m0
|
|
mova [r6-32*3], m1
|
|
mova [r6-32*2], m2
|
|
mova [r6-32*1], m3
|
|
mova [r6+32*0], m4
|
|
mova [r6+32*1], m5
|
|
mova [r6+32*2], m6
|
|
mova [r6+32*3], m7
|
|
mova m0, [cq+32* 8]
|
|
mova m1, [cq+32* 9]
|
|
mova m2, [cq+32*10]
|
|
mova m3, [cq+32*11]
|
|
mova m4, [cq+32*12]
|
|
mova m5, [cq+32*13]
|
|
mova m6, [cq+32*14]
|
|
mova m7, [cq+32*15]
|
|
mov r5, r6
|
|
add r6, 32*16
|
|
call .pass2_main
|
|
jmp m(iadst_16x16_internal_12bpc).end
|
|
ALIGN function_align
|
|
.write_16x16:
|
|
mova [rsp+gprsize+32*0], m8
|
|
mova [rsp+gprsize+32*1], m9
|
|
mova [rsp+gprsize+32*2], m12
|
|
vpbroadcastd m12, [pw_16384]
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
pmulhrsw m2, m12
|
|
pmulhrsw m3, m12
|
|
call m(idct_16x8_internal_12bpc).write_16x4_start
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
jmp m(idct_16x16_internal_10bpc).write_16x16_2
|
|
ALIGN function_align
|
|
.pass2_main:
|
|
call m(idct_8x8_internal_12bpc).transpose_8x8
|
|
mova [cq+32* 0], m0
|
|
mova [cq+32* 1], m2
|
|
mova [cq+32* 2], m4
|
|
mova [cq+32* 3], m6
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
pmaxsd m0, m12, m1
|
|
pmaxsd m1, m12, m3
|
|
pmaxsd m2, m12, m5
|
|
pmaxsd m3, m12, m7
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3
|
|
test eobd, eobd
|
|
jge .pass2_slow
|
|
pxor m4, m4
|
|
REPX {mova x, m4}, m5, m6, m7
|
|
jmp .pass2_fast
|
|
.pass2_slow:
|
|
sub r6, 32*8
|
|
mova m8, [r6-32*4]
|
|
mova m4, [r6-32*3]
|
|
mova m10, [r6-32*2]
|
|
mova m5, [r6-32*1]
|
|
mova m12, [r6+32*0]
|
|
mova m6, [r6+32*1]
|
|
mova m14, [r6+32*2]
|
|
mova m7, [r6+32*3]
|
|
TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15
|
|
mova [cq+32* 4], m8
|
|
mova [cq+32* 5], m10
|
|
mova [cq+32* 6], m12
|
|
mova [cq+32* 7], m14
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
REPX {pmaxsd x, m12}, m4, m5, m6, m7
|
|
REPX {pminsd x, m13}, m4, m5, m6, m7
|
|
.pass2_fast:
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m14, [pd_2896]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf
|
|
pmaxsd m0, m12, [cq+32* 0]
|
|
pmaxsd m1, m12, [cq+32* 1]
|
|
pmaxsd m2, m12, [cq+32* 2]
|
|
pmaxsd m3, m12, [cq+32* 3]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3
|
|
test eobd, eobd
|
|
jge .pass2_slow2
|
|
pxor m4, m4
|
|
REPX {mova x, m4}, m5, m6, m7
|
|
jmp .pass2_fast2
|
|
.pass2_slow2:
|
|
pmaxsd m4, m12, [cq+32* 4]
|
|
pmaxsd m5, m12, [cq+32* 5]
|
|
pmaxsd m6, m12, [cq+32* 6]
|
|
pmaxsd m7, m12, [cq+32* 7]
|
|
REPX {pminsd x, m13}, m4, m5, m6, m7
|
|
.pass2_fast2:
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
psrad m11, 8 ; pd_8
|
|
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(idct_16x8_internal_10bpc).pass1_rotations
|
|
REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m12, m13, m14, m15
|
|
ret
|
|
|
|
INV_TXFM_16X16_FN adst, dct, 0, 12
|
|
INV_TXFM_16X16_FN adst, adst, 0, 12
|
|
INV_TXFM_16X16_FN adst, flipadst, 0, 12
|
|
|
|
cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
|
vpbroadcastd m13, [clip_20b_min]
|
|
vpbroadcastd m14, [clip_20b_max]
|
|
jmp m(iadst_16x16_internal_10bpc).pass1
|
|
.pass2:
|
|
call .pass2_part1
|
|
call m(iadst_16x8_internal_10bpc).pass1_rotations
|
|
call .pass2_part2
|
|
call m(iadst_16x8_internal_10bpc).pass1_rotations
|
|
.pass2_part3:
|
|
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
|
|
REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
|
|
.end:
|
|
packssdw m15, m14
|
|
packssdw m14, m13, m12
|
|
packssdw m13, m11, m10
|
|
packssdw m12, m9, m8
|
|
packssdw m11, m7, m6
|
|
packssdw m10, m5, m4
|
|
packssdw m7, m3, m2
|
|
packssdw m6, m1, m0
|
|
vpblendd m0, m6, [r5-32*4], 0x33
|
|
vpblendd m1, m6, [r5-32*4], 0xcc
|
|
vpblendd m2, m7, [r5-32*3], 0x33
|
|
vpblendd m3, m7, [r5-32*3], 0xcc
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m1, q2031
|
|
vpermq m2, m2, q3120
|
|
vpermq m3, m3, q2031
|
|
call m(idct_16x8_internal_12bpc).write_16x4_start
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
vpblendd m0, m10, [r5-32*2], 0x33
|
|
vpblendd m1, m10, [r5-32*2], 0xcc
|
|
vpblendd m2, m11, [r5-32*1], 0x33
|
|
vpblendd m3, m11, [r5-32*1], 0xcc
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m1, q2031
|
|
vpermq m2, m2, q3120
|
|
vpermq m3, m3, q2031
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
vpblendd m0, m12, [r5+32*0], 0x33
|
|
vpblendd m1, m12, [r5+32*0], 0xcc
|
|
vpblendd m2, m13, [r5+32*1], 0x33
|
|
vpblendd m3, m13, [r5+32*1], 0xcc
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m1, q2031
|
|
vpermq m2, m2, q3120
|
|
vpermq m3, m3, q2031
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
vpblendd m0, m14, [r5+32*2], 0x33
|
|
vpblendd m1, m14, [r5+32*2], 0xcc
|
|
vpblendd m2, m15, [r5+32*3], 0x33
|
|
vpblendd m3, m15, [r5+32*3], 0xcc
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m1, q2031
|
|
vpermq m2, m2, q3120
|
|
vpermq m3, m3, q2031
|
|
call m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
RET
|
|
ALIGN function_align
|
|
.pass2_part1:
|
|
mova [cq+32* 8], m8
|
|
mova [cq+32* 9], m9
|
|
mova [cq+32*10], m10
|
|
mova [cq+32*11], m11
|
|
mova [cq+32*12], m12
|
|
mova [cq+32*13], m13
|
|
mova [cq+32*14], m14
|
|
mova [cq+32*15], m15
|
|
.pass2_main:
|
|
call m(idct_8x8_internal_12bpc).transpose_8x8
|
|
mova [cq+32* 0], m0
|
|
mova [cq+32* 1], m3
|
|
mova [cq+32* 2], m4
|
|
mova [cq+32* 3], m7
|
|
vpbroadcastd m13, [clip_18b_min]
|
|
vpbroadcastd m14, [clip_18b_max]
|
|
pmaxsd m0, m13, m2
|
|
pmaxsd m2, m13, m6
|
|
pmaxsd m5, m13, m5
|
|
pmaxsd m7, m13, m1
|
|
REPX {pminsd x, m14}, m0, m2, m5, m7
|
|
test eobd, eobd
|
|
jge .pass2_slow
|
|
pxor m1, m1
|
|
REPX {mova x, m1}, m3, m4, m6
|
|
jmp .pass2_fast
|
|
.pass2_slow:
|
|
sub r6, 32*8
|
|
mova m8, [r6-32*4]
|
|
mova m3, [r6-32*3]
|
|
mova m4, [r6-32*2]
|
|
mova m11, [r6-32*1]
|
|
mova m12, [r6+32*0]
|
|
mova m1, [r6+32*1]
|
|
mova m6, [r6+32*2]
|
|
mova m15, [r6+32*3]
|
|
TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14
|
|
mova [cq+32* 4], m8
|
|
mova [cq+32* 5], m11
|
|
mova [cq+32* 6], m12
|
|
mova [cq+32* 7], m15
|
|
vpbroadcastd m13, [clip_18b_min]
|
|
vpbroadcastd m14, [clip_18b_max]
|
|
REPX {pmaxsd x, m13}, m1, m3, m4, m6
|
|
REPX {pminsd x, m14}, m1, m3, m4, m6
|
|
.pass2_fast:
|
|
vpbroadcastd m12, [pd_2048]
|
|
vpbroadcastd m15, [pd_2896]
|
|
call m(iadst_16x8_internal_10bpc).main_part1
|
|
pmaxsd m0, m13, [cq+32* 0] ; 0
|
|
pmaxsd m7, m13, [cq+32* 1] ; 3
|
|
pmaxsd m2, m13, [cq+32* 2] ; 4
|
|
pmaxsd m5, m13, [cq+32* 3] ; 7
|
|
REPX {pminsd x, m14}, m0, m2, m5, m7
|
|
test eobd, eobd
|
|
jge .pass2_slow2
|
|
pxor m1, m1
|
|
REPX {mova x, m1}, m3, m4, m6
|
|
jmp .pass2_fast2
|
|
.pass2_slow2:
|
|
pmaxsd m4, m13, [cq+32* 4] ; 8
|
|
pmaxsd m3, m13, [cq+32* 5] ; 11
|
|
pmaxsd m6, m13, [cq+32* 6] ; 12
|
|
pmaxsd m1, m13, [cq+32* 7] ; 15
|
|
REPX {pminsd x, m14}, m1, m3, m4, m6
|
|
.pass2_fast2:
|
|
call m(iadst_16x8_internal_10bpc).main_part2
|
|
vpbroadcastd m14, [pd_17408]
|
|
psrld m15, 11 ; pd_1
|
|
psubd m13, m14, m15 ; pd_17407
|
|
pslld m15, 3 ; pd_8
|
|
ret
|
|
ALIGN function_align
|
|
.pass2_part2:
|
|
REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
|
|
REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11
|
|
packssdw m0, m1
|
|
packssdw m1, m2, m3
|
|
packssdw m2, m4, m5
|
|
packssdw m3, m6, m7
|
|
packssdw m4, m8, m9
|
|
packssdw m5, m10, m11
|
|
packssdw m6, m12, m13
|
|
packssdw m7, m14, m15
|
|
mova [r6-32*4], m0
|
|
mova [r6-32*3], m1
|
|
mova [r6-32*2], m2
|
|
mova [r6-32*1], m3
|
|
mova [r6+32*0], m4
|
|
mova [r6+32*1], m5
|
|
mova [r6+32*2], m6
|
|
mova [r6+32*3], m7
|
|
mova m0, [cq+32* 8]
|
|
mova m1, [cq+32* 9]
|
|
mova m2, [cq+32*10]
|
|
mova m3, [cq+32*11]
|
|
mova m4, [cq+32*12]
|
|
mova m5, [cq+32*13]
|
|
mova m6, [cq+32*14]
|
|
mova m7, [cq+32*15]
|
|
mov r5, r6
|
|
add r6, 32*16
|
|
jmp .pass2_main
|
|
|
|
INV_TXFM_16X16_FN flipadst, dct, 0, 12
|
|
INV_TXFM_16X16_FN flipadst, adst, 0, 12
|
|
INV_TXFM_16X16_FN flipadst, flipadst, 0, 12
|
|
|
|
cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
|
vpbroadcastd m13, [clip_20b_min]
|
|
vpbroadcastd m14, [clip_20b_max]
|
|
jmp m(iflipadst_16x16_internal_10bpc).pass1
|
|
.pass2:
|
|
call m(iadst_16x16_internal_12bpc).pass2_part1
|
|
call m(iflipadst_16x8_internal_10bpc).pass1_rotations
|
|
call m(iadst_16x16_internal_12bpc).pass2_part2
|
|
call m(iflipadst_16x8_internal_10bpc).pass1_rotations
|
|
jmp m(iadst_16x16_internal_12bpc).pass2_part3
|
|
|
|
INV_TXFM_16X16_FN identity, dct, -92, 12
|
|
INV_TXFM_16X16_FN identity, identity, 0, 12
|
|
|
|
%macro IDTX16_12BPC 1 ; src
|
|
pmulld m6, m7, m%1
|
|
paddd m6, m15
|
|
psrad m6, 12
|
|
paddd m6, m%1
|
|
psrad m%1, m6, 1
|
|
%endmacro
|
|
|
|
cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
|
|
vpbroadcastd m7, [pd_1697]
|
|
vpbroadcastd m15, [pd_5120]
|
|
lea r6, [rsp+32*4]
|
|
sub eobd, 36
|
|
jl .fast
|
|
mov r3, -32*8*4
|
|
.righthalf:
|
|
mova m10, [cq+r3+32*33]
|
|
mova m11, [cq+r3+32*35]
|
|
mova m12, [cq+r3+32*37]
|
|
mova m13, [cq+r3+32*39]
|
|
add r6, 32*4
|
|
pmulld m0, m7, m10
|
|
pmulld m1, m7, m11
|
|
pmulld m2, m7, m12
|
|
pmulld m3, m7, m13
|
|
REPX {paddd x, m15}, m0, m1, m2, m3
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3
|
|
paddd m0, m10
|
|
paddd m1, m11
|
|
paddd m2, m12
|
|
paddd m3, m13
|
|
REPX {psrad x, 1 }, m0, m1, m2, m3
|
|
mova [r6+32*0], m0
|
|
mova [r6+32*1], m1
|
|
mova [r6+32*2], m2
|
|
mova [r6+32*3], m3
|
|
add r3, 32*8
|
|
jl .righthalf
|
|
.fast:
|
|
mova m0, [cq+64* 0]
|
|
mova m1, [cq+64* 1]
|
|
mova m2, [cq+64* 2]
|
|
mova m3, [cq+64* 3]
|
|
mova m4, [cq+64* 4]
|
|
mova m5, [cq+64* 5]
|
|
mova m8, [cq+64* 6]
|
|
mova m9, [cq+64* 7]
|
|
REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9
|
|
mova [cq+64*0], m8
|
|
mova [cq+64*1], m9
|
|
mova m8, [cq+64* 8]
|
|
mova m9, [cq+64* 9]
|
|
mova m10, [cq+64*10]
|
|
mova m11, [cq+64*11]
|
|
mova m12, [cq+64*12]
|
|
mova m13, [cq+64*13]
|
|
mova m14, [cq+64*14]
|
|
REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14
|
|
mova m6, [cq+64*15]
|
|
pmulld m7, m6
|
|
paddd m7, m15
|
|
psrad m7, 12
|
|
paddd m7, m6
|
|
mova m6, [cq+64*0]
|
|
psrad m15, m7, 1
|
|
mova m7, [cq+64*1]
|
|
jmp tx2q
|
|
.pass2:
|
|
call m(iidentity_8x16_internal_12bpc).pass2_main
|
|
call m(idct_16x16_internal_10bpc).transpose_fast
|
|
test eobd, eobd
|
|
jl .pass2_fast
|
|
mova [cq+32* 8], m0
|
|
mova [cq+32* 9], m1
|
|
mova [cq+32*10], m2
|
|
mova [cq+32*11], m3
|
|
mova [cq+32*12], m4
|
|
mova [cq+32*13], m5
|
|
mova [cq+32*14], m6
|
|
mova [cq+32*15], m7
|
|
mova m8, [r6-32*4]
|
|
mova m9, [r6-32*3]
|
|
mova m10, [r6-32*2]
|
|
mova m11, [r6-32*1]
|
|
mova m12, [r6+32*0]
|
|
mova m13, [r6+32*1]
|
|
mova m14, [r6+32*2]
|
|
mova m15, [r6+32*3]
|
|
sub r6, 32*8
|
|
mova m0, [r6-32*4]
|
|
mova m1, [r6-32*3]
|
|
mova m2, [r6-32*2]
|
|
mova m3, [r6-32*1]
|
|
mova m4, [r6+32*0]
|
|
mova m5, [r6+32*1]
|
|
mova m6, [r6+32*2]
|
|
mova m7, [r6+32*3]
|
|
call m(iidentity_8x16_internal_12bpc).pass2_main
|
|
call m(idct_16x8_internal_10bpc).transpose2
|
|
mova m8, m0
|
|
mova m9, m1
|
|
mova m10, m2
|
|
mova m11, m3
|
|
mova m12, m4
|
|
mova m13, m5
|
|
mova m14, m6
|
|
mova m15, m7
|
|
mova m0, [cq+32* 8]
|
|
mova m1, [cq+32* 9]
|
|
mova m2, [cq+32*10]
|
|
mova m3, [cq+32*11]
|
|
mova m4, [cq+32*12]
|
|
mova m5, [cq+32*13]
|
|
mova m6, [cq+32*14]
|
|
mova m7, [cq+32*15]
|
|
.pass2_fast:
|
|
call m(idct_16x16_internal_12bpc).write_16x16
|
|
RET
|
|
|
|
%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack
|
|
mova m%4, [r6+32*(%1-4)]
|
|
mova m%2, [r5+32*(3-%1)]
|
|
mova m%5, [r4+32*(%1-4)]
|
|
psubd m%3, m%1, m%4 ; idct16 out15 - n
|
|
paddd m%1, m%4 ; idct16 out0 + n
|
|
pmaxsd m%1, m12
|
|
pmaxsd m%3, m12
|
|
pminsd m%1, m13
|
|
pminsd m%3, m13
|
|
paddd m%1, m11
|
|
paddd m%3, m11
|
|
psubd m%4, m%1, m%2 ; out31 - n
|
|
paddd m%1, m%2 ; out0 + n
|
|
paddd m%2, m%3, m%5 ; out15 - n
|
|
psubd m%3, m%5 ; out16 + n
|
|
REPX {psrad x, %6}, m%1, m%3, m%2, m%4
|
|
%if %7 & 1
|
|
packssdw m%1, m%3 ; out0 + n, out16 + n
|
|
packssdw m%2, m%4 ; out15 - n, out31 - n
|
|
%endif
|
|
%endmacro
|
|
|
|
cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jz .dconly
|
|
PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
vbroadcasti128 m14, [idct32_shuf]
|
|
mov r4, cq
|
|
call .pass1_main
|
|
mova [rsp+32*0], m2
|
|
mova [rsp+32*1], m3
|
|
cmp eobd, 43
|
|
jge .eob43
|
|
pxor m4, m4
|
|
REPX {mova x, m4}, [rsp+32*2], m2, m3, m11
|
|
jmp .pass1_end_fast
|
|
.eob43:
|
|
lea r6, [rsp+32*8]
|
|
mova [r6-32*4], m0
|
|
mova [r6-32*3], m1
|
|
call .pass1_main
|
|
mova [rsp+32*2], m2
|
|
cmp eobd, 107
|
|
jge .eob107
|
|
mova m11, m3
|
|
mova m2, m0
|
|
mova m3, m1
|
|
mova m0, [r6-32*4]
|
|
mova m1, [r6-32*3]
|
|
pxor m4, m4
|
|
.pass1_end_fast:
|
|
vpbroadcastd m10, [pw_2048]
|
|
lea r6, [deint_shuf+128]
|
|
REPX {mova x, m4}, m5, m6, m7
|
|
call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
|
|
jmp .end
|
|
.eob107:
|
|
mova [rsp+32*3], m3
|
|
mova [r6-32*2], m0
|
|
mova [r6-32*1], m1
|
|
call .pass1_main
|
|
cmp eobd, 171
|
|
jge .eob171
|
|
pshufd m12, m2, q1032
|
|
pshufd m13, m3, q1032
|
|
mova m4, m0
|
|
mova m5, m1
|
|
pxor m6, m6
|
|
REPX {mova x, m6}, m7, m14, m15
|
|
jmp .pass1_end
|
|
.eob171:
|
|
mova [r6+32*0], m0
|
|
mova [r6+32*1], m1
|
|
mova [r6+32*2], m2
|
|
mova [r6+32*3], m3
|
|
call .pass1_main
|
|
pshufd m12, [r6+32*2], q1032 ; out19 out17
|
|
pshufd m13, [r6+32*3], q1032 ; out23 out21
|
|
mova m4, [r6+32*0] ; out16 out18
|
|
mova m5, [r6+32*1] ; out20 out22
|
|
pshufd m14, m2, q1032 ; out27 out25
|
|
pshufd m15, m3, q1032 ; out31 out29
|
|
mova m6, m0 ; out24 out26
|
|
mova m7, m1 ; out28 out30
|
|
.pass1_end:
|
|
mova m0, [r6-32*4] ; out0 out2
|
|
mova m1, [r6-32*3] ; out4 out6
|
|
mova m2, [r6-32*2] ; out8 out10
|
|
mova m3, [r6-32*1] ; out12 out14
|
|
lea r6, [deint_shuf+128]
|
|
mova m11, [rsp+32*3] ; out13 out15
|
|
vpbroadcastd m10, [pw_2048]
|
|
call m(inv_txfm_add_dct_dct_8x32_8bpc).main
|
|
.end: ; [rsp+0*32] = m12
|
|
vpbroadcastd m12, [pw_2048]
|
|
mov cq, r4
|
|
mova [rsp+32*1], m8
|
|
mova [rsp+32*2], m9
|
|
mova [rsp+32*3], m10
|
|
mova [rsp+32*4], m11
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m1, q2031
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4_start
|
|
vpermq m0, m2, q3120
|
|
vpermq m1, m3, q2031
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, m4, q3120
|
|
vpermq m1, m5, q2031
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, m6, q3120
|
|
vpermq m1, m7, q2031
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, [rsp+32*1], q3120
|
|
vpermq m1, [rsp+32*2], q2031
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, [rsp+32*3], q3120
|
|
vpermq m1, [rsp+32*4], q2031
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, [rsp+32*0], q3120
|
|
vpermq m1, m13, q2031
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, m14, q3120
|
|
vpermq m1, m15, q2031
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
RET
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m2, [dconly_10bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 32
|
|
add r6d, 640
|
|
sar r6d, 10
|
|
jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
|
|
ALIGN function_align
|
|
.pass1_main_part1:
|
|
mova m0, [cq+128*0]
|
|
mova m1, [cq+128*1]
|
|
mova m2, [cq+128*2]
|
|
mova m3, [cq+128*3]
|
|
mova m4, [cq+128*4]
|
|
mova m5, [cq+128*5]
|
|
mova m6, [cq+128*6]
|
|
mova m7, [cq+128*7]
|
|
call m(idct_8x8_internal_10bpc).main
|
|
psrld m1, m11, 10 ; pd_2
|
|
REPX {paddd x, m1}, m0, m6, m5, m3
|
|
paddd m1, m6, m7 ; out1
|
|
psubd m6, m7 ; out6
|
|
psubd m7, m0, m9 ; out7
|
|
paddd m0, m9 ; out0
|
|
paddd m2, m5, m4 ; out2
|
|
psubd m5, m4 ; out5
|
|
psubd m4, m3, m8 ; out4
|
|
paddd m3, m8 ; out3
|
|
REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
ret
|
|
ALIGN function_align
|
|
.pass1_main:
|
|
call .pass1_main_part1
|
|
add cq, 32
|
|
packssdw m0, m1
|
|
packssdw m2, m3
|
|
packssdw m4, m5
|
|
packssdw m6, m7
|
|
pshufb m0, m14
|
|
pshufb m2, m14
|
|
pshufb m4, m14
|
|
pshufb m6, m14
|
|
punpckhdq m3, m0, m2
|
|
punpckldq m0, m2
|
|
punpckldq m2, m4, m6
|
|
punpckhdq m4, m6
|
|
vperm2i128 m1, m0, m2, 0x31 ; 4 6
|
|
vinserti128 m0, xm2, 1 ; 0 2
|
|
vinserti128 m2, m3, xm4, 1 ; 1 3
|
|
vperm2i128 m3, m4, 0x31 ; 5 7
|
|
ret
|
|
.main_oddhalf_part1_fast_rect2:
|
|
REPX {paddd x, m11}, m0, m1, m2, m3
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3
|
|
.main_oddhalf_part1_fast: ; lower half zero
|
|
vpbroadcastd m7, [pd_4091]
|
|
vpbroadcastd m8, [pd_201]
|
|
vpbroadcastd m6, [pd_m1380]
|
|
vpbroadcastd m9, [pd_3857]
|
|
vpbroadcastd m5, [pd_3703]
|
|
vpbroadcastd m10, [pd_1751]
|
|
vpbroadcastd m4, [pd_m2751]
|
|
vpbroadcastd m15, [pd_3035]
|
|
pmulld m7, m0
|
|
pmulld m0, m8
|
|
pmulld m6, m1
|
|
pmulld m1, m9
|
|
pmulld m5, m2
|
|
pmulld m2, m10
|
|
pmulld m4, m3
|
|
pmulld m3, m15
|
|
jmp .main_oddhalf_part1_fast2
|
|
.main_oddhalf_part1_rect2:
|
|
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
|
|
ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a
|
|
ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
|
|
ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
|
|
ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
|
|
.main_oddhalf_part1_fast2:
|
|
REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
|
|
REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
|
|
psubd m8, m0, m4 ; t17
|
|
paddd m0, m4 ; t16
|
|
psubd m4, m6, m2 ; t18
|
|
paddd m6, m2 ; t19
|
|
psubd m2, m1, m5 ; t29
|
|
paddd m1, m5 ; t28
|
|
psubd m5, m7, m3 ; t30
|
|
paddd m7, m3 ; t31
|
|
REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
|
|
REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
|
|
vpbroadcastd m15, [pd_4017]
|
|
vpbroadcastd m10, [pd_799]
|
|
ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
|
|
ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a
|
|
psubd m3, m0, m6 ; t19a
|
|
paddd m0, m6 ; t16a
|
|
psubd m6, m7, m1 ; t28a
|
|
paddd m7, m1 ; t31a
|
|
psubd m1, m5, m4 ; t18
|
|
paddd m5, m4 ; t17
|
|
psubd m4, m8, m2 ; t29
|
|
paddd m8, m2 ; t30
|
|
REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
|
|
REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
|
|
vpbroadcastd m15, [pd_3784]
|
|
vpbroadcastd m10, [pd_1567]
|
|
ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
|
|
ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28
|
|
mova [r6-32*4], m0
|
|
mova [r6-32*3], m5
|
|
mova [r6-32*2], m4
|
|
mova [r6-32*1], m6
|
|
mova [r6+32*0], m3
|
|
mova [r6+32*1], m1
|
|
mova [r6+32*2], m8
|
|
mova [r6+32*3], m7
|
|
ret
|
|
.main_oddhalf_part2_fast_rect2:
|
|
REPX {paddd x, m11}, m0, m1, m2, m3
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3
|
|
.main_oddhalf_part2_fast: ; lower half zero
|
|
vpbroadcastd m7, [pd_m601]
|
|
vpbroadcastd m8, [pd_4052]
|
|
vpbroadcastd m6, [pd_3973]
|
|
vpbroadcastd m9, [pd_995]
|
|
vpbroadcastd m5, [pd_m2106]
|
|
vpbroadcastd m10, [pd_3513]
|
|
vpbroadcastd m4, [pd_3290]
|
|
vpbroadcastd m15, [pd_2440]
|
|
pmulld m7, m0
|
|
pmulld m0, m8
|
|
pmulld m6, m1
|
|
pmulld m1, m9
|
|
pmulld m5, m2
|
|
pmulld m2, m10
|
|
pmulld m4, m3
|
|
pmulld m3, m15
|
|
jmp .main_oddhalf_part2_fast2
|
|
.main_oddhalf_part2_rect2:
|
|
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
|
|
.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
|
|
ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a
|
|
ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a
|
|
ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
|
|
ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
|
|
.main_oddhalf_part2_fast2:
|
|
REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
|
|
REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
|
|
psubd m8, m0, m4 ; t25
|
|
paddd m0, m4 ; t24
|
|
psubd m4, m6, m2 ; t26
|
|
paddd m6, m2 ; t27
|
|
psubd m2, m1, m5 ; t21
|
|
paddd m1, m5 ; t20
|
|
psubd m5, m7, m3 ; t22
|
|
paddd m7, m3 ; t23
|
|
REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
|
|
REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
|
|
vpbroadcastd m15, [pd_2276]
|
|
vpbroadcastd m10, [pd_3406]
|
|
ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
|
|
ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a
|
|
psubd m3, m0, m6 ; t27a
|
|
paddd m0, m6 ; t24a
|
|
psubd m6, m7, m1 ; t20a
|
|
paddd m7, m1 ; t23a
|
|
psubd m1, m5, m4 ; t21
|
|
paddd m5, m4 ; t22
|
|
psubd m4, m8, m2 ; t26
|
|
paddd m8, m2 ; t25
|
|
REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
|
|
REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
|
|
vpbroadcastd m15, [pd_3784]
|
|
vpbroadcastd m10, [pd_1567]
|
|
ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a
|
|
ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20
|
|
mova m9, [r6-32*4] ; t16a
|
|
mova m10, [r6-32*3] ; t17
|
|
psubd m2, m9, m7 ; t23
|
|
paddd m9, m7 ; t16
|
|
psubd m7, m10, m5 ; t22a
|
|
paddd m10, m5 ; t17a
|
|
REPX {pmaxsd x, m12}, m9, m10, m2, m7
|
|
REPX {pminsd x, m13}, m9, m10, m2, m7
|
|
mova [r6-32*4], m9
|
|
mova [r6-32*3], m10
|
|
mova m9, [r6-32*2] ; t18a
|
|
mova m10, [r6-32*1] ; t19
|
|
psubd m5, m9, m1 ; t21
|
|
paddd m9, m1 ; t18
|
|
psubd m1, m10, m6 ; t20a
|
|
paddd m10, m6 ; t19a
|
|
REPX {pmaxsd x, m12}, m9, m10, m5, m1
|
|
REPX {pminsd x, m13}, m9, m10, m5, m1
|
|
mova [r6-32*2], m9
|
|
mova [r6-32*1], m10
|
|
mova m9, [r6+32*0] ; t28
|
|
mova m10, [r6+32*1] ; t29a
|
|
psubd m6, m9, m3 ; t27a
|
|
paddd m9, m3 ; t28a
|
|
psubd m3, m10, m4 ; t26
|
|
paddd m10, m4 ; t29
|
|
REPX {pmaxsd x, m12}, m9, m10, m6, m3
|
|
REPX {pminsd x, m13}, m9, m10, m6, m3
|
|
REPX {pmulld x, m14}, m6, m3, m1, m5
|
|
paddd m6, m11
|
|
paddd m3, m11
|
|
psubd m4, m6, m1 ; t20
|
|
paddd m6, m1 ; t27
|
|
psubd m1, m3, m5 ; t21a
|
|
paddd m3, m5 ; t26a
|
|
REPX {psrad x, 12 }, m4, m1, m3, m6
|
|
mova [r6+32*0], m4
|
|
mova [r6+32*1], m1
|
|
mova m4, [r6+32*2] ; t30
|
|
mova m1, [r6+32*3] ; t31a
|
|
psubd m5, m4, m8 ; t25a
|
|
paddd m4, m8 ; t30a
|
|
psubd m8, m1, m0 ; t24
|
|
paddd m1, m0 ; t31
|
|
REPX {pmaxsd x, m12}, m8, m5, m4, m1
|
|
REPX {pminsd x, m13}, m8, m5, m4, m1
|
|
REPX {pmulld x, m14}, m5, m8, m7, m2
|
|
paddd m5, m11
|
|
paddd m8, m11
|
|
psubd m0, m5, m7 ; t22
|
|
paddd m5, m7 ; t25
|
|
psubd m7, m8, m2 ; t23a
|
|
paddd m2, m8 ; t24a
|
|
REPX {psrad x, 12 }, m0, m7, m2, m5
|
|
mova [r6+32*2], m0
|
|
mova [r6+32*3], m7
|
|
mov r4, r6
|
|
add r6, 32*8
|
|
mova [r6-32*4], m2
|
|
mova [r6-32*3], m5
|
|
mova [r6-32*2], m3
|
|
mova [r6-32*1], m6
|
|
mova [r6+32*0], m9
|
|
mova [r6+32*1], m10
|
|
mova [r6+32*2], m4
|
|
mova [r6+32*3], m1
|
|
mov r5, r6
|
|
add r6, 32*8
|
|
ret
|
|
ALIGN function_align
|
|
.main_end:
|
|
psrld m11, 10 ; pd_2
|
|
IDCT32_END 0, 15, 8, 9, 10, 2
|
|
IDCT32_END 1, 14, 8, 9, 10, 2
|
|
punpckhwd m8, m0, m1 ; 16 17
|
|
punpcklwd m0, m1 ; 0 1
|
|
punpcklwd m1, m14, m15 ; 14 15
|
|
punpckhwd m14, m15 ; 30 31
|
|
mova [r5+32*3], m8
|
|
mova [r5+32*2], m14
|
|
IDCT32_END 2, 15, 8, 9, 10, 2
|
|
IDCT32_END 3, 14, 8, 9, 10, 2
|
|
punpckhwd m8, m2, m3 ; 18 19
|
|
punpcklwd m2, m3 ; 2 3
|
|
punpcklwd m3, m14, m15 ; 12 13
|
|
punpckhwd m14, m15 ; 28 29
|
|
mova [r5+32*1], m8
|
|
mova [r5+32*0], m14
|
|
IDCT32_END 4, 15, 8, 9, 10, 2
|
|
IDCT32_END 5, 14, 8, 9, 10, 2
|
|
punpckhwd m8, m4, m5 ; 20 21
|
|
punpcklwd m4, m5 ; 4 5
|
|
punpcklwd m5, m14, m15 ; 10 11
|
|
punpckhwd m14, m15 ; 26 27
|
|
mova [r5-32*1], m8
|
|
mova [r5-32*2], m14
|
|
IDCT32_END 6, 15, 8, 9, 10, 2
|
|
IDCT32_END 7, 14, 8, 9, 10, 2
|
|
punpckhwd m8, m6, m7 ; 22 23
|
|
punpcklwd m6, m7 ; 6 7
|
|
punpcklwd m7, m14, m15 ; 8 9
|
|
punpckhwd m14, m15 ; 24 25
|
|
mova [r5-32*3], m8
|
|
mova [r5-32*4], m14
|
|
.transpose:
|
|
punpckhdq m15, m3, m1
|
|
punpckldq m3, m1
|
|
punpckhdq m1, m4, m6
|
|
punpckldq m4, m6
|
|
punpckhdq m6, m0, m2
|
|
punpckldq m0, m2
|
|
punpckhdq m2, m7, m5
|
|
punpckldq m7, m5
|
|
punpcklqdq m5, m2, m15
|
|
punpckhqdq m2, m15
|
|
punpckhqdq m15, m7, m3
|
|
punpcklqdq m7, m3
|
|
punpckhqdq m3, m6, m1
|
|
punpcklqdq m6, m1
|
|
punpckhqdq m1, m0, m4
|
|
punpcklqdq m0, m4
|
|
vperm2i128 m4, m0, m7, 0x31
|
|
vinserti128 m0, xm7, 1
|
|
vperm2i128 m7, m3, m2, 0x31
|
|
vinserti128 m3, xm2, 1
|
|
vinserti128 m2, m6, xm5, 1
|
|
vperm2i128 m6, m5, 0x31
|
|
vperm2i128 m5, m1, m15, 0x31
|
|
vinserti128 m1, xm15, 1
|
|
ret
|
|
|
|
cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob
|
|
vpbroadcastd m7, [pixel_10bpc_max]
|
|
.pass1:
|
|
vpbroadcastd m5, [pw_5]
|
|
pxor m6, m6
|
|
mov r6d, eobd
|
|
add eobb, 21
|
|
cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192
|
|
lea r6, [strideq*3]
|
|
lea r5, [strideq*5]
|
|
lea r4, [strideq+r6*2] ; strideq*7
|
|
.loop:
|
|
mova m0, [cq+128*0]
|
|
packssdw m0, [cq+128*1]
|
|
mova m1, [cq+128*2]
|
|
packssdw m1, [cq+128*3]
|
|
mova m2, [cq+128*4]
|
|
packssdw m2, [cq+128*5]
|
|
mova m3, [cq+128*6]
|
|
packssdw m3, [cq+128*7]
|
|
REPX {paddsw x, m5}, m0, m1, m2, m3
|
|
REPX {psraw x, 3 }, m0, m1, m2, m3
|
|
call .main_zero
|
|
add cq, 32
|
|
lea dstq, [dstq+strideq*8]
|
|
sub eobd, 64
|
|
jge .loop
|
|
RET
|
|
ALIGN function_align
|
|
.main_zero:
|
|
REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
|
|
.main:
|
|
punpckhwd m4, m0, m1
|
|
punpcklwd m0, m1
|
|
punpckhwd m1, m2, m3
|
|
punpcklwd m2, m3
|
|
punpckhwd m3, m0, m4
|
|
punpcklwd m0, m4
|
|
punpckhwd m4, m2, m1
|
|
punpcklwd m2, m1
|
|
punpckhqdq m1, m0, m2
|
|
punpcklqdq m0, m2
|
|
punpcklqdq m2, m3, m4
|
|
punpckhqdq m3, m4
|
|
mova xm4, [dstq+strideq*0]
|
|
vinserti128 m4, [dstq+strideq*4], 1
|
|
paddw m0, m4
|
|
mova xm4, [dstq+strideq*1]
|
|
vinserti128 m4, [dstq+r5 ], 1
|
|
paddw m1, m4
|
|
mova xm4, [dstq+strideq*2]
|
|
vinserti128 m4, [dstq+r6*2 ], 1
|
|
paddw m2, m4
|
|
mova xm4, [dstq+r6 ]
|
|
vinserti128 m4, [dstq+r4 ], 1
|
|
paddw m3, m4
|
|
REPX {pmaxsw x, m6}, m0, m1, m2, m3
|
|
REPX {pminsw x, m7}, m0, m1, m2, m3
|
|
mova [dstq+strideq*0], xm0
|
|
vextracti128 [dstq+strideq*4], m0, 1
|
|
mova [dstq+strideq*1], xm1
|
|
vextracti128 [dstq+r5 ], m1, 1
|
|
mova [dstq+strideq*2], xm2
|
|
vextracti128 [dstq+r6*2 ], m2, 1
|
|
mova [dstq+r6 ], xm3
|
|
vextracti128 [dstq+r4 ], m3, 1
|
|
ret
|
|
|
|
cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jz .dconly
|
|
PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
mov r4, cq
|
|
lea r6, [rsp+32*4]
|
|
call .pass1_main
|
|
cmp eobd, 43
|
|
jge .eob43
|
|
jmp .pass2_fast
|
|
.eob43:
|
|
call .pass1_main
|
|
cmp eobd, 107
|
|
jge .eob107
|
|
.pass2_fast:
|
|
mov cq, r4
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
pmaxsd m0, m12, [cq+128*1+ 0]
|
|
pmaxsd m1, m12, [cq+128*7+ 0]
|
|
pmaxsd m2, m12, [cq+128*1+32]
|
|
pmaxsd m3, m12, [cq+128*7+32]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3
|
|
vpbroadcastd m14, [pd_2896]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
|
|
pmaxsd m0, m12, [cq+128*3+ 0]
|
|
pmaxsd m1, m12, [cq+128*5+ 0]
|
|
pmaxsd m2, m12, [cq+128*3+32]
|
|
pmaxsd m3, m12, [cq+128*5+32]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
|
|
pmaxsd m0, m12, [cq+128*2+ 0]
|
|
pmaxsd m1, m12, [cq+128*6+ 0]
|
|
pmaxsd m2, m12, [cq+128*2+32]
|
|
pmaxsd m3, m12, [cq+128*6+32]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf_fast
|
|
pmaxsd m0, m12, [cq+128*0+ 0]
|
|
pmaxsd m1, m12, [cq+128*4+ 0]
|
|
pmaxsd m2, m12, [cq+128*0+32]
|
|
pmaxsd m3, m12, [cq+128*4+32]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3
|
|
pxor m4, m4
|
|
REPX {mova x, m4}, m5, m6, m7
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
jmp .pass2_end
|
|
.eob107:
|
|
call .pass1_main
|
|
cmp eobd, 171
|
|
jge .eob171
|
|
jmp .pass2
|
|
.eob171:
|
|
call .pass1_main
|
|
.pass2:
|
|
mov cq, r4
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
pmaxsd m0, m12, [cq+128*1+ 0]
|
|
pmaxsd m1, m12, [cq+128*7+ 0]
|
|
pmaxsd m2, m12, [cq+128*1+32]
|
|
pmaxsd m3, m12, [cq+128*7+32]
|
|
pmaxsd m4, m12, [cq+128*1+64]
|
|
pmaxsd m5, m12, [cq+128*7+64]
|
|
pmaxsd m6, m12, [cq+128*1+96]
|
|
pmaxsd m7, m12, [cq+128*7+96]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
vpbroadcastd m14, [pd_2896]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
|
|
pmaxsd m0, m12, [cq+128*3+ 0]
|
|
pmaxsd m1, m12, [cq+128*5+ 0]
|
|
pmaxsd m2, m12, [cq+128*3+32]
|
|
pmaxsd m3, m12, [cq+128*5+32]
|
|
pmaxsd m4, m12, [cq+128*3+64]
|
|
pmaxsd m5, m12, [cq+128*5+64]
|
|
pmaxsd m6, m12, [cq+128*3+96]
|
|
pmaxsd m7, m12, [cq+128*5+96]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
|
|
pmaxsd m0, m12, [cq+128*2+ 0]
|
|
pmaxsd m1, m12, [cq+128*6+ 0]
|
|
pmaxsd m2, m12, [cq+128*2+32]
|
|
pmaxsd m3, m12, [cq+128*6+32]
|
|
pmaxsd m4, m12, [cq+128*2+64]
|
|
pmaxsd m5, m12, [cq+128*6+64]
|
|
pmaxsd m6, m12, [cq+128*2+96]
|
|
pmaxsd m7, m12, [cq+128*6+96]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf
|
|
pmaxsd m0, m12, [cq+128*0+ 0]
|
|
pmaxsd m1, m12, [cq+128*4+ 0]
|
|
pmaxsd m2, m12, [cq+128*0+32]
|
|
pmaxsd m3, m12, [cq+128*4+32]
|
|
pmaxsd m4, m12, [cq+128*0+64]
|
|
pmaxsd m5, m12, [cq+128*4+64]
|
|
pmaxsd m6, m12, [cq+128*0+96]
|
|
pmaxsd m7, m12, [cq+128*4+96]
|
|
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
.pass2_end:
|
|
psrld m11, 8 ; pd_8
|
|
IDCT32_END 0, 15, 8, 9, 10, 4
|
|
IDCT32_END 1, 14, 8, 9, 10, 4
|
|
punpckhqdq m8, m0, m1 ; 16 17 (interleaved)
|
|
punpcklqdq m0, m1 ; 0 1 (interleaved)
|
|
punpcklqdq m1, m14, m15 ; 14 15 (interleaved)
|
|
punpckhqdq m14, m15 ; 30 31 (interleaved)
|
|
mova [r5+32*3], m8
|
|
mova [r5+32*2], m14
|
|
IDCT32_END 2, 15, 8, 9, 10, 4
|
|
IDCT32_END 3, 14, 8, 9, 10, 4
|
|
punpckhqdq m8, m2, m3 ; 18 19 (interleaved)
|
|
punpcklqdq m2, m3 ; 2 3 (interleaved)
|
|
punpcklqdq m3, m14, m15 ; 12 13 (interleaved)
|
|
punpckhqdq m14, m15 ; 28 29 (interleaved)
|
|
mova [r5+32*1], m8
|
|
mova [r5+32*0], m14
|
|
IDCT32_END 4, 15, 8, 9, 10, 4
|
|
IDCT32_END 5, 14, 8, 9, 10, 4
|
|
punpckhqdq m8, m4, m5 ; 20 21 (interleaved)
|
|
punpcklqdq m4, m5 ; 4 5 (interleaved)
|
|
punpcklqdq m5, m14, m15 ; 10 11 (interleaved)
|
|
punpckhqdq m14, m15 ; 26 27 (interleaved)
|
|
mova [r5-32*1], m8
|
|
mova [r5-32*2], m14
|
|
IDCT32_END 6, 15, 8, 9, 10, 4
|
|
IDCT32_END 7, 14, 8, 9, 10, 4
|
|
punpckhqdq m8, m6, m7 ; 22 23 (interleaved)
|
|
punpcklqdq m6, m7 ; 6 7 (interleaved)
|
|
punpcklqdq m7, m14, m15 ; 8 9 (interleaved)
|
|
punpckhqdq m14, m15 ; 24 25 (interleaved)
|
|
mova [r5-32*3], m8
|
|
mova [r5-32*4], m14
|
|
mova m15, m1
|
|
.end:
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m2, q3120
|
|
call m(idct_8x8_internal_12bpc).write_8x4_start
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, m4, q3120
|
|
vpermq m1, m6, q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, m7, q3120
|
|
vpermq m1, m5, q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, m3, q3120
|
|
vpermq m1, m15, q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, [r5+32*3], q3120
|
|
vpermq m1, [r5+32*1], q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, [r5-32*1], q3120
|
|
vpermq m1, [r5-32*3], q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, [r5-32*4], q3120
|
|
vpermq m1, [r5-32*2], q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
vpermq m0, [r5+32*0], q3120
|
|
vpermq m1, [r5+32*2], q3120
|
|
call m(idct_8x8_internal_10bpc).write_8x4
|
|
RET
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m2, [dconly_12bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 32
|
|
add r6d, 640
|
|
sar r6d, 10
|
|
jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
|
|
ALIGN function_align
|
|
.pass1_main:
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1
|
|
TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15
|
|
mova [cq+128*0], m0
|
|
mova [cq+128*1], m1
|
|
mova [cq+128*2], m2
|
|
mova [cq+128*3], m3
|
|
mova [cq+128*4], m4
|
|
mova [cq+128*5], m5
|
|
mova [cq+128*6], m6
|
|
mova [cq+128*7], m7
|
|
add cq, 32
|
|
ret
|
|
ALIGN function_align
|
|
.main_end:
|
|
psrld m11, 10 ; pd_2
|
|
IDCT32_END 0, 15, 8, 9, 10, 2, 0
|
|
mova [cq+32*16], m8
|
|
mova [cq+32*31], m9
|
|
IDCT32_END 1, 14, 8, 9, 10, 2, 0
|
|
mova [cq+32*17], m8
|
|
mova [cq+32*30], m9
|
|
mova [cq+32*14], m14
|
|
IDCT32_END 2, 14, 8, 9, 10, 2, 0
|
|
mova [cq+32*18], m8
|
|
mova [cq+32*29], m9
|
|
mova [cq+32*13], m14
|
|
IDCT32_END 3, 14, 8, 9, 10, 2, 0
|
|
mova [cq+32*19], m8
|
|
mova [cq+32*28], m9
|
|
mova [cq+32*12], m14
|
|
IDCT32_END 4, 14, 8, 9, 10, 2, 0
|
|
mova [cq+32*20], m8
|
|
mova [cq+32*27], m9
|
|
mova [cq+32* 0], m0
|
|
mova [cq+32* 1], m1
|
|
mova [cq+32* 2], m2
|
|
IDCT32_END 5, 10, 0, 1, 2, 2, 0
|
|
mova [cq+32*21], m0
|
|
mova [cq+32*26], m1
|
|
IDCT32_END 6, 9, 0, 1, 2, 2, 0
|
|
mova [cq+32*22], m0
|
|
mova [cq+32*25], m1
|
|
IDCT32_END 7, 8, 0, 1, 2, 2, 0
|
|
mova [cq+32*23], m0
|
|
mova [cq+32*24], m1
|
|
mova m0, [cq+32* 0]
|
|
mova m1, [cq+32* 1]
|
|
mova m2, [cq+32* 2]
|
|
mova m11, m14
|
|
mova m12, [cq+32*12]
|
|
mova m13, [cq+32*13]
|
|
mova m14, [cq+32*14]
|
|
ret
|
|
|
|
cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob
|
|
vpbroadcastd m7, [pixel_12bpc_max]
|
|
jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1
|
|
|
|
cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jnz .full
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m3, [dconly_10bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 8
|
|
.dconly:
|
|
add r6d, 640
|
|
sar r6d, 10
|
|
.dconly2:
|
|
imul r6d, 181
|
|
add r6d, 2176
|
|
sar r6d, 12
|
|
movd xm0, r6d
|
|
paddsw xm0, xm3
|
|
vpbroadcastw m0, xm0
|
|
.dconly_loop:
|
|
paddsw m1, m0, [dstq+32*0]
|
|
paddsw m2, m0, [dstq+32*1]
|
|
psubusw m1, m3
|
|
psubusw m2, m3
|
|
mova [dstq+32*0], m1
|
|
mova [dstq+32*1], m2
|
|
add dstq, strideq
|
|
dec r3d
|
|
jg .dconly_loop
|
|
RET
|
|
.full:
|
|
PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
|
|
lea r6, [rsp+32*4]
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
call .pass1
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
|
|
lea r6, [deint_shuf+128]
|
|
vpbroadcastd m11, [pw_2048]
|
|
mov r4, dstq
|
|
call .pass2
|
|
mova m0, [r5+32*3] ; 16 17
|
|
mova m1, [r5+32*2] ; 30 31
|
|
mova m2, [r5+32*1] ; 18 19
|
|
mova m3, [r5+32*0] ; 28 29
|
|
mova m4, [r5-32*1] ; 20 21
|
|
mova m5, [r5-32*2] ; 26 27
|
|
mova m6, [r5-32*3] ; 22 23
|
|
mova m7, [r5-32*4] ; 24 25
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
|
|
lea dstq, [r4+32]
|
|
call .pass2
|
|
RET
|
|
ALIGN function_align
|
|
.pass2:
|
|
call m(idct_16x8_internal_8bpc).main
|
|
REPX {pmulhrsw x, m11}, m0, m1, m2, m3
|
|
call m(idct_16x8_internal_10bpc).write_16x4_start
|
|
pmulhrsw m0, m11, m4
|
|
pmulhrsw m1, m11, m5
|
|
pmulhrsw m2, m11, m6
|
|
pmulhrsw m3, m11, m7
|
|
jmp m(idct_16x8_internal_10bpc).write_16x4_zero
|
|
ALIGN function_align
|
|
.pass1:
|
|
mova m0, [cq+32* 1]
|
|
mova m1, [cq+32* 7]
|
|
mova m2, [cq+32* 9]
|
|
mova m3, [cq+32*15]
|
|
mova m4, [cq+32*17]
|
|
mova m5, [cq+32*23]
|
|
mova m6, [cq+32*25]
|
|
mova m7, [cq+32*31]
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m14, [pd_2896]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
|
|
mova m0, [cq+32* 3]
|
|
mova m1, [cq+32* 5]
|
|
mova m2, [cq+32*11]
|
|
mova m3, [cq+32*13]
|
|
mova m4, [cq+32*19]
|
|
mova m5, [cq+32*21]
|
|
mova m6, [cq+32*27]
|
|
mova m7, [cq+32*29]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
|
|
mova m0, [cq+32* 2]
|
|
mova m1, [cq+32* 6]
|
|
mova m2, [cq+32*10]
|
|
mova m3, [cq+32*14]
|
|
mova m4, [cq+32*18]
|
|
mova m5, [cq+32*22]
|
|
mova m6, [cq+32*26]
|
|
mova m7, [cq+32*30]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf
|
|
mova m0, [cq+32* 0]
|
|
mova m1, [cq+32* 4]
|
|
mova m2, [cq+32* 8]
|
|
mova m3, [cq+32*12]
|
|
mova m4, [cq+32*16]
|
|
mova m5, [cq+32*20]
|
|
mova m6, [cq+32*24]
|
|
mova m7, [cq+32*28]
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
ret
|
|
|
|
cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
|
|
vpbroadcastd m7, [pixel_10bpc_max]
|
|
.pass1:
|
|
vpbroadcastd m5, [pw_4096]
|
|
pxor m6, m6
|
|
mov r6d, eobd
|
|
add eobb, 21
|
|
cmovc eobd, r6d
|
|
lea r6, [strideq*3]
|
|
lea r5, [strideq*5]
|
|
lea r4, [strideq+r6*2] ; strideq*7
|
|
.loop:
|
|
mova m0, [cq+32*0]
|
|
packssdw m0, [cq+32*1]
|
|
mova m1, [cq+32*2]
|
|
packssdw m1, [cq+32*3]
|
|
REPX {mova [cq+32*x], m6}, 0, 1, 2, 3
|
|
add cq, 32*8
|
|
mova m2, [cq-32*4]
|
|
packssdw m2, [cq-32*3]
|
|
mova m3, [cq-32*2]
|
|
packssdw m3, [cq-32*1]
|
|
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
|
|
REPX {mova [cq+32*x], m6}, -4, -3, -2, -1
|
|
call m(inv_txfm_add_identity_identity_8x32_10bpc).main
|
|
add dstq, 16
|
|
sub eobd, 64
|
|
jge .loop
|
|
RET
|
|
|
|
cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jnz .full
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m3, [dconly_12bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 8
|
|
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
|
|
.full:
|
|
PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
|
|
lea r6, [rsp+32*4]
|
|
vpbroadcastd m12, [clip_20b_min]
|
|
vpbroadcastd m13, [clip_20b_max]
|
|
call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1
|
|
call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end
|
|
mov r4, dstq
|
|
call m(idct_16x8_internal_12bpc).pass2_main
|
|
mova m0, [cq+32* 0] ; 16
|
|
mova m1, [cq+32* 1] ; 17
|
|
mova m2, [cq+32* 2] ; 18
|
|
mova m3, [cq+32* 3] ; 19
|
|
mova m4, [cq+32* 4] ; 20
|
|
mova m5, [cq+32* 5] ; 21
|
|
mova m6, [cq+32* 6] ; 22
|
|
mova m7, [cq+32* 7] ; 23
|
|
mova m8, [cq+32* 8] ; 24
|
|
mova m9, [cq+32* 9] ; 25
|
|
mova m10, [cq+32*10] ; 26
|
|
mova m11, [cq+32*11] ; 27
|
|
mova m12, [cq+32*12] ; 28
|
|
mova m13, [cq+32*13] ; 29
|
|
mova m14, [cq+32*14] ; 30
|
|
mova m15, [cq+32*15] ; 31
|
|
lea dstq, [r4+32]
|
|
call m(idct_16x8_internal_12bpc).pass2_main
|
|
RET
|
|
|
|
cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob
|
|
vpbroadcastd m7, [pixel_12bpc_max]
|
|
jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1
|
|
|
|
%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2]
|
|
mova m%4, [%2]
|
|
paddsw m%3, m%1, m%4
|
|
psubsw m%1, m%4
|
|
%if %1 == 0
|
|
pxor m6, m6
|
|
%endif
|
|
pmulhrsw m%3, m15
|
|
pmulhrsw m%1, m15
|
|
paddw m%3, [dstq+%5]
|
|
paddw m%1, [r2+%6]
|
|
pmaxsw m%3, m6
|
|
pmaxsw m%1, m6
|
|
pminsw m%3, m7
|
|
pminsw m%1, m7
|
|
mova [dstq+%5], m%3
|
|
mova [r2+%6], m%1
|
|
%endmacro
|
|
|
|
cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jz .dconly
|
|
PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
vpbroadcastd m14, [pd_2896]
|
|
lea r6, [rsp+32*16]
|
|
lea r4, [r6+32*8]
|
|
lea r5, [r6+32*16]
|
|
call .main
|
|
sub eobd, 44
|
|
jge .eob44
|
|
vperm2i128 m2, m0, m3, 0x31 ; 5
|
|
vinserti128 m0, xm3, 1 ; 1
|
|
vperm2i128 m3, m1, m4, 0x31 ; 7
|
|
vinserti128 m1, xm4, 1 ; 3
|
|
pxor m4, m4
|
|
REPX {mova x, m4}, m5, m6, m7
|
|
REPX {mova [r6+32*x], m4}, 0, 1, 2, 3
|
|
jmp .fast
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m3, [dconly_10bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 32
|
|
add r6d, 128
|
|
sar r6d, 8
|
|
imul r6d, 181
|
|
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
|
|
.eob44:
|
|
mova [r4+16*0], xm0
|
|
mova [r4+16*1], xm3
|
|
mova [r4+16*2], xm1
|
|
mova [r4+16*3], xm4
|
|
vextracti128 [r4+16*4], m0, 1
|
|
vextracti128 [r4+16*5], m3, 1
|
|
vextracti128 [r4+16*6], m1, 1
|
|
vextracti128 [r4+16*7], m4, 1
|
|
call .main
|
|
sub eobd, 107
|
|
jge .eob151
|
|
vperm2i128 m7, m1, m4, 0x31 ; 15
|
|
vinserti128 m5, m1, xm4, 1 ; 11
|
|
vperm2i128 m6, m0, m3, 0x31 ; 13
|
|
vinserti128 m4, m0, xm3, 1 ; 9
|
|
mova m0, [r4+32*0]
|
|
mova m1, [r4+32*1]
|
|
mova m2, [r4+32*2]
|
|
mova m3, [r4+32*3]
|
|
.fast:
|
|
lea r6, [pw_5+128]
|
|
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
|
|
pxor m8, m8
|
|
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
|
|
jmp .idct16
|
|
.eob151:
|
|
mova [r4-16*8], xm0
|
|
mova [r4-16*7], xm3
|
|
mova [r4-16*6], xm1
|
|
mova [r4-16*5], xm4
|
|
vextracti128 [r4-16*4], m0, 1
|
|
vextracti128 [r4-16*3], m3, 1
|
|
vextracti128 [r4-16*2], m1, 1
|
|
vextracti128 [r4-16*1], m4, 1
|
|
call .main
|
|
sub eobd, 128
|
|
jge .eob279
|
|
vperm2i128 m10, m0, m3, 0x31 ; 21
|
|
vinserti128 m8, m0, xm3, 1 ; 17
|
|
vperm2i128 m11, m1, m4, 0x31 ; 23
|
|
vinserti128 m9, m1, xm4, 1 ; 19
|
|
pxor m12, m12
|
|
REPX {mova x, m12}, m13, m14, m15
|
|
REPX {mova [r6+32*x], m12}, 0, 1, 2, 3
|
|
jmp .full
|
|
.eob279:
|
|
mova [r5+16*0], xm0
|
|
mova [r5+16*1], xm3
|
|
mova [r5+16*2], xm1
|
|
mova [r5+16*3], xm4
|
|
vextracti128 [r5+16*4], m0, 1
|
|
vextracti128 [r5+16*5], m3, 1
|
|
vextracti128 [r5+16*6], m1, 1
|
|
vextracti128 [r5+16*7], m4, 1
|
|
call .main
|
|
vperm2i128 m14, m0, m3, 0x31 ; 29
|
|
vinserti128 m12, m0, xm3, 1 ; 25
|
|
vperm2i128 m15, m1, m4, 0x31 ; 31
|
|
vinserti128 m13, m1, xm4, 1 ; 27
|
|
mova m8, [r5+32*0]
|
|
mova m9, [r5+32*1]
|
|
mova m10, [r5+32*2]
|
|
mova m11, [r5+32*3]
|
|
.full:
|
|
mova m0, [r4+32*0]
|
|
mova m1, [r4+32*1]
|
|
mova m2, [r4+32*2]
|
|
mova m3, [r4+32*3]
|
|
mova m4, [r4-32*4]
|
|
mova m5, [r4-32*3]
|
|
mova m6, [r4-32*2]
|
|
mova m7, [r4-32*1]
|
|
lea r6, [pw_5 + 128]
|
|
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
|
|
lea r3, [rsp+32*8]
|
|
mova m8, [r3+32*0]
|
|
mova m9, [r3+32*1]
|
|
mova m10, [r3+32*2]
|
|
mova m11, [r3+32*3]
|
|
mova m12, [r3-32*4]
|
|
mova m13, [r3-32*3]
|
|
mova m14, [r3-32*2]
|
|
mova m15, [r3-32*1]
|
|
.idct16:
|
|
lea r3, [rsp+32*16]
|
|
mova m0, [r3+32*0]
|
|
mova m1, [r3+32*1]
|
|
mova m2, [r3+32*2]
|
|
mova m3, [r3+32*3]
|
|
mova m4, [r3-32*4]
|
|
mova m5, [r3-32*3]
|
|
mova m6, [r3-32*2]
|
|
mova m7, [r3-32*1]
|
|
mova [rsp], m15
|
|
call m(idct_16x16_internal_8bpc).main
|
|
imul r2, strideq, 19
|
|
lea r3, [strideq*3]
|
|
add r2, dstq
|
|
call .pass2_end
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
pmulld m0, m14, [cq+128* 1]
|
|
pmulld m1, m14, [cq+128* 3]
|
|
pmulld m2, m14, [cq+128* 5]
|
|
pmulld m3, m14, [cq+128* 7]
|
|
pmulld m4, m14, [cq+128* 9]
|
|
pmulld m5, m14, [cq+128*11]
|
|
pmulld m6, m14, [cq+128*13]
|
|
pmulld m7, m14, [cq+128*15]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
|
|
pmulld m0, m14, [cq+128* 0]
|
|
pmulld m1, m14, [cq+128* 2]
|
|
pmulld m2, m14, [cq+128* 4]
|
|
pmulld m3, m14, [cq+128* 6]
|
|
pmulld m4, m14, [cq+128* 8]
|
|
pmulld m5, m14, [cq+128*10]
|
|
pmulld m6, m14, [cq+128*12]
|
|
pmulld m7, m14, [cq+128*14]
|
|
call m(idct_8x8_internal_10bpc).main_rect2
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
psrld m15, m11, 11 ; pd_1
|
|
mova m8, [r6-32*4]
|
|
mova m9, [r6-32*3]
|
|
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
psubd m10, m0, m8 ; out15
|
|
paddd m0, m8 ; out0
|
|
mova m8, [r6-32*2]
|
|
paddd m15, m1, m9 ; out1
|
|
psubd m1, m9 ; out14
|
|
mova m9, [r6-32*1]
|
|
REPX {psrad x, 1}, m0, m15, m10, m1
|
|
packssdw m0, m15
|
|
packssdw m1, m10
|
|
psubd m10, m2, m8 ; out13
|
|
paddd m2, m8 ; out2
|
|
mova m8, [r6+32*0]
|
|
paddd m15, m3, m9 ; out3
|
|
psubd m3, m9 ; out12
|
|
mova m9, [r6+32*1]
|
|
REPX {psrad x, 1}, m2, m15, m10, m3
|
|
packssdw m2, m15
|
|
packssdw m3, m10
|
|
psubd m10, m4, m8 ; out11
|
|
paddd m4, m8 ; out4
|
|
mova m8, [r6+32*2]
|
|
paddd m15, m5, m9 ; out5
|
|
psubd m5, m9 ; out10
|
|
mova m9, [r6+32*3]
|
|
REPX {psrad x, 1}, m4, m10, m15, m5
|
|
packssdw m4, m15
|
|
packssdw m5, m10
|
|
psubd m10, m6, m8 ; out9
|
|
paddd m6, m8 ; out6
|
|
paddd m15, m7, m9 ; out7
|
|
psubd m7, m9 ; out8
|
|
REPX {psrad x, 1}, m6, m10, m15, m7
|
|
packssdw m6, m15
|
|
packssdw m7, m10
|
|
punpckhwd m8, m0, m2
|
|
punpcklwd m0, m2
|
|
punpckhwd m2, m3, m1
|
|
punpcklwd m3, m1
|
|
punpckhwd m1, m4, m6
|
|
punpcklwd m4, m6
|
|
punpcklwd m6, m7, m5
|
|
punpckhwd m7, m5
|
|
pxor m5, m5
|
|
mov r7d, 128*13
|
|
.main_zero_loop:
|
|
mova [cq+r7-128*1], m5
|
|
mova [cq+r7+128*0], m5
|
|
mova [cq+r7+128*1], m5
|
|
mova [cq+r7+128*2], m5
|
|
sub r7d, 128*4
|
|
jg .main_zero_loop
|
|
add cq, 32
|
|
punpcklwd m5, m3, m2
|
|
punpckhwd m3, m2
|
|
punpcklwd m2, m4, m1
|
|
punpckhwd m4, m1
|
|
punpckhwd m1, m0, m8
|
|
punpcklwd m0, m8
|
|
punpckhwd m8, m6, m7
|
|
punpcklwd m6, m7
|
|
punpcklqdq m7, m1, m4
|
|
punpckhqdq m1, m4
|
|
punpckhqdq m4, m8, m3
|
|
punpcklqdq m8, m3
|
|
punpckhqdq m3, m6, m5
|
|
punpcklqdq m6, m5
|
|
punpcklqdq m5, m0, m2
|
|
punpckhqdq m0, m2
|
|
mova [r6+16*0], xm5
|
|
mova [r6+16*1], xm6
|
|
mova [r6+16*2], xm7
|
|
mova [r6+16*3], xm8
|
|
vextracti128 [r6+16*4], m5, 1
|
|
vextracti128 [r6+16*5], m6, 1
|
|
vextracti128 [r6+16*6], m7, 1
|
|
vextracti128 [r6+16*7], m8, 1
|
|
sub r6, 32*4
|
|
ret
|
|
ALIGN function_align
|
|
.pass2_end:
|
|
mova [rsp+gprsize+32*0], m6
|
|
mova [rsp+gprsize+32*2], m7
|
|
mova [rsp+gprsize+32*3], m15
|
|
vpbroadcastd m15, [pw_2048]
|
|
vpbroadcastd m7, [pixel_10bpc_max]
|
|
IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4
|
|
IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8
|
|
IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4
|
|
IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0
|
|
add dstq, strideq
|
|
sub r2, strideq
|
|
mova m1, [rsp+gprsize+32*1]
|
|
IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4
|
|
IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8
|
|
IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4
|
|
IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0
|
|
add dstq, strideq
|
|
sub r2, strideq
|
|
mova m1, [rsp+gprsize+32*0]
|
|
IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4
|
|
IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8
|
|
IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4
|
|
IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0
|
|
add dstq, strideq
|
|
sub r2, strideq
|
|
mova m1, [rsp+gprsize+32*2]
|
|
mova m2, [rsp+gprsize+32*3]
|
|
IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4
|
|
IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8
|
|
IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4
|
|
IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0
|
|
ret
|
|
|
|
cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob
|
|
vpbroadcastd m7, [pixel_10bpc_max]
|
|
.pass1:
|
|
vpbroadcastd m8, [pw_2896x8]
|
|
vpbroadcastd m9, [pw_1697x16]
|
|
vpbroadcastd m11, [pw_8192]
|
|
lea r6, [strideq*5]
|
|
pxor m6, m6
|
|
paddw m10, m11, m11 ; pw_16384
|
|
mov r5, dstq
|
|
call .main
|
|
sub eobd, 36
|
|
jl .ret
|
|
add cq, 128*8
|
|
lea dstq, [r5+16]
|
|
call .main
|
|
sub cq, 128*8-32
|
|
lea dstq, [r5+strideq*8]
|
|
mov r5, dstq
|
|
call .main
|
|
sub eobd, 107 ; eob < 143
|
|
jl .ret
|
|
add cq, 128*8
|
|
lea dstq, [r5+16]
|
|
call .main
|
|
sub cq, 128*8-32
|
|
lea dstq, [r5+strideq*8]
|
|
mov r5, dstq
|
|
call .main
|
|
sub eobd, 128 ; eob < 271
|
|
jl .ret
|
|
add cq, 128*8
|
|
lea dstq, [r5+16]
|
|
call .main
|
|
sub cq, 128*8-32
|
|
lea dstq, [r5+strideq*8]
|
|
mov r5, dstq
|
|
call .main
|
|
sub eobd, 128 ; eob < 399
|
|
jl .ret
|
|
add cq, 128*8
|
|
lea dstq, [r5+16]
|
|
call .main
|
|
.ret:
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
mova m0, [cq+128*0]
|
|
packssdw m0, [cq+128*1]
|
|
mova m1, [cq+128*2]
|
|
packssdw m1, [cq+128*3]
|
|
mova m2, [cq+128*4]
|
|
packssdw m2, [cq+128*5]
|
|
mova m3, [cq+128*6]
|
|
packssdw m3, [cq+128*7]
|
|
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
|
|
REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3
|
|
REPX {pmulhrsw x, m11}, m0, m1, m2, m3
|
|
REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
|
|
.main2:
|
|
punpckhwd m4, m0, m1
|
|
punpcklwd m0, m1
|
|
punpckhwd m1, m2, m3
|
|
punpcklwd m2, m3
|
|
punpckhwd m3, m0, m4
|
|
punpcklwd m0, m4
|
|
punpcklwd m4, m2, m1
|
|
punpckhwd m2, m1
|
|
punpckhqdq m1, m0, m4
|
|
punpcklqdq m0, m4
|
|
call m(iidentity_8x8_internal_10bpc).write_2x8x2
|
|
punpcklqdq m0, m3, m2
|
|
punpckhqdq m1, m3, m2
|
|
jmp m(iidentity_8x8_internal_10bpc).write_2x8x2
|
|
|
|
cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob
|
|
vpbroadcastd m7, [pixel_12bpc_max]
|
|
jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1
|
|
|
|
cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jz .dconly
|
|
PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
lea r6, [rsp+32*4]
|
|
call .main
|
|
cmp eobd, 36
|
|
jge .full
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
|
|
pxor m8, m8
|
|
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
|
|
lea r6, [pw_5+128]
|
|
mov r7, dstq
|
|
call m(idct_16x16_internal_8bpc).main
|
|
call .write_16x16
|
|
mova m0, [r5+32*3]
|
|
mova m1, [r5+32*2]
|
|
mova m2, [r5+32*1]
|
|
mova m3, [r5+32*0]
|
|
mova m4, [r5-32*1]
|
|
mova m5, [r5-32*2]
|
|
mova m6, [r5-32*3]
|
|
mova m7, [r5-32*4]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
|
|
pxor m8, m8
|
|
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
|
|
jmp .end
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m3, [dconly_10bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 16
|
|
add r6d, 128
|
|
sar r6d, 8
|
|
imul r6d, 181
|
|
add r6d, 384
|
|
sar r6d, 9
|
|
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
|
|
.full:
|
|
add cq, 32
|
|
mova [r4+32*3], m0
|
|
mova [r4+32*2], m1
|
|
mova [r4+32*1], m2
|
|
mova [r4+32*0], m3
|
|
mova [r4-32*1], m4
|
|
mova [r4-32*2], m5
|
|
mova [r4-32*3], m6
|
|
mova [r4-32*4], m7
|
|
call .main
|
|
sub r4, 32*16 ; topleft 16x8
|
|
call .transpose_16x16
|
|
lea r6, [pw_5+128]
|
|
mov r7, dstq
|
|
call m(idct_16x16_internal_8bpc).main
|
|
call .write_16x16
|
|
mova m0, [r5+32*3]
|
|
mova m1, [r5+32*2]
|
|
mova m2, [r5+32*1]
|
|
mova m3, [r5+32*0]
|
|
mova m4, [r5-32*1]
|
|
mova m5, [r5-32*2]
|
|
mova m6, [r5-32*3]
|
|
mova m7, [r5-32*4]
|
|
add r4, 32*8 ; bottomleft 16x8
|
|
call .transpose_16x16
|
|
.end:
|
|
lea dstq, [r7+32]
|
|
call m(idct_16x16_internal_8bpc).main
|
|
call .write_16x16
|
|
RET
|
|
ALIGN function_align
|
|
.transpose_16x16:
|
|
punpckhdq m8, m3, m1
|
|
punpckldq m3, m1
|
|
punpckhdq m1, m0, m2
|
|
punpckldq m0, m2
|
|
punpckhdq m2, m7, m5
|
|
punpckldq m7, m5
|
|
punpckhdq m5, m4, m6
|
|
punpckldq m4, m6
|
|
punpckhqdq m6, m0, m4
|
|
punpcklqdq m0, m4
|
|
punpckhqdq m4, m1, m5
|
|
punpcklqdq m1, m5
|
|
punpckhqdq m5, m7, m3
|
|
punpcklqdq m7, m3
|
|
punpckhqdq m3, m2, m8
|
|
punpcklqdq m2, m8
|
|
vinserti128 m8, m0, xm7, 1
|
|
vperm2i128 m12, m0, m7, 0x31
|
|
vinserti128 m9, m6, xm5, 1
|
|
vperm2i128 m13, m6, m5, 0x31
|
|
vinserti128 m10, m1, xm2, 1
|
|
vperm2i128 m14, m1, m2, 0x31
|
|
vinserti128 m11, m4, xm3, 1
|
|
vperm2i128 m15, m4, m3, 0x31
|
|
mova m0, [r4+32*3]
|
|
mova m1, [r4+32*2]
|
|
mova m2, [r4+32*1]
|
|
mova m3, [r4+32*0]
|
|
mova m4, [r4-32*1]
|
|
mova m5, [r4-32*2]
|
|
mova m6, [r4-32*3]
|
|
mova m7, [r4-32*4]
|
|
mova [rsp+gprsize], m15
|
|
jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
|
|
ALIGN function_align
|
|
.main:
|
|
vpbroadcastd m14, [pd_2896]
|
|
vpbroadcastd m11, [pd_2048]
|
|
pmulld m0, m14, [cq+64* 1]
|
|
pmulld m1, m14, [cq+64* 7]
|
|
pmulld m2, m14, [cq+64* 9]
|
|
pmulld m3, m14, [cq+64*15]
|
|
pmulld m4, m14, [cq+64*17]
|
|
pmulld m5, m14, [cq+64*23]
|
|
pmulld m6, m14, [cq+64*25]
|
|
pmulld m7, m14, [cq+64*31]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
|
|
pmulld m0, m14, [cq+64* 3]
|
|
pmulld m1, m14, [cq+64* 5]
|
|
pmulld m2, m14, [cq+64*11]
|
|
pmulld m3, m14, [cq+64*13]
|
|
pmulld m4, m14, [cq+64*19]
|
|
pmulld m5, m14, [cq+64*21]
|
|
pmulld m6, m14, [cq+64*27]
|
|
pmulld m7, m14, [cq+64*29]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
|
|
pmulld m0, m14, [cq+64* 2]
|
|
pmulld m1, m14, [cq+64* 6]
|
|
pmulld m2, m14, [cq+64*10]
|
|
pmulld m3, m14, [cq+64*14]
|
|
pmulld m4, m14, [cq+64*18]
|
|
pmulld m5, m14, [cq+64*22]
|
|
pmulld m6, m14, [cq+64*26]
|
|
pmulld m7, m14, [cq+64*30]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
|
|
pmulld m0, m14, [cq+64* 0]
|
|
pmulld m1, m14, [cq+64* 4]
|
|
pmulld m2, m14, [cq+64* 8]
|
|
pmulld m3, m14, [cq+64*12]
|
|
pmulld m4, m14, [cq+64*16]
|
|
pmulld m5, m14, [cq+64*20]
|
|
pmulld m6, m14, [cq+64*24]
|
|
pmulld m7, m14, [cq+64*28]
|
|
call m(idct_8x8_internal_10bpc).main_rect2
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
pxor m8, m8
|
|
mov r7d, 64*30
|
|
.main_zero_loop:
|
|
mova [cq+r7-64*2], m8
|
|
mova [cq+r7-64*1], m8
|
|
mova [cq+r7+64*0], m8
|
|
mova [cq+r7+64*1], m8
|
|
sub r7d, 64*4
|
|
jg .main_zero_loop
|
|
.main_end:
|
|
psrld m11, 11 ; pd_1
|
|
IDCT32_END 0, 15, 8, 9, 10, 1
|
|
IDCT32_END 1, 14, 8, 9, 10, 1
|
|
punpckhwd m8, m0, m1 ; 16 17
|
|
punpcklwd m0, m1 ; 0 1
|
|
punpcklwd m1, m14, m15 ; 14 15
|
|
punpckhwd m14, m15 ; 30 31
|
|
mova [r5+32*3], m8
|
|
mova [r5+32*2], m14
|
|
IDCT32_END 2, 15, 8, 9, 10, 1
|
|
IDCT32_END 3, 14, 8, 9, 10, 1
|
|
punpckhwd m8, m2, m3 ; 18 19
|
|
punpcklwd m2, m3 ; 2 3
|
|
punpcklwd m3, m14, m15 ; 12 13
|
|
punpckhwd m14, m15 ; 28 29
|
|
mova [r5+32*1], m8
|
|
mova [r5+32*0], m14
|
|
IDCT32_END 4, 15, 8, 9, 10, 1
|
|
IDCT32_END 5, 14, 8, 9, 10, 1
|
|
punpckhwd m8, m4, m5 ; 20 21
|
|
punpcklwd m4, m5 ; 4 5
|
|
punpcklwd m5, m14, m15 ; 10 11
|
|
punpckhwd m14, m15 ; 26 27
|
|
mova [r5-32*1], m8
|
|
mova [r5-32*2], m14
|
|
IDCT32_END 6, 15, 8, 9, 10, 1
|
|
IDCT32_END 7, 14, 8, 9, 10, 1
|
|
punpckhwd m8, m6, m7 ; 22 23
|
|
punpcklwd m6, m7 ; 6 7
|
|
punpcklwd m7, m14, m15 ; 8 9
|
|
punpckhwd m14, m15 ; 24 25
|
|
mova [r5-32*3], m8
|
|
mova [r5-32*4], m14
|
|
ret
|
|
ALIGN function_align
|
|
.write_16x16:
|
|
mova m1, [rsp+gprsize+32*1]
|
|
mova [rsp+gprsize+32*0], m8
|
|
mova [rsp+gprsize+32*1], m9
|
|
mova [rsp+gprsize+32*2], m12
|
|
vpbroadcastd m12, [pw_2048]
|
|
vpbroadcastd m9, [pixel_10bpc_max]
|
|
lea r3, [strideq*3]
|
|
pxor m8, m8
|
|
pmulhrsw m0, m12
|
|
pmulhrsw m1, m12
|
|
pmulhrsw m2, m12
|
|
pmulhrsw m3, m12
|
|
call m(idct_16x8_internal_10bpc).write_16x4
|
|
pmulhrsw m0, m12, m4
|
|
pmulhrsw m1, m12, m5
|
|
pmulhrsw m2, m12, m6
|
|
pmulhrsw m3, m12, m7
|
|
call m(idct_16x8_internal_10bpc).write_16x4
|
|
pmulhrsw m0, m12, [rsp+gprsize+32*0]
|
|
pmulhrsw m1, m12, [rsp+gprsize+32*1]
|
|
pmulhrsw m2, m12, m10
|
|
pmulhrsw m3, m12, m11
|
|
call m(idct_16x8_internal_10bpc).write_16x4
|
|
pmulhrsw m0, m12, [rsp+gprsize+32*2]
|
|
pmulhrsw m1, m12, m13
|
|
pmulhrsw m2, m12, m14
|
|
pmulhrsw m3, m12, m15
|
|
jmp m(idct_16x8_internal_10bpc).write_16x4
|
|
|
|
cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob
|
|
vpbroadcastd m7, [pixel_10bpc_max]
|
|
.pass1:
|
|
vpbroadcastd m8, [pw_2896x8]
|
|
vpbroadcastd m9, [pw_1697x16]
|
|
vpbroadcastd m10, [pw_4096]
|
|
lea r6, [strideq*5]
|
|
pxor m6, m6
|
|
mov r5, dstq
|
|
call .main
|
|
sub eobd, 36
|
|
jl .ret
|
|
add cq, 32
|
|
lea dstq, [dstq+strideq*4]
|
|
call .main
|
|
add cq, 64*8-32
|
|
lea dstq, [r5+16*1]
|
|
call .main
|
|
sub eobd, 107 ; eob < 143
|
|
jl .ret
|
|
add cq, 32
|
|
lea dstq, [dstq+strideq*4]
|
|
call .main
|
|
add cq, 64*8-32
|
|
lea dstq, [r5+16*2]
|
|
call .main
|
|
sub eobd, 128 ; eob < 271
|
|
jl .ret
|
|
add cq, 32
|
|
lea dstq, [dstq+strideq*4]
|
|
call .main
|
|
add cq, 64*8-32
|
|
lea dstq, [r5+16*3]
|
|
call .main
|
|
sub eobd, 128 ; eob < 399
|
|
jl .ret
|
|
add cq, 32
|
|
lea dstq, [dstq+strideq*4]
|
|
call .main
|
|
.ret:
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
mova m0, [cq+64*0]
|
|
packssdw m0, [cq+64*1]
|
|
mova m1, [cq+64*2]
|
|
packssdw m1, [cq+64*3]
|
|
mova m2, [cq+64*4]
|
|
packssdw m2, [cq+64*5]
|
|
mova m3, [cq+64*6]
|
|
packssdw m3, [cq+64*7]
|
|
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
|
|
REPX {paddsw x, x }, m0, m1, m2, m3
|
|
REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3
|
|
REPX {pmulhrsw x, m10}, m0, m1, m2, m3
|
|
REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
|
|
jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
|
|
|
|
cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob
|
|
vpbroadcastd m7, [pixel_12bpc_max]
|
|
jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1
|
|
|
|
cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jz .dconly
|
|
PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
lea r6, [rsp+32*7]
|
|
call .main
|
|
cmp eobd, 36
|
|
jl .fast
|
|
call .main
|
|
cmp eobd, 136
|
|
jl .fast
|
|
call .main
|
|
cmp eobd, 300
|
|
jl .fast
|
|
call .main
|
|
jmp .pass2
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m3, [dconly_10bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 32
|
|
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
|
|
.fast:
|
|
lea r4, [rsp+32*71]
|
|
pxor m0, m0
|
|
.fast_loop:
|
|
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
|
|
add r6, 32*8
|
|
cmp r6, r4
|
|
jl .fast_loop
|
|
.pass2:
|
|
lea r3, [rsp+32*3]
|
|
mov r4, r6
|
|
lea r5, [r6+32*8]
|
|
lea r6, [pw_5+128]
|
|
call .pass2_oddhalf
|
|
call .pass2_evenhalf
|
|
imul r2, strideq, 19
|
|
lea r3, [strideq*3]
|
|
add r2, dstq
|
|
call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
|
|
sub dstq, r3
|
|
lea r2, [r2+r3+32]
|
|
add dstq, 32
|
|
lea r3, [rsp+32*11]
|
|
call .pass2_oddhalf
|
|
call .pass2_evenhalf
|
|
lea r3, [strideq*3]
|
|
call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
mova m0, [cq+128* 1]
|
|
mova m1, [cq+128* 7]
|
|
mova m2, [cq+128* 9]
|
|
mova m3, [cq+128*15]
|
|
mova m4, [cq+128*17]
|
|
mova m5, [cq+128*23]
|
|
mova m6, [cq+128*25]
|
|
mova m7, [cq+128*31]
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m14, [pd_2896]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
|
|
mova m0, [cq+128* 3]
|
|
mova m1, [cq+128* 5]
|
|
mova m2, [cq+128*11]
|
|
mova m3, [cq+128*13]
|
|
mova m4, [cq+128*19]
|
|
mova m5, [cq+128*21]
|
|
mova m6, [cq+128*27]
|
|
mova m7, [cq+128*29]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
|
|
mova m0, [cq+128* 2]
|
|
mova m1, [cq+128* 6]
|
|
mova m2, [cq+128*10]
|
|
mova m3, [cq+128*14]
|
|
mova m4, [cq+128*18]
|
|
mova m5, [cq+128*22]
|
|
mova m6, [cq+128*26]
|
|
mova m7, [cq+128*30]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf
|
|
mova m0, [cq+128* 0]
|
|
mova m1, [cq+128* 4]
|
|
mova m2, [cq+128* 8]
|
|
mova m3, [cq+128*12]
|
|
mova m4, [cq+128*16]
|
|
mova m5, [cq+128*20]
|
|
mova m6, [cq+128*24]
|
|
mova m7, [cq+128*28]
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
|
|
pxor m15, m15
|
|
mov r7d, 128*29
|
|
.main_zero_loop:
|
|
mova [cq+r7-128*1], m15
|
|
mova [cq+r7+128*0], m15
|
|
mova [cq+r7+128*1], m15
|
|
mova [cq+r7+128*2], m15
|
|
sub r7d, 128*4
|
|
jg .main_zero_loop
|
|
add cq, 32
|
|
mova [r4-32*4], m0
|
|
mova [r4-32*3], m1
|
|
mova [r4-32*2], m2
|
|
mova [r4-32*1], m3
|
|
mova [r4+32*0], m4
|
|
mova [r4+32*1], m5
|
|
mova [r4+32*2], m6
|
|
mova [r4+32*3], m7
|
|
mova m0, [r5+32*3]
|
|
mova m1, [r5+32*2]
|
|
mova m2, [r5+32*1]
|
|
mova m3, [r5+32*0]
|
|
mova m4, [r5-32*1]
|
|
mova m5, [r5-32*2]
|
|
mova m6, [r5-32*3]
|
|
mova m7, [r5-32*4]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
|
|
mova [r5-32*4], m0
|
|
mova [r5-32*3], m1
|
|
mova [r5-32*2], m2
|
|
mova [r5-32*1], m3
|
|
mova [r5+32*0], m4
|
|
mova [r5+32*1], m5
|
|
mova [r5+32*2], m6
|
|
mova [r5+32*3], m7
|
|
ret
|
|
ALIGN function_align
|
|
.pass2_oddhalf:
|
|
mova m0, [r3+32* 1] ; 1
|
|
mova m1, [r3+32* 3] ; 3
|
|
mova m2, [r3+32* 5] ; 5
|
|
mova m3, [r3+32* 7] ; 7
|
|
mova m4, [r3+32*17] ; 9
|
|
mova m5, [r3+32*19] ; 11
|
|
mova m6, [r3+32*21] ; 13
|
|
mova m7, [r3+32*23] ; 15
|
|
mova m8, [r3+32*33] ; 17
|
|
mova m9, [r3+32*35] ; 19
|
|
mova m10, [r3+32*37] ; 21
|
|
mova m11, [r3+32*39] ; 23
|
|
mova m12, [r3+32*49] ; 25
|
|
mova m13, [r3+32*51] ; 27
|
|
mova m14, [r3+32*53] ; 29
|
|
mova m15, [r3+32*55] ; 31
|
|
jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
|
|
ALIGN function_align
|
|
.pass2_evenhalf:
|
|
mova m0, [r3+32* 0] ; 0
|
|
mova m1, [r3+32* 2] ; 2
|
|
mova m2, [r3+32* 4] ; 4
|
|
mova m3, [r3+32* 6] ; 6
|
|
mova m4, [r3+32*16] ; 8
|
|
mova m5, [r3+32*18] ; 10
|
|
mova m6, [r3+32*20] ; 12
|
|
mova m7, [r3+32*22] ; 14
|
|
mova m8, [r3+32*32] ; 16
|
|
mova m9, [r3+32*34] ; 18
|
|
mova m10, [r3+32*36] ; 20
|
|
mova m11, [r3+32*38] ; 22
|
|
mova m12, [r3+32*48] ; 24
|
|
mova m13, [r3+32*50] ; 26
|
|
mova m14, [r3+32*52] ; 28
|
|
mova m15, [r3+32*54] ; 30
|
|
mova [rsp+gprsize], m15
|
|
jmp m(idct_16x16_internal_8bpc).main
|
|
|
|
cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m7, [pixel_10bpc_max]
|
|
.pass1:
|
|
vpbroadcastd m5, [pw_8192]
|
|
pxor m6, m6
|
|
lea r6, [strideq*3]
|
|
lea r5, [strideq*5]
|
|
lea r4, [strideq+r6*2] ; strideq*7
|
|
call .main ; 0
|
|
cmp eobd, 36
|
|
jl .ret
|
|
add cq, 128*8 ; 0 1
|
|
mov r7, dstq ; 1
|
|
add dstq, 16
|
|
call .main
|
|
call .main2
|
|
cmp eobd, 136
|
|
jl .ret
|
|
add cq, 128*16-32 ; 0 1 2
|
|
lea dstq, [r7+16*2] ; 1 2
|
|
call .main ; 2
|
|
call .main2
|
|
call .main2
|
|
cmp eobd, 300
|
|
jl .ret
|
|
add cq, 128*24-64 ; 0 1 2 3
|
|
add r7, 16*3 ; 1 2 3
|
|
mov dstq, r7 ; 2 3
|
|
call .main ; 3
|
|
call .main2
|
|
call .main2
|
|
call .main2
|
|
cmp eobd, 535
|
|
jl .ret
|
|
add cq, 128*24-64 ; 0 1 2 3
|
|
lea dstq, [r7+strideq*8] ; 1 2 3 4
|
|
mov r7, dstq ; 2 3 4
|
|
call .main ; 3 4
|
|
call .main2
|
|
call .main2
|
|
cmp eobd, 755
|
|
jl .ret
|
|
add cq, 128*16-32 ; 0 1 2 3
|
|
lea dstq, [r7+strideq*8] ; 1 2 3 4
|
|
call .main ; 2 3 4 5
|
|
call .main2 ; 3 4 5
|
|
cmp eobd, 911
|
|
jl .ret
|
|
add cq, 128*8 ; 0 1 2 3
|
|
add dstq, 16 ; 1 2 3 4
|
|
call .main ; 2 3 4 5
|
|
.ret: ; 3 4 5 6
|
|
RET
|
|
ALIGN function_align
|
|
.main2:
|
|
sub cq, 128*8-32
|
|
lea dstq, [dstq+strideq*8-16]
|
|
.main:
|
|
mova m0, [cq+128*0]
|
|
packssdw m0, [cq+128*1]
|
|
mova m1, [cq+128*2]
|
|
packssdw m1, [cq+128*3]
|
|
mova m2, [cq+128*4]
|
|
packssdw m2, [cq+128*5]
|
|
mova m3, [cq+128*6]
|
|
packssdw m3, [cq+128*7]
|
|
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
|
|
jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero
|
|
|
|
cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob
|
|
vpbroadcastd m7, [pixel_12bpc_max]
|
|
jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1
|
|
|
|
%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
|
|
%if %1 & 1
|
|
mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n
|
|
mova m%4, [r4-32*(14+%1)] ; idct32 out31-n
|
|
%else
|
|
mova m%5, [r4-32*(45-%1)]
|
|
mova m%4, [r5-32*(20+%1)]
|
|
%endif
|
|
paddsw m%6, m%5, m%4 ; idct32 out 0+n
|
|
psubsw m%5, m%4 ; idct32 out31-n
|
|
paddsw m%4, m%5, m%3 ; out31-n
|
|
psubsw m%5, m%3 ; out32+n
|
|
paddsw m%3, m%6, m%2 ; out 0+n
|
|
psubsw m%6, m%2 ; out63-n
|
|
REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3
|
|
%if %1 & 1
|
|
%define %%d0 r2
|
|
%define %%d1 dstq
|
|
%else
|
|
%define %%d0 dstq
|
|
%define %%d1 r2
|
|
%endif
|
|
paddw m%3, [%%d0+%7 ]
|
|
paddw m%4, [%%d1+%8 ]
|
|
paddw m%5, [%%d0+%9 ]
|
|
paddw m%6, [%%d1+%10]
|
|
pxor m%2, m%2
|
|
REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6
|
|
vpbroadcastd m%2, [pixel_10bpc_max]
|
|
REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6
|
|
mova [%%d0+%7 ], m%3
|
|
mova [%%d1+%8 ], m%4
|
|
mova [%%d0+%9 ], m%5
|
|
mova [%%d1+%10], m%6
|
|
%endmacro
|
|
|
|
cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jz .dconly
|
|
PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
vpbroadcastd m14, [pd_2896]
|
|
lea r6, [rsp+32*6]
|
|
call .main
|
|
sub eobd, 44
|
|
jl .fast
|
|
call .main
|
|
sub eobd, 107
|
|
jl .fast
|
|
call .main
|
|
sub eobd, 128
|
|
jl .fast
|
|
call .main
|
|
jmp .pass2
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m3, [dconly_10bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 64
|
|
add r6d, 640
|
|
sar r6d, 10
|
|
jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
|
|
.fast:
|
|
lea r4, [rsp+32*38]
|
|
pxor m0, m0
|
|
.fast_loop:
|
|
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
|
|
add r6, 32*8
|
|
cmp r6, r4
|
|
jl .fast_loop
|
|
.pass2:
|
|
lea r6, [pw_5+128]
|
|
mova m0, [rsp+32* 2] ; in0
|
|
mova m1, [rsp+32* 6] ; in4
|
|
mova m2, [rsp+32*10] ; in8
|
|
mova m3, [rsp+32*14] ; in12
|
|
mova m4, [rsp+32*18] ; in16
|
|
mova m5, [rsp+32*22] ; in20
|
|
mova m6, [rsp+32*26] ; in24
|
|
mova m7, [rsp+32*30] ; in28
|
|
pxor m8, m8
|
|
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
|
|
mova [rsp], m8
|
|
call m(idct_16x16_internal_8bpc).main
|
|
mova m1, [rsp+32*1]
|
|
lea r4, [rsp+32*38]
|
|
mova [r4-32*4], m0
|
|
mova [r4-32*3], m1
|
|
mova [r4-32*2], m2
|
|
mova [r4-32*1], m3
|
|
mova [r4+32*0], m4
|
|
mova [r4+32*1], m5
|
|
mova [r4+32*2], m6
|
|
mova [r4+32*3], m7
|
|
add r4, 32*8
|
|
mova [r4-32*4], m8
|
|
mova [r4-32*3], m9
|
|
mova [r4-32*2], m10
|
|
mova [r4-32*1], m11
|
|
mova [r4+32*0], m12
|
|
mova [r4+32*1], m13
|
|
mova [r4+32*2], m14
|
|
mova [r4+32*3], m15
|
|
mova m0, [rsp+32* 4] ; in2
|
|
mova m1, [rsp+32* 8] ; in6
|
|
mova m2, [rsp+32*12] ; in10
|
|
mova m3, [rsp+32*16] ; in14
|
|
mova m4, [rsp+32*20] ; in18
|
|
mova m5, [rsp+32*24] ; in22
|
|
mova m6, [rsp+32*28] ; in26
|
|
mova m7, [rsp+32*32] ; in30
|
|
lea r5, [r4+32*16]
|
|
add r4, 32*8
|
|
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
|
|
mova m0, [rsp+32* 3] ; in1
|
|
mova m1, [rsp+32*33] ; in31
|
|
mova m2, [rsp+32*19] ; in17
|
|
mova m3, [rsp+32*17] ; in15
|
|
mova m4, [rsp+32*11] ; in9
|
|
mova m5, [rsp+32*25] ; in23
|
|
mova m6, [rsp+32*27] ; in25
|
|
mova m7, [rsp+32* 9] ; in7
|
|
lea r6, [idct64_mul - 8]
|
|
add r4, 32*16
|
|
add r5, 32*32
|
|
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
|
mova m0, [rsp+32* 7] ; in5
|
|
mova m1, [rsp+32*29] ; in27
|
|
mova m2, [rsp+32*23] ; in21
|
|
mova m3, [rsp+32*13] ; in11
|
|
mova m4, [rsp+32*15] ; in13
|
|
mova m5, [rsp+32*21] ; in19
|
|
mova m6, [rsp+32*31] ; in29
|
|
mova m7, [rsp+32* 5] ; in3
|
|
add r6, 8
|
|
add r4, 32*8
|
|
sub r5, 32*8
|
|
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
|
lea r8, [strideq*4]
|
|
lea r9, [strideq*5]
|
|
lea r3, [r9+strideq*1] ; stride*6
|
|
lea r7, [r9+strideq*2] ; stride*7
|
|
call .main_part2_pass2
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
mova m0, [cq+128* 1]
|
|
mova m1, [cq+128* 3]
|
|
mova m2, [cq+128* 5]
|
|
mova m3, [cq+128* 7]
|
|
mova m4, [cq+128* 9]
|
|
mova m5, [cq+128*11]
|
|
mova m6, [cq+128*13]
|
|
mova m7, [cq+128*15]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf
|
|
mova m0, [cq+128* 0]
|
|
mova m1, [cq+128* 2]
|
|
mova m2, [cq+128* 4]
|
|
mova m3, [cq+128* 6]
|
|
mova m4, [cq+128* 8]
|
|
mova m5, [cq+128*10]
|
|
mova m6, [cq+128*12]
|
|
mova m7, [cq+128*14]
|
|
call m(idct_8x8_internal_10bpc).main
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
pxor m15, m15
|
|
mov r7d, 128*13
|
|
.main_zero_loop:
|
|
mova [cq+r7-128*1], m15
|
|
mova [cq+r7+128*0], m15
|
|
mova [cq+r7+128*1], m15
|
|
mova [cq+r7+128*2], m15
|
|
sub r7d, 128*4
|
|
jg .main_zero_loop
|
|
add cq, 32
|
|
psrld m15, m11, 10 ; pd_2
|
|
mova m8, [r6-32*4]
|
|
mova m9, [r6+32*3]
|
|
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
psubd m10, m0, m8 ; out15
|
|
paddd m0, m8 ; out0
|
|
mova m8, [r6-32*3]
|
|
psubd m15, m7, m9 ; out8
|
|
paddd m7, m9 ; out7
|
|
mova m9, [r6+32*2]
|
|
REPX {psrad x, 2}, m0, m15, m10, m7
|
|
packssdw m0, m15
|
|
packssdw m7, m10
|
|
psubd m10, m1, m8 ; out14
|
|
paddd m1, m8 ; out1
|
|
mova m8, [r6-32*2]
|
|
psubd m15, m6, m9 ; out9
|
|
paddd m6, m9 ; out6
|
|
mova m9, [r6+32*1]
|
|
REPX {psrad x, 2}, m1, m15, m10, m6
|
|
packssdw m1, m15
|
|
packssdw m6, m10
|
|
psubd m10, m2, m8 ; out13
|
|
paddd m2, m8 ; out2
|
|
mova m8, [r6-32*1]
|
|
psubd m15, m5, m9 ; out10
|
|
paddd m5, m9 ; out5
|
|
mova m9, [r6+32*0]
|
|
REPX {psrad x, 2}, m2, m15, m10, m5
|
|
packssdw m2, m15
|
|
packssdw m5, m10
|
|
psubd m10, m3, m8 ; out12
|
|
paddd m3, m8 ; out3
|
|
psubd m15, m4, m9 ; out11
|
|
paddd m4, m9 ; out4
|
|
REPX {psrad x, 2}, m3, m15, m10, m4
|
|
packssdw m3, m15
|
|
packssdw m4, m10
|
|
call m(idct_16x8_internal_10bpc).transpose3
|
|
mova [r6-32*4], m0
|
|
mova [r6-32*3], m1
|
|
mova [r6-32*2], m2
|
|
mova [r6-32*1], m3
|
|
mova [r6+32*0], m4
|
|
mova [r6+32*1], m5
|
|
mova [r6+32*2], m6
|
|
mova [r6+32*3], m7
|
|
add r6, 32*8
|
|
ret
|
|
.main_part2_pass2:
|
|
vpbroadcastd m11, [pw_1567_3784]
|
|
vpbroadcastd m12, [pw_m3784_1567]
|
|
vpbroadcastd m13, [pw_2896_2896]
|
|
lea r6, [pw_5+128]
|
|
lea r2, [dstq+r7]
|
|
.main_part2_pass2_loop:
|
|
vpbroadcastd m14, [pw_m2896_2896]
|
|
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal
|
|
vpbroadcastd m14, [pw_2048]
|
|
IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8
|
|
IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8
|
|
IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8
|
|
IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8
|
|
add dstq, strideq
|
|
sub r2, strideq
|
|
cmp r4, r5
|
|
jne .main_part2_pass2_loop
|
|
ret
|
|
ALIGN function_align
|
|
.main_part1_rect2:
|
|
REPX {paddd x, m11}, m0, m1, m2, m3
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3
|
|
.main_part1: ; idct64 steps 1-5
|
|
; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
|
|
; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
|
|
; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
|
|
; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
|
|
vpbroadcastd m7, [r5+4*0]
|
|
vpbroadcastd m8, [r5+4*1]
|
|
vpbroadcastd m6, [r5+4*2]
|
|
vpbroadcastd m9, [r5+4*3]
|
|
vpbroadcastd m5, [r5+4*4]
|
|
vpbroadcastd m10, [r5+4*5]
|
|
vpbroadcastd m4, [r5+4*6]
|
|
vpbroadcastd m15, [r5+4*7]
|
|
pmulld m7, m0 ; t63a
|
|
pmulld m0, m8 ; t32a
|
|
pmulld m6, m1 ; t62a
|
|
pmulld m1, m9 ; t33a
|
|
pmulld m5, m2 ; t61a
|
|
pmulld m2, m10 ; t34a
|
|
pmulld m4, m3 ; t60a
|
|
pmulld m3, m15 ; t35a
|
|
vpbroadcastd m10, [r5+4*8]
|
|
vpbroadcastd m15, [r5+4*9]
|
|
REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
|
|
REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
|
|
psubd m8, m0, m1 ; t33
|
|
paddd m0, m1 ; t32
|
|
psubd m1, m7, m6 ; t62
|
|
paddd m7, m6 ; t63
|
|
psubd m6, m3, m2 ; t34
|
|
paddd m3, m2 ; t35
|
|
psubd m2, m4, m5 ; t61
|
|
paddd m4, m5 ; t60
|
|
REPX {pmaxsd x, m12}, m8, m1, m6, m2
|
|
REPX {pminsd x, m13}, m8, m1, m6, m2
|
|
ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
|
|
ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a
|
|
REPX {pmaxsd x, m12}, m0, m3, m7, m4
|
|
REPX {pminsd x, m13}, m0, m3, m7, m4
|
|
vpbroadcastd m10, [r5+4*10]
|
|
vpbroadcastd m15, [r5+4*11]
|
|
psubd m5, m0, m3 ; t35a
|
|
paddd m0, m3 ; t32a
|
|
psubd m3, m7, m4 ; t60a
|
|
paddd m7, m4 ; t63a
|
|
psubd m4, m1, m6 ; t34
|
|
paddd m1, m6 ; t33
|
|
psubd m6, m8, m2 ; t61
|
|
paddd m8, m2 ; t62
|
|
REPX {pmaxsd x, m12}, m5, m3, m4, m6
|
|
REPX {pminsd x, m13}, m5, m3, m4, m6
|
|
ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60
|
|
ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
|
|
REPX {pmaxsd x, m12}, m0, m7, m1, m8
|
|
REPX {pminsd x, m13}, m0, m7, m1, m8
|
|
add r5, 4*12
|
|
mova [r6-32*4], m0
|
|
mova [r6+32*3], m7
|
|
mova [r6-32*3], m1
|
|
mova [r6+32*2], m8
|
|
mova [r6-32*2], m6
|
|
mova [r6+32*1], m4
|
|
mova [r6-32*1], m3
|
|
mova [r6+32*0], m5
|
|
add r6, 32*8
|
|
ret
|
|
.main_part2: ; idct64 steps 6-9
|
|
lea r5, [r6+32*3]
|
|
sub r6, 32*4
|
|
vpbroadcastd m10, [pd_1567]
|
|
vpbroadcastd m15, [pd_3784]
|
|
.main_part2_loop:
|
|
mova m0, [r6-32*32] ; t32a
|
|
mova m1, [r5-32*24] ; t39a
|
|
mova m2, [r5-32*32] ; t63a
|
|
mova m3, [r6-32*24] ; t56a
|
|
mova m4, [r6-32*16] ; t40a
|
|
mova m5, [r5-32* 8] ; t47a
|
|
mova m6, [r5-32*16] ; t55a
|
|
mova m7, [r6-32* 8] ; t48a
|
|
psubd m8, m0, m1 ; t39
|
|
paddd m0, m1 ; t32
|
|
psubd m1, m2, m3 ; t56
|
|
paddd m2, m3 ; t63
|
|
psubd m3, m5, m4 ; t40
|
|
paddd m5, m4 ; t47
|
|
psubd m4, m7, m6 ; t55
|
|
paddd m7, m6 ; t48
|
|
REPX {pmaxsd x, m12}, m8, m1, m3, m4
|
|
REPX {pminsd x, m13}, m8, m1, m3, m4
|
|
ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
|
|
ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a
|
|
REPX {pmaxsd x, m12}, m0, m2, m5, m7
|
|
REPX {pminsd x, m13}, m0, m5, m2, m7
|
|
psubd m6, m2, m7 ; t48a
|
|
paddd m2, m7 ; t63a
|
|
psubd m7, m0, m5 ; t47a
|
|
paddd m0, m5 ; t32a
|
|
psubd m5, m8, m4 ; t55
|
|
paddd m8, m4 ; t56
|
|
psubd m4, m1, m3 ; t40
|
|
paddd m1, m3 ; t39
|
|
REPX {pmaxsd x, m12}, m6, m7, m5, m4
|
|
REPX {pminsd x, m13}, m6, m7, m5, m4
|
|
REPX {pmulld x, m14}, m6, m7, m5, m4
|
|
REPX {pmaxsd x, m12}, m2, m0, m8, m1
|
|
REPX {pminsd x, m13}, m2, m0, m8, m1
|
|
paddd m6, m11
|
|
paddd m5, m11
|
|
psubd m3, m6, m7 ; t47
|
|
paddd m6, m7 ; t48
|
|
psubd m7, m5, m4 ; t40a
|
|
paddd m5, m4 ; t55a
|
|
REPX {psrad x, 12}, m3, m6, m7, m5
|
|
mova [r5-32* 8], m2
|
|
mova [r6-32*32], m0
|
|
mova [r6-32* 8], m8
|
|
mova [r5-32*32], m1
|
|
mova [r5-32*24], m3
|
|
mova [r6-32*16], m6
|
|
mova [r6-32*24], m7
|
|
mova [r5-32*16], m5
|
|
add r6, 32
|
|
sub r5, 32
|
|
cmp r6, r5
|
|
jl .main_part2_loop
|
|
ret
|
|
|
|
cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jz .dconly
|
|
PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
lea r6, [rsp+32*6]
|
|
call .main
|
|
cmp eobd, 36
|
|
jl .fast
|
|
call .main
|
|
cmp eobd, 136
|
|
jl .fast
|
|
call .main
|
|
cmp eobd, 300
|
|
jl .fast
|
|
call .main
|
|
jmp .pass2
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
vpbroadcastd m3, [dconly_10bpc]
|
|
mov [cq], eobd ; 0
|
|
or r3d, 64
|
|
add r6d, 128
|
|
sar r6d, 8
|
|
imul r6d, 181
|
|
add r6d, 384
|
|
sar r6d, 9
|
|
jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
|
|
.fast:
|
|
lea r4, [rsp+32*70]
|
|
pxor m0, m0
|
|
.fast_loop:
|
|
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
|
|
add r6, 32*8
|
|
cmp r6, r4
|
|
jl .fast_loop
|
|
.pass2:
|
|
lea r6, [pw_5 + 128]
|
|
mov r10, rsp
|
|
lea r8, [strideq*4]
|
|
lea r9, [strideq*5]
|
|
lea r3, [r9+strideq*1] ; stride*6
|
|
lea r7, [r9+strideq*2] ; stride*7
|
|
.pass2_loop:
|
|
mova m0, [r10+32* 2] ; in0
|
|
mova m1, [r10+32* 6] ; in4
|
|
mova m2, [r10+32*18] ; in8
|
|
mova m3, [r10+32*22] ; in12
|
|
mova m4, [r10+32*34] ; in16
|
|
mova m5, [r10+32*38] ; in20
|
|
mova m6, [r10+32*50] ; in24
|
|
mova m7, [r10+32*54] ; in28
|
|
pxor m8, m8
|
|
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
|
|
mova [rsp], m8
|
|
call m(idct_16x16_internal_8bpc).main
|
|
mova m1, [rsp+32*1]
|
|
lea r4, [rsp+32*70]
|
|
mova [r4-32*4], m0
|
|
mova [r4-32*3], m1
|
|
mova [r4-32*2], m2
|
|
mova [r4-32*1], m3
|
|
mova [r4+32*0], m4
|
|
mova [r4+32*1], m5
|
|
mova [r4+32*2], m6
|
|
mova [r4+32*3], m7
|
|
add r4, 32*8
|
|
mova [r4-32*4], m8
|
|
mova [r4-32*3], m9
|
|
mova [r4-32*2], m10
|
|
mova [r4-32*1], m11
|
|
mova [r4+32*0], m12
|
|
mova [r4+32*1], m13
|
|
mova [r4+32*2], m14
|
|
mova [r4+32*3], m15
|
|
mova m0, [r10+32* 4] ; in2
|
|
mova m1, [r10+32* 8] ; in6
|
|
mova m2, [r10+32*20] ; in10
|
|
mova m3, [r10+32*24] ; in14
|
|
mova m4, [r10+32*36] ; in18
|
|
mova m5, [r10+32*40] ; in22
|
|
mova m6, [r10+32*52] ; in26
|
|
mova m7, [r10+32*56] ; in30
|
|
lea r5, [r4+32*16]
|
|
add r4, 32*8
|
|
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
|
|
mova m0, [r10+32* 3] ; in1
|
|
mova m1, [r10+32*57] ; in31
|
|
mova m2, [r10+32*35] ; in17
|
|
mova m3, [r10+32*25] ; in15
|
|
mova m4, [r10+32*19] ; in9
|
|
mova m5, [r10+32*41] ; in23
|
|
mova m6, [r10+32*51] ; in25
|
|
mova m7, [r10+32* 9] ; in7
|
|
lea r6, [idct64_mul - 8]
|
|
add r4, 32*16
|
|
add r5, 32*32
|
|
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
|
mova m0, [r10+32* 7] ; in5
|
|
mova m1, [r10+32*53] ; in27
|
|
mova m2, [r10+32*39] ; in21
|
|
mova m3, [r10+32*21] ; in11
|
|
mova m4, [r10+32*23] ; in13
|
|
mova m5, [r10+32*37] ; in19
|
|
mova m6, [r10+32*55] ; in29
|
|
mova m7, [r10+32* 5] ; in3
|
|
add r6, 8
|
|
add r4, 32*8
|
|
sub r5, 32*8
|
|
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
|
|
add r10, 32*8
|
|
sub r4, 32*98 ; rsp+32*16
|
|
sub dstq, r8
|
|
add dstq, 32
|
|
cmp r10, r4
|
|
jl .pass2_loop
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
vpbroadcastd m14, [pd_2896]
|
|
vpbroadcastd m11, [pd_2048]
|
|
pmulld m0, m14, [cq+128* 1]
|
|
pmulld m1, m14, [cq+128* 7]
|
|
pmulld m2, m14, [cq+128* 9]
|
|
pmulld m3, m14, [cq+128*15]
|
|
pmulld m4, m14, [cq+128*17]
|
|
pmulld m5, m14, [cq+128*23]
|
|
pmulld m6, m14, [cq+128*25]
|
|
pmulld m7, m14, [cq+128*31]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
|
|
pmulld m0, m14, [cq+128* 3]
|
|
pmulld m1, m14, [cq+128* 5]
|
|
pmulld m2, m14, [cq+128*11]
|
|
pmulld m3, m14, [cq+128*13]
|
|
pmulld m4, m14, [cq+128*19]
|
|
pmulld m5, m14, [cq+128*21]
|
|
pmulld m6, m14, [cq+128*27]
|
|
pmulld m7, m14, [cq+128*29]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
|
|
pmulld m0, m14, [cq+128* 2]
|
|
pmulld m1, m14, [cq+128* 6]
|
|
pmulld m2, m14, [cq+128*10]
|
|
pmulld m3, m14, [cq+128*14]
|
|
pmulld m4, m14, [cq+128*18]
|
|
pmulld m5, m14, [cq+128*22]
|
|
pmulld m6, m14, [cq+128*26]
|
|
pmulld m7, m14, [cq+128*30]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
|
|
pmulld m0, m14, [cq+128* 0]
|
|
pmulld m1, m14, [cq+128* 4]
|
|
pmulld m2, m14, [cq+128* 8]
|
|
pmulld m3, m14, [cq+128*12]
|
|
pmulld m4, m14, [cq+128*16]
|
|
pmulld m5, m14, [cq+128*20]
|
|
pmulld m6, m14, [cq+128*24]
|
|
pmulld m7, m14, [cq+128*28]
|
|
pxor m15, m15
|
|
mov r7d, 128*29
|
|
.main_zero_loop:
|
|
mova [cq+r7-128*1], m15
|
|
mova [cq+r7+128*0], m15
|
|
mova [cq+r7+128*1], m15
|
|
mova [cq+r7+128*2], m15
|
|
sub r7d, 128*4
|
|
jg .main_zero_loop
|
|
add cq, 32
|
|
call m(idct_8x8_internal_10bpc).main_rect2
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
|
|
mova [r4-32*4], m0
|
|
mova [r4-32*3], m1
|
|
mova [r4-32*2], m2
|
|
mova [r4-32*1], m3
|
|
mova [r4+32*0], m4
|
|
mova [r4+32*1], m5
|
|
mova [r4+32*2], m6
|
|
mova [r4+32*3], m7
|
|
mova m0, [r5+32*3]
|
|
mova m1, [r5+32*2]
|
|
mova m2, [r5+32*1]
|
|
mova m3, [r5+32*0]
|
|
mova m4, [r5-32*1]
|
|
mova m5, [r5-32*2]
|
|
mova m6, [r5-32*3]
|
|
mova m7, [r5-32*4]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
|
|
mova [r5-32*4], m0
|
|
mova [r5-32*3], m1
|
|
mova [r5-32*2], m2
|
|
mova [r5-32*1], m3
|
|
mova [r5+32*0], m4
|
|
mova [r5+32*1], m5
|
|
mova [r5+32*2], m6
|
|
mova [r5+32*3], m7
|
|
ret
|
|
|
|
cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jnz .normal
|
|
imul r6d, [cq], 181
|
|
mov [cq], eobd ; 0
|
|
or r3d, 16
|
|
.dconly:
|
|
add r6d, 640
|
|
sar r6d, 10
|
|
.dconly2:
|
|
vpbroadcastd m5, [dconly_10bpc]
|
|
imul r6d, 181
|
|
add r6d, 2176
|
|
sar r6d, 12
|
|
movd xm0, r6d
|
|
paddsw xm0, xm5
|
|
vpbroadcastw m0, xm0
|
|
.dconly_loop:
|
|
paddsw m1, m0, [dstq+32*0]
|
|
paddsw m2, m0, [dstq+32*1]
|
|
paddsw m3, m0, [dstq+32*2]
|
|
paddsw m4, m0, [dstq+32*3]
|
|
REPX {psubusw x, m5}, m1, m2, m3, m4
|
|
mova [dstq+32*0], m1
|
|
mova [dstq+32*1], m2
|
|
mova [dstq+32*2], m3
|
|
mova [dstq+32*3], m4
|
|
add dstq, strideq
|
|
dec r3d
|
|
jg .dconly_loop
|
|
RET
|
|
.normal:
|
|
PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
vpbroadcastd m14, [pd_2896]
|
|
lea r6, [rsp+32*4]
|
|
call .main
|
|
call .shift_transpose
|
|
cmp eobd, 36
|
|
jl .fast
|
|
call .main
|
|
call .shift_transpose
|
|
jmp .pass2
|
|
.fast:
|
|
pxor m0, m0
|
|
mov r3d, 4
|
|
.fast_loop:
|
|
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
|
|
add r6, 32*8
|
|
dec r3d
|
|
jg .fast_loop
|
|
.pass2:
|
|
lea r7, [r6-32*64]
|
|
lea r4, [r6-32*32]
|
|
lea r6, [pw_5+128]
|
|
mov r5, dstq
|
|
.pass2_loop:
|
|
mova m0, [r7-32*4]
|
|
mova m1, [r7-32*3]
|
|
mova m2, [r7-32*2]
|
|
mova m3, [r7-32*1]
|
|
mova m4, [r7+32*0]
|
|
mova m5, [r7+32*1]
|
|
mova m6, [r7+32*2]
|
|
mova m7, [r7+32*3]
|
|
add r7, 32*32
|
|
mova m8, [r7-32*4]
|
|
mova m9, [r7-32*3]
|
|
mova m10, [r7-32*2]
|
|
mova m11, [r7-32*1]
|
|
mova m12, [r7+32*0]
|
|
mova m13, [r7+32*1]
|
|
mova m14, [r7+32*2]
|
|
mova m15, [r7+32*3]
|
|
sub r7, 32*24
|
|
mova [rsp], m15
|
|
call m(idct_16x16_internal_8bpc).main
|
|
mova m1, [rsp+32*1]
|
|
call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16
|
|
add r5, 32
|
|
mov dstq, r5
|
|
cmp r7, r4
|
|
jl .pass2_loop
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
lea r5, [idct64_mul_16bpc]
|
|
mova m0, [cq+64* 1]
|
|
mova m1, [cq+64*31]
|
|
mova m2, [cq+64*17]
|
|
mova m3, [cq+64*15]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
|
|
mova m0, [cq+64* 7]
|
|
mova m1, [cq+64*25]
|
|
mova m2, [cq+64*23]
|
|
mova m3, [cq+64* 9]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
|
|
mova m0, [cq+64* 5]
|
|
mova m1, [cq+64*27]
|
|
mova m2, [cq+64*21]
|
|
mova m3, [cq+64*11]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
|
|
mova m0, [cq+64* 3]
|
|
mova m1, [cq+64*29]
|
|
mova m2, [cq+64*19]
|
|
mova m3, [cq+64*13]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
|
|
mova m0, [cq+64* 2]
|
|
mova m1, [cq+64*14]
|
|
mova m2, [cq+64*18]
|
|
mova m3, [cq+64*30]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
|
|
mova m0, [cq+64* 6]
|
|
mova m1, [cq+64*10]
|
|
mova m2, [cq+64*22]
|
|
mova m3, [cq+64*26]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
|
|
mova m0, [cq+64* 4]
|
|
mova m1, [cq+64*12]
|
|
mova m2, [cq+64*20]
|
|
mova m3, [cq+64*28]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf_fast
|
|
mova m0, [cq+64* 0]
|
|
mova m1, [cq+64* 8]
|
|
mova m2, [cq+64*16]
|
|
mova m3, [cq+64*24]
|
|
pxor m15, m15
|
|
mov r7d, 64*30
|
|
.main_zero_loop:
|
|
mova [cq+r7-64*2], m15
|
|
mova [cq+r7-64*1], m15
|
|
mova [cq+r7+64*0], m15
|
|
mova [cq+r7+64*1], m15
|
|
sub r7d, 64*4
|
|
jg .main_zero_loop
|
|
.main_end:
|
|
psrld m15, m11, 10 ; pd_2
|
|
.main_end2:
|
|
add cq, 32
|
|
pxor m4, m4
|
|
REPX {mova x, m4}, m5, m6, m7
|
|
call m(idct_8x8_internal_10bpc).main
|
|
add r6, 32*8
|
|
call m(idct_8x16_internal_10bpc).main_evenhalf
|
|
mova [r6+32*2], m1
|
|
mova [r6+32*1], m2
|
|
mova [r6+32*0], m3
|
|
mova [r6-32*1], m4
|
|
mova [r6-32*2], m5
|
|
mova [r6-32*3], m6
|
|
mova [r6-32*4], m7
|
|
jmp .main_end_loop_start
|
|
.main_end_loop:
|
|
mova m0, [r6+32* 3] ; idct8 0 + n
|
|
.main_end_loop_start:
|
|
mova m1, [r5+32* 4] ; idct16 15 - n
|
|
mova m2, [r5-32*12] ; idct32 16 + n
|
|
mova m3, [r6-32*13] ; idct32 31 - n
|
|
mova m4, [r6-32*29] ; idct64 63 - n
|
|
mova m5, [r5-32*28] ; idct64 48 + n
|
|
mova m6, [r6-32*45] ; idct64 47 - n
|
|
mova m7, [r5-32*44] ; idct64 32 + n
|
|
paddd m8, m0, m1 ; idct16 out0 + n
|
|
psubd m0, m1 ; idct16 out15 - n
|
|
REPX {pmaxsd x, m12}, m8, m0
|
|
REPX {pminsd x, m13}, m8, m0
|
|
paddd m1, m8, m3 ; idct32 out0 + n
|
|
psubd m8, m3 ; idct32 out31 - n
|
|
paddd m3, m0, m2 ; idct32 out15 - n
|
|
psubd m0, m2 ; idct32 out16 + n
|
|
REPX {pmaxsd x, m12}, m1, m8, m3, m0
|
|
REPX {pminsd x, m13}, m1, m3, m8, m0
|
|
REPX {paddd x, m15}, m1, m3, m0, m8
|
|
paddd m2, m1, m4 ; idct64 out0 + n (unshifted)
|
|
psubd m1, m4 ; idct64 out63 - n (unshifted)
|
|
paddd m4, m3, m5 ; idct64 out15 - n (unshifted)
|
|
psubd m3, m5 ; idct64 out48 + n (unshifted)
|
|
paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
|
|
psubd m0, m6 ; idct64 out47 - n (unshifted)
|
|
paddd m6, m8, m7 ; idct64 out31 - n (unshifted)
|
|
psubd m8, m7 ; idct64 out32 + n (unshifted)
|
|
mova [r5-32*44], m2
|
|
mova [r6+32* 3], m1
|
|
mova [r6-32*45], m4
|
|
mova [r5+32* 4], m3
|
|
mova [r5-32*28], m5
|
|
mova [r6-32*13], m0
|
|
mova [r6-32*29], m6
|
|
mova [r5-32*12], m8
|
|
add r5, 32
|
|
sub r6, 32
|
|
cmp r5, r6
|
|
jl .main_end_loop
|
|
ret
|
|
.shift_transpose:
|
|
%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift
|
|
sub r6, 32*48
|
|
mov r5, r6
|
|
%%loop:
|
|
mova m0, [r6-32* 4]
|
|
mova m4, [r6+32* 4]
|
|
mova m1, [r6-32* 3]
|
|
mova m5, [r6+32* 5]
|
|
mova m2, [r6-32* 2]
|
|
mova m6, [r6+32* 6]
|
|
mova m3, [r6-32* 1]
|
|
mova m7, [r6+32* 7]
|
|
REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7
|
|
packssdw m0, m4
|
|
packssdw m1, m5
|
|
packssdw m2, m6
|
|
packssdw m3, m7
|
|
mova m4, [r6+32* 0]
|
|
mova m6, [r6+32* 8]
|
|
mova m5, [r6+32* 1]
|
|
mova m7, [r6+32* 9]
|
|
REPX {psrad x, %1}, m4, m6, m5, m7
|
|
packssdw m4, m6
|
|
packssdw m5, m7
|
|
mova m6, [r6+32* 2]
|
|
mova m8, [r6+32*10]
|
|
mova m7, [r6+32* 3]
|
|
mova m9, [r6+32*11]
|
|
REPX {psrad x, %1}, m6, m8, m7, m9
|
|
packssdw m6, m8
|
|
packssdw m7, m9
|
|
call m(idct_16x8_internal_10bpc).transpose3
|
|
mova [r5-32*4], m0
|
|
mova [r5-32*3], m1
|
|
mova [r5-32*2], m2
|
|
mova [r5-32*1], m3
|
|
mova [r5+32*0], m4
|
|
mova [r5+32*1], m5
|
|
mova [r5+32*2], m6
|
|
mova [r5+32*3], m7
|
|
add r6, 32*16
|
|
add r5, 32*8
|
|
cmp r5, r4
|
|
jl %%loop
|
|
mov r6, r4
|
|
%endmacro
|
|
IDCT64_SHIFT_TRANSPOSE 2
|
|
ret
|
|
|
|
cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jz .dconly
|
|
PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
vpbroadcastd m14, [pd_2896]
|
|
lea r6, [rsp+32*7]
|
|
call .main
|
|
cmp eobd, 36
|
|
jl .fast
|
|
call .main
|
|
cmp eobd, 136
|
|
jl .fast
|
|
call .main
|
|
cmp eobd, 300
|
|
jl .fast
|
|
call .main
|
|
jmp .pass2
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
mov [cq], eobd ; 0
|
|
or r3d, 32
|
|
add r6d, 128
|
|
sar r6d, 8
|
|
imul r6d, 181
|
|
add r6d, 384
|
|
sar r6d, 9
|
|
jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
|
|
.fast:
|
|
pxor m0, m0
|
|
lea r4, [rsp+32*135]
|
|
.fast_loop:
|
|
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
|
|
add r6, 32*8
|
|
cmp r6, r4
|
|
jl .fast_loop
|
|
.pass2:
|
|
lea r7, [r6-32*32]
|
|
lea r5, [r6+32*8]
|
|
lea r6, [pw_5+128]
|
|
imul r2, strideq, 19
|
|
lea r3, [strideq*3]
|
|
add r2, dstq
|
|
.pass2_loop:
|
|
mova m0, [r7-32*99]
|
|
mova m1, [r7-32*97]
|
|
mova m2, [r7-32*95]
|
|
mova m3, [r7-32*93]
|
|
mova m4, [r7-32*67]
|
|
mova m5, [r7-32*65]
|
|
mova m6, [r7-32*63]
|
|
mova m7, [r7-32*61]
|
|
mova m8, [r7-32*35]
|
|
mova m9, [r7-32*33]
|
|
mova m10, [r7-32*31]
|
|
mova m11, [r7-32*29]
|
|
mova m12, [r7-32* 3]
|
|
mova m13, [r7-32* 1]
|
|
mova m14, [r7+32* 1]
|
|
mova m15, [r7+32* 3]
|
|
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
|
|
mova m0, [r7-32*100]
|
|
mova m1, [r7-32*98]
|
|
mova m2, [r7-32*96]
|
|
mova m3, [r7-32*94]
|
|
mova m4, [r7-32*68]
|
|
mova m5, [r7-32*66]
|
|
mova m6, [r7-32*64]
|
|
mova m7, [r7-32*62]
|
|
mova m8, [r7-32*36]
|
|
mova m9, [r7-32*34]
|
|
mova m10, [r7-32*32]
|
|
mova m11, [r7-32*30]
|
|
mova m12, [r7-32* 4]
|
|
mova m13, [r7-32* 2]
|
|
mova m14, [r7+32* 0]
|
|
mova m15, [r7+32* 2]
|
|
add r7, 32*8
|
|
mova [rsp], m15
|
|
call m(idct_16x16_internal_8bpc).main
|
|
call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
|
|
sub dstq, r3
|
|
lea r2, [r2+r3+32]
|
|
add dstq, 32
|
|
cmp r7, r4
|
|
jl .pass2_loop
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
lea r5, [idct64_mul_16bpc]
|
|
pmulld m0, m14, [cq+128* 1]
|
|
pmulld m1, m14, [cq+128*31]
|
|
pmulld m2, m14, [cq+128*17]
|
|
pmulld m3, m14, [cq+128*15]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
|
|
pmulld m0, m14, [cq+128* 7]
|
|
pmulld m1, m14, [cq+128*25]
|
|
pmulld m2, m14, [cq+128*23]
|
|
pmulld m3, m14, [cq+128* 9]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
|
|
pmulld m0, m14, [cq+128* 5]
|
|
pmulld m1, m14, [cq+128*27]
|
|
pmulld m2, m14, [cq+128*21]
|
|
pmulld m3, m14, [cq+128*11]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
|
|
pmulld m0, m14, [cq+128* 3]
|
|
pmulld m1, m14, [cq+128*29]
|
|
pmulld m2, m14, [cq+128*19]
|
|
pmulld m3, m14, [cq+128*13]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
|
|
pmulld m0, m14, [cq+128* 2]
|
|
pmulld m1, m14, [cq+128*14]
|
|
pmulld m2, m14, [cq+128*18]
|
|
pmulld m3, m14, [cq+128*30]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2
|
|
pmulld m0, m14, [cq+128* 6]
|
|
pmulld m1, m14, [cq+128*10]
|
|
pmulld m2, m14, [cq+128*22]
|
|
pmulld m3, m14, [cq+128*26]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2
|
|
pmulld m0, m14, [cq+128* 4]
|
|
pmulld m1, m14, [cq+128*12]
|
|
pmulld m2, m14, [cq+128*20]
|
|
pmulld m3, m14, [cq+128*28]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2
|
|
pmulld m0, m14, [cq+128* 0]
|
|
pmulld m1, m14, [cq+128* 8]
|
|
pmulld m2, m14, [cq+128*16]
|
|
pmulld m3, m14, [cq+128*24]
|
|
pxor m15, m15
|
|
mov r7d, 128*29
|
|
.main_zero_loop:
|
|
mova [cq+r7-128*1], m15
|
|
mova [cq+r7+128*0], m15
|
|
mova [cq+r7+128*1], m15
|
|
mova [cq+r7+128*2], m15
|
|
sub r7d, 128*4
|
|
jg .main_zero_loop
|
|
psrld m15, m11, 11 ; pd_1
|
|
REPX {paddd x, m11}, m0, m1, m2, m3
|
|
REPX {psrad x, 12 }, m0, m1, m2, m3
|
|
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2
|
|
IDCT64_SHIFT_TRANSPOSE 1
|
|
ret
|
|
|
|
cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
|
|
test eobd, eobd
|
|
jz .dconly
|
|
PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob
|
|
%undef cmp
|
|
vpbroadcastd m11, [pd_2048]
|
|
vpbroadcastd m12, [clip_18b_min]
|
|
vpbroadcastd m13, [clip_18b_max]
|
|
vpbroadcastd m14, [pd_2896]
|
|
lea r6, [rsp+32*7]
|
|
call .main
|
|
cmp eobd, 36
|
|
jl .fast
|
|
call .main
|
|
cmp eobd, 136
|
|
jl .fast
|
|
call .main
|
|
cmp eobd, 300
|
|
jl .fast
|
|
call .main
|
|
jmp .pass2
|
|
.dconly:
|
|
imul r6d, [cq], 181
|
|
mov [cq], eobd ; 0
|
|
or r3d, 64
|
|
jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly
|
|
.fast:
|
|
pxor m0, m0
|
|
lea r4, [rsp+32*135]
|
|
.fast_loop:
|
|
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
|
|
add r6, 32*8
|
|
cmp r6, r4
|
|
jl .fast_loop
|
|
.pass2:
|
|
lea r10, [r6-32*32]
|
|
lea r6, [pw_5+128]
|
|
lea r8, [strideq*4]
|
|
lea r9, [strideq*5]
|
|
lea r3, [r9+strideq*1] ; stride*6
|
|
lea r7, [r9+strideq*2] ; stride*7
|
|
.pass2_loop:
|
|
mova m0, [r10-32*100] ; in0
|
|
mova m1, [r10-32*96] ; in4
|
|
mova m2, [r10-32*68] ; in8
|
|
mova m3, [r10-32*64] ; in12
|
|
mova m4, [r10-32*36] ; in16
|
|
mova m5, [r10-32*32] ; in20
|
|
mova m6, [r10-32* 4] ; in24
|
|
mova m7, [r10+32* 0] ; in28
|
|
pxor m8, m8
|
|
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
|
|
mova [rsp], m8
|
|
call m(idct_16x16_internal_8bpc).main
|
|
mova m1, [rsp+32*1]
|
|
mova [r4-32*4], m0
|
|
mova [r4-32*3], m1
|
|
mova [r4-32*2], m2
|
|
mova [r4-32*1], m3
|
|
mova [r4+32*0], m4
|
|
mova [r4+32*1], m5
|
|
mova [r4+32*2], m6
|
|
mova [r4+32*3], m7
|
|
add r4, 32*8
|
|
mova [r4-32*4], m8
|
|
mova [r4-32*3], m9
|
|
mova [r4-32*2], m10
|
|
mova [r4-32*1], m11
|
|
mova [r4+32*0], m12
|
|
mova [r4+32*1], m13
|
|
mova [r4+32*2], m14
|
|
mova [r4+32*3], m15
|
|
mova m0, [r10-32*98] ; in2
|
|
mova m1, [r10-32*94] ; in6
|
|
mova m2, [r10-32*66] ; in10
|
|
mova m3, [r10-32*62] ; in14
|
|
mova m4, [r10-32*34] ; in18
|
|
mova m5, [r10-32*30] ; in22
|
|
mova m6, [r10-32* 2] ; in26
|
|
mova m7, [r10+32* 2] ; in30
|
|
lea r5, [r4+32*16]
|
|
add r4, 32*8
|
|
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
|
|
mova m0, [r10-32*99] ; in1
|
|
mova m1, [r10+32* 3] ; in31
|
|
mova m2, [r10-32*35] ; in17
|
|
mova m3, [r10-32*61] ; in15
|
|
mova m4, [r10-32*67] ; in9
|
|
mova m5, [r10-32*29] ; in23
|
|
mova m6, [r10-32* 3] ; in25
|
|
mova m7, [r10-32*93] ; in7
|
|
lea r6, [idct64_mul - 8]
|
|
add r4, 32*16
|
|
add r5, 32*32
|
|
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
|
mova m0, [r10-32*95] ; in5
|
|
mova m1, [r10-32* 1] ; in27
|
|
mova m2, [r10-32*31] ; in21
|
|
mova m3, [r10-32*65] ; in11
|
|
mova m4, [r10-32*63] ; in13
|
|
mova m5, [r10-32*33] ; in19
|
|
mova m6, [r10+32* 1] ; in29
|
|
mova m7, [r10-32*97] ; in3
|
|
add r6, 8
|
|
add r4, 32*8
|
|
sub r5, 32*8
|
|
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
|
|
add r10, 32*8
|
|
sub dstq, r8
|
|
sub r4, 32*44
|
|
add dstq, 32
|
|
cmp r10, r4
|
|
jl .pass2_loop
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
lea r5, [idct64_mul_16bpc]
|
|
mova m0, [cq+128* 1]
|
|
mova m1, [cq+128*31]
|
|
mova m2, [cq+128*17]
|
|
mova m3, [cq+128*15]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
|
|
mova m0, [cq+128* 7]
|
|
mova m1, [cq+128*25]
|
|
mova m2, [cq+128*23]
|
|
mova m3, [cq+128* 9]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
|
|
mova m0, [cq+128* 5]
|
|
mova m1, [cq+128*27]
|
|
mova m2, [cq+128*21]
|
|
mova m3, [cq+128*11]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
|
|
mova m0, [cq+128* 3]
|
|
mova m1, [cq+128*29]
|
|
mova m2, [cq+128*19]
|
|
mova m3, [cq+128*13]
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
|
|
call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
|
|
mova m0, [cq+128* 2]
|
|
mova m1, [cq+128*14]
|
|
mova m2, [cq+128*18]
|
|
mova m3, [cq+128*30]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
|
|
mova m0, [cq+128* 6]
|
|
mova m1, [cq+128*10]
|
|
mova m2, [cq+128*22]
|
|
mova m3, [cq+128*26]
|
|
call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
|
|
mova m0, [cq+128* 4]
|
|
mova m1, [cq+128*12]
|
|
mova m2, [cq+128*20]
|
|
mova m3, [cq+128*28]
|
|
call m(idct_8x16_internal_10bpc).main_oddhalf_fast
|
|
mova m0, [cq+128* 0]
|
|
mova m1, [cq+128* 8]
|
|
mova m2, [cq+128*16]
|
|
mova m3, [cq+128*24]
|
|
pxor m15, m15
|
|
mov r7d, 128*29
|
|
.main_zero_loop:
|
|
mova [cq+r7-128*1], m15
|
|
mova [cq+r7+128*0], m15
|
|
mova [cq+r7+128*1], m15
|
|
mova [cq+r7+128*2], m15
|
|
sub r7d, 128*4
|
|
jg .main_zero_loop
|
|
call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
|
|
jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose
|
|
|
|
%endif ; ARCH_X86_64
|