1033 lines
33 KiB
NASM
1033 lines
33 KiB
NASM
; Copyright © 2021, VideoLAN and dav1d authors
|
|
; Copyright © 2021, Two Orioles, LLC
|
|
; Copyright (c) 2017-2021, The rav1e contributors
|
|
; Copyright (c) 2021, Nathan Egge
|
|
; All rights reserved.
|
|
;
|
|
; Redistribution and use in source and binary forms, with or without
|
|
; modification, are permitted provided that the following conditions are met:
|
|
;
|
|
; 1. Redistributions of source code must retain the above copyright notice, this
|
|
; list of conditions and the following disclaimer.
|
|
;
|
|
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
; this list of conditions and the following disclaimer in the documentation
|
|
; and/or other materials provided with the distribution.
|
|
;
|
|
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
%include "config.asm"
|
|
%include "ext/x86/x86inc.asm"
|
|
|
|
SECTION_RODATA
|
|
|
|
%macro DUP8 1-*
|
|
%rep %0
|
|
times 8 dw %1
|
|
%rotate 1
|
|
%endrep
|
|
%endmacro
|
|
|
|
pri_taps: DUP8 4, 2, 3, 3
|
|
dir_table: db 1 * 32 + 0, 2 * 32 + 0
|
|
db 1 * 32 + 0, 2 * 32 - 2
|
|
db -1 * 32 + 2, -2 * 32 + 4
|
|
db 0 * 32 + 2, -1 * 32 + 4
|
|
db 0 * 32 + 2, 0 * 32 + 4
|
|
db 0 * 32 + 2, 1 * 32 + 4
|
|
db 1 * 32 + 2, 2 * 32 + 4
|
|
db 1 * 32 + 0, 2 * 32 + 2
|
|
db 1 * 32 + 0, 2 * 32 + 0
|
|
db 1 * 32 + 0, 2 * 32 - 2
|
|
db -1 * 32 + 2, -2 * 32 + 4
|
|
db 0 * 32 + 2, -1 * 32 + 4
|
|
|
|
dir_shift: times 4 dw 0x4000
|
|
times 4 dw 0x1000
|
|
|
|
pw_128: times 4 dw 128
|
|
pw_2048: times 8 dw 2048
|
|
pw_m16384: times 8 dw -16384
|
|
|
|
cextern cdef_dir_8bpc_ssse3.main
|
|
cextern cdef_dir_8bpc_sse4.main
|
|
cextern shufw_6543210x
|
|
|
|
SECTION .text
|
|
|
|
%if ARCH_X86_32
|
|
DECLARE_REG_TMP 5, 3
|
|
%elif WIN64
|
|
DECLARE_REG_TMP 8, 4
|
|
%else
|
|
DECLARE_REG_TMP 8, 6
|
|
%endif
|
|
|
|
%macro CDEF_FILTER 2 ; w, h
|
|
%if ARCH_X86_64
|
|
DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir
|
|
mova m8, [base+pw_2048]
|
|
%else
|
|
DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
|
|
%define m8 [base+pw_2048]
|
|
%define m9 [rsp+16*1+gprsize]
|
|
%define m10 [rsp+16*2+gprsize]
|
|
%endif
|
|
movifnidn prid, r5m
|
|
movifnidn secd, r6m
|
|
test prid, prid
|
|
jz .sec_only
|
|
movd m6, r5m
|
|
%if ARCH_X86_32
|
|
mov [rsp+24], pridmpd
|
|
%endif
|
|
bsr pridmpd, prid
|
|
lea tmpd, [priq*4]
|
|
cmp dword r10m, 0x3ff ; if (bpc == 10)
|
|
cmove prid, tmpd ; pri <<= 2
|
|
mov tmpd, r8m ; damping
|
|
mov dird, r7m
|
|
and prid, 16
|
|
pshufb m6, m7 ; splat
|
|
lea dirq, [base+dir_table+dirq*2]
|
|
lea priq, [base+pri_taps+priq*2]
|
|
test secd, secd
|
|
jz .pri_only
|
|
mova [rsp], m6
|
|
movd m6, secd
|
|
tzcnt secd, secd
|
|
sub pridmpd, tmpd
|
|
sub tmpd, secd
|
|
pshufb m6, m7
|
|
xor secd, secd
|
|
neg pridmpd
|
|
cmovs pridmpd, secd
|
|
%if ARCH_X86_32
|
|
mov [pri_shift+4], secd
|
|
mov [sec_shift+4], secd
|
|
%endif
|
|
mov [pri_shift+0], pridmpq
|
|
mov [sec_shift+0], tmpq
|
|
lea tmpq, [px]
|
|
%if WIN64
|
|
movaps r4m, m9
|
|
movaps r6m, m10
|
|
%elif ARCH_X86_32
|
|
mov pridmpd, [rsp+24]
|
|
%endif
|
|
%rep %1*%2/8
|
|
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
|
|
%endrep
|
|
%if WIN64
|
|
movaps m9, r4m
|
|
movaps m10, r6m
|
|
%endif
|
|
jmp .end
|
|
.pri_only:
|
|
sub tmpd, pridmpd
|
|
cmovs tmpd, secd
|
|
%if ARCH_X86_32
|
|
mov pridmpd, [rsp+24]
|
|
mov [pri_shift+4], secd
|
|
%endif
|
|
mov [pri_shift+0], tmpq
|
|
lea tmpq, [px]
|
|
%rep %1*%2/8
|
|
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
|
|
%endrep
|
|
.end:
|
|
RET
|
|
.sec_only:
|
|
mov tmpd, r8m ; damping
|
|
movd m6, r6m
|
|
tzcnt secd, secd
|
|
mov dird, r7m
|
|
pshufb m6, m7
|
|
sub tmpd, secd
|
|
lea dirq, [base+dir_table+dirq*2]
|
|
%if ARCH_X86_32
|
|
mov [sec_shift+4], prid
|
|
%endif
|
|
mov [sec_shift+0], tmpq
|
|
lea tmpq, [px]
|
|
%rep %1*%2/8
|
|
call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
|
|
%endrep
|
|
jmp .end
|
|
%if %1 == %2
|
|
%if ARCH_X86_64
|
|
DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir
|
|
%else
|
|
DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
|
|
%endif
|
|
ALIGN function_align
|
|
.pri:
|
|
movsx offq, byte [dirq+4] ; off_k0
|
|
%if %1 == 4
|
|
movq m1, [dstq+strideq*0]
|
|
movhps m1, [dstq+strideq*1]
|
|
movq m2, [tmpq+offq+32*0] ; k0p0
|
|
movhps m2, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m3, [tmpq+offq+32*0] ; k0p1
|
|
movhps m3, [tmpq+offq+32*1]
|
|
%else
|
|
mova m1, [dstq]
|
|
movu m2, [tmpq+offq]
|
|
neg offq
|
|
movu m3, [tmpq+offq]
|
|
%endif
|
|
movsx offq, byte [dirq+5] ; off_k1
|
|
psubw m2, m1 ; diff_k0p0
|
|
psubw m3, m1 ; diff_k0p1
|
|
pabsw m4, m2 ; adiff_k0p0
|
|
psrlw m5, m4, [pri_shift+gprsize]
|
|
psubusw m0, m6, m5
|
|
pabsw m5, m3 ; adiff_k0p1
|
|
pminsw m0, m4
|
|
psrlw m4, m5, [pri_shift+gprsize]
|
|
psignw m0, m2 ; constrain(diff_k0p0)
|
|
psubusw m2, m6, m4
|
|
pminsw m2, m5
|
|
%if %1 == 4
|
|
movq m4, [tmpq+offq+32*0] ; k1p0
|
|
movhps m4, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m5, [tmpq+offq+32*0] ; k1p1
|
|
movhps m5, [tmpq+offq+32*1]
|
|
%else
|
|
movu m4, [tmpq+offq]
|
|
neg offq
|
|
movu m5, [tmpq+offq]
|
|
%endif
|
|
psubw m4, m1 ; diff_k1p0
|
|
psubw m5, m1 ; diff_k1p1
|
|
psignw m2, m3 ; constrain(diff_k0p1)
|
|
pabsw m3, m4 ; adiff_k1p0
|
|
paddw m0, m2 ; constrain(diff_k0)
|
|
psrlw m2, m3, [pri_shift+gprsize]
|
|
psubusw m7, m6, m2
|
|
pabsw m2, m5 ; adiff_k1p1
|
|
pminsw m7, m3
|
|
psrlw m3, m2, [pri_shift+gprsize]
|
|
psignw m7, m4 ; constrain(diff_k1p0)
|
|
psubusw m4, m6, m3
|
|
pminsw m4, m2
|
|
psignw m4, m5 ; constrain(diff_k1p1)
|
|
paddw m7, m4 ; constrain(diff_k1)
|
|
pmullw m0, [priq+16*0] ; pri_tap_k0
|
|
pmullw m7, [priq+16*1] ; pri_tap_k1
|
|
paddw m0, m7 ; sum
|
|
psraw m2, m0, 15
|
|
paddw m0, m2
|
|
pmulhrsw m0, m8
|
|
paddw m0, m1
|
|
%if %1 == 4
|
|
add tmpq, 32*2
|
|
movq [dstq+strideq*0], m0
|
|
movhps [dstq+strideq*1], m0
|
|
lea dstq, [dstq+strideq*2]
|
|
%else
|
|
add tmpq, 32
|
|
mova [dstq], m0
|
|
add dstq, strideq
|
|
%endif
|
|
ret
|
|
ALIGN function_align
|
|
.sec:
|
|
movsx offq, byte [dirq+8] ; off1_k0
|
|
%if %1 == 4
|
|
movq m1, [dstq+strideq*0]
|
|
movhps m1, [dstq+strideq*1]
|
|
movq m2, [tmpq+offq+32*0] ; k0s0
|
|
movhps m2, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m3, [tmpq+offq+32*0] ; k0s1
|
|
movhps m3, [tmpq+offq+32*1]
|
|
%else
|
|
mova m1, [dstq]
|
|
movu m2, [tmpq+offq]
|
|
neg offq
|
|
movu m3, [tmpq+offq]
|
|
%endif
|
|
movsx offq, byte [dirq+0] ; off2_k0
|
|
psubw m2, m1 ; diff_k0s0
|
|
psubw m3, m1 ; diff_k0s1
|
|
pabsw m4, m2 ; adiff_k0s0
|
|
psrlw m5, m4, [sec_shift+gprsize]
|
|
psubusw m0, m6, m5
|
|
pabsw m5, m3 ; adiff_k0s1
|
|
pminsw m0, m4
|
|
psrlw m4, m5, [sec_shift+gprsize]
|
|
psignw m0, m2 ; constrain(diff_k0s0)
|
|
psubusw m2, m6, m4
|
|
pminsw m2, m5
|
|
%if %1 == 4
|
|
movq m4, [tmpq+offq+32*0] ; k0s2
|
|
movhps m4, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m5, [tmpq+offq+32*0] ; k0s3
|
|
movhps m5, [tmpq+offq+32*1]
|
|
%else
|
|
movu m4, [tmpq+offq]
|
|
neg offq
|
|
movu m5, [tmpq+offq]
|
|
%endif
|
|
movsx offq, byte [dirq+9] ; off1_k1
|
|
psubw m4, m1 ; diff_k0s2
|
|
psubw m5, m1 ; diff_k0s3
|
|
psignw m2, m3 ; constrain(diff_k0s1)
|
|
pabsw m3, m4 ; adiff_k0s2
|
|
paddw m0, m2
|
|
psrlw m2, m3, [sec_shift+gprsize]
|
|
psubusw m7, m6, m2
|
|
pabsw m2, m5 ; adiff_k0s3
|
|
pminsw m7, m3
|
|
psrlw m3, m2, [sec_shift+gprsize]
|
|
psignw m7, m4 ; constrain(diff_k0s2)
|
|
psubusw m4, m6, m3
|
|
pminsw m4, m2
|
|
%if %1 == 4
|
|
movq m2, [tmpq+offq+32*0] ; k1s0
|
|
movhps m2, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m3, [tmpq+offq+32*0] ; k1s1
|
|
movhps m3, [tmpq+offq+32*1]
|
|
%else
|
|
movu m2, [tmpq+offq]
|
|
neg offq
|
|
movu m3, [tmpq+offq]
|
|
%endif
|
|
movsx offq, byte [dirq+1] ; off2_k1
|
|
paddw m0, m7
|
|
psignw m4, m5 ; constrain(diff_k0s3)
|
|
paddw m0, m4 ; constrain(diff_k0)
|
|
psubw m2, m1 ; diff_k1s0
|
|
psubw m3, m1 ; diff_k1s1
|
|
paddw m0, m0 ; sec_tap_k0
|
|
pabsw m4, m2 ; adiff_k1s0
|
|
psrlw m5, m4, [sec_shift+gprsize]
|
|
psubusw m7, m6, m5
|
|
pabsw m5, m3 ; adiff_k1s1
|
|
pminsw m7, m4
|
|
psrlw m4, m5, [sec_shift+gprsize]
|
|
psignw m7, m2 ; constrain(diff_k1s0)
|
|
psubusw m2, m6, m4
|
|
pminsw m2, m5
|
|
%if %1 == 4
|
|
movq m4, [tmpq+offq+32*0] ; k1s2
|
|
movhps m4, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m5, [tmpq+offq+32*0] ; k1s3
|
|
movhps m5, [tmpq+offq+32*1]
|
|
%else
|
|
movu m4, [tmpq+offq]
|
|
neg offq
|
|
movu m5, [tmpq+offq]
|
|
%endif
|
|
paddw m0, m7
|
|
psubw m4, m1 ; diff_k1s2
|
|
psubw m5, m1 ; diff_k1s3
|
|
psignw m2, m3 ; constrain(diff_k1s1)
|
|
pabsw m3, m4 ; adiff_k1s2
|
|
paddw m0, m2
|
|
psrlw m2, m3, [sec_shift+gprsize]
|
|
psubusw m7, m6, m2
|
|
pabsw m2, m5 ; adiff_k1s3
|
|
pminsw m7, m3
|
|
psrlw m3, m2, [sec_shift+gprsize]
|
|
psignw m7, m4 ; constrain(diff_k1s2)
|
|
psubusw m4, m6, m3
|
|
pminsw m4, m2
|
|
paddw m0, m7
|
|
psignw m4, m5 ; constrain(diff_k1s3)
|
|
paddw m0, m4 ; sum
|
|
psraw m2, m0, 15
|
|
paddw m0, m2
|
|
pmulhrsw m0, m8
|
|
paddw m0, m1
|
|
%if %1 == 4
|
|
add tmpq, 32*2
|
|
movq [dstq+strideq*0], m0
|
|
movhps [dstq+strideq*1], m0
|
|
lea dstq, [dstq+strideq*2]
|
|
%else
|
|
add tmpq, 32
|
|
mova [dstq], m0
|
|
add dstq, strideq
|
|
%endif
|
|
ret
|
|
ALIGN function_align
|
|
.pri_sec:
|
|
movsx offq, byte [dirq+8] ; off2_k0
|
|
%if %1 == 4
|
|
movq m1, [dstq+strideq*0]
|
|
movhps m1, [dstq+strideq*1]
|
|
movq m2, [tmpq+offq+32*0] ; k0s0
|
|
movhps m2, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m3, [tmpq+offq+32*0] ; k0s1
|
|
movhps m3, [tmpq+offq+32*1]
|
|
%else
|
|
mova m1, [dstq]
|
|
movu m2, [tmpq+offq]
|
|
neg offq
|
|
movu m3, [tmpq+offq]
|
|
%endif
|
|
movsx offq, byte [dirq+0] ; off3_k0
|
|
pabsw m4, m2
|
|
%if ARCH_X86_64
|
|
pabsw m10, m3
|
|
pmaxsw m9, m2, m3
|
|
pminsw m10, m4
|
|
%else
|
|
pabsw m7, m3
|
|
pmaxsw m5, m2, m3
|
|
pminsw m4, m7
|
|
mova m9, m5
|
|
mova m10, m4
|
|
%endif
|
|
psubw m2, m1 ; diff_k0s0
|
|
psubw m3, m1 ; diff_k0s1
|
|
pabsw m4, m2 ; adiff_k0s0
|
|
psrlw m5, m4, [sec_shift+gprsize]
|
|
psubusw m0, m6, m5
|
|
pabsw m5, m3 ; adiff_k0s1
|
|
pminsw m0, m4
|
|
psrlw m4, m5, [sec_shift+gprsize]
|
|
psignw m0, m2 ; constrain(diff_k0s0)
|
|
psubusw m2, m6, m4
|
|
pminsw m2, m5
|
|
%if %1 == 4
|
|
movq m4, [tmpq+offq+32*0] ; k0s2
|
|
movhps m4, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m5, [tmpq+offq+32*0] ; k0s3
|
|
movhps m5, [tmpq+offq+32*1]
|
|
%else
|
|
movu m4, [tmpq+offq]
|
|
neg offq
|
|
movu m5, [tmpq+offq]
|
|
%endif
|
|
movsx offq, byte [dirq+9] ; off2_k1
|
|
pabsw m7, m4
|
|
psignw m2, m3
|
|
pabsw m3, m5 ; constrain(diff_k0s1)
|
|
%if ARCH_X86_64
|
|
pmaxsw m9, m4
|
|
pminsw m10, m7
|
|
pmaxsw m9, m5
|
|
pminsw m10, m3
|
|
%else
|
|
pminsw m7, m10
|
|
pminsw m7, m3
|
|
pmaxsw m3, m9, m4
|
|
pmaxsw m3, m5
|
|
mova m10, m7
|
|
mova m9, m3
|
|
%endif
|
|
psubw m4, m1 ; diff_k0s2
|
|
psubw m5, m1 ; diff_k0s3
|
|
paddw m0, m2
|
|
pabsw m3, m4 ; adiff_k0s2
|
|
psrlw m2, m3, [sec_shift+gprsize]
|
|
psubusw m7, m6, m2
|
|
pabsw m2, m5 ; adiff_k0s3
|
|
pminsw m7, m3
|
|
psrlw m3, m2, [sec_shift+gprsize]
|
|
psignw m7, m4 ; constrain(diff_k0s2)
|
|
psubusw m4, m6, m3
|
|
pminsw m4, m2
|
|
%if %1 == 4
|
|
movq m2, [tmpq+offq+32*0] ; k1s0
|
|
movhps m2, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m3, [tmpq+offq+32*0] ; k1s1
|
|
movhps m3, [tmpq+offq+32*1]
|
|
%else
|
|
movu m2, [tmpq+offq]
|
|
neg offq
|
|
movu m3, [tmpq+offq]
|
|
%endif
|
|
movsx offq, byte [dirq+1] ; off3_k1
|
|
paddw m0, m7
|
|
pabsw m7, m2
|
|
psignw m4, m5 ; constrain(diff_k0s3)
|
|
pabsw m5, m3
|
|
%if ARCH_X86_64
|
|
pmaxsw m9, m2
|
|
pminsw m10, m7
|
|
pmaxsw m9, m3
|
|
pminsw m10, m5
|
|
%else
|
|
pminsw m7, m10
|
|
pminsw m7, m5
|
|
pmaxsw m5, m9, m2
|
|
pmaxsw m5, m3
|
|
mova m10, m7
|
|
mova m9, m5
|
|
%endif
|
|
paddw m0, m4 ; constrain(diff_k0)
|
|
psubw m2, m1 ; diff_k1s0
|
|
psubw m3, m1 ; diff_k1s1
|
|
paddw m0, m0 ; sec_tap_k0
|
|
pabsw m4, m2 ; adiff_k1s0
|
|
psrlw m5, m4, [sec_shift+gprsize]
|
|
psubusw m7, m6, m5
|
|
pabsw m5, m3 ; adiff_k1s1
|
|
pminsw m7, m4
|
|
psrlw m4, m5, [sec_shift+gprsize]
|
|
psignw m7, m2 ; constrain(diff_k1s0)
|
|
psubusw m2, m6, m4
|
|
pminsw m2, m5
|
|
%if %1 == 4
|
|
movq m4, [tmpq+offq+32*0] ; k1s2
|
|
movhps m4, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m5, [tmpq+offq+32*0] ; k1s3
|
|
movhps m5, [tmpq+offq+32*1]
|
|
%else
|
|
movu m4, [tmpq+offq]
|
|
neg offq
|
|
movu m5, [tmpq+offq]
|
|
%endif
|
|
movsx offq, byte [dirq+4] ; off1_k0
|
|
paddw m0, m7
|
|
pabsw m7, m4
|
|
psignw m2, m3 ; constrain(diff_k1s1)
|
|
pabsw m3, m5
|
|
%if ARCH_X86_64
|
|
pmaxsw m9, m4
|
|
pminsw m10, m7
|
|
pmaxsw m9, m5
|
|
pminsw m10, m3
|
|
%else
|
|
pminsw m7, m10
|
|
pminsw m7, m3
|
|
pmaxsw m3, m9, m4
|
|
pmaxsw m3, m5
|
|
mova m10, m7
|
|
mova m9, m3
|
|
%endif
|
|
psubw m4, m1 ; diff_k1s2
|
|
psubw m5, m1 ; diff_k1s3
|
|
pabsw m3, m4 ; adiff_k1s2
|
|
paddw m0, m2
|
|
psrlw m2, m3, [sec_shift+gprsize]
|
|
psubusw m7, m6, m2
|
|
pabsw m2, m5 ; adiff_k1s3
|
|
pminsw m7, m3
|
|
psrlw m3, m2, [sec_shift+gprsize]
|
|
psignw m7, m4 ; constrain(diff_k1s2)
|
|
psubusw m4, m6, m3
|
|
pminsw m4, m2
|
|
paddw m0, m7
|
|
%if %1 == 4
|
|
movq m2, [tmpq+offq+32*0] ; k0p0
|
|
movhps m2, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m3, [tmpq+offq+32*0] ; k0p1
|
|
movhps m3, [tmpq+offq+32*1]
|
|
%else
|
|
movu m2, [tmpq+offq]
|
|
neg offq
|
|
movu m3, [tmpq+offq]
|
|
%endif
|
|
movsx offq, byte [dirq+5] ; off1_k1
|
|
pabsw m7, m2
|
|
psignw m4, m5 ; constrain(diff_k1s3)
|
|
pabsw m5, m3
|
|
%if ARCH_X86_64
|
|
pmaxsw m9, m2
|
|
pminsw m10, m7
|
|
pmaxsw m9, m3
|
|
pminsw m10, m5
|
|
%else
|
|
pminsw m7, m10
|
|
pminsw m7, m5
|
|
pmaxsw m5, m9, m2
|
|
pmaxsw m5, m3
|
|
mova m10, m7
|
|
mova m9, m5
|
|
%endif
|
|
psubw m2, m1 ; diff_k0p0
|
|
psubw m3, m1 ; diff_k0p1
|
|
paddw m0, m4
|
|
pabsw m4, m2 ; adiff_k0p0
|
|
psrlw m5, m4, [pri_shift+gprsize]
|
|
psubusw m7, [rsp+gprsize], m5
|
|
pabsw m5, m3 ; adiff_k0p1
|
|
pminsw m7, m4
|
|
psrlw m4, m5, [pri_shift+gprsize]
|
|
psignw m7, m2 ; constrain(diff_k0p0)
|
|
psubusw m2, [rsp+gprsize], m4
|
|
pminsw m2, m5
|
|
%if %1 == 4
|
|
movq m4, [tmpq+offq+32*0] ; k1p0
|
|
movhps m4, [tmpq+offq+32*1]
|
|
neg offq
|
|
movq m5, [tmpq+offq+32*0] ; k1p1
|
|
movhps m5, [tmpq+offq+32*1]
|
|
%else
|
|
movu m4, [tmpq+offq]
|
|
neg offq
|
|
movu m5, [tmpq+offq]
|
|
%endif
|
|
psignw m2, m3 ; constrain(diff_k0p1)
|
|
pabsw m3, m4
|
|
paddw m7, m2 ; constrain(diff_k0)
|
|
pabsw m2, m5
|
|
%if ARCH_X86_64
|
|
pmaxsw m9, m4
|
|
pminsw m10, m3
|
|
pmaxsw m9, m5
|
|
pminsw m10, m2
|
|
%else
|
|
pminsw m3, m10
|
|
pminsw m3, m2
|
|
pmaxsw m2, m9, m4
|
|
pmaxsw m2, m5
|
|
mova m10, m3
|
|
mova m9, m2
|
|
%endif
|
|
psubw m4, m1 ; diff_k1p0
|
|
psubw m5, m1 ; diff_k1p1
|
|
pabsw m3, m4 ; adiff_k1p0
|
|
pmullw m7, [priq+16*0] ; pri_tap_k0
|
|
paddw m0, m7
|
|
psrlw m2, m3, [pri_shift+gprsize]
|
|
psubusw m7, [rsp+16*0+gprsize], m2
|
|
pabsw m2, m5 ; adiff_k1p1
|
|
pminsw m7, m3
|
|
psrlw m3, m2, [pri_shift+gprsize]
|
|
psignw m7, m4 ; constrain(diff_k1p0)
|
|
psubusw m4, [rsp+16*0+gprsize], m3
|
|
pminsw m4, m2
|
|
psignw m4, m5 ; constrain(diff_k1p1)
|
|
paddw m7, m4 ; constrain(diff_k1)
|
|
pmullw m7, [priq+16*1] ; pri_tap_k1
|
|
paddw m0, m7 ; sum
|
|
psraw m2, m0, 15
|
|
paddw m0, m2
|
|
pmulhrsw m0, m8
|
|
paddw m0, m1
|
|
%if ARCH_X86_64
|
|
pmaxsw m9, m1
|
|
pminsw m0, m9
|
|
%else
|
|
pmaxsw m2, m9, m1
|
|
pminsw m0, m2
|
|
%endif
|
|
pminsw m1, m10
|
|
pmaxsw m0, m1
|
|
%if %1 == 4
|
|
add tmpq, 32*2
|
|
movq [dstq+strideq*0], m0
|
|
movhps [dstq+strideq*1], m0
|
|
lea dstq, [dstq+strideq*2]
|
|
%else
|
|
add tmpq, 32
|
|
mova [dstq], m0
|
|
add dstq, strideq
|
|
%endif
|
|
ret
|
|
%endif
|
|
%endmacro
|
|
|
|
INIT_XMM ssse3
|
|
%if ARCH_X86_64
|
|
cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \
|
|
pri, sec, edge
|
|
%define px rsp+32*4
|
|
%else
|
|
cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
|
|
%define botq topq
|
|
%define px rsp+32*5
|
|
%endif
|
|
%define base t0-dir_table
|
|
%define pri_shift px-16*6
|
|
%define sec_shift px-16*5
|
|
mov edged, r9m
|
|
LEA t0, dir_table
|
|
movu m0, [dstq+strideq*0]
|
|
movu m1, [dstq+strideq*1]
|
|
lea t1, [dstq+strideq*2]
|
|
movu m2, [t1 +strideq*0]
|
|
movu m3, [t1 +strideq*1]
|
|
movddup m7, [base+pw_m16384]
|
|
mova [px+32*0+0], m0
|
|
mova [px+32*1+0], m1
|
|
mova [px+32*2+0], m2
|
|
mova [px+32*3+0], m3
|
|
test edgeb, 4 ; HAVE_TOP
|
|
jz .no_top
|
|
movifnidn topq, topmp
|
|
movu m0, [topq+strideq*0]
|
|
movu m1, [topq+strideq*1]
|
|
mova [px-32*2+0], m0
|
|
mova [px-32*1+0], m1
|
|
test edgeb, 1 ; HAVE_LEFT
|
|
jz .top_no_left
|
|
movd m0, [topq+strideq*0-4]
|
|
movd m1, [topq+strideq*1-4]
|
|
movd [px-32*2-4], m0
|
|
movd [px-32*1-4], m1
|
|
jmp .top_done
|
|
.no_top:
|
|
mova [px-32*2+0], m7
|
|
mova [px-32*1+0], m7
|
|
.top_no_left:
|
|
movd [px-32*2-4], m7
|
|
movd [px-32*1-4], m7
|
|
.top_done:
|
|
test edgeb, 8 ; HAVE_BOTTOM
|
|
jz .no_bottom
|
|
movifnidn botq, r4mp
|
|
movu m0, [botq+strideq*0]
|
|
movu m1, [botq+strideq*1]
|
|
mova [px+32*4+0], m0
|
|
mova [px+32*5+0], m1
|
|
test edgeb, 1 ; HAVE_LEFT
|
|
jz .bottom_no_left
|
|
movd m0, [botq+strideq*0-4]
|
|
movd m1, [botq+strideq*1-4]
|
|
movd [px+32*4-4], m0
|
|
movd [px+32*5-4], m1
|
|
jmp .bottom_done
|
|
.no_bottom:
|
|
mova [px+32*4+0], m7
|
|
mova [px+32*5+0], m7
|
|
.bottom_no_left:
|
|
movd [px+32*4-4], m7
|
|
movd [px+32*5-4], m7
|
|
.bottom_done:
|
|
test edgeb, 1 ; HAVE_LEFT
|
|
jz .no_left
|
|
movifnidn leftq, r2mp
|
|
movd m0, [leftq+4*0]
|
|
movd m1, [leftq+4*1]
|
|
movd m2, [leftq+4*2]
|
|
movd m3, [leftq+4*3]
|
|
movd [px+32*0-4], m0
|
|
movd [px+32*1-4], m1
|
|
movd [px+32*2-4], m2
|
|
movd [px+32*3-4], m3
|
|
jmp .left_done
|
|
.no_left:
|
|
REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
|
|
.left_done:
|
|
test edgeb, 2 ; HAVE_RIGHT
|
|
jnz .padding_done
|
|
REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
|
|
.padding_done:
|
|
CDEF_FILTER 4, 4
|
|
|
|
%if ARCH_X86_64
|
|
cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
|
|
pri, sec, edge
|
|
%else
|
|
cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
|
|
%endif
|
|
mov edged, r9m
|
|
LEA t0, dir_table
|
|
movu m0, [dstq+strideq*0]
|
|
movu m1, [dstq+strideq*1]
|
|
lea t1, [dstq+strideq*2]
|
|
movu m2, [t1 +strideq*0]
|
|
movu m3, [t1 +strideq*1]
|
|
lea t1, [t1 +strideq*2]
|
|
movu m4, [t1 +strideq*0]
|
|
movu m5, [t1 +strideq*1]
|
|
lea t1, [t1 +strideq*2]
|
|
movu m6, [t1 +strideq*0]
|
|
movu m7, [t1 +strideq*1]
|
|
mova [px+32*0+0], m0
|
|
mova [px+32*1+0], m1
|
|
mova [px+32*2+0], m2
|
|
mova [px+32*3+0], m3
|
|
mova [px+32*4+0], m4
|
|
mova [px+32*5+0], m5
|
|
mova [px+32*6+0], m6
|
|
mova [px+32*7+0], m7
|
|
movddup m7, [base+pw_m16384]
|
|
test edgeb, 4 ; HAVE_TOP
|
|
jz .no_top
|
|
movifnidn topq, topmp
|
|
movu m0, [topq+strideq*0]
|
|
movu m1, [topq+strideq*1]
|
|
mova [px-32*2+0], m0
|
|
mova [px-32*1+0], m1
|
|
test edgeb, 1 ; HAVE_LEFT
|
|
jz .top_no_left
|
|
movd m0, [topq+strideq*0-4]
|
|
movd m1, [topq+strideq*1-4]
|
|
movd [px-32*2-4], m0
|
|
movd [px-32*1-4], m1
|
|
jmp .top_done
|
|
.no_top:
|
|
mova [px-32*2+0], m7
|
|
mova [px-32*1+0], m7
|
|
.top_no_left:
|
|
movd [px-32*2-4], m7
|
|
movd [px-32*1-4], m7
|
|
.top_done:
|
|
test edgeb, 8 ; HAVE_BOTTOM
|
|
jz .no_bottom
|
|
movifnidn botq, r4mp
|
|
movu m0, [botq+strideq*0]
|
|
movu m1, [botq+strideq*1]
|
|
mova [px+32*8+0], m0
|
|
mova [px+32*9+0], m1
|
|
test edgeb, 1 ; HAVE_LEFT
|
|
jz .bottom_no_left
|
|
movd m0, [botq+strideq*0-4]
|
|
movd m1, [botq+strideq*1-4]
|
|
movd [px+32*8-4], m0
|
|
movd [px+32*9-4], m1
|
|
jmp .bottom_done
|
|
.no_bottom:
|
|
mova [px+32*8+0], m7
|
|
mova [px+32*9+0], m7
|
|
.bottom_no_left:
|
|
movd [px+32*8-4], m7
|
|
movd [px+32*9-4], m7
|
|
.bottom_done:
|
|
test edgeb, 1 ; HAVE_LEFT
|
|
jz .no_left
|
|
movifnidn leftq, r2mp
|
|
movd m0, [leftq+4*0]
|
|
movd m1, [leftq+4*1]
|
|
movd m2, [leftq+4*2]
|
|
movd m3, [leftq+4*3]
|
|
movd [px+32*0-4], m0
|
|
movd [px+32*1-4], m1
|
|
movd [px+32*2-4], m2
|
|
movd [px+32*3-4], m3
|
|
movd m0, [leftq+4*4]
|
|
movd m1, [leftq+4*5]
|
|
movd m2, [leftq+4*6]
|
|
movd m3, [leftq+4*7]
|
|
movd [px+32*4-4], m0
|
|
movd [px+32*5-4], m1
|
|
movd [px+32*6-4], m2
|
|
movd [px+32*7-4], m3
|
|
jmp .left_done
|
|
.no_left:
|
|
REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
|
|
.left_done:
|
|
test edgeb, 2 ; HAVE_RIGHT
|
|
jnz .padding_done
|
|
REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
|
|
.padding_done:
|
|
CDEF_FILTER 4, 8
|
|
|
|
%if ARCH_X86_64
|
|
cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \
|
|
pri, sec, edge
|
|
%else
|
|
cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
|
|
%endif
|
|
mov edged, r9m
|
|
LEA t0, dir_table
|
|
mova m0, [dstq+strideq*0+ 0]
|
|
movd m1, [dstq+strideq*0+16]
|
|
mova m2, [dstq+strideq*1+ 0]
|
|
movd m3, [dstq+strideq*1+16]
|
|
lea t1, [dstq+strideq*2]
|
|
mova m4, [t1 +strideq*0+ 0]
|
|
movd m5, [t1 +strideq*0+16]
|
|
mova m6, [t1 +strideq*1+ 0]
|
|
movd m7, [t1 +strideq*1+16]
|
|
lea t1, [t1 +strideq*2]
|
|
mova [px+32*0+ 0], m0
|
|
movd [px+32*0+16], m1
|
|
mova [px+32*1+ 0], m2
|
|
movd [px+32*1+16], m3
|
|
mova [px+32*2+ 0], m4
|
|
movd [px+32*2+16], m5
|
|
mova [px+32*3+ 0], m6
|
|
movd [px+32*3+16], m7
|
|
mova m0, [t1 +strideq*0+ 0]
|
|
movd m1, [t1 +strideq*0+16]
|
|
mova m2, [t1 +strideq*1+ 0]
|
|
movd m3, [t1 +strideq*1+16]
|
|
lea t1, [t1 +strideq*2]
|
|
mova m4, [t1 +strideq*0+ 0]
|
|
movd m5, [t1 +strideq*0+16]
|
|
mova m6, [t1 +strideq*1+ 0]
|
|
movd m7, [t1 +strideq*1+16]
|
|
mova [px+32*4+ 0], m0
|
|
movd [px+32*4+16], m1
|
|
mova [px+32*5+ 0], m2
|
|
movd [px+32*5+16], m3
|
|
mova [px+32*6+ 0], m4
|
|
movd [px+32*6+16], m5
|
|
mova [px+32*7+ 0], m6
|
|
movd [px+32*7+16], m7
|
|
movddup m7, [base+pw_m16384]
|
|
test edgeb, 4 ; HAVE_TOP
|
|
jz .no_top
|
|
movifnidn topq, topmp
|
|
mova m0, [topq+strideq*0+ 0]
|
|
mova m1, [topq+strideq*0+16]
|
|
mova m2, [topq+strideq*1+ 0]
|
|
mova m3, [topq+strideq*1+16]
|
|
mova [px-32*2+ 0], m0
|
|
movd [px-32*2+16], m1
|
|
mova [px-32*1+ 0], m2
|
|
movd [px-32*1+16], m3
|
|
test edgeb, 1 ; HAVE_LEFT
|
|
jz .top_no_left
|
|
movd m0, [topq+strideq*0-4]
|
|
movd m1, [topq+strideq*1-4]
|
|
movd [px-32*2-4], m0
|
|
movd [px-32*1-4], m1
|
|
jmp .top_done
|
|
.no_top:
|
|
mova [px-32*2+ 0], m7
|
|
movd [px-32*2+16], m7
|
|
mova [px-32*1+ 0], m7
|
|
movd [px-32*1+16], m7
|
|
.top_no_left:
|
|
movd [px-32*2- 4], m7
|
|
movd [px-32*1- 4], m7
|
|
.top_done:
|
|
test edgeb, 8 ; HAVE_BOTTOM
|
|
jz .no_bottom
|
|
movifnidn botq, r4mp
|
|
mova m0, [botq+strideq*0+ 0]
|
|
movd m1, [botq+strideq*0+16]
|
|
mova m2, [botq+strideq*1+ 0]
|
|
movd m3, [botq+strideq*1+16]
|
|
mova [px+32*8+ 0], m0
|
|
movd [px+32*8+16], m1
|
|
mova [px+32*9+ 0], m2
|
|
movd [px+32*9+16], m3
|
|
test edgeb, 1 ; HAVE_LEFT
|
|
jz .bottom_no_left
|
|
movd m0, [botq+strideq*0-4]
|
|
movd m1, [botq+strideq*1-4]
|
|
movd [px+32*8- 4], m0
|
|
movd [px+32*9- 4], m1
|
|
jmp .bottom_done
|
|
.no_bottom:
|
|
mova [px+32*8+ 0], m7
|
|
movd [px+32*8+16], m7
|
|
mova [px+32*9+ 0], m7
|
|
movd [px+32*9+16], m7
|
|
.bottom_no_left:
|
|
movd [px+32*8- 4], m7
|
|
movd [px+32*9- 4], m7
|
|
.bottom_done:
|
|
test edgeb, 1 ; HAVE_LEFT
|
|
jz .no_left
|
|
movifnidn leftq, r2mp
|
|
movd m0, [leftq+4*0]
|
|
movd m1, [leftq+4*1]
|
|
movd m2, [leftq+4*2]
|
|
movd m3, [leftq+4*3]
|
|
movd [px+32*0- 4], m0
|
|
movd [px+32*1- 4], m1
|
|
movd [px+32*2- 4], m2
|
|
movd [px+32*3- 4], m3
|
|
movd m0, [leftq+4*4]
|
|
movd m1, [leftq+4*5]
|
|
movd m2, [leftq+4*6]
|
|
movd m3, [leftq+4*7]
|
|
movd [px+32*4- 4], m0
|
|
movd [px+32*5- 4], m1
|
|
movd [px+32*6- 4], m2
|
|
movd [px+32*7- 4], m3
|
|
jmp .left_done
|
|
.no_left:
|
|
REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
|
|
.left_done:
|
|
test edgeb, 2 ; HAVE_RIGHT
|
|
jnz .padding_done
|
|
REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
|
|
.padding_done:
|
|
CDEF_FILTER 8, 8
|
|
|
|
%macro CDEF_DIR 0
|
|
%if ARCH_X86_64
|
|
cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax
|
|
lea r6, [dir_shift]
|
|
shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc
|
|
movddup m7, [r6+bdmaxq*8]
|
|
lea r6, [strideq*3]
|
|
mova m0, [srcq+strideq*0]
|
|
mova m1, [srcq+strideq*1]
|
|
mova m2, [srcq+strideq*2]
|
|
mova m3, [srcq+r6 ]
|
|
lea srcq, [srcq+strideq*4]
|
|
mova m4, [srcq+strideq*0]
|
|
mova m5, [srcq+strideq*1]
|
|
mova m6, [srcq+strideq*2]
|
|
REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
|
pmulhuw m7, [srcq+r6 ]
|
|
pxor m8, m8
|
|
packuswb m9, m0, m1
|
|
packuswb m10, m2, m3
|
|
packuswb m11, m4, m5
|
|
packuswb m12, m6, m7
|
|
REPX {psadbw x, m8}, m9, m10, m11, m12
|
|
packssdw m9, m10
|
|
packssdw m11, m12
|
|
packssdw m9, m11
|
|
jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
|
|
%else
|
|
cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax
|
|
mov bdmaxd, bdmaxm
|
|
LEA r2, dir_shift
|
|
shr bdmaxd, 11
|
|
movddup m7, [r2+bdmaxq*8]
|
|
lea r3, [strideq*3]
|
|
pmulhuw m3, m7, [srcq+strideq*0]
|
|
pmulhuw m4, m7, [srcq+strideq*1]
|
|
pmulhuw m5, m7, [srcq+strideq*2]
|
|
pmulhuw m6, m7, [srcq+r3 ]
|
|
movddup m1, [r2-dir_shift+pw_128]
|
|
lea srcq, [srcq+strideq*4]
|
|
pxor m0, m0
|
|
packuswb m2, m3, m4
|
|
psubw m3, m1
|
|
psubw m4, m1
|
|
mova [esp+0x00], m3
|
|
mova [esp+0x10], m4
|
|
packuswb m3, m5, m6
|
|
psadbw m2, m0
|
|
psadbw m3, m0
|
|
psubw m5, m1
|
|
psubw m6, m1
|
|
packssdw m2, m3
|
|
mova [esp+0x20], m5
|
|
mova [esp+0x50], m6
|
|
pmulhuw m4, m7, [srcq+strideq*0]
|
|
pmulhuw m5, m7, [srcq+strideq*1]
|
|
pmulhuw m6, m7, [srcq+strideq*2]
|
|
pmulhuw m7, [srcq+r3 ]
|
|
packuswb m3, m4, m5
|
|
packuswb m1, m6, m7
|
|
psadbw m3, m0
|
|
psadbw m1, m0
|
|
packssdw m3, m1
|
|
movddup m1, [r2-dir_shift+pw_128]
|
|
LEA r2, shufw_6543210x
|
|
jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main
|
|
%endif
|
|
%endmacro
|
|
|
|
INIT_XMM ssse3
|
|
CDEF_DIR
|
|
|
|
INIT_XMM sse4
|
|
CDEF_DIR
|