930 lines
31 KiB
NASM
930 lines
31 KiB
NASM
; Copyright © 2022, VideoLAN and dav1d authors
|
|
; Copyright © 2022, Two Orioles, LLC
|
|
; All rights reserved.
|
|
;
|
|
; Redistribution and use in source and binary forms, with or without
|
|
; modification, are permitted provided that the following conditions are met:
|
|
;
|
|
; 1. Redistributions of source code must retain the above copyright notice, this
|
|
; list of conditions and the following disclaimer.
|
|
;
|
|
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
; this list of conditions and the following disclaimer in the documentation
|
|
; and/or other materials provided with the distribution.
|
|
;
|
|
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
%include "config.asm"
|
|
%include "ext/x86/x86inc.asm"
|
|
%include "x86/filmgrain_common.asm"
|
|
|
|
%if ARCH_X86_64
|
|
|
|
SECTION_RODATA 16
|
|
scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1
|
|
scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4
|
|
pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27
|
|
pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32
|
|
fg_min: times 2 dw 0
|
|
times 2 dw 64
|
|
times 2 dw 256
|
|
fg_max: times 2 dw 1023
|
|
times 2 dw 4095
|
|
times 2 dw 960
|
|
times 2 dw 3840
|
|
times 2 dw 940
|
|
times 2 dw 3760
|
|
scale_rnd: dd 64
|
|
dd 16
|
|
uv_offset_mul: dd 256
|
|
dd 1024
|
|
pb_8_9_0_1: db 8, 9, 0, 1
|
|
|
|
cextern pb_0to63
|
|
|
|
SECTION .text
|
|
|
|
INIT_ZMM avx512icl
|
|
cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \
|
|
grain_lut, offx, sby, see, offy, src_bak
|
|
%define base r11-fg_min
|
|
lea r11, [fg_min]
|
|
mov r6d, r9m ; bdmax
|
|
mov r9d, [fg_dataq+FGData.clip_to_restricted_range]
|
|
mov r7d, [fg_dataq+FGData.scaling_shift]
|
|
mov sbyd, sbym
|
|
vpbroadcastd m6, r9m
|
|
shr r6d, 11 ; is_12bpc
|
|
vbroadcasti32x4 m7, [base+scale_mask]
|
|
shlx r10d, r9d, r6d
|
|
vpbroadcastd m10, [base+scale_shift+r7*4-32]
|
|
lea r9d, [r6+r9*4]
|
|
vpbroadcastd m8, [base+fg_min+r10*4]
|
|
kxnorw k1, k1, k1 ; 0xffff
|
|
vpbroadcastd m9, [base+fg_max+r9*4]
|
|
mov r12, 0xeeeeeeeeeeeeeeee
|
|
vpbroadcastd m19, [base+scale_rnd+r6*4]
|
|
kshiftrb k2, k1, 4 ; 0xf
|
|
vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8]
|
|
kmovq k3, r12
|
|
vpbroadcastd m11, [base+scale_shift+r6*8+4]
|
|
test sbyd, sbyd
|
|
setnz r7b
|
|
vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0]
|
|
vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4]
|
|
test r7b, [fg_dataq+FGData.overlap_flag]
|
|
jnz .v_overlap
|
|
|
|
imul seed, sbyd, (173 << 24) | 37
|
|
add seed, (105 << 24) | 178
|
|
rorx seed, seed, 24
|
|
movzx seed, seew
|
|
xor seed, [fg_dataq+FGData.seed]
|
|
lea src_bakq, [srcq+wq*2]
|
|
neg wq
|
|
sub dstq, srcq
|
|
|
|
.loop_x:
|
|
rorx r6, seeq, 1
|
|
or seed, 0xeff4
|
|
test seeb, seeh
|
|
lea seed, [r6+0x8000]
|
|
cmovp seed, r6d ; updated seed
|
|
rorx offyd, seed, 8
|
|
rorx offxq, seeq, 12
|
|
and offyd, 0xf
|
|
imul offyd, 164
|
|
lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
|
|
sby, see, offxy, src_bak
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
.loop_y:
|
|
movu m4, [grain_lutq+offxyq*2+82*0]
|
|
movu m5, [grain_lutq+offxyq*2+82*2]
|
|
call .add_noise
|
|
sub hb, 2
|
|
jg .loop_y
|
|
add wq, 32
|
|
jge .end
|
|
lea srcq, [src_bakq+wq*2]
|
|
cmp byte [fg_dataq+FGData.overlap_flag], 0
|
|
je .loop_x
|
|
test sbyd, sbyd
|
|
jnz .hv_overlap
|
|
|
|
; horizontal overlap (without vertical overlap)
|
|
.loop_x_h_overlap:
|
|
rorx r6, seeq, 1
|
|
or seed, 0xeff4
|
|
test seeb, seeh
|
|
lea seed, [r6+0x8000]
|
|
cmovp seed, r6d ; updated seed
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
|
|
sby, see, offy, src_bak, left_offxy
|
|
|
|
lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx
|
|
rorx offyd, seed, 8
|
|
rorx offxq, seeq, 12
|
|
and offyd, 0xf
|
|
imul offyd, 164
|
|
lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
|
|
sby, see, offxy, src_bak, left_offxy
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
.loop_y_h_overlap:
|
|
movu m4, [grain_lutq+offxyq*2+82*0]
|
|
movu m5, [grain_lutq+offxyq*2+82*2]
|
|
movd xm17, [grain_lutq+left_offxyq*2-82*1]
|
|
pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1
|
|
punpckldq xm16, xm4, xm5
|
|
punpcklwd xm17, xm16
|
|
mova xm16, xm19
|
|
vpdpwssd xm16, xm20, xm17
|
|
psrad xm16, 1
|
|
packssdw xm16, xm16
|
|
vpsravw xm16, xm11
|
|
vmovdqu8 m4{k2}, m16
|
|
vpalignr m5{k2}, m16, m16, 4
|
|
call .add_noise
|
|
sub hb, 2
|
|
jg .loop_y_h_overlap
|
|
add wq, 32
|
|
jge .end
|
|
lea srcq, [src_bakq+wq*2]
|
|
test sbyd, sbyd
|
|
jnz .hv_overlap
|
|
jmp .loop_x_h_overlap
|
|
|
|
.v_overlap:
|
|
movzx sbyd, sbyb
|
|
imul seed, [fg_dataq+FGData.seed], 0x00010001
|
|
imul r7d, sbyd, 173 * 0x00010001
|
|
imul sbyd, 37 * 0x01000100
|
|
add r7d, (105 << 16) | 188
|
|
add sbyd, (178 << 24) | (141 << 8)
|
|
and r7d, 0x00ff00ff
|
|
and sbyd, 0xff00ff00
|
|
xor seed, r7d
|
|
xor seed, sbyd ; (cur_seed << 16) | top_seed
|
|
lea src_bakq, [srcq+wq*2]
|
|
neg wq
|
|
sub dstq, srcq
|
|
|
|
; we assume from the block above that bits 8-15 of r7d are zero'ed
|
|
mov r6d, seed
|
|
or seed, 0xeff4eff4
|
|
test seeb, seeh
|
|
setp r7b ; parity of top_seed
|
|
shr seed, 16
|
|
shl r7d, 16
|
|
test seeb, seeh
|
|
setp r7b ; parity of cur_seed
|
|
or r6d, 0x00010001
|
|
xor r7d, r6d
|
|
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
|
|
sby, see, offy, src_bak, _, top_offxy
|
|
|
|
rorx offyd, seed, 8
|
|
rorx offxd, seed, 12
|
|
and offyd, 0xf000f
|
|
and offxd, 0xf000f
|
|
imul offyd, 164
|
|
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
|
|
lea offyd, [offyq+offxq*2+0x10001*747+32*82]
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
|
|
sby, see, offxy, src_bak, _, top_offxy
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
movzx top_offxyd, offxyw
|
|
shr offxyd, 16
|
|
|
|
movu m16, [grain_lutq+offxyq*2+82*0]
|
|
movu m0, [grain_lutq+top_offxyq*2+82*0]
|
|
movu m17, [grain_lutq+offxyq*2+82*2]
|
|
movu m1, [grain_lutq+top_offxyq*2+82*2]
|
|
punpckhwd m4, m0, m16
|
|
punpcklwd m0, m16
|
|
punpckhwd m5, m1, m17
|
|
punpcklwd m1, m17
|
|
call .add_noise_v
|
|
sub hb, 2
|
|
jg .loop_y
|
|
add wq, 32
|
|
jge .end
|
|
lea srcq, [src_bakq+wq*2]
|
|
|
|
; since fg_dataq.overlap is guaranteed to be set, we never jump back
|
|
; to .v_overlap, and instead always fall-through to .hv_overlap
|
|
.hv_overlap:
|
|
; we assume from the block above that bits 8-15 of r7d are zero'ed
|
|
mov r6d, seed
|
|
or seed, 0xeff4eff4
|
|
test seeb, seeh
|
|
setp r7b ; parity of top_seed
|
|
shr seed, 16
|
|
shl r7d, 16
|
|
test seeb, seeh
|
|
setp r7b ; parity of cur_seed
|
|
or r6d, 0x00010001
|
|
xor r7d, r6d
|
|
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
|
|
sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy
|
|
|
|
lea topleft_offxyd, [top_offxyq+73]
|
|
lea left_offxyd, [offyq+73]
|
|
rorx offyd, seed, 8
|
|
rorx offxd, seed, 12
|
|
and offyd, 0xf000f
|
|
and offxd, 0xf000f
|
|
imul offyd, 164
|
|
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
|
|
lea offyd, [offyq+offxq*2+0x10001*747+32*82]
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
|
|
sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
movzx top_offxyd, offxyw
|
|
shr offxyd, 16
|
|
|
|
movu m5, [grain_lutq+offxyq*2+82*0]
|
|
movu m0, [grain_lutq+top_offxyq*2+82*0]
|
|
movd xm17, [grain_lutq+left_offxyq*2-82*1]
|
|
pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1
|
|
movu m2, [grain_lutq+offxyq*2+82*2]
|
|
movu m1, [grain_lutq+top_offxyq*2+82*2]
|
|
movd xm18, [grain_lutq+left_offxyq*2+82*1]
|
|
pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1
|
|
punpckldq xm16, xm5, xm0
|
|
punpcklwd xm17, xm16
|
|
mova xm16, xm19
|
|
vpdpwssd xm16, xm20, xm17
|
|
punpckldq xm17, xm2, xm1
|
|
punpcklwd xm18, xm17
|
|
mova xm17, xm19
|
|
vpdpwssd xm17, xm20, xm18
|
|
punpckhwd m4, m0, m5
|
|
punpcklwd m0, m5
|
|
punpckhwd m5, m1, m2
|
|
punpcklwd m1, m2
|
|
psrad xm16, 1
|
|
psrad xm17, 1
|
|
packssdw xm16, xm17
|
|
vpsravw xm16, xm11
|
|
vpshuflw m0{k2}, m16, q1302
|
|
punpckhqdq xm16, xm16
|
|
vpshuflw m1{k2}, m16, q1302
|
|
call .add_noise_v
|
|
sub hb, 2
|
|
jg .loop_y_h_overlap
|
|
add wq, 32
|
|
lea srcq, [src_bakq+wq*2]
|
|
jl .hv_overlap
|
|
.end:
|
|
RET
|
|
ALIGN function_align
|
|
.add_noise_v:
|
|
mova m2, m19
|
|
vpdpwssd m2, m12, m4
|
|
mova m3, m19
|
|
vpdpwssd m3, m13, m5
|
|
mova m4, m19
|
|
vpdpwssd m4, m12, m0
|
|
mova m5, m19
|
|
vpdpwssd m5, m13, m1
|
|
REPX {psrad x, 1}, m2, m3, m4, m5
|
|
packssdw m4, m2
|
|
packssdw m5, m3
|
|
vpsravw m4, m11
|
|
vpsravw m5, m11
|
|
.add_noise:
|
|
mova m0, [srcq+strideq*0]
|
|
mova m1, [srcq+strideq*1]
|
|
kmovw k4, k1
|
|
pand m16, m6, m0
|
|
psrld m3, m0, 16
|
|
vpgatherdd m2{k4}, [scalingq+m16]
|
|
vpcmpud k4, m3, m6, 2 ; px <= bdmax
|
|
vpgatherdd m16{k4}, [scalingq+m3]
|
|
kmovw k4, k1
|
|
pand m17, m6, m1
|
|
vpgatherdd m3{k4}, [scalingq+m17]
|
|
vpshufb m2{k3}, m16, m7
|
|
psrld m16, m1, 16
|
|
vpcmpud k4, m16, m6, 2
|
|
vpgatherdd m17{k4}, [scalingq+m16]
|
|
vpshufb m3{k3}, m17, m7
|
|
vpsllvw m2, m10
|
|
vpsllvw m3, m10
|
|
pmulhrsw m4, m2
|
|
pmulhrsw m5, m3
|
|
add grain_lutq, 82*4
|
|
paddw m0, m4
|
|
paddw m1, m5
|
|
pmaxsw m0, m8
|
|
pmaxsw m1, m8
|
|
pminsw m0, m9
|
|
pminsw m1, m9
|
|
mova [dstq+srcq], m0
|
|
add srcq, strideq
|
|
mova [dstq+srcq], m1
|
|
add srcq, strideq
|
|
ret
|
|
|
|
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
|
|
cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \
|
|
grain_lut, h, sby, luma, lstride, uv_pl, is_id
|
|
%define base r12-fg_min
|
|
lea r12, [fg_min]
|
|
mov r9d, r13m ; bdmax
|
|
mov r7d, [fg_dataq+FGData.scaling_shift]
|
|
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
|
|
mov r11d, is_idm
|
|
kxnorw k1, k1, k1 ; 0xffff
|
|
vpbroadcastd m5, r13m
|
|
mov r13, 0xeeeeeeeeeeeeeeee
|
|
vbroadcasti32x4 m6, [base+scale_mask]
|
|
shr r9d, 11 ; is_12bpc
|
|
vpbroadcastd m7, [base+scale_shift+r7*4-32]
|
|
shlx r10d, r6d, r9d
|
|
mov sbyd, sbym
|
|
shlx r6d, r6d, r11d
|
|
vpbroadcastd m8, [base+fg_min+r10*4]
|
|
lea r6d, [r9+r6*2]
|
|
vpbroadcastd m9, [base+fg_max+r6*4]
|
|
kmovq k2, r13
|
|
vpbroadcastd m20, [base+scale_rnd+r9*4]
|
|
packssdw m4, m5, m5
|
|
vpbroadcastd m21, [base+scale_shift+r9*8+4]
|
|
%if %2
|
|
mova m12, [pb_0to63] ; pw_even
|
|
mov r13d, 0x0101
|
|
vpbroadcastq m10, [base+pw_23_22+r9*8]
|
|
kmovw k3, r13d
|
|
%if %3
|
|
pshufd m11, m10, q0000
|
|
%else
|
|
vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0]
|
|
vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4]
|
|
vmovdqu16 m11{k1}, m16
|
|
%endif
|
|
psrlw m13, m12, 8 ; pw_odd
|
|
%else
|
|
vpbroadcastq m10, [base+pw_27_17_17_27+r9*8]
|
|
kshiftrb k3, k1, 7 ; 0x01
|
|
kshiftrb k4, k1, 4 ; 0x0f
|
|
pshufd m11, m10, q0000
|
|
%endif
|
|
mov lstrideq, r10mp
|
|
test sbyd, sbyd
|
|
setnz r7b
|
|
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
|
|
jne .csfl
|
|
|
|
%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
|
|
_, sby, see, lstride
|
|
|
|
%if %1
|
|
mov r6d, r11m
|
|
vpbroadcastd m0, [base+uv_offset_mul+r9*4]
|
|
vpbroadcastd m1, [base+pb_8_9_0_1]
|
|
vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4]
|
|
vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4]
|
|
pmaddwd m14, m0
|
|
pshufb m15, m1 ; { uv_luma_mult, uv_mult }
|
|
%endif
|
|
test r7b, [fg_dataq+FGData.overlap_flag]
|
|
jnz %%v_overlap
|
|
|
|
imul seed, sbyd, (173 << 24) | 37
|
|
add seed, (105 << 24) | 178
|
|
rorx seed, seed, 24
|
|
movzx seed, seew
|
|
xor seed, [fg_dataq+FGData.seed]
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
|
|
offx, offy, see, lstride, luma
|
|
|
|
mov lumaq, r9mp
|
|
lea r12, [srcq+wq*2]
|
|
lea r13, [dstq+wq*2]
|
|
lea r14, [lumaq+wq*(2<<%2)]
|
|
mov r9mp, r12
|
|
mov r10mp, r13
|
|
mov r11mp, r14
|
|
neg wq
|
|
|
|
%%loop_x:
|
|
rorx r6, seeq, 1
|
|
or seed, 0xeff4
|
|
test seeb, seeh
|
|
lea seed, [r6+0x8000]
|
|
cmovp seed, r6d ; updated seed
|
|
rorx offyd, seed, 8
|
|
rorx offxq, seeq, 12
|
|
and offyd, 0xf
|
|
imul offyd, 164>>%3
|
|
lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
|
|
h, offxy, see, lstride, luma
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
%%loop_y:
|
|
%if %2
|
|
movu ym18, [grain_lutq+offxyq*2+82*0]
|
|
vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
|
|
movu ym19, [grain_lutq+offxyq*2+82*4]
|
|
vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
|
|
%else
|
|
movu m18, [grain_lutq+offxyq*2+82*0]
|
|
movu m19, [grain_lutq+offxyq*2+82*2]
|
|
%endif
|
|
call %%add_noise
|
|
sub hb, 2<<%2
|
|
jg %%loop_y
|
|
add wq, 32>>%2
|
|
jge .end
|
|
mov srcq, r9mp
|
|
mov dstq, r10mp
|
|
mov lumaq, r11mp
|
|
lea srcq, [srcq+wq*2]
|
|
lea dstq, [dstq+wq*2]
|
|
lea lumaq, [lumaq+wq*(2<<%2)]
|
|
cmp byte [fg_dataq+FGData.overlap_flag], 0
|
|
je %%loop_x
|
|
cmp dword r8m, 0 ; sby
|
|
jne %%hv_overlap
|
|
|
|
; horizontal overlap (without vertical overlap)
|
|
%%loop_x_h_overlap:
|
|
rorx r6, seeq, 1
|
|
or seed, 0xEFF4
|
|
test seeb, seeh
|
|
lea seed, [r6+0x8000]
|
|
cmovp seed, r6d ; updated seed
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
|
|
offx, offy, see, lstride, luma, left_offxy
|
|
|
|
lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
|
|
rorx offyd, seed, 8
|
|
rorx offxq, seeq, 12
|
|
and offyd, 0xf
|
|
imul offyd, 164>>%3
|
|
lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
|
|
h, offxy, see, lstride, luma, left_offxy
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
%%loop_y_h_overlap:
|
|
%if %2
|
|
movu ym18, [grain_lutq+offxyq*2+82*0]
|
|
vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
|
|
movu ym19, [grain_lutq+offxyq*2+82*4]
|
|
vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
|
|
movd xm16, [grain_lutq+left_offxyq*2+82*0]
|
|
vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2
|
|
movd xm17, [grain_lutq+left_offxyq*2+82*4]
|
|
vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2
|
|
punpckldq m16, m17
|
|
punpckldq m17, m18, m19
|
|
punpcklwd m16, m17
|
|
mova m17, m20
|
|
vpdpwssd m17, m16, m10
|
|
psrad m17, 1
|
|
packssdw m17, m17
|
|
vpsravw m17, m21
|
|
%else
|
|
movu m18, [grain_lutq+offxyq*2+82*0]
|
|
movu m19, [grain_lutq+offxyq*2+82*2]
|
|
movd xm16, [grain_lutq+left_offxyq*2+82*0]
|
|
pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1
|
|
punpckldq xm17, xm18, xm19
|
|
punpcklwd xm16, xm17
|
|
mova xm17, xm20
|
|
vpdpwssd xm17, xm16, xm10
|
|
psrad xm17, 1
|
|
packssdw xm17, xm17
|
|
vpsravw xm17, xm21
|
|
%endif
|
|
vmovdqa32 m18{k3}, m17
|
|
vpshufd m19{k3}, m17, q0321
|
|
call %%add_noise
|
|
sub hb, 2<<%2
|
|
jg %%loop_y_h_overlap
|
|
add wq, 32>>%2
|
|
jge .end
|
|
mov srcq, r9mp
|
|
mov dstq, r10mp
|
|
mov lumaq, r11mp
|
|
lea srcq, [srcq+wq*2]
|
|
lea dstq, [dstq+wq*2]
|
|
lea lumaq, [lumaq+wq*(2<<%2)]
|
|
cmp dword r8m, 0 ; sby
|
|
jne %%hv_overlap
|
|
jmp %%loop_x_h_overlap
|
|
|
|
%%v_overlap:
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
|
|
_, sby, see, lstride
|
|
|
|
movzx sbyd, sbyb
|
|
imul seed, [fg_dataq+FGData.seed], 0x00010001
|
|
imul r7d, sbyd, 173 * 0x00010001
|
|
imul sbyd, 37 * 0x01000100
|
|
add r7d, (105 << 16) | 188
|
|
add sbyd, (178 << 24) | (141 << 8)
|
|
and r7d, 0x00ff00ff
|
|
and sbyd, 0xff00ff00
|
|
xor seed, r7d
|
|
xor seed, sbyd ; (cur_seed << 16) | top_seed
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
|
|
offx, offy, see, lstride, luma, _, top_offxy
|
|
|
|
mov lumaq, r9mp
|
|
lea r12, [srcq+wq*2]
|
|
lea r13, [dstq+wq*2]
|
|
lea r14, [lumaq+wq*(2<<%2)]
|
|
mov r9mp, r12
|
|
mov r10mp, r13
|
|
mov r11mp, r14
|
|
neg wq
|
|
|
|
; we assume from the block above that bits 8-15 of r7d are zero'ed
|
|
mov r6d, seed
|
|
or seed, 0xeff4eff4
|
|
test seeb, seeh
|
|
setp r7b ; parity of top_seed
|
|
shr seed, 16
|
|
shl r7d, 16
|
|
test seeb, seeh
|
|
setp r7b ; parity of cur_seed
|
|
or r6d, 0x00010001
|
|
xor r7d, r6d
|
|
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
|
|
|
|
rorx offyd, seed, 8
|
|
rorx offxd, seed, 12
|
|
and offyd, 0xf000f
|
|
and offxd, 0xf000f
|
|
imul offyd, 164>>%3
|
|
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
|
|
lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
|
|
h, offxy, see, lstride, luma, _, top_offxy
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
movzx top_offxyd, offxyw
|
|
shr offxyd, 16
|
|
|
|
%if %3
|
|
movu ym16, [grain_lutq+offxyq*2+82*0]
|
|
movu ym1, [grain_lutq+top_offxyq*2+82*0]
|
|
vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2]
|
|
movu ym19, [grain_lutq+offxyq*2+82*4]
|
|
vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
|
|
punpcklwd ym17, ym1, ym16
|
|
punpckhwd ym1, ym16
|
|
%elif %2
|
|
movu ym18, [grain_lutq+offxyq*2+82*0]
|
|
vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
|
|
movu ym17, [grain_lutq+top_offxyq*2+82*0]
|
|
vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1
|
|
movu ym19, [grain_lutq+offxyq*2+82*4]
|
|
vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
|
|
punpcklwd m16, m17, m18
|
|
punpckhwd m17, m18
|
|
%else
|
|
movu m18, [grain_lutq+offxyq*2+82*0]
|
|
movu m19, [grain_lutq+top_offxyq*2+82*0]
|
|
movu m2, [grain_lutq+offxyq*2+82*2]
|
|
movu m16, [grain_lutq+top_offxyq*2+82*2]
|
|
punpckhwd m1, m19, m18
|
|
punpcklwd m19, m18
|
|
punpckhwd m18, m2, m16
|
|
punpcklwd m2, m16
|
|
%endif
|
|
call %%add_noise_v
|
|
sub hb, 2<<%2
|
|
jg %%loop_y
|
|
add wq, 32>>%2
|
|
jge .end
|
|
mov srcq, r9mp
|
|
mov dstq, r10mp
|
|
mov lumaq, r11mp
|
|
lea srcq, [srcq+wq*2]
|
|
lea dstq, [dstq+wq*2]
|
|
lea lumaq, [lumaq+wq*(2<<%2)]
|
|
|
|
; since fg_dataq.overlap is guaranteed to be set, we never jump back
|
|
; to %%v_overlap, and instead always fall-through to %%hv_overlap
|
|
%%hv_overlap:
|
|
; we assume from the block above that bits 8-15 of r7d are zero'ed
|
|
mov r6d, seed
|
|
or seed, 0xeff4eff4
|
|
test seeb, seeh
|
|
setp r7b ; parity of top_seed
|
|
shr seed, 16
|
|
shl r7d, 16
|
|
test seeb, seeh
|
|
setp r7b ; parity of cur_seed
|
|
or r6d, 0x00010001
|
|
xor r7d, r6d
|
|
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
|
|
offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
|
|
|
|
lea topleft_offxyq, [top_offxyq+(32>>%2)]
|
|
lea left_offxyq, [offyq+(32>>%2)]
|
|
rorx offyd, seed, 8
|
|
rorx offxd, seed, 12
|
|
and offyd, 0xf000f
|
|
and offxd, 0xf000f
|
|
imul offyd, 164>>%3
|
|
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
|
|
lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
|
|
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
|
|
h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
movzx top_offxyd, offxyw
|
|
shr offxyd, 16
|
|
|
|
; grain = grain_lut[offy+y][offx+x]
|
|
%if %2
|
|
movd xm16, [grain_lutq+left_offxyq*2+82*0]
|
|
vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2
|
|
movd xm17, [grain_lutq+left_offxyq*2+82*4]
|
|
vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2
|
|
movu ym18, [grain_lutq+offxyq*2+82*0]
|
|
vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
|
|
movu ym19, [grain_lutq+offxyq*2+82*4]
|
|
vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
|
|
punpckldq m16, m17
|
|
punpckldq m17, m18, m19
|
|
punpcklwd m16, m17
|
|
movu ym1, [grain_lutq+top_offxyq*2+82*0]
|
|
movd xm17, [grain_lutq+topleft_offxyq*2+82*0]
|
|
mova m0, m20
|
|
vpdpwssd m0, m16, m10
|
|
%if %3
|
|
punpcklwd xm17, xm1
|
|
mova xm16, xm20
|
|
vpdpwssd xm16, xm17, xm10
|
|
psrad xm16, 1
|
|
%else
|
|
vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1
|
|
vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2
|
|
punpcklwd m17, m1
|
|
mova m16, m20
|
|
vpdpwssd m16, m17, m10
|
|
psrad m16, 1
|
|
%endif
|
|
psrad m0, 1
|
|
packssdw m0, m16
|
|
vpsravw m0, m21
|
|
vmovdqa32 m18{k3}, m0
|
|
vpshufd m19{k3}, m0, q0321
|
|
%if %3
|
|
vpunpckhdq ym1{k3}, ym0, ym0
|
|
punpcklwd ym17, ym1, ym18
|
|
punpckhwd ym1, ym18
|
|
%else
|
|
vpunpckhdq m1{k3}, m0, m0
|
|
punpcklwd m16, m1, m18
|
|
punpckhwd m17, m1, m18
|
|
%endif
|
|
%else
|
|
movu m18, [grain_lutq+offxyq*2+82*0]
|
|
movu m19, [grain_lutq+top_offxyq*2+82*0]
|
|
movd xm17, [grain_lutq+left_offxyq*2+82*0]
|
|
pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1
|
|
punpckldq xm16, xm18, xm19
|
|
punpcklwd xm17, xm16
|
|
movu m2, [grain_lutq+offxyq*2+82*2]
|
|
movu m0, [grain_lutq+top_offxyq*2+82*2]
|
|
movd xm16, [grain_lutq+left_offxyq*2+82*2]
|
|
pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1
|
|
punpckldq xm1, xm2, xm0
|
|
punpcklwd xm1, xm16, xm1
|
|
mova xm16, xm20
|
|
vpdpwssd xm16, xm17, xm10
|
|
mova xm17, xm20
|
|
vpdpwssd xm17, xm1, xm10
|
|
punpckhwd m1, m19, m18
|
|
punpcklwd m19, m18
|
|
punpckhwd m18, m2, m0
|
|
punpcklwd m2, m0
|
|
psrad xm16, 1
|
|
psrad xm17, 1
|
|
packssdw xm16, xm17
|
|
vpsravw xm16, xm21
|
|
vpshuflw m19{k4}, m16, q1302
|
|
punpckhqdq xm16, xm16
|
|
vpshuflw m2{k4}, m16, q3120
|
|
%endif
|
|
call %%add_noise_v
|
|
sub hb, 2<<%2
|
|
jg %%loop_y_h_overlap
|
|
add wq, 32>>%2
|
|
jge .end
|
|
mov srcq, r9mp
|
|
mov dstq, r10mp
|
|
mov lumaq, r11mp
|
|
lea srcq, [srcq+wq*2]
|
|
lea dstq, [dstq+wq*2]
|
|
lea lumaq, [lumaq+wq*(2<<%2)]
|
|
jmp %%hv_overlap
|
|
|
|
ALIGN function_align
|
|
%%add_noise_v:
|
|
%if %3
|
|
mova ym16, ym20
|
|
vpdpwssd ym16, ym17, ym11
|
|
mova ym17, ym20
|
|
vpdpwssd ym17, ym1, ym11
|
|
psrad ym16, 1
|
|
psrad ym17, 1
|
|
packssdw ym16, ym17
|
|
vpsravw m18{k1}, m16, m21
|
|
%elif %2
|
|
mova m18, m20
|
|
vpdpwssd m18, m16, m11
|
|
mova m16, m20
|
|
vpdpwssd m16, m17, m11
|
|
psrad m18, 1
|
|
psrad m16, 1
|
|
packssdw m18, m16
|
|
vpsravw m18, m21
|
|
%else
|
|
mova m16, m20
|
|
vpdpwssd m16, m1, m11
|
|
mova m17, m20
|
|
vpdpwssd m17, m18, m11
|
|
mova m18, m20
|
|
vpdpwssd m18, m19, m11
|
|
mova m19, m20
|
|
vpdpwssd m19, m2, m11
|
|
REPX {psrad x, 1}, m16, m17, m18, m19
|
|
packssdw m18, m16
|
|
packssdw m19, m17
|
|
vpsravw m18, m21
|
|
vpsravw m19, m21
|
|
%endif
|
|
%%add_noise:
|
|
%if %2
|
|
mova m2, [lumaq+lstrideq*(0<<%3)]
|
|
mova m0, [lumaq+lstrideq*(1<<%3)]
|
|
lea lumaq, [lumaq+lstrideq*(2<<%3)]
|
|
mova m3, [lumaq+lstrideq*(0<<%3)]
|
|
mova m1, [lumaq+lstrideq*(1<<%3)]
|
|
mova m16, m12
|
|
vpermi2w m16, m2, m0
|
|
vpermt2w m2, m13, m0
|
|
mova m17, m12
|
|
vpermi2w m17, m3, m1
|
|
vpermt2w m3, m13, m1
|
|
pavgw m2, m16
|
|
pavgw m3, m17
|
|
%elif %1
|
|
mova m2, [lumaq+lstrideq*0]
|
|
mova m3, [lumaq+lstrideq*1]
|
|
%endif
|
|
%if %2
|
|
mova ym16, [srcq+strideq*0]
|
|
vinserti32x8 m16, [srcq+strideq*1], 1
|
|
lea srcq, [srcq+strideq*2]
|
|
%else
|
|
mova m16, [srcq+strideq*0]
|
|
%endif
|
|
%if %1
|
|
punpckhwd m17, m2, m16
|
|
mova m0, m14
|
|
vpdpwssd m0, m17, m15
|
|
punpcklwd m17, m2, m16
|
|
mova m2, m14
|
|
vpdpwssd m2, m17, m15
|
|
%endif
|
|
%if %2
|
|
mova ym17, [srcq+strideq*0]
|
|
vinserti32x8 m17, [srcq+strideq*1], 1
|
|
%else
|
|
mova m17, [srcq+strideq*1]
|
|
%endif
|
|
%if %1
|
|
psrad m0, 6
|
|
psrad m2, 6
|
|
packusdw m2, m0
|
|
punpckhwd m0, m3, m17
|
|
mova m1, m14
|
|
vpdpwssd m1, m15, m0
|
|
punpcklwd m0, m3, m17
|
|
mova m3, m14
|
|
vpdpwssd m3, m15, m0
|
|
psrad m1, 6
|
|
psrad m3, 6
|
|
packusdw m3, m1
|
|
pminuw m2, m4
|
|
pminuw m3, m4
|
|
|
|
.add_noise_main:
|
|
; scaling[luma_src]
|
|
kmovw k5, k1
|
|
pand m1, m5, m2
|
|
vpgatherdd m0{k5}, [scalingq+m1]
|
|
kmovw k5, k1
|
|
psrld m2, 16
|
|
vpgatherdd m1{k5}, [scalingq+m2]
|
|
vpshufb m0{k2}, m1, m6
|
|
kmovw k5, k1
|
|
psrld m1, m3, 16
|
|
vpgatherdd m2{k5}, [scalingq+m1]
|
|
kmovw k5, k1
|
|
pand m3, m5
|
|
vpgatherdd m1{k5}, [scalingq+m3]
|
|
vpshufb m1{k2}, m2, m6
|
|
|
|
; noise = round2(scaling[luma_src] * grain, scaling_shift)
|
|
vpsllvw m0, m7
|
|
vpsllvw m1, m7
|
|
pmulhrsw m18, m0
|
|
pmulhrsw m19, m1
|
|
add grain_lutq, 82*(4<<%2)
|
|
lea lumaq, [lumaq+lstrideq*(2<<%3)]
|
|
lea srcq, [srcq+strideq*2]
|
|
paddw m16, m18
|
|
paddw m17, m19
|
|
pmaxsw m16, m8
|
|
pmaxsw m17, m8
|
|
pminsw m16, m9
|
|
pminsw m17, m9
|
|
%if %2
|
|
mova [dstq+strideq*0], ym16
|
|
vextracti32x8 [dstq+strideq*1], m16, 1
|
|
lea dstq, [dstq+strideq*2]
|
|
mova [dstq+strideq*0], ym17
|
|
vextracti32x8 [dstq+strideq*1], m17, 1
|
|
%else
|
|
mova [dstq+strideq*0], m16
|
|
mova [dstq+strideq*1], m17
|
|
%endif
|
|
lea dstq, [dstq+strideq*2]
|
|
ret
|
|
%else
|
|
%if %2
|
|
pand m2, m4
|
|
pand m3, m4
|
|
%else
|
|
pand m2, m4, [lumaq+lstrideq*0]
|
|
pand m3, m4, [lumaq+lstrideq*1]
|
|
%endif
|
|
jmp .add_noise_main
|
|
%endif
|
|
%endmacro
|
|
|
|
%%FGUV_32x32xN_LOOP 1, %2, %3
|
|
.csfl:
|
|
%%FGUV_32x32xN_LOOP 0, %2, %3
|
|
.end:
|
|
RET
|
|
%endmacro
|
|
|
|
FGUV_FN 420, 1, 1
|
|
FGUV_FN 422, 1, 0
|
|
FGUV_FN 444, 0, 0
|
|
|
|
%endif
|