813 lines
28 KiB
NASM
813 lines
28 KiB
NASM
; Copyright © 2022, VideoLAN and dav1d authors
|
|
; Copyright © 2022, Two Orioles, LLC
|
|
; All rights reserved.
|
|
;
|
|
; Redistribution and use in source and binary forms, with or without
|
|
; modification, are permitted provided that the following conditions are met:
|
|
;
|
|
; 1. Redistributions of source code must retain the above copyright notice, this
|
|
; list of conditions and the following disclaimer.
|
|
;
|
|
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
; this list of conditions and the following disclaimer in the documentation
|
|
; and/or other materials provided with the distribution.
|
|
;
|
|
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
%include "config.asm"
|
|
%include "ext/x86/x86inc.asm"
|
|
%include "x86/filmgrain_common.asm"
|
|
|
|
%if ARCH_X86_64
|
|
|
|
SECTION_RODATA 64
|
|
|
|
pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
|
|
db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
|
|
db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
|
|
pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
|
|
db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
|
|
db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95
|
|
db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127
|
|
interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7
|
|
pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32
|
|
pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32
|
|
pb_27_17: times 2 db 27, 17
|
|
pb_23_22: times 2 db 23, 22
|
|
pw_8: times 2 dw 8
|
|
pw_1024: times 2 dw 1024
|
|
pb_17_27: times 2 db 17, 27
|
|
fg_max: times 4 db 255
|
|
times 4 db 240
|
|
times 4 db 235
|
|
fg_min: times 4 db 0
|
|
times 4 db 16
|
|
noise_rnd: times 2 dw 128
|
|
times 2 dw 64
|
|
times 2 dw 32
|
|
times 2 dw 16
|
|
|
|
SECTION .text
|
|
|
|
INIT_ZMM avx512icl
|
|
cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \
|
|
grain_lut, h, sby, see, overlap
|
|
%define base r11-fg_min
|
|
lea r11, [fg_min]
|
|
mov r6d, [fg_dataq+FGData.scaling_shift]
|
|
mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
|
|
mov sbyd, sbym
|
|
mov overlapd, [fg_dataq+FGData.overlap_flag]
|
|
mov r12, 0x0000000f0000000f ; h_overlap mask
|
|
mova m0, [scalingq+64*0]
|
|
mova m1, [scalingq+64*1]
|
|
mova m2, [scalingq+64*2]
|
|
mova m3, [scalingq+64*3]
|
|
kmovq k1, r12
|
|
vbroadcasti32x4 m4, [base+interleave_hl]
|
|
vpbroadcastd ym16, [base+pb_27_17]
|
|
vpbroadcastd m12, [base+pb_17_27]
|
|
vpbroadcastd m6, [base+noise_rnd+r6*4-32]
|
|
test sbyd, sbyd
|
|
setnz r6b
|
|
vpbroadcastd m7, [base+fg_min+r7*4]
|
|
vpbroadcastd m8, [base+fg_max+r7*8]
|
|
pxor m5, m5
|
|
vpbroadcastd m9, [base+pw_1024]
|
|
vpbroadcastq m10, [base+pb_27_17_17_27]
|
|
vmovdqa64 m12{k1}, m16
|
|
test r6b, overlapb
|
|
jnz .v_overlap
|
|
|
|
imul seed, sbyd, (173 << 24) | 37
|
|
add seed, (105 << 24) | 178
|
|
rorx seed, seed, 24
|
|
movzx seed, seew
|
|
xor seed, [fg_dataq+FGData.seed]
|
|
|
|
DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
|
|
h, sby, see, overlap
|
|
|
|
lea src_bakq, [srcq+wq]
|
|
neg wq
|
|
sub dstq, srcq
|
|
.loop_x:
|
|
rorx r6, seeq, 1
|
|
or seed, 0xeff4
|
|
test seeb, seeh
|
|
lea seed, [r6+0x8000]
|
|
cmovp seed, r6d ; updated seed
|
|
rorx offyd, seed, 8
|
|
rorx offxq, seeq, 12
|
|
and offyd, 0xf
|
|
imul offyd, 164
|
|
lea offxd, [offyq+offxq*2+829] ; offy*stride+offx
|
|
|
|
DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
|
|
h, sby, see, overlap
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
.loop_y:
|
|
movu ym21, [grain_lutq+offxyq-82]
|
|
vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1
|
|
call .add_noise
|
|
sub hb, 2
|
|
jg .loop_y
|
|
add wq, 32
|
|
jge .end
|
|
lea srcq, [src_bakq+wq]
|
|
test overlapd, overlapd
|
|
jz .loop_x
|
|
test sbyd, sbyd
|
|
jnz .hv_overlap
|
|
|
|
.loop_x_h_overlap:
|
|
rorx r6, seeq, 1
|
|
or seed, 0xeff4
|
|
test seeb, seeh
|
|
lea seed, [r6+0x8000]
|
|
cmovp seed, r6d ; updated seed
|
|
|
|
DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
|
|
h, sby, see, left_offxy
|
|
|
|
rorx offyd, seed, 8
|
|
mov left_offxyd, offxd ; previous column's offy*stride
|
|
rorx offxq, seeq, 12
|
|
and offyd, 0xf
|
|
imul offyd, 164
|
|
lea offxd, [offyq+offxq*2+829] ; offy*stride+offx
|
|
|
|
DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
|
|
h, sby, see, left_offxy
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
.loop_y_h_overlap:
|
|
movu ym20, [grain_lutq+offxyq-82]
|
|
vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1
|
|
movd xm19, [grain_lutq+left_offxyq-50]
|
|
vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2
|
|
punpcklbw m19, m20
|
|
pmaddubsw m19, m10, m19
|
|
pmulhrsw m19, m9
|
|
punpckhbw m21, m20, m5
|
|
packsswb m20{k1}, m19, m19
|
|
punpcklbw m20, m5, m20
|
|
call .add_noise_h
|
|
sub hb, 2
|
|
jg .loop_y_h_overlap
|
|
add wq, 32
|
|
jge .end
|
|
lea srcq, [src_bakq+wq]
|
|
test sbyd, sbyd
|
|
jnz .hv_overlap
|
|
jmp .loop_x_h_overlap
|
|
|
|
.v_overlap:
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \
|
|
h, sby, see, overlap
|
|
|
|
movzx r6d, sbyb
|
|
imul seed, [fg_dataq+FGData.seed], 0x00010001
|
|
imul r7d, r6d, 173 * 0x00010001
|
|
imul r6d, 37 * 0x01000100
|
|
add r7d, (105 << 16) | 188
|
|
add r6d, (178 << 24) | (141 << 8)
|
|
and r7d, 0x00ff00ff
|
|
and r6d, 0xff00ff00
|
|
xor seed, r7d
|
|
xor seed, r6d ; (cur_seed << 16) | top_seed
|
|
|
|
DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
|
|
h, sby, see, overlap
|
|
|
|
lea src_bakq, [srcq+wq]
|
|
neg wq
|
|
sub dstq, srcq
|
|
|
|
; we assume from the block above that bits 8-15 of r7d are zero'ed
|
|
mov r6d, seed
|
|
or seed, 0xeff4eff4
|
|
test seeb, seeh
|
|
setp r7b ; parity of top_seed
|
|
shr seed, 16
|
|
shl r7d, 16
|
|
test seeb, seeh
|
|
setp r7b ; parity of cur_seed
|
|
or r6d, 0x00010001
|
|
xor r7d, r6d
|
|
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
|
|
rorx offyd, seed, 8
|
|
rorx offxd, seed, 12
|
|
and offyd, 0xf000f
|
|
and offxd, 0xf000f
|
|
imul offyd, 164
|
|
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
|
|
lea offxd, [offyq+offxq*2+0x10001*829+32*82]
|
|
|
|
DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
|
|
h, sby, see, overlap, top_offxy
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
movzx top_offxyd, offxyw
|
|
shr offxyd, 16
|
|
movu ym19, [grain_lutq+offxyq-82]
|
|
vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1
|
|
movu ym21, [grain_lutq+top_offxyq-82]
|
|
vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1
|
|
punpckhbw m20, m21, m19
|
|
punpcklbw m21, m19
|
|
call .add_noise_v
|
|
sub hb, 2
|
|
jg .loop_y
|
|
add wq, 32
|
|
jge .end
|
|
lea srcq, [src_bakq+wq]
|
|
|
|
; since fg_dataq.overlap is guaranteed to be set, we never jump back
|
|
; to .v_overlap, and instead always fall-through to h+v overlap
|
|
.hv_overlap:
|
|
; we assume from the block above that bits 8-15 of r7d are zero'ed
|
|
mov r6d, seed
|
|
or seed, 0xeff4eff4
|
|
test seeb, seeh
|
|
setp r7b ; parity of top_seed
|
|
shr seed, 16
|
|
shl r7d, 16
|
|
test seeb, seeh
|
|
setp r7b ; parity of cur_seed
|
|
or r6d, 0x00010001
|
|
xor r7d, r6d
|
|
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
|
|
|
|
DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \
|
|
h, sby, see, left_offxy, top_offxy, topleft_offxy
|
|
|
|
mov topleft_offxyd, top_offxyd
|
|
rorx offyd, seed, 8
|
|
mov left_offxyd, offxd
|
|
rorx offxd, seed, 12
|
|
and offyd, 0xf000f
|
|
and offxd, 0xf000f
|
|
imul offyd, 164
|
|
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
|
|
lea offxd, [offyq+offxq*2+0x10001*829+32*82]
|
|
|
|
DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \
|
|
h, sby, see, left_offxy, top_offxy, topleft_offxy
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
movzx top_offxyd, offxyw
|
|
shr offxyd, 16
|
|
movu ym19, [grain_lutq+offxyq-82]
|
|
vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1
|
|
movd xm16, [grain_lutq+left_offxyq-50]
|
|
vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2
|
|
movu ym21, [grain_lutq+top_offxyq-82]
|
|
vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1
|
|
movd xm17, [grain_lutq+topleft_offxyq-50]
|
|
vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2
|
|
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
|
|
punpcklbw m16, m19
|
|
pmaddubsw m16, m10, m16
|
|
punpcklbw m17, m21
|
|
pmaddubsw m17, m10, m17
|
|
punpckhbw m20, m21, m19
|
|
pmulhrsw m16, m9
|
|
pmulhrsw m17, m9
|
|
packsswb m19{k1}, m16, m16
|
|
packsswb m21{k1}, m17, m17
|
|
; followed by v interpolation (top | cur -> cur)
|
|
punpcklbw m21, m19
|
|
call .add_noise_v
|
|
sub hb, 2
|
|
jg .loop_y_h_overlap
|
|
add wq, 32
|
|
lea srcq, [src_bakq+wq]
|
|
jl .hv_overlap
|
|
.end:
|
|
RET
|
|
ALIGN function_align
|
|
.add_noise_v:
|
|
pmaddubsw m20, m12, m20
|
|
pmaddubsw m21, m12, m21
|
|
pmulhrsw m20, m9
|
|
pmulhrsw m21, m9
|
|
packsswb m21, m20
|
|
.add_noise:
|
|
punpcklbw m20, m5, m21
|
|
punpckhbw m21, m5
|
|
.add_noise_h:
|
|
mova ym18, [srcq+strideq*0]
|
|
vinserti32x8 m18, [srcq+strideq*1], 1
|
|
mova m19, m0
|
|
punpcklbw m16, m18, m5
|
|
vpermt2b m19, m18, m1 ; scaling[ 0..127]
|
|
vpmovb2m k2, m18
|
|
punpckhbw m17, m18, m5
|
|
vpermi2b m18, m2, m3 ; scaling[128..255]
|
|
vmovdqu8 m19{k2}, m18 ; scaling[src]
|
|
pshufb m19, m4
|
|
pmaddubsw m18, m19, m20
|
|
pmaddubsw m19, m21
|
|
add grain_lutq, 82*2
|
|
pmulhrsw m18, m6 ; noise
|
|
pmulhrsw m19, m6
|
|
paddw m16, m18
|
|
paddw m17, m19
|
|
packuswb m16, m17
|
|
pmaxub m16, m7
|
|
pminub m16, m8
|
|
mova [dstq+srcq], ym16
|
|
add srcq, strideq
|
|
vextracti32x8 [dstq+srcq], m16, 1
|
|
add srcq, strideq
|
|
ret
|
|
|
|
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
|
|
cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \
|
|
scaling, grain_lut, h, sby, luma, \
|
|
overlap, uv_pl, is_id, _, stride3
|
|
lea r11, [fg_min]
|
|
mov r6d, [fg_dataq+FGData.scaling_shift]
|
|
mov r7d, [fg_dataq+FGData.clip_to_restricted_range]
|
|
mov r9d, is_idm
|
|
mov sbyd, sbym
|
|
mov overlapd, [fg_dataq+FGData.overlap_flag]
|
|
%if %2
|
|
mov r12, 0x000f000f000f000f ; h_overlap mask
|
|
vpbroadcastq m10, [base+pb_23_22_0_32]
|
|
lea stride3q, [strideq*3]
|
|
%else
|
|
mov r12, 0x0000000f0000000f
|
|
vpbroadcastq m10, [base+pb_27_17_17_27]
|
|
%endif
|
|
mova m0, [scalingq+64*0]
|
|
mova m1, [scalingq+64*1]
|
|
mova m2, [scalingq+64*2]
|
|
mova m3, [scalingq+64*3]
|
|
kmovq k1, r12
|
|
vbroadcasti32x4 m4, [base+interleave_hl]
|
|
vpbroadcastd m6, [base+noise_rnd+r6*4-32]
|
|
vpbroadcastd m7, [base+fg_min+r7*4]
|
|
shlx r7d, r7d, r9d
|
|
vpbroadcastd m8, [base+fg_max+r7*4]
|
|
test sbyd, sbyd
|
|
setnz r7b
|
|
vpbroadcastd m9, [base+pw_1024]
|
|
mova m11, [base+pb_even]
|
|
mova m12, [base+pb_odd]
|
|
pxor m5, m5
|
|
mov r5, r10mp ; lstride
|
|
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
|
|
jne .csfl
|
|
|
|
%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
|
|
h, sby, see, overlap, uv_pl, _, _, stride3
|
|
%if %1
|
|
mov r6d, uv_plm
|
|
vpbroadcastd m16, [base+pw_8]
|
|
vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4]
|
|
vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4]
|
|
pshufb m14, m16 ; uv_luma_mult, uv_mult
|
|
%endif
|
|
test r7b, overlapb
|
|
jnz %%v_overlap
|
|
|
|
imul seed, sbyd, (173 << 24) | 37
|
|
add seed, (105 << 24) | 178
|
|
rorx seed, seed, 24
|
|
movzx seed, seew
|
|
xor seed, [fg_dataq+FGData.seed]
|
|
|
|
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
|
|
offx, offy, see, overlap, _, _, _, stride3
|
|
|
|
mov lumaq, r9mp
|
|
lea r11, [srcq+wq]
|
|
lea r12, [dstq+wq]
|
|
lea r13, [lumaq+wq*(1+%2)]
|
|
mov r11mp, r11
|
|
mov r12mp, r12
|
|
neg wq
|
|
|
|
%%loop_x:
|
|
rorx r6, seeq, 1
|
|
or seed, 0xeff4
|
|
test seeb, seeh
|
|
lea seed, [r6+0x8000]
|
|
cmovp seed, r6d ; updated seed
|
|
rorx offyd, seed, 8
|
|
rorx offxq, seeq, 12
|
|
and offyd, 0xf
|
|
imul offyd, 164>>%3
|
|
lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
|
|
|
|
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
|
|
h, offxy, see, overlap, _, _, _, stride3
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
%%loop_y:
|
|
%if %2
|
|
movu xm21, [grain_lutq+offxyq+82*0]
|
|
vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
|
|
vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
|
|
vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
|
|
%else
|
|
movu ym21, [grain_lutq+offxyq+82*0]
|
|
vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
|
|
%endif
|
|
call %%add_noise
|
|
sub hb, 2<<%2
|
|
jg %%loop_y
|
|
add wq, 32>>%2
|
|
jge .end
|
|
mov srcq, r11mp
|
|
mov dstq, r12mp
|
|
lea lumaq, [r13+wq*(1<<%2)]
|
|
add srcq, wq
|
|
add dstq, wq
|
|
test overlapd, overlapd
|
|
jz %%loop_x
|
|
cmp dword r8m, 0 ; sby
|
|
jne %%hv_overlap
|
|
|
|
; horizontal overlap (without vertical overlap)
|
|
%%loop_x_h_overlap:
|
|
rorx r6, seeq, 1
|
|
or seed, 0xeff4
|
|
test seeb, seeh
|
|
lea seed, [r6+0x8000]
|
|
cmovp seed, r6d ; updated seed
|
|
|
|
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
|
|
offx, offy, see, left_offxy, _, _, _, stride3
|
|
|
|
lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
|
|
rorx offyd, seed, 8
|
|
rorx offxq, seeq, 12
|
|
and offyd, 0xf
|
|
imul offyd, 164>>%3
|
|
lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
|
|
|
|
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
|
|
h, offxy, see, left_offxy, _, _, _, stride3
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
%%loop_y_h_overlap:
|
|
%if %2
|
|
movu xm20, [grain_lutq+offxyq +82*0]
|
|
movd xm19, [grain_lutq+left_offxyq+82*0]
|
|
vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1
|
|
vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1
|
|
vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2
|
|
vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2
|
|
vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3
|
|
vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3
|
|
%else
|
|
movu ym20, [grain_lutq+offxyq + 0]
|
|
movd xm19, [grain_lutq+left_offxyq+ 0]
|
|
vinserti32x8 m20, [grain_lutq+offxyq +82], 1
|
|
vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2
|
|
%endif
|
|
punpcklbw m19, m20
|
|
pmaddubsw m19, m10, m19
|
|
punpckhbw m21, m20, m5
|
|
pmulhrsw m19, m9
|
|
vpacksswb m20{k1}, m19, m19
|
|
punpcklbw m20, m5, m20
|
|
call %%add_noise_h
|
|
sub hb, 2<<%2
|
|
jg %%loop_y_h_overlap
|
|
add wq, 32>>%2
|
|
jge .end
|
|
mov srcq, r11mp
|
|
mov dstq, r12mp
|
|
lea lumaq, [r13+wq*(1<<%2)]
|
|
add srcq, wq
|
|
add dstq, wq
|
|
cmp dword r8m, 0 ; sby
|
|
jne %%hv_overlap
|
|
jmp %%loop_x_h_overlap
|
|
|
|
%%v_overlap:
|
|
DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \
|
|
_, sby, see, overlap, _, _, _, stride3
|
|
|
|
movzx sbyd, sbyb
|
|
imul seed, [fg_dataq+FGData.seed], 0x00010001
|
|
imul r7d, sbyd, 173 * 0x00010001
|
|
imul sbyd, 37 * 0x01000100
|
|
add r7d, (105 << 16) | 188
|
|
add sbyd, (178 << 24) | (141 << 8)
|
|
and r7d, 0x00ff00ff
|
|
and sbyd, 0xff00ff00
|
|
xor seed, r7d
|
|
xor seed, sbyd ; (cur_seed << 16) | top_seed
|
|
|
|
%if %3
|
|
vpbroadcastd m13, [base+pb_23_22]
|
|
kxnorw k3, k3, k3 ; v_overlap mask
|
|
%elif %2
|
|
vbroadcasti32x8 m13, [base+pb_27_17]
|
|
kxnord k3, k3, k3
|
|
pshufd m13, m13, q0000 ; 8x27_17, 8x17_27
|
|
%else
|
|
vpbroadcastd ym16, [base+pb_27_17]
|
|
vpbroadcastd m13, [base+pb_17_27]
|
|
vmovdqa64 m13{k1}, m16
|
|
%endif
|
|
|
|
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
|
|
offx, offy, see, overlap, top_offxy, _, _, stride3
|
|
|
|
mov lumaq, r9mp
|
|
lea r11, [srcq+wq]
|
|
lea r12, [dstq+wq]
|
|
lea r13, [lumaq+wq*(1<<%2)]
|
|
mov r11mp, r11
|
|
mov r12mp, r12
|
|
neg wq
|
|
|
|
; we assume from the block above that bits 8-15 of r7d are zero'ed
|
|
mov r6d, seed
|
|
or seed, 0xeff4eff4
|
|
test seeb, seeh
|
|
setp r7b ; parity of top_seed
|
|
shr seed, 16
|
|
shl r7d, 16
|
|
test seeb, seeh
|
|
setp r7b ; parity of cur_seed
|
|
or r6d, 0x00010001
|
|
xor r7d, r6d
|
|
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
|
|
rorx offyd, seed, 8
|
|
rorx offxd, seed, 12
|
|
and offyd, 0x000f000f
|
|
and offxd, 0x000f000f
|
|
imul offyd, 164>>%3
|
|
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
|
|
lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
|
|
|
|
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
|
|
h, offxy, see, overlap, top_offxy, _, _, stride3
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
movzx top_offxyd, offxyw
|
|
shr offxyd, 16
|
|
|
|
%if %3
|
|
movu xm18, [grain_lutq+offxyq+82*0]
|
|
movu xm20, [grain_lutq+top_offxyq+82*0]
|
|
; only interpolate first line, insert remaining line unmodified
|
|
vbroadcasti128 ym21, [grain_lutq+offxyq+82*1]
|
|
vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
|
|
vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
|
|
punpcklbw xm19, xm20, xm18
|
|
punpckhbw xm20, xm18
|
|
%elif %2
|
|
movu xm18, [grain_lutq+offxyq+82*0]
|
|
vinserti128 ym18, [grain_lutq+offxyq+82*1], 1
|
|
movu xm20, [grain_lutq+top_offxyq+82*0]
|
|
vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1
|
|
vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2]
|
|
vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
|
|
punpcklbw ym19, ym20, ym18
|
|
punpckhbw ym20, ym18
|
|
%else
|
|
movu ym21, [grain_lutq+offxyq+82*0]
|
|
vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
|
|
movu ym20, [grain_lutq+top_offxyq+82*0]
|
|
vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1
|
|
%endif
|
|
call %%add_noise_v
|
|
sub hb, 2<<%2
|
|
jg %%loop_y
|
|
add wq, 32>>%2
|
|
jge .end
|
|
mov srcq, r11mp
|
|
mov dstq, r12mp
|
|
lea lumaq, [r13+wq*(1<<%2)]
|
|
add srcq, wq
|
|
add dstq, wq
|
|
|
|
%%hv_overlap:
|
|
; we assume from the block above that bits 8-15 of r7d are zero'ed
|
|
mov r6d, seed
|
|
or seed, 0xeff4eff4
|
|
test seeb, seeh
|
|
setp r7b ; parity of top_seed
|
|
shr seed, 16
|
|
shl r7d, 16
|
|
test seeb, seeh
|
|
setp r7b ; parity of cur_seed
|
|
or r6d, 0x00010001
|
|
xor r7d, r6d
|
|
rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
|
|
|
|
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
|
|
offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
|
|
|
|
lea topleft_offxyd, [top_offxyq+(32>>%2)]
|
|
lea left_offxyd, [offyq+(32>>%2)]
|
|
rorx offyd, seed, 8
|
|
rorx offxd, seed, 12
|
|
and offyd, 0x000f000f
|
|
and offxd, 0x000f000f
|
|
imul offyd, 164>>%3
|
|
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
|
|
lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
|
|
|
|
DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \
|
|
h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3
|
|
|
|
mov grain_lutq, grain_lutmp
|
|
mov hd, hm
|
|
movzx top_offxyd, offxyw
|
|
shr offxyd, 16
|
|
|
|
%if %2
|
|
movu xm21, [grain_lutq+offxyq+82*0]
|
|
movd xm16, [grain_lutq+left_offxyq+82*0]
|
|
vinserti128 ym21, [grain_lutq+offxyq+82*1], 1
|
|
vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1
|
|
vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2
|
|
vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2
|
|
vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3
|
|
vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3
|
|
movd xm18, [grain_lutq+topleft_offxyq+82*0]
|
|
movu xm20, [grain_lutq+top_offxyq]
|
|
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
|
|
punpcklbw m16, m21
|
|
%if %3
|
|
punpcklbw xm18, xm20
|
|
%else
|
|
vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1
|
|
vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1
|
|
punpcklbw ym18, ym20
|
|
%endif
|
|
punpcklqdq m16, m18
|
|
pmaddubsw m16, m10, m16
|
|
pmulhrsw m16, m9
|
|
packsswb m16, m16
|
|
vmovdqu8 m21{k1}, m16
|
|
%if %3
|
|
vpalignr xm20{k1}, xm16, xm16, 4
|
|
punpcklbw xm19, xm20, xm21
|
|
punpckhbw xm20, xm21
|
|
%else
|
|
vpalignr ym20{k1}, ym16, ym16, 4
|
|
punpcklbw ym19, ym20, ym21
|
|
punpckhbw ym20, ym21
|
|
%endif
|
|
%else
|
|
movu ym21, [grain_lutq+offxyq+82*0]
|
|
vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1
|
|
movd xm16, [grain_lutq+left_offxyq+82*0]
|
|
vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2
|
|
movu ym20, [grain_lutq+top_offxyq+82*0]
|
|
vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1
|
|
movd xm18, [grain_lutq+topleft_offxyq+82*0]
|
|
vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2
|
|
punpcklbw m16, m21
|
|
punpcklbw m18, m20
|
|
punpcklqdq m16, m18
|
|
pmaddubsw m16, m10, m16
|
|
pmulhrsw m16, m9
|
|
packsswb m16, m16
|
|
vpalignr m20{k1}, m16, m16, 4
|
|
vmovdqu8 m21{k1}, m16
|
|
%endif
|
|
call %%add_noise_v
|
|
sub hb, 2<<%2
|
|
jg %%loop_y_h_overlap
|
|
add wq, 32>>%2
|
|
jge .end
|
|
mov srcq, r11mp
|
|
mov dstq, r12mp
|
|
lea lumaq, [r13+wq*(1<<%2)]
|
|
add srcq, wq
|
|
add dstq, wq
|
|
jmp %%hv_overlap
|
|
ALIGN function_align
|
|
%%add_noise_v:
|
|
%if %3
|
|
pmaddubsw xm19, xm13, xm19
|
|
pmaddubsw xm20, xm13, xm20
|
|
pmulhrsw xm19, xm9
|
|
pmulhrsw xm20, xm9
|
|
vpacksswb m21{k3}, m19, m20
|
|
%elif %2
|
|
pmaddubsw ym19, ym13, ym19
|
|
pmaddubsw ym20, ym13, ym20
|
|
pmulhrsw ym19, ym9
|
|
pmulhrsw ym20, ym9
|
|
vpacksswb m21{k3}, m19, m20
|
|
%else
|
|
punpcklbw m19, m20, m21
|
|
punpckhbw m20, m21
|
|
pmaddubsw m19, m13, m19
|
|
pmaddubsw m20, m13, m20
|
|
pmulhrsw m19, m9
|
|
pmulhrsw m20, m9
|
|
packsswb m21, m19, m20
|
|
%endif
|
|
%%add_noise:
|
|
punpcklbw m20, m5, m21
|
|
punpckhbw m21, m5
|
|
%%add_noise_h:
|
|
mova ym18, [lumaq+lstrideq*(0<<%3)]
|
|
vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1
|
|
%if %2
|
|
lea lumaq, [lumaq+lstrideq*(2<<%3)]
|
|
mova ym16, [lumaq+lstrideq*(0<<%3)]
|
|
vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1
|
|
mova xm17, [srcq+strideq*0]
|
|
mova m19, m11
|
|
vpermi2b m19, m18, m16
|
|
vinserti128 ym17, [srcq+strideq*1], 1
|
|
vpermt2b m18, m12, m16
|
|
vinserti32x4 m17, [srcq+strideq*2], 2
|
|
pavgb m18, m19
|
|
vinserti32x4 m17, [srcq+stride3q ], 3
|
|
%else
|
|
mova ym17, [srcq+strideq*0]
|
|
vinserti32x8 m17, [srcq+strideq*1], 1
|
|
%endif
|
|
%if %1
|
|
punpckhbw m19, m18, m17
|
|
punpcklbw m18, m17 ; { luma, chroma }
|
|
pmaddubsw m19, m14
|
|
pmaddubsw m18, m14
|
|
psraw m19, 6
|
|
psraw m18, 6
|
|
paddw m19, m15
|
|
paddw m18, m15
|
|
packuswb m18, m19
|
|
.add_noise_main:
|
|
mova m19, m0
|
|
vpermt2b m19, m18, m1 ; scaling[ 0..127]
|
|
vpmovb2m k2, m18
|
|
vpermi2b m18, m2, m3 ; scaling[128..255]
|
|
vmovdqu8 m19{k2}, m18 ; scaling[src]
|
|
pshufb m19, m4
|
|
pmaddubsw m18, m19, m20
|
|
pmaddubsw m19, m21
|
|
add grain_lutq, 82*2<<%2
|
|
lea lumaq, [lumaq+lstrideq*(2<<%3)]
|
|
lea srcq, [srcq+strideq*(2<<%2)]
|
|
pmulhrsw m18, m6 ; noise
|
|
pmulhrsw m19, m6
|
|
punpcklbw m16, m17, m5 ; chroma
|
|
punpckhbw m17, m5
|
|
paddw m16, m18
|
|
paddw m17, m19
|
|
packuswb m16, m17
|
|
pmaxub m16, m7
|
|
pminub m16, m8
|
|
%if %2
|
|
mova [dstq+strideq*0], xm16
|
|
vextracti128 [dstq+strideq*1], ym16, 1
|
|
vextracti32x4 [dstq+strideq*2], m16, 2
|
|
vextracti32x4 [dstq+stride3q ], m16, 3
|
|
%else
|
|
mova [dstq+strideq*0], ym16
|
|
vextracti32x8 [dstq+strideq*1], m16, 1
|
|
%endif
|
|
lea dstq, [dstq+strideq*(2<<%2)]
|
|
ret
|
|
%else
|
|
jmp .add_noise_main
|
|
%endif
|
|
%endmacro
|
|
|
|
%%FGUV_32x32xN_LOOP 1, %2, %3
|
|
.csfl:
|
|
%%FGUV_32x32xN_LOOP 0, %2, %3
|
|
.end:
|
|
RET
|
|
%endmacro
|
|
|
|
FGUV_FN 420, 1, 1
|
|
FGUV_FN 422, 1, 0
|
|
FGUV_FN 444, 0, 0
|
|
|
|
%endif ; ARCH_X86_64
|