314 lines
7.6 KiB
NASM
314 lines
7.6 KiB
NASM
;******************************************************************************
|
|
;* FLAC DSP SIMD optimizations
|
|
;*
|
|
;* Copyright (C) 2014 Loren Merritt
|
|
;* Copyright (C) 2014 James Almer
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION .text
|
|
|
|
%macro PMACSDQL 5
|
|
%if cpuflag(xop)
|
|
pmacsdql %1, %2, %3, %1
|
|
%else
|
|
pmuldq %2, %3
|
|
paddq %1, %2
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro LPC_32 1
|
|
INIT_XMM %1
|
|
cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
|
|
sub lend, pred_orderd
|
|
jle .ret
|
|
lea decodedq, [decodedq+pred_orderq*4-8]
|
|
lea coeffsq, [coeffsq+pred_orderq*4]
|
|
neg pred_orderq
|
|
movd m4, qlevelm
|
|
ALIGN 16
|
|
.loop_sample:
|
|
movd m0, [decodedq+pred_orderq*4+8]
|
|
add decodedq, 8
|
|
movd m1, [coeffsq+pred_orderq*4]
|
|
pxor m2, m2
|
|
pxor m3, m3
|
|
lea jq, [pred_orderq+1]
|
|
test jq, jq
|
|
jz .end_order
|
|
.loop_order:
|
|
PMACSDQL m2, m0, m1, m2, m0
|
|
movd m0, [decodedq+jq*4]
|
|
PMACSDQL m3, m1, m0, m3, m1
|
|
movd m1, [coeffsq+jq*4]
|
|
inc jq
|
|
jl .loop_order
|
|
.end_order:
|
|
PMACSDQL m2, m0, m1, m2, m0
|
|
psrlq m2, m4
|
|
movd m0, [decodedq]
|
|
paddd m0, m2
|
|
movd [decodedq], m0
|
|
sub lend, 2
|
|
jl .ret
|
|
PMACSDQL m3, m1, m0, m3, m1
|
|
psrlq m3, m4
|
|
movd m1, [decodedq+4]
|
|
paddd m1, m3
|
|
movd [decodedq+4], m1
|
|
jg .loop_sample
|
|
.ret:
|
|
REP_RET
|
|
%endmacro
|
|
|
|
%if HAVE_XOP_EXTERNAL
|
|
LPC_32 xop
|
|
%endif
|
|
LPC_32 sse4
|
|
|
|
;----------------------------------------------------------------------------------
|
|
;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
|
|
; int len, int shift);
|
|
;----------------------------------------------------------------------------------
|
|
%macro FLAC_DECORRELATE_16 3-4
|
|
cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
|
|
%if ARCH_X86_32
|
|
mov lend, lenm
|
|
%endif
|
|
movd m3, r4m
|
|
shl lend, 2
|
|
mov in1q, [in0q + gprsize]
|
|
mov in0q, [in0q]
|
|
mov outq, [outq]
|
|
add in1q, lenq
|
|
add in0q, lenq
|
|
add outq, lenq
|
|
neg lenq
|
|
|
|
align 16
|
|
.loop:
|
|
mova m0, [in0q + lenq]
|
|
mova m1, [in1q + lenq]
|
|
%ifidn %1, ms
|
|
psrad m2, m1, 1
|
|
psubd m0, m2
|
|
%endif
|
|
%ifnidn %1, indep2
|
|
p%4d m2, m0, m1
|
|
%endif
|
|
packssdw m%2, m%2
|
|
packssdw m%3, m%3
|
|
punpcklwd m%2, m%3
|
|
psllw m%2, m3
|
|
mova [outq + lenq], m%2
|
|
add lenq, 16
|
|
jl .loop
|
|
REP_RET
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
FLAC_DECORRELATE_16 ls, 0, 2, sub
|
|
FLAC_DECORRELATE_16 rs, 2, 1, add
|
|
FLAC_DECORRELATE_16 ms, 2, 0, add
|
|
|
|
;----------------------------------------------------------------------------------
|
|
;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
|
|
; int len, int shift);
|
|
;----------------------------------------------------------------------------------
|
|
%macro FLAC_DECORRELATE_32 5
|
|
cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
|
|
%if ARCH_X86_32
|
|
mov lend, lenm
|
|
%endif
|
|
movd m3, r4m
|
|
mov in1q, [in0q + gprsize]
|
|
mov in0q, [in0q]
|
|
mov outq, [outq]
|
|
sub in1q, in0q
|
|
|
|
align 16
|
|
.loop:
|
|
mova m0, [in0q]
|
|
mova m1, [in0q + in1q]
|
|
%ifidn %1, ms
|
|
psrad m2, m1, 1
|
|
psubd m0, m2
|
|
%endif
|
|
p%5d m2, m0, m1
|
|
pslld m%2, m3
|
|
pslld m%3, m3
|
|
|
|
SBUTTERFLY dq, %2, %3, %4
|
|
|
|
mova [outq ], m%2
|
|
mova [outq + mmsize], m%3
|
|
|
|
add in0q, mmsize
|
|
add outq, mmsize*2
|
|
sub lend, mmsize/4
|
|
jg .loop
|
|
REP_RET
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
|
|
FLAC_DECORRELATE_32 rs, 2, 1, 0, add
|
|
FLAC_DECORRELATE_32 ms, 2, 0, 1, add
|
|
|
|
;-----------------------------------------------------------------------------------------
|
|
;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
|
|
; int len, int shift);
|
|
;-----------------------------------------------------------------------------------------
|
|
;%1 = bps
|
|
;%2 = channels
|
|
;%3 = last xmm reg used
|
|
;%4 = word/dword (shift instruction)
|
|
%macro FLAC_DECORRELATE_INDEP 4
|
|
%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
|
|
cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
|
|
%if ARCH_X86_32
|
|
%if %2 == 6
|
|
DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
|
|
%define lend dword r3m
|
|
%else
|
|
mov lend, lenm
|
|
%endif
|
|
%endif
|
|
movd m%3, r4m
|
|
|
|
%assign %%i 1
|
|
%rep %2-1
|
|
mov in %+ %%i %+ q, [in0q+%%i*gprsize]
|
|
%assign %%i %%i+1
|
|
%endrep
|
|
|
|
mov in0q, [in0q]
|
|
mov outq, [outq]
|
|
|
|
%assign %%i 1
|
|
%rep %2-1
|
|
sub in %+ %%i %+ q, in0q
|
|
%assign %%i %%i+1
|
|
%endrep
|
|
|
|
align 16
|
|
.loop:
|
|
mova m0, [in0q]
|
|
|
|
%assign %%i 1
|
|
%rep REPCOUNT-1
|
|
mova m %+ %%i, [in0q + in %+ %%i %+ q]
|
|
%assign %%i %%i+1
|
|
%endrep
|
|
|
|
%if %1 == 32
|
|
|
|
%if %2 == 8
|
|
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
|
|
%elif %2 == 6
|
|
SBUTTERFLY dq, 0, 1, 6
|
|
SBUTTERFLY dq, 2, 3, 6
|
|
SBUTTERFLY dq, 4, 5, 6
|
|
|
|
punpcklqdq m6, m0, m2
|
|
punpckhqdq m2, m4
|
|
shufps m4, m0, 0xe4
|
|
punpcklqdq m0, m1, m3
|
|
punpckhqdq m3, m5
|
|
shufps m5, m1, 0xe4
|
|
SWAP 0,6,1,4,5,3
|
|
%elif %2 == 4
|
|
TRANSPOSE4x4D 0, 1, 2, 3, 4
|
|
%else ; %2 == 2
|
|
SBUTTERFLY dq, 0, 1, 2
|
|
%endif
|
|
|
|
%else ; %1 == 16
|
|
|
|
%if %2 == 8
|
|
packssdw m0, [in0q + in4q]
|
|
packssdw m1, [in0q + in5q]
|
|
packssdw m2, [in0q + in6q]
|
|
packssdw m3, [in0q + in7q]
|
|
TRANSPOSE2x4x4W 0, 1, 2, 3, 4
|
|
%elif %2 == 6
|
|
packssdw m0, [in0q + in3q]
|
|
packssdw m1, [in0q + in4q]
|
|
packssdw m2, [in0q + in5q]
|
|
pshufd m3, m0, q1032
|
|
punpcklwd m0, m1
|
|
punpckhwd m1, m2
|
|
punpcklwd m2, m3
|
|
|
|
shufps m3, m0, m2, q2020
|
|
shufps m0, m1, q2031
|
|
shufps m2, m1, q3131
|
|
shufps m1, m2, m3, q3120
|
|
shufps m3, m0, q0220
|
|
shufps m0, m2, q3113
|
|
SWAP 2, 0, 3
|
|
%else ; %2 == 4
|
|
packssdw m0, [in0q + in2q]
|
|
packssdw m1, [in0q + in3q]
|
|
SBUTTERFLY wd, 0, 1, 2
|
|
SBUTTERFLY dq, 0, 1, 2
|
|
%endif
|
|
|
|
%endif
|
|
|
|
%assign %%i 0
|
|
%rep REPCOUNT
|
|
psll%4 m %+ %%i, m%3
|
|
%assign %%i %%i+1
|
|
%endrep
|
|
|
|
%assign %%i 0
|
|
%rep REPCOUNT
|
|
mova [outq + %%i*mmsize], m %+ %%i
|
|
%assign %%i %%i+1
|
|
%endrep
|
|
|
|
add in0q, mmsize
|
|
add outq, mmsize*REPCOUNT
|
|
sub lend, mmsize/4
|
|
jg .loop
|
|
REP_RET
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
|
|
FLAC_DECORRELATE_INDEP 32, 2, 3, d
|
|
FLAC_DECORRELATE_INDEP 16, 4, 3, w
|
|
FLAC_DECORRELATE_INDEP 32, 4, 5, d
|
|
FLAC_DECORRELATE_INDEP 16, 6, 4, w
|
|
FLAC_DECORRELATE_INDEP 32, 6, 7, d
|
|
%if ARCH_X86_64
|
|
FLAC_DECORRELATE_INDEP 16, 8, 5, w
|
|
FLAC_DECORRELATE_INDEP 32, 8, 9, d
|
|
%endif
|
|
|
|
INIT_XMM avx
|
|
FLAC_DECORRELATE_INDEP 32, 4, 5, d
|
|
FLAC_DECORRELATE_INDEP 32, 6, 7, d
|
|
%if ARCH_X86_64
|
|
FLAC_DECORRELATE_INDEP 16, 8, 5, w
|
|
FLAC_DECORRELATE_INDEP 32, 8, 9, d
|
|
%endif
|