FFmpeg4/libavcodec/x86/vp9mc_16bpp.asm

432 lines
10 KiB
NASM

;******************************************************************************
;* VP9 MC SIMD optimizations
;*
;* Copyright (c) 2015 Ronald S. Bultje <rsbultje gmail com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
pd_64: times 8 dd 64
cextern pw_1023
cextern pw_4095
SECTION .text
%macro filter_h4_fn 1-2 12
cglobal vp9_%1_8tap_1d_h_4_10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
mova m5, [pw_1023]
.body:
%if notcpuflag(sse4) && ARCH_X86_64
pxor m11, m11
%endif
mova m6, [pd_64]
mova m7, [filteryq+ 0]
%if ARCH_X86_64 && mmsize > 8
mova m8, [filteryq+32]
mova m9, [filteryq+64]
mova m10, [filteryq+96]
%endif
.loop:
movh m0, [srcq-6]
movh m1, [srcq-4]
movh m2, [srcq-2]
movh m3, [srcq+0]
movh m4, [srcq+2]
punpcklwd m0, m1
punpcklwd m2, m3
pmaddwd m0, m7
%if ARCH_X86_64 && mmsize > 8
pmaddwd m2, m8
%else
pmaddwd m2, [filteryq+32]
%endif
movu m1, [srcq+4]
movu m3, [srcq+6]
paddd m0, m2
movu m2, [srcq+8]
add srcq, sstrideq
punpcklwd m4, m1
punpcklwd m3, m2
%if ARCH_X86_64 && mmsize > 8
pmaddwd m4, m9
pmaddwd m3, m10
%else
pmaddwd m4, [filteryq+64]
pmaddwd m3, [filteryq+96]
%endif
paddd m0, m4
paddd m0, m3
paddd m0, m6
psrad m0, 7
%if cpuflag(sse4)
packusdw m0, m0
%else
packssdw m0, m0
%endif
%ifidn %1, avg
movh m1, [dstq]
%endif
pminsw m0, m5
%if notcpuflag(sse4)
%if ARCH_X86_64
pmaxsw m0, m11
%else
pxor m2, m2
pmaxsw m0, m2
%endif
%endif
%ifidn %1, avg
pavgw m0, m1
%endif
movh [dstq], m0
add dstq, dstrideq
dec hd
jg .loop
RET
cglobal vp9_%1_8tap_1d_h_4_12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
mova m5, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_4_10 %+ SUFFIX).body
%endmacro
INIT_XMM sse2
filter_h4_fn put
filter_h4_fn avg
%macro filter_h_fn 1-2 12
%assign %%px mmsize/2
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _10, 6, 6, %2, dst, dstride, src, sstride, h, filtery
mova m5, [pw_1023]
.body:
%if notcpuflag(sse4) && ARCH_X86_64
pxor m11, m11
%endif
mova m6, [pd_64]
mova m7, [filteryq+ 0]
%if ARCH_X86_64 && mmsize > 8
mova m8, [filteryq+32]
mova m9, [filteryq+64]
mova m10, [filteryq+96]
%endif
.loop:
movu m0, [srcq-6]
movu m1, [srcq-4]
movu m2, [srcq-2]
movu m3, [srcq+0]
movu m4, [srcq+2]
pmaddwd m0, m7
pmaddwd m1, m7
%if ARCH_X86_64 && mmsize > 8
pmaddwd m2, m8
pmaddwd m3, m8
pmaddwd m4, m9
%else
pmaddwd m2, [filteryq+32]
pmaddwd m3, [filteryq+32]
pmaddwd m4, [filteryq+64]
%endif
paddd m0, m2
paddd m1, m3
paddd m0, m4
movu m2, [srcq+4]
movu m3, [srcq+6]
movu m4, [srcq+8]
add srcq, sstrideq
%if ARCH_X86_64 && mmsize > 8
pmaddwd m2, m9
pmaddwd m3, m10
pmaddwd m4, m10
%else
pmaddwd m2, [filteryq+64]
pmaddwd m3, [filteryq+96]
pmaddwd m4, [filteryq+96]
%endif
paddd m1, m2
paddd m0, m3
paddd m1, m4
paddd m0, m6
paddd m1, m6
psrad m0, 7
psrad m1, 7
%if cpuflag(sse4)
packusdw m0, m0
packusdw m1, m1
%else
packssdw m0, m0
packssdw m1, m1
%endif
punpcklwd m0, m1
pminsw m0, m5
%if notcpuflag(sse4)
%if ARCH_X86_64
pmaxsw m0, m11
%else
pxor m2, m2
pmaxsw m0, m2
%endif
%endif
%ifidn %1, avg
pavgw m0, [dstq]
%endif
mova [dstq], m0
add dstq, dstrideq
dec hd
jg .loop
RET
cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _12, 6, 6, %2, dst, dstride, src, sstride, h, filtery
mova m5, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_h_ %+ %%px %+ _10 %+ SUFFIX).body
%endmacro
INIT_XMM sse2
filter_h_fn put
filter_h_fn avg
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
filter_h_fn put
filter_h_fn avg
%endif
%macro filter_v4_fn 1-2 12
%if ARCH_X86_64
cglobal vp9_%1_8tap_1d_v_4_10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
cglobal vp9_%1_8tap_1d_v_4_10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%define hd r4mp
%endif
mova m5, [pw_1023]
.body:
%if notcpuflag(sse4) && ARCH_X86_64
pxor m11, m11
%endif
mova m6, [pd_64]
lea sstride3q, [sstrideq*3]
lea src4q, [srcq+sstrideq]
sub srcq, sstride3q
mova m7, [filteryq+ 0]
%if ARCH_X86_64 && mmsize > 8
mova m8, [filteryq+ 32]
mova m9, [filteryq+ 64]
mova m10, [filteryq+ 96]
%endif
.loop:
; FIXME maybe reuse loads from previous rows, or just
; more generally unroll this to prevent multiple loads of
; the same data?
movh m0, [srcq]
movh m1, [srcq+sstrideq]
movh m2, [srcq+sstrideq*2]
movh m3, [srcq+sstride3q]
add srcq, sstrideq
movh m4, [src4q]
punpcklwd m0, m1
punpcklwd m2, m3
pmaddwd m0, m7
%if ARCH_X86_64 && mmsize > 8
pmaddwd m2, m8
%else
pmaddwd m2, [filteryq+ 32]
%endif
movh m1, [src4q+sstrideq]
movh m3, [src4q+sstrideq*2]
paddd m0, m2
movh m2, [src4q+sstride3q]
add src4q, sstrideq
punpcklwd m4, m1
punpcklwd m3, m2
%if ARCH_X86_64 && mmsize > 8
pmaddwd m4, m9
pmaddwd m3, m10
%else
pmaddwd m4, [filteryq+ 64]
pmaddwd m3, [filteryq+ 96]
%endif
paddd m0, m4
paddd m0, m3
paddd m0, m6
psrad m0, 7
%if cpuflag(sse4)
packusdw m0, m0
%else
packssdw m0, m0
%endif
%ifidn %1, avg
movh m1, [dstq]
%endif
pminsw m0, m5
%if notcpuflag(sse4)
%if ARCH_X86_64
pmaxsw m0, m11
%else
pxor m2, m2
pmaxsw m0, m2
%endif
%endif
%ifidn %1, avg
pavgw m0, m1
%endif
movh [dstq], m0
add dstq, dstrideq
dec hd
jg .loop
RET
%if ARCH_X86_64
cglobal vp9_%1_8tap_1d_v_4_12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
cglobal vp9_%1_8tap_1d_v_4_12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%endif
mova m5, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_4_10 %+ SUFFIX).body
%endmacro
INIT_XMM sse2
filter_v4_fn put
filter_v4_fn avg
%macro filter_v_fn 1-2 13
%assign %%px mmsize/2
%if ARCH_X86_64
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _10, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%define hd r4mp
%endif
mova m5, [pw_1023]
.body:
%if notcpuflag(sse4) && ARCH_X86_64
pxor m12, m12
%endif
%if ARCH_X86_64
mova m11, [pd_64]
%endif
lea sstride3q, [sstrideq*3]
lea src4q, [srcq+sstrideq]
sub srcq, sstride3q
mova m7, [filteryq+ 0]
%if ARCH_X86_64 && mmsize > 8
mova m8, [filteryq+ 32]
mova m9, [filteryq+ 64]
mova m10, [filteryq+ 96]
%endif
.loop:
; FIXME maybe reuse loads from previous rows, or just
; more generally unroll this to prevent multiple loads of
; the same data?
movu m0, [srcq]
movu m1, [srcq+sstrideq]
movu m2, [srcq+sstrideq*2]
movu m3, [srcq+sstride3q]
add srcq, sstrideq
movu m4, [src4q]
SBUTTERFLY wd, 0, 1, 6
SBUTTERFLY wd, 2, 3, 6
pmaddwd m0, m7
pmaddwd m1, m7
%if ARCH_X86_64 && mmsize > 8
pmaddwd m2, m8
pmaddwd m3, m8
%else
pmaddwd m2, [filteryq+ 32]
pmaddwd m3, [filteryq+ 32]
%endif
paddd m0, m2
paddd m1, m3
movu m2, [src4q+sstrideq]
movu m3, [src4q+sstrideq*2]
SBUTTERFLY wd, 4, 2, 6
%if ARCH_X86_64 && mmsize > 8
pmaddwd m4, m9
pmaddwd m2, m9
%else
pmaddwd m4, [filteryq+ 64]
pmaddwd m2, [filteryq+ 64]
%endif
paddd m0, m4
paddd m1, m2
movu m4, [src4q+sstride3q]
add src4q, sstrideq
SBUTTERFLY wd, 3, 4, 6
%if ARCH_X86_64 && mmsize > 8
pmaddwd m3, m10
pmaddwd m4, m10
%else
pmaddwd m3, [filteryq+ 96]
pmaddwd m4, [filteryq+ 96]
%endif
paddd m0, m3
paddd m1, m4
%if ARCH_X86_64
paddd m0, m11
paddd m1, m11
%else
paddd m0, [pd_64]
paddd m1, [pd_64]
%endif
psrad m0, 7
psrad m1, 7
%if cpuflag(sse4)
packusdw m0, m1
%else
packssdw m0, m1
%endif
pminsw m0, m5
%if notcpuflag(sse4)
%if ARCH_X86_64
pmaxsw m0, m12
%else
pxor m2, m2
pmaxsw m0, m2
%endif
%endif
%ifidn %1, avg
pavgw m0, [dstq]
%endif
mova [dstq], m0
add dstq, dstrideq
dec hd
jg .loop
RET
%if ARCH_X86_64
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 6, 8, %2, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _12, 4, 7, %2, dst, dstride, src, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%endif
mova m5, [pw_4095]
jmp mangle(private_prefix %+ _ %+ vp9_%1_8tap_1d_v_ %+ %%px %+ _10 %+ SUFFIX).body
%endmacro
INIT_XMM sse2
filter_v_fn put
filter_v_fn avg
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
filter_v_fn put
filter_v_fn avg
%endif