FFmpeg4/libswscale/arm/rgb2yuv_neon_common.S

292 lines
6.8 KiB
ArmAsm
Raw Permalink Normal View History

2023-07-02 12:20:28 +00:00
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro alias name, tgt, set=1
.if \set != 0
\name .req \tgt
.else
.unreq \name
.endif
.endm
.altmacro
.macro alias_dw_all qw, dw_l, dw_h
alias q\qw\()_l, d\dw_l
alias q\qw\()_h, d\dw_h
.if \qw < 15
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
.endif
.endm
alias_dw_all 0, 0, 1
.noaltmacro
.macro alias_qw name, qw, set=1
alias \name\(), \qw, \set
alias \name\()_l, \qw\()_l, \set
alias \name\()_h, \qw\()_h, \set
.endm
.macro prologue
push {r4-r12, lr}
vpush {q4-q7}
.endm
.macro epilogue
vpop {q4-q7}
pop {r4-r12, pc}
.endm
.macro load_arg reg, ix
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
.endm
/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
* int width, int height,
* int y_stride, int c_stride, int src_stride,
* int32_t coeff_table[9]);
*/
.macro alias_loop_420sp set=1
alias src, r0, \set
alias src0, src, \set
alias y, r1, \set
alias y0, y, \set
alias chroma, r2, \set
alias width, r3, \set
alias header, width, \set
alias height, r4, \set
alias y_stride, r5, \set
alias c_stride, r6, \set
alias c_padding, c_stride, \set
alias src_stride, r7, \set
alias y0_end, r8, \set
alias src_padding,r9, \set
alias y_padding, r10, \set
alias src1, r11, \set
alias y1, r12, \set
alias coeff_table,r12, \set
.endm
.macro loop_420sp s_fmt, d_fmt, init, kernel, precision
function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
prologue
alias_loop_420sp
load_arg height, 4
load_arg y_stride, 5
load_arg c_stride, 6
load_arg src_stride, 7
load_arg coeff_table, 8
\init coeff_table
sub y_padding, y_stride, width
sub c_padding, c_stride, width
sub src_padding, src_stride, width, LSL #2
add y0_end, y0, width
and header, width, #15
add y1, y0, y_stride
add src1, src0, src_stride
0:
cmp header, #0
beq 1f
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
1:
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
cmp y0, y0_end
blt 1b
2:
add y0, y1, y_padding
add y0_end, y1, y_stride
add chroma, chroma, c_padding
add src0, src1, src_padding
add y1, y0, y_stride
add src1, src0, src_stride
subs height, height, #2
bgt 0b
epilogue
alias_loop_420sp 0
endfunc
.endm
.macro downsample
vpaddl.u8 r16x8, r8x16
vpaddl.u8 g16x8, g8x16
vpaddl.u8 b16x8, b8x16
.endm
/* acculumate and right shift by 2 */
.macro downsample_ars2
vpadal.u8 r16x8, r8x16
vpadal.u8 g16x8, g8x16
vpadal.u8 b16x8, b8x16
vrshr.u16 r16x8, r16x8, #2
vrshr.u16 g16x8, g16x8, #2
vrshr.u16 b16x8, b16x8, #2
.endm
.macro store_y8_16x1 dst, count
.ifc "\count",""
vstmia \dst!, {y8x16}
.else
vstmia \dst, {y8x16}
add \dst, \dst, \count
.endif
.endm
.macro store_chroma_nv12_8x1 dst, count
.ifc "\count",""
vst2.i8 {u8x8, v8x8}, [\dst]!
.else
vst2.i8 {u8x8, v8x8}, [\dst], \count
.endif
.endm
.macro store_chroma_nv21_8x1 dst, count
.ifc "\count",""
vst2.i8 {v8x8, u8x8}, [\dst]!
.else
vst2.i8 {v8x8, u8x8}, [\dst], \count
.endif
.endm
.macro load_8888_16x1 a, b, c, d, src, count
.ifc "\count",""
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
.else
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
sub \src, \src, #32
add \src, \src, \count, LSL #2
.endif
.endm
.macro load_rgbx_16x1 src, count
load_8888_16x1 r, g, b, x, \src, \count
.endm
.macro load_bgrx_16x1 src, count
load_8888_16x1 b, g, r, x, \src, \count
.endm
.macro alias_src_rgbx set=1
alias_src_8888 r, g, b, x, \set
.endm
.macro alias_src_bgrx set=1
alias_src_8888 b, g, r, x, \set
.endm
.macro alias_dst_nv12 set=1
alias u8x8, c8x8x2_l, \set
alias v8x8, c8x8x2_h, \set
.endm
.macro alias_dst_nv21 set=1
alias v8x8, c8x8x2_l, \set
alias u8x8, c8x8x2_h, \set
.endm
// common aliases
alias CO_R d0
CO_RY .dn d0.s16[0]
CO_RU .dn d0.s16[1]
CO_RV .dn d0.s16[2]
alias CO_G d1
CO_GY .dn d1.s16[0]
CO_GU .dn d1.s16[1]
CO_GV .dn d1.s16[2]
alias CO_B d2
CO_BY .dn d2.s16[0]
CO_BU .dn d2.s16[1]
CO_BV .dn d2.s16[2]
alias BIAS_U, d3
alias BIAS_V, BIAS_U
alias BIAS_Y, q2
/* q3-q6 R8G8B8X8 x16 */
.macro alias_src_8888 a, b, c, d, set
alias_qw \a\()8x16, q3, \set
alias_qw \b\()8x16, q4, \set
alias_qw \c\()8x16, q5, \set
alias_qw \d\()8x16, q6, \set
.endm
.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
alias_src_\rgb_fmt
alias_dst_\yuv_fmt
load_\rgb_fmt\()_16x1 \rgb0, \count
downsample
compute_y_16x1
store_y8_16x1 \y0, \count
load_\rgb_fmt\()_16x1 \rgb1, \count
downsample_ars2
compute_y_16x1
store_y8_16x1 \y1, \count
compute_chroma_8x1 u, U
compute_chroma_8x1 v, V
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
alias_dst_\yuv_fmt 0
alias_src_\rgb_fmt 0
.endm