FFmpeg4/libswscale/arm/rgb2yuv_neon_common.S

/*
 * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

.macro alias name, tgt, set=1
.if \set != 0
    \name   .req    \tgt
.else
    .unreq  \name
.endif
.endm

.altmacro

.macro alias_dw_all qw, dw_l, dw_h
    alias   q\qw\()_l, d\dw_l
    alias   q\qw\()_h, d\dw_h
    .if \qw < 15
        alias_dw_all  %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
    .endif
.endm

alias_dw_all    0, 0, 1

.noaltmacro

.macro alias_qw     name, qw, set=1
    alias   \name\(), \qw, \set
    alias   \name\()_l, \qw\()_l, \set
    alias   \name\()_h, \qw\()_h, \set
.endm

.macro prologue
    push            {r4-r12, lr}
    vpush           {q4-q7}
.endm

.macro epilogue
    vpop            {q4-q7}
    pop             {r4-r12, pc}
.endm

.macro  load_arg    reg, ix
    ldr     \reg,   [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
.endm


/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
 *                  int width, int height,
 *                  int y_stride, int c_stride, int src_stride,
 *                  int32_t coeff_table[9]);
 */
.macro  alias_loop_420sp set=1
    alias   src,        r0, \set
    alias   src0,       src, \set
    alias   y,          r1, \set
    alias   y0,         y, \set
    alias   chroma,     r2, \set
    alias   width,      r3, \set
    alias   header,     width, \set

    alias   height,     r4, \set
    alias   y_stride,   r5, \set
    alias   c_stride,   r6, \set
    alias   c_padding,  c_stride, \set
    alias   src_stride, r7, \set

    alias   y0_end,     r8, \set

    alias   src_padding,r9, \set
    alias   y_padding,  r10, \set

    alias   src1,       r11, \set
    alias   y1,         r12, \set

    alias   coeff_table,r12, \set
.endm


.macro  loop_420sp s_fmt, d_fmt, init, kernel, precision

function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
    prologue

    alias_loop_420sp

    load_arg    height,         4
    load_arg    y_stride,       5
    load_arg    c_stride,       6
    load_arg    src_stride,     7
    load_arg    coeff_table,    8

    \init       coeff_table

    sub         y_padding,      y_stride,       width
    sub         c_padding,      c_stride,       width
    sub         src_padding,    src_stride,     width, LSL #2

    add         y0_end,         y0,             width
    and         header,         width,          #15

    add         y1,             y0,             y_stride
    add         src1,           src0,           src_stride

0:
    cmp         header,     #0
    beq         1f

    \kernel     \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header

1:
    \kernel     \s_fmt, \d_fmt, src0, src1, y0, y1, chroma

    cmp         y0,         y0_end
    blt         1b
2:
    add         y0,         y1,         y_padding
    add         y0_end,     y1,         y_stride
    add         chroma,     chroma,     c_padding
    add         src0,       src1,       src_padding

    add         y1,         y0,         y_stride
    add         src1,       src0,       src_stride

    subs        height,     height,     #2

    bgt         0b

    epilogue

    alias_loop_420sp 0

endfunc
.endm

.macro downsample
    vpaddl.u8   r16x8,  r8x16
    vpaddl.u8   g16x8,  g8x16
    vpaddl.u8   b16x8,  b8x16
.endm


/* acculumate and right shift by 2 */
.macro downsample_ars2
    vpadal.u8   r16x8,  r8x16
    vpadal.u8   g16x8,  g8x16
    vpadal.u8   b16x8,  b8x16

    vrshr.u16   r16x8,  r16x8,  #2
    vrshr.u16   g16x8,  g16x8,  #2
    vrshr.u16   b16x8,  b16x8,  #2
.endm

.macro store_y8_16x1            dst, count
.ifc "\count",""
    vstmia      \dst!,  {y8x16}
.else
    vstmia      \dst,   {y8x16}
    add         \dst,   \dst,           \count
.endif
.endm

.macro store_chroma_nv12_8x1    dst, count
.ifc "\count",""
    vst2.i8     {u8x8, v8x8},   [\dst]!
.else
    vst2.i8     {u8x8, v8x8},   [\dst], \count
.endif
.endm

.macro store_chroma_nv21_8x1    dst, count
.ifc "\count",""
    vst2.i8     {v8x8, u8x8},   [\dst]!
.else
    vst2.i8     {v8x8, u8x8},   [\dst], \count
.endif
.endm

.macro load_8888_16x1   a, b, c, d, src, count
.ifc "\count",""
    vld4.8      {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l},  [\src]!
    vld4.8      {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h},  [\src]!
.else
    vld4.8      {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l},  [\src]!
    vld4.8      {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h},  [\src]
    sub         \src,   \src,   #32
    add         \src,   \src,   \count, LSL #2
.endif
.endm

.macro load_rgbx_16x1   src, count
    load_8888_16x1  r, g, b, x, \src, \count
.endm

.macro load_bgrx_16x1   src, count
    load_8888_16x1  b, g, r, x, \src, \count
.endm

.macro alias_src_rgbx   set=1
    alias_src_8888  r, g, b, x, \set
.endm

.macro alias_src_bgrx   set=1
    alias_src_8888  b, g, r, x, \set
.endm

.macro alias_dst_nv12   set=1
    alias   u8x8, c8x8x2_l, \set
    alias   v8x8, c8x8x2_h, \set
.endm

.macro alias_dst_nv21   set=1
    alias   v8x8, c8x8x2_l, \set
    alias   u8x8, c8x8x2_h, \set
.endm


// common aliases

alias   CO_R    d0
CO_RY   .dn     d0.s16[0]
CO_RU   .dn     d0.s16[1]
CO_RV   .dn     d0.s16[2]

alias   CO_G    d1
CO_GY   .dn     d1.s16[0]
CO_GU   .dn     d1.s16[1]
CO_GV   .dn     d1.s16[2]

alias   CO_B    d2
CO_BY   .dn     d2.s16[0]
CO_BU   .dn     d2.s16[1]
CO_BV   .dn     d2.s16[2]

alias   BIAS_U, d3
alias   BIAS_V, BIAS_U

alias   BIAS_Y, q2


/* q3-q6 R8G8B8X8 x16 */

.macro alias_src_8888   a, b, c, d, set
    alias_qw  \a\()8x16, q3, \set
    alias_qw  \b\()8x16, q4, \set
    alias_qw  \c\()8x16, q5, \set
    alias_qw  \d\()8x16, q6, \set
.endm

.macro kernel_420_16x2  rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
    alias_src_\rgb_fmt
    alias_dst_\yuv_fmt

    load_\rgb_fmt\()_16x1   \rgb0, \count

    downsample
    compute_y_16x1
    store_y8_16x1   \y0, \count


    load_\rgb_fmt\()_16x1   \rgb1, \count
    downsample_ars2
    compute_y_16x1
    store_y8_16x1   \y1, \count

    compute_chroma_8x1  u, U
    compute_chroma_8x1  v, V

    store_chroma_\yuv_fmt\()_8x1 \chroma, \count

    alias_dst_\yuv_fmt 0
    alias_src_\rgb_fmt 0
.endm
init commit 2023-07-02 12:20:28 +00:00			`/*`
			`* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "libavutil/arm/asm.S"`

			`.macro alias name, tgt, set=1`
			`.if \set != 0`
			`\name .req \tgt`
			`.else`
			`.unreq \name`
			`.endif`
			`.endm`

			`.altmacro`

			`.macro alias_dw_all qw, dw_l, dw_h`
			`alias q\qw\()_l, d\dw_l`
			`alias q\qw\()_h, d\dw_h`
			`.if \qw < 15`
			`alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)`
			`.endif`
			`.endm`

			`alias_dw_all 0, 0, 1`

			`.noaltmacro`

			`.macro alias_qw name, qw, set=1`
			`alias \name\(), \qw, \set`
			`alias \name\()_l, \qw\()_l, \set`
			`alias \name\()_h, \qw\()_h, \set`
			`.endm`

			`.macro prologue`
			`push {r4-r12, lr}`
			`vpush {q4-q7}`
			`.endm`

			`.macro epilogue`
			`vpop {q4-q7}`
			`pop {r4-r12, pc}`
			`.endm`

			`.macro load_arg reg, ix`
			`ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]`
			`.endm`


			`/* ()_to_()_neon(const uint8_t src, uint8_t y, uint8_t *chroma`
			`* int width, int height,`
			`* int y_stride, int c_stride, int src_stride,`
			`* int32_t coeff_table[9]);`
			`*/`
			`.macro alias_loop_420sp set=1`
			`alias src, r0, \set`
			`alias src0, src, \set`
			`alias y, r1, \set`
			`alias y0, y, \set`
			`alias chroma, r2, \set`
			`alias width, r3, \set`
			`alias header, width, \set`

			`alias height, r4, \set`
			`alias y_stride, r5, \set`
			`alias c_stride, r6, \set`
			`alias c_padding, c_stride, \set`
			`alias src_stride, r7, \set`

			`alias y0_end, r8, \set`

			`alias src_padding,r9, \set`
			`alias y_padding, r10, \set`

			`alias src1, r11, \set`
			`alias y1, r12, \set`

			`alias coeff_table,r12, \set`
			`.endm`


			`.macro loop_420sp s_fmt, d_fmt, init, kernel, precision`

			`function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1`
			`prologue`

			`alias_loop_420sp`

			`load_arg height, 4`
			`load_arg y_stride, 5`
			`load_arg c_stride, 6`
			`load_arg src_stride, 7`
			`load_arg coeff_table, 8`

			`\init coeff_table`

			`sub y_padding, y_stride, width`
			`sub c_padding, c_stride, width`
			`sub src_padding, src_stride, width, LSL #2`

			`add y0_end, y0, width`
			`and header, width, #15`

			`add y1, y0, y_stride`
			`add src1, src0, src_stride`

			`0:`
			`cmp header, #0`
			`beq 1f`

			`\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header`

			`1:`
			`\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma`

			`cmp y0, y0_end`
			`blt 1b`
			`2:`
			`add y0, y1, y_padding`
			`add y0_end, y1, y_stride`
			`add chroma, chroma, c_padding`
			`add src0, src1, src_padding`

			`add y1, y0, y_stride`
			`add src1, src0, src_stride`

			`subs height, height, #2`

			`bgt 0b`

			`epilogue`

			`alias_loop_420sp 0`

			`endfunc`
			`.endm`

			`.macro downsample`
			`vpaddl.u8 r16x8, r8x16`
			`vpaddl.u8 g16x8, g8x16`
			`vpaddl.u8 b16x8, b8x16`
			`.endm`


			`/* acculumate and right shift by 2 */`
			`.macro downsample_ars2`
			`vpadal.u8 r16x8, r8x16`
			`vpadal.u8 g16x8, g8x16`
			`vpadal.u8 b16x8, b8x16`

			`vrshr.u16 r16x8, r16x8, #2`
			`vrshr.u16 g16x8, g16x8, #2`
			`vrshr.u16 b16x8, b16x8, #2`
			`.endm`

			`.macro store_y8_16x1 dst, count`
			`.ifc "\count",""`
			`vstmia \dst!, {y8x16}`
			`.else`
			`vstmia \dst, {y8x16}`
			`add \dst, \dst, \count`
			`.endif`
			`.endm`

			`.macro store_chroma_nv12_8x1 dst, count`
			`.ifc "\count",""`
			`vst2.i8 {u8x8, v8x8}, [\dst]!`
			`.else`
			`vst2.i8 {u8x8, v8x8}, [\dst], \count`
			`.endif`
			`.endm`

			`.macro store_chroma_nv21_8x1 dst, count`
			`.ifc "\count",""`
			`vst2.i8 {v8x8, u8x8}, [\dst]!`
			`.else`
			`vst2.i8 {v8x8, u8x8}, [\dst], \count`
			`.endif`
			`.endm`

			`.macro load_8888_16x1 a, b, c, d, src, count`
			`.ifc "\count",""`
			`vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!`
			`vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!`
			`.else`
			`vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!`
			`vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]`
			`sub \src, \src, #32`
			`add \src, \src, \count, LSL #2`
			`.endif`
			`.endm`

			`.macro load_rgbx_16x1 src, count`
			`load_8888_16x1 r, g, b, x, \src, \count`
			`.endm`

			`.macro load_bgrx_16x1 src, count`
			`load_8888_16x1 b, g, r, x, \src, \count`
			`.endm`

			`.macro alias_src_rgbx set=1`
			`alias_src_8888 r, g, b, x, \set`
			`.endm`

			`.macro alias_src_bgrx set=1`
			`alias_src_8888 b, g, r, x, \set`
			`.endm`

			`.macro alias_dst_nv12 set=1`
			`alias u8x8, c8x8x2_l, \set`
			`alias v8x8, c8x8x2_h, \set`
			`.endm`

			`.macro alias_dst_nv21 set=1`
			`alias v8x8, c8x8x2_l, \set`
			`alias u8x8, c8x8x2_h, \set`
			`.endm`


			`// common aliases`

			`alias CO_R d0`
			`CO_RY .dn d0.s16[0]`
			`CO_RU .dn d0.s16[1]`
			`CO_RV .dn d0.s16[2]`

			`alias CO_G d1`
			`CO_GY .dn d1.s16[0]`
			`CO_GU .dn d1.s16[1]`
			`CO_GV .dn d1.s16[2]`

			`alias CO_B d2`
			`CO_BY .dn d2.s16[0]`
			`CO_BU .dn d2.s16[1]`
			`CO_BV .dn d2.s16[2]`

			`alias BIAS_U, d3`
			`alias BIAS_V, BIAS_U`

			`alias BIAS_Y, q2`


			`/* q3-q6 R8G8B8X8 x16 */`

			`.macro alias_src_8888 a, b, c, d, set`
			`alias_qw \a\()8x16, q3, \set`
			`alias_qw \b\()8x16, q4, \set`
			`alias_qw \c\()8x16, q5, \set`
			`alias_qw \d\()8x16, q6, \set`
			`.endm`

			`.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count`
			`alias_src_\rgb_fmt`
			`alias_dst_\yuv_fmt`

			`load_\rgb_fmt\()_16x1 \rgb0, \count`

			`downsample`
			`compute_y_16x1`
			`store_y8_16x1 \y0, \count`


			`load_\rgb_fmt\()_16x1 \rgb1, \count`
			`downsample_ars2`
			`compute_y_16x1`
			`store_y8_16x1 \y1, \count`

			`compute_chroma_8x1 u, U`
			`compute_chroma_8x1 v, V`

			`store_chroma_\yuv_fmt\()_8x1 \chroma, \count`

			`alias_dst_\yuv_fmt 0`
			`alias_src_\rgb_fmt 0`
			`.endm`