From f6077cc666b7fdea536c1461c5582ed6ef04d1ff Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Thu, 25 May 2023 15:24:29 +0800 Subject: [PATCH] avcodec/la: Add LSX optimization for h264 qpel. ./configure --disable-lasx ffmpeg -i 1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -y /dev/null -an before: 214fps after: 274fps Reviewed-by: Shiyou Yin Signed-off-by: Michael Niedermayer --- libavcodec/loongarch/Makefile | 2 + libavcodec/loongarch/h264qpel.S | 1686 +++++++++++++++++ .../loongarch/h264qpel_init_loongarch.c | 74 +- libavcodec/loongarch/h264qpel_lasx.c | 401 +--- libavcodec/loongarch/h264qpel_lasx.h | 158 -- libavcodec/loongarch/h264qpel_loongarch.h | 312 +++ libavcodec/loongarch/h264qpel_lsx.c | 487 +++++ 7 files changed, 2561 insertions(+), 559 deletions(-) create mode 100644 libavcodec/loongarch/h264qpel.S delete mode 100644 libavcodec/loongarch/h264qpel_lasx.h create mode 100644 libavcodec/loongarch/h264qpel_loongarch.h create mode 100644 libavcodec/loongarch/h264qpel_lsx.c diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile index a563055161..06cfab5c20 100644 --- a/libavcodec/loongarch/Makefile +++ b/libavcodec/loongarch/Makefile @@ -31,5 +31,7 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \ LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \ loongarch/h264idct_loongarch.o \ loongarch/h264dsp.o +LSX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel.o \ + loongarch/h264qpel_lsx.o LSX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma.o LSX-OBJS-$(CONFIG_H264PRED) += loongarch/h264intrapred.o diff --git a/libavcodec/loongarch/h264qpel.S b/libavcodec/loongarch/h264qpel.S new file mode 100644 index 0000000000..3f885b6ce2 --- /dev/null +++ b/libavcodec/loongarch/h264qpel.S @@ -0,0 +1,1686 @@ +/* + * Loongson LSX optimized h264qpel + * + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by Hecai Yuan + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "loongson_asm.S" + +.macro VLD_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4 + vld vr0, \in4, 0 + vldx vr1, \in4, a2 + QPEL8_H_LSX \in0, \in1 + vssrani.bu.h \in0, \in2, 5 + vssrani.bu.h \in1, \in3, 5 +.endm + +.macro VLDX_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4 + vldx vr0, \in4, t1 + vldx vr1, \in4, t2 + QPEL8_H_LSX \in0, \in1 + vssrani.bu.h \in0, \in2, 5 + vssrani.bu.h \in1, \in3, 5 +.endm + +.macro VLD_DOUBLE_QPEL8_H_SSRANI_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8 + vld vr0, \in8, 0 + vldx vr1, \in8, a2 + QPEL8_H_LSX \in0, \in1 + vssrani.bu.h \in0, \in4, 5 + vssrani.bu.h \in1, \in5, 5 + vldx vr0, \in8, t1 + vldx vr1, \in8, t2 + QPEL8_H_LSX \in2, \in3 + vssrani.bu.h \in2, \in6, 5 + vssrani.bu.h \in3, \in7, 5 +.endm + +function ff_put_h264_qpel16_mc00_lsx + slli.d t0, a2, 1 + add.d t1, t0, a2 + slli.d t2, t0, 1 +.rept 4 + vld vr0, a1, 0 + vldx vr1, a1, a2 + vldx vr2, a1, t0 + vldx vr3, a1, t1 + add.d a1, a1, t2 + vst vr0, a0, 0 + vstx vr1, a0, a2 + vstx vr2, a0, t0 + vstx vr3, a0, t1 + add.d a0, a0, t2 +.endr +endfunc + +.macro QPEL8_H_LSX out0, out1 + vbsrl.v vr2, vr0, 1 + vbsrl.v vr3, vr1, 1 + vbsrl.v vr4, vr0, 2 + vbsrl.v vr5, vr1, 2 + vbsrl.v vr6, vr0, 3 + vbsrl.v vr7, vr1, 3 + vbsrl.v vr8, vr0, 4 + vbsrl.v vr9, vr1, 4 + vbsrl.v vr10, vr0, 5 + vbsrl.v vr11, vr1, 5 + + vilvl.b vr6, vr4, vr6 + vilvl.b vr7, vr5, vr7 + vilvl.b vr8, vr2, vr8 + vilvl.b vr9, vr3, vr9 + vilvl.b vr10, vr0, vr10 + vilvl.b vr11, vr1, vr11 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vmul.h vr2, vr6, vr20 + vmul.h vr3, vr7, vr20 + vmul.h vr4, vr8, vr21 + vmul.h vr5, vr9, vr21 + vssub.h vr2, vr2, vr4 + vssub.h vr3, vr3, vr5 + vsadd.h vr2, vr2, vr10 + vsadd.h vr3, vr3, vr11 + vsadd.h \out0, vr2, vr22 + vsadd.h \out1, vr3, vr22 +.endm + +.macro VLD_DOUBLE_QPEL8_H_LSX in0, in1, in2, in3, in4 + vld vr0, \in4, 0 + vldx vr1, \in4, a2 + QPEL8_H_LSX \in0, \in1 + vldx vr0, \in4, t1 + vldx vr1, \in4, t2 + QPEL8_H_LSX \in2, \in3 +.endm + +.macro put_h264_qpel16 in0 +function ff_put_h264_qpel16_mc\in0\()_lsx +.ifc \in0, 10 + addi.d t8, a1, 0 +.else + addi.d t8, a1, 1 +.endif + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + slli.d t1, a2, 1 + add.d t2, t1, a2 + addi.d t0, a1, -2 // t0 = src - 2 + addi.d a1, t0, 8 // a1 = t0 + 8 +.rept 4 + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 + VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1 + vld vr10, t8, 0 + vldx vr11, t8, a2 + vavgr.bu vr0, vr2, vr10 + vavgr.bu vr1, vr3, vr11 + vst vr0, a0, 0 + vstx vr1, a0, a2 + VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1 + vldx vr12, t8, t1 + vldx vr13, t8, t2 + vavgr.bu vr2, vr4, vr12 + vavgr.bu vr3, vr5, vr13 + vstx vr2, a0, t1 + vstx vr3, a0, t2 + alsl.d a0, a2, a0, 2 + alsl.d t8, a2, t8, 2 + alsl.d a1, a2, a1, 2 + alsl.d t0, a2, t0, 2 +.endr +endfunc +.endm + +put_h264_qpel16 10 +put_h264_qpel16 30 + +function ff_put_h264_qpel16_mc20_lsx + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + slli.d t1, a2, 1 + add.d t2, t1, a2 + addi.d t0, a1, -2 // t0 = src - 2 + addi.d a1, t0, 8 // a1 = t0 + 8 +.rept 4 + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 + VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1 + vst vr2, a0, 0 + vstx vr3, a0, a2 + VLDX_QPEL8_H_SSRANI_LSX vr4, vr5, vr14, vr15, a1 + vstx vr4, a0, t1 + vstx vr5, a0, t2 + alsl.d a0, a2, a0, 2 + alsl.d a1, a2, a1, 2 + alsl.d t0, a2, t0, 2 +.endr +endfunc + +.macro QPEL8_V_LSX in0, in1, in2, in3, in4, in5, in6 + vilvl.b vr7, \in3, \in2 + vilvl.b vr8, \in4, \in3 + vilvl.b vr9, \in4, \in1 + vilvl.b vr10, \in5, \in2 + vilvl.b vr11, \in5, \in0 + vilvl.b vr12, \in6, \in1 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vmul.h vr7, vr7, vr20 + vmul.h vr8, vr8, vr20 + vmul.h vr9, vr9, vr21 + vmul.h vr10, vr10, vr21 + vssub.h vr7, vr7, vr9 + vssub.h vr8, vr8, vr10 + vsadd.h vr7, vr7, vr11 + vsadd.h vr8, vr8, vr12 + vsadd.h vr7, vr7, vr22 + vsadd.h vr8, vr8, vr22 + + vilvh.b vr13, \in3, \in2 + vilvh.b vr14, \in4, \in3 + vilvh.b vr15, \in4, \in1 + vilvh.b vr16, \in5, \in2 + vilvh.b vr17, \in5, \in0 + vilvh.b vr18, \in6, \in1 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vhaddw.hu.bu vr17, vr17, vr17 + vhaddw.hu.bu vr18, vr18, vr18 + vmul.h vr13, vr13, vr20 + vmul.h vr14, vr14, vr20 + vmul.h vr15, vr15, vr21 + vmul.h vr16, vr16, vr21 + vssub.h vr13, vr13, vr15 + vssub.h vr14, vr14, vr16 + vsadd.h vr13, vr13, vr17 + vsadd.h vr14, vr14, vr18 + vsadd.h vr13, vr13, vr22 + vsadd.h vr14, vr14, vr22 + vssrani.bu.h vr13, vr7, 5 + vssrani.bu.h vr14, vr8, 5 +.endm + +.macro put_h264_qpel16_mc1 in0 +function ff_put_h264_qpel16_mc\in0\()_lsx + slli.d t0, a2, 1 + add.d t1, t0, a2 + sub.d t2, a1, t0 // t2 = src - 2 * stride + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + + vld vr0, t2, 0 + vldx vr1, t2, a2 + vldx vr2, t2, t0 + vldx vr3, t2, t1 + alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride + vld vr4, t2, 0 + vldx vr5, t2, a2 + vldx vr6, t2, t0 + QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 +.ifc \in0, 01 + vavgr.bu vr13, vr2, vr13 + vavgr.bu vr14, vr3, vr14 +.else + vavgr.bu vr13, vr3, vr13 + vavgr.bu vr14, vr4, vr14 +.endif + vst vr13, a0, 0 + vstx vr14, a0, a2 + + vldx vr0, t2, t1 + alsl.d t2, a2, t2, 2 // t2 = t2 + 4 *stride + vld vr1, t2, 0 + QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1 +.ifc \in0, 01 + vavgr.bu vr13, vr4, vr13 + vavgr.bu vr14, vr5, vr14 +.else + vavgr.bu vr13, vr5, vr13 + vavgr.bu vr14, vr6, vr14 +.endif + vstx vr13, a0, t0 + vstx vr14, a0, t1 + + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + + vldx vr2, t2, a2 + vldx vr3, t2, t0 + QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3 +.ifc \in0, 01 + vavgr.bu vr13, vr6, vr13 + vavgr.bu vr14, vr0, vr14 +.else + vavgr.bu vr13, vr0, vr13 + vavgr.bu vr14, vr1, vr14 +.endif + vst vr13, a0, 0 + vstx vr14, a0, a2 + + vldx vr4, t2, t1 + alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride + vld vr5, t2, 0 + QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5 +.ifc \in0, 01 + vavgr.bu vr13, vr1, vr13 + vavgr.bu vr14, vr2, vr14 +.else + vavgr.bu vr13, vr2, vr13 + vavgr.bu vr14, vr3, vr14 +.endif + vstx vr13, a0, t0 + vstx vr14, a0, t1 + + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + + vldx vr6, t2, a2 + vldx vr0, t2, t0 + QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0 +.ifc \in0, 01 + vavgr.bu vr13, vr3, vr13 + vavgr.bu vr14, vr4, vr14 +.else + vavgr.bu vr13, vr4, vr13 + vavgr.bu vr14, vr5, vr14 +.endif + vst vr13, a0, 0 + vstx vr14, a0, a2 + + vldx vr1, t2, t1 + alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride + vld vr2, t2, 0 + QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2 +.ifc \in0, 01 + vavgr.bu vr13, vr5, vr13 + vavgr.bu vr14, vr6, vr14 +.else + vavgr.bu vr13, vr6, vr13 + vavgr.bu vr14, vr0, vr14 +.endif + vstx vr13, a0, t0 + vstx vr14, a0, t1 + + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + + vldx vr3, t2, a2 + vldx vr4, t2, t0 + QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4 +.ifc \in0, 01 + vavgr.bu vr13, vr0, vr13 + vavgr.bu vr14, vr1, vr14 +.else + vavgr.bu vr13, vr1, vr13 + vavgr.bu vr14, vr2, vr14 +.endif + vst vr13, a0, 0 + vstx vr14, a0, a2 + + vldx vr5, t2, t1 + alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride + vld vr6, t2, 0 + QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 +.ifc \in0, 01 + vavgr.bu vr13, vr2, vr13 + vavgr.bu vr14, vr3, vr14 +.else + vavgr.bu vr13, vr3, vr13 + vavgr.bu vr14, vr4, vr14 +.endif + vstx vr13, a0, t0 + vstx vr14, a0, t1 +endfunc +.endm + +put_h264_qpel16_mc1 01 +put_h264_qpel16_mc1 03 + +.macro VST_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8 + QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6 + vavgr.bu vr13, \in7, vr13 + vavgr.bu vr14, \in8, vr14 + vst vr13, a0, 0 + vstx vr14, a0, a2 +.endm + +.macro VSTX_QPEL8_V_LOWPASS_LSX in0, in1, in2, in3, in4, in5, in6, in7, in8 + QPEL8_V_LSX \in0, \in1, \in2, \in3, \in4, \in5, \in6 + vavgr.bu vr13, \in7, vr13 + vavgr.bu vr14, \in8, vr14 + vstx vr13, a0, t1 + vstx vr14, a0, t2 +.endm + +function ff_put_h264_qpel16_mc11_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + slli.d t1, a2, 1 + add.d t2, t1, a2 + slli.d t6, t1, 1 + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + sub.d t4, a1, t1 // t4 = src - 2 * stride + addi.d t0, a1, -2 // t0 = src - 2 + addi.d a1, t0, 8 // a1 = t0 + 8 +.rept 2 + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 + alsl.d t0, a2, t0, 2 + VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t0 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ + vr14, vr15, a1 + alsl.d a1, a2, a1, 2 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ + vr18, vr19, a1 + + vld vr0, t4, 0 // t4 = src - 2 * stride + vldx vr1, t4, a2 + vldx vr2, t4, t1 + vldx vr3, t4, t2 + alsl.d t4, a2, t4, 2 // src + 2 *stride + vld vr4, t4, 0 + vldx vr5, t4, a2 + vldx vr6, t4, t1 + VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24 + vldx vr0, t4, t2 + alsl.d t4, a2, t4, 2 // src + 6 *stride + vld vr1, t4, 0 + VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26 + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + vldx vr2, t4, a2 + vldx vr3, t4, t1 + VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28 + vldx vr4, t4, t2 + alsl.d t4, a2, t4, 2 // src + 10 *stride + vld vr5, t4, 0 + VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30 + alsl.d t0, a2, t0, 2 + alsl.d a1, a2, a1, 2 // a1 = src + 8 * stride + alsl.d a0, a2, a0, 2 // dst = dst + 8 * stride + sub.d t4, t4, t6 +.endr + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc + +function ff_avg_h264_qpel16_mc00_lsx + slli.d t0, a2, 1 + add.d t1, t0, a2 + slli.d t2, t0, 1 + addi.d t3, a0, 0 +.rept 4 + vld vr0, a1, 0 + vldx vr1, a1, a2 + vldx vr2, a1, t0 + vldx vr3, a1, t1 + add.d a1, a1, t2 + vld vr8, t3, 0 + vldx vr9, t3, a2 + vldx vr10, t3, t0 + vldx vr11, t3, t1 + add.d t3, t3, t2 + vavgr.bu vr0, vr8, vr0 + vavgr.bu vr1, vr9, vr1 + vavgr.bu vr2, vr10, vr2 + vavgr.bu vr3, vr11, vr3 + vst vr0, a0, 0 + vstx vr1, a0, a2 + vstx vr2, a0, t0 + vstx vr3, a0, t1 + add.d a0, a0, t2 +.endr +endfunc + +.macro put_h264_qpel16_mc in0 +function ff_put_h264_qpel16_mc\in0\()_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + slli.d t1, a2, 1 + add.d t2, t1, a2 + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + addi.d t0, a1, -2 // t0 = src - 2 + +.ifc \in0, 33 + add.d t0, t0, a2 +.endif + add.d t3, a1, zero // t3 = src + sub.d t4, a1, t1 // t4 = src - 2 * stride + addi.d t4, t4, 1 + + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 + alsl.d a1, a2, t0, 2 + VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 + addi.d a1, t0, 8 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ + vr14, vr15, a1 + alsl.d a1, a2, a1, 2 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ + vr18, vr19, a1 + vld vr0, t4, 0 // t4 = src - 2 * stride + 1 + vldx vr1, t4, a2 + vldx vr2, t4, t1 + vldx vr3, t4, t2 + alsl.d t4, a2, t4, 2 + vld vr4, t4, 0 + vldx vr5, t4, a2 + vldx vr6, t4, t1 + VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24 + vldx vr0, t4, t2 + alsl.d t4, a2, t4, 2 + vld vr1, t4, 0 + VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26 + add.d t6, t4, zero // t6 = src + 6 * stride + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + vldx vr2, t4, a2 + vldx vr3, t4, t1 + VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28 + vldx vr4, t4, t2 + alsl.d t4, a2, t4, 2 + vld vr5, t4, 0 + VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30 + alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride + addi.d t5, a1, 8 // a1 = src + 8 * stride + 8 + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1 + alsl.d a1, a2, a1, 2 + VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ + vr14, vr15, t5 + alsl.d t5, a2, t5, 2 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ + vr18, vr19, t5 + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + + // t6 = src + 6 * stride + 1 + vld vr0, t6, 0 + vldx vr1, t6, a2 + vldx vr2, t6, t1 + vldx vr3, t6, t2 + alsl.d t6, a2, t6, 2 + vld vr4, t6, 0 + vldx vr5, t6, a2 + vldx vr6, t6, t1 + VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24 + vldx vr0, t6, t2 + alsl.d t6, a2, t6, 2 + vld vr1, t6, 0 + VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5 ,vr6, vr0, vr1, vr25, vr26 + alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride + vldx vr2, t6, a2 + vldx vr3, t6, t1 + VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28 + vldx vr4, t6, t2 + alsl.d t6, a2, t6, 2 + vld vr5, t6, 0 + VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc +.endm + +put_h264_qpel16_mc 33 +put_h264_qpel16_mc 31 + +function ff_put_h264_qpel16_mc13_lsx + slli.d t1, a2, 1 + add.d t2, t1, a2 + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + addi.d t0, a1, -2 // t0 = src - 2 + add.d t0, t0, a2 + add.d t3, a1, zero // t3 = src + sub.d t4, a1, t1 // t4 = src - 2 * stride + + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 + alsl.d a1, a2, t0, 2 + VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 + addi.d a1, t0, 8 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ + vr14, vr15, a1 + alsl.d a1, a2, a1, 2 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ + vr18, vr19, a1 + vld vr0, t4, 0 // t4 = src - 2 * stride + 1 + vldx vr1, t4, a2 + vldx vr2, t4, t1 + vldx vr3, t4, t2 + alsl.d t4, a2, t4, 2 + vld vr4, t4, 0 + vldx vr5, t4, a2 + vldx vr6, t4, t1 + VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24 + vldx vr0, t4, t2 + alsl.d t4, a2, t4, 2 + vld vr1, t4, 0 + VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26 + add.d t6, t4, zero + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + vldx vr2, t4, a2 + vldx vr3, t4, t1 + VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28 + vldx vr4, t4, t2 + alsl.d t4, a2, t4, 2 + vld vr5, t4, 0 + VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30 + alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride + addi.d t5, a1, 8 // a1 = src + 8 * stride + 8 + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1 + alsl.d a1, a2, a1, 2 + VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ + vr14, vr15, t5 + alsl.d t5, a2, t5, 2 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ + vr18, vr19, t5 + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + + vld vr0, t6, 0 // // t6 = src + 6 * stride + 1 + vldx vr1, t6, a2 + vldx vr2, t6, t1 + vldx vr3, t6, t2 + alsl.d t6, a2, t6, 2 + vld vr4, t6, 0 + vldx vr5, t6, a2 + vldx vr6, t6, t1 + VST_QPEL8_V_LOWPASS_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr23, vr24 + vldx vr0, t6, t2 + alsl.d t6, a2, t6, 2 + vld vr1, t6, 0 + VSTX_QPEL8_V_LOWPASS_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1, vr25, vr26 + alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride + vldx vr2, t6, a2 + vldx vr3, t6, t1 + VST_QPEL8_V_LOWPASS_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3, vr27, vr28 + vldx vr4, t6, t2 + alsl.d t6, a2, t6, 2 + vld vr5, t6, 0 + VSTX_QPEL8_V_LOWPASS_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5, vr29, vr30 + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc + +function ff_avg_h264_qpel16_mc10_lsx + addi.d t0, a0, 0 // t0 = dst + addi.d t4, a1, -2 // t1 = src - 2 + addi.d t5, t4, 8 + slli.d t1, a2, 1 + add.d t2, a2, t1 + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 +.rept 2 + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4 + alsl.d t4, a2, t4, 2 + VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4 + VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5 + vld vr0, a1, 0 + vldx vr1, a1, a2 + vld vr12, t0, 0 + vldx vr13, t0, a2 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vavgr.bu vr0, vr0, vr12 + vavgr.bu vr1, vr1, vr13 + vst vr0, a0, 0 + vstx vr1, a0, a2 + VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5 + vldx vr0, a1, t1 + vldx vr1, a1, t2 + vldx vr12, t0, t1 + vldx vr13, t0, t2 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vavgr.bu vr0, vr0, vr12 + vavgr.bu vr1, vr1, vr13 + vstx vr0, a0, t1 + vstx vr1, a0, t2 + alsl.d t5, a2, t5, 2 + alsl.d a1, a2, a1, 2 + alsl.d t0, a2, t0, 2 + alsl.d a0, a2, a0, 2 + VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5 + vld vr0, a1, 0 + vldx vr1, a1, a2 + vld vr12, t0, 0 + vldx vr13, t0, a2 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vavgr.bu vr0, vr0, vr12 + vavgr.bu vr1, vr1, vr13 + vst vr0, a0, 0 + vstx vr1, a0, a2 + VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5 + vldx vr0, a1, t1 + vldx vr1, a1, t2 + vldx vr12, t0, t1 + vldx vr13, t0, t2 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vavgr.bu vr0, vr0, vr12 + vavgr.bu vr1, vr1, vr13 + vstx vr0, a0, t1 + vstx vr1, a0, t2 + alsl.d t5, a2, t5, 2 + alsl.d a1, a2, a1, 2 + alsl.d t0, a2, t0, 2 + alsl.d a0, a2, a0, 2 + alsl.d t4, a2, t4, 2 // src + 8 * stride -2 +.endr +endfunc + +function ff_avg_h264_qpel16_mc30_lsx + addi.d t0, a0, 0 // t0 = dst + addi.d t4, a1, -2 // t1 = src - 2 + addi.d t5, t4, 8 + addi.d a1, a1, 1 // a1 = a1 + 1 + slli.d t1, a2, 1 + add.d t2, a2, t1 + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 +.rept 2 + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t4 + alsl.d t4, a2, t4, 2 + VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, t4 + VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, t5 + vld vr0, a1, 0 + vldx vr1, a1, a2 + vld vr12, t0, 0 + vldx vr13, t0, a2 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vavgr.bu vr0, vr0, vr12 + vavgr.bu vr1, vr1, vr13 + vst vr0, a0, 0 + vstx vr1, a0, a2 + VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, t5 + vldx vr0, a1, t1 + vldx vr1, a1, t2 + vldx vr12, t0, t1 + vldx vr13, t0, t2 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vavgr.bu vr0, vr0, vr12 + vavgr.bu vr1, vr1, vr13 + vstx vr0, a0, t1 + vstx vr1, a0, t2 + alsl.d t5, a2, t5, 2 + alsl.d a1, a2, a1, 2 + alsl.d t0, a2, t0, 2 + alsl.d a0, a2, a0, 2 + VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr16, vr17, t5 + vld vr0, a1, 0 + vldx vr1, a1, a2 + vld vr12, t0, 0 + vldx vr13, t0, a2 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vavgr.bu vr0, vr0, vr12 + vavgr.bu vr1, vr1, vr13 + vst vr0, a0, 0 + vstx vr1, a0, a2 + VLDX_QPEL8_H_SSRANI_LSX vr2, vr3, vr18, vr19, t5 + vldx vr0, a1, t1 + vldx vr1, a1, t2 + vldx vr12, t0, t1 + vldx vr13, t0, t2 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vavgr.bu vr0, vr0, vr12 + vavgr.bu vr1, vr1, vr13 + vstx vr0, a0, t1 + vstx vr1, a0, t2 + alsl.d t5, a2, t5, 2 + alsl.d a1, a2, a1, 2 + alsl.d t0, a2, t0, 2 + alsl.d a0, a2, a0, 2 + alsl.d t4, a2, t4, 2 // t1 = src + 8 * stride -2 +.endr +endfunc + +function ff_put_h264_qpel16_mc02_lsx + slli.d t0, a2, 1 + add.d t1, t0, a2 + sub.d t2, a1, t0 // t2 = src - 2 * stride + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + + vld vr0, t2, 0 + vldx vr1, t2, a2 + vldx vr2, t2, t0 + vldx vr3, t2, t1 + alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride + vld vr4, t2, 0 + vldx vr5, t2, a2 + vldx vr6, t2, t0 + QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 + vst vr13, a0, 0 + vstx vr14, a0, a2 + vldx vr0, t2, t1 + alsl.d t2, a2, t2, 2 // t2 = t2 + 4 *stride + vld vr1, t2, 0 + QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1 + vstx vr13, a0, t0 + vstx vr14, a0, t1 + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + vldx vr2, t2, a2 + vldx vr3, t2, t0 + QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3 + vst vr13, a0, 0 + vstx vr14, a0, a2 + vldx vr4, t2, t1 + alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride + vld vr5, t2, 0 + QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5 + vstx vr13, a0, t0 + vstx vr14, a0, t1 + + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + + vldx vr6, t2, a2 + vldx vr0, t2, t0 + QPEL8_V_LSX vr1, vr2, vr3, vr4, vr5, vr6, vr0 + vst vr13, a0, 0 + vstx vr14, a0, a2 + vldx vr1, t2, t1 + alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride + vld vr2, t2, 0 + QPEL8_V_LSX vr3, vr4, vr5, vr6, vr0, vr1, vr2 + vstx vr13, a0, t0 + vstx vr14, a0, t1 + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + vldx vr3, t2, a2 + vldx vr4, t2, t0 + QPEL8_V_LSX vr5, vr6, vr0, vr1, vr2, vr3, vr4 + vst vr13, a0, 0 + vstx vr14, a0, a2 + vldx vr5, t2, t1 + alsl.d t2, a2, t2, 2 // t2 = t2 + 4 * stride + vld vr6, t2, 0 + QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 + vstx vr13, a0, t0 + vstx vr14, a0, t1 +endfunc + +.macro avc_luma_hv_qrt_and_aver_dst_16x16_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 + alsl.d a1, a2, t0, 2 + VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 + addi.d a1, t0, 8 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ + vr14, vr15, a1 + alsl.d a1, a2, a1, 2 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ + vr18, vr19, a1 + vld vr0, t4, 0 // t4 = src - 2 * stride + 1 + vldx vr1, t4, a2 + vldx vr2, t4, t1 + vldx vr3, t4, t2 + alsl.d t4, a2, t4, 2 + vld vr4, t4, 0 + vldx vr5, t4, a2 + vldx vr6, t4, t1 + QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 + vld vr0, t8, 0 + vldx vr1, t8, a2 + vavgr.bu vr13, vr23, vr13 + vavgr.bu vr14, vr24, vr14 + vavgr.bu vr13, vr13, vr0 + vavgr.bu vr14, vr14, vr1 + vst vr13, a0, 0 + vstx vr14, a0, a2 + vldx vr0, t4, t2 + alsl.d t4, a2, t4, 2 + vld vr1, t4, 0 + QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1 + vldx vr2, t8, t1 + vldx vr3, t8, t2 + vavgr.bu vr13, vr25, vr13 + vavgr.bu vr14, vr26, vr14 + vavgr.bu vr13, vr13, vr2 + vavgr.bu vr14, vr14, vr3 + add.d t6, t4, zero // t6 = src + 6 * stride + vstx vr13, a0, t1 + vstx vr14, a0, t2 + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + alsl.d t8, a2, t8, 2 + vldx vr2, t4, a2 + vldx vr3, t4, t1 + QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3 + vld vr4, t8, 0 + vldx vr5, t8, a2 + vavgr.bu vr13, vr27, vr13 + vavgr.bu vr14, vr28, vr14 + vavgr.bu vr13, vr13, vr4 + vavgr.bu vr14, vr14, vr5 + vst vr13, a0, 0 + vstx vr14, a0, a2 + vldx vr4, t4, t2 + alsl.d t4, a2, t4, 2 + vld vr5, t4, 0 + QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5 + vldx vr6, t8, t1 + vldx vr0, t8, t2 + vavgr.bu vr13, vr29, vr13 + vavgr.bu vr14, vr30, vr14 + vavgr.bu vr13, vr13, vr6 + vavgr.bu vr14, vr14, vr0 + vstx vr13, a0, t1 + vstx vr14, a0, t2 + alsl.d a1, a2, t0, 3 // a1 = src + 8 * stride + addi.d t5, a1, 8 // a1 = src + 8 * stride + 8 + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, a1 + alsl.d a1, a2, a1, 2 + VLD_DOUBLE_QPEL8_H_LSX vr16, vr17, vr18, vr19, a1 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr23, vr24, vr25, vr26, vr12, vr13, \ + vr14, vr15, t5 + alsl.d t5, a2, t5, 2 + VLD_DOUBLE_QPEL8_H_SSRANI_LSX vr27, vr28, vr29, vr30, vr16, vr17, \ + vr18, vr19, t5 + alsl.d a0, a2, a0, 2 // dst = dst + 4 * stride + alsl.d t8, a2, t8, 2 + // t6 = src + 6 * stride + 1 + vld vr0, t6, 0 + vldx vr1, t6, a2 + vldx vr2, t6, t1 + vldx vr3, t6, t2 + alsl.d t6, a2, t6, 2 + vld vr4, t6, 0 + vldx vr5, t6, a2 + vldx vr6, t6, t1 + QPEL8_V_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 + vld vr0, t8, 0 + vldx vr1, t8, a2 + vavgr.bu vr13, vr23, vr13 + vavgr.bu vr14, vr24, vr14 + vavgr.bu vr13, vr13, vr0 + vavgr.bu vr14, vr14, vr1 + vst vr13, a0, 0 + vstx vr14, a0, a2 + vldx vr0, t6, t2 + alsl.d t6, a2, t6, 2 + vld vr1, t6, 0 + QPEL8_V_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1 + vldx vr2, t8, t1 + vldx vr3, t8, t2 + vavgr.bu vr13, vr25, vr13 + vavgr.bu vr14, vr26, vr14 + vavgr.bu vr13, vr13, vr2 + vavgr.bu vr14, vr14, vr3 + vstx vr13, a0, t1 + vstx vr14, a0, t2 + alsl.d a0, a2, a0, 2 // dst = dst + 4 *stride + alsl.d t8, a2, t8, 2 + vldx vr2, t6, a2 + vldx vr3, t6, t1 + QPEL8_V_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3 + vld vr4, t8, 0 + vldx vr5, t8, a2 + vavgr.bu vr13, vr27, vr13 + vavgr.bu vr14, vr28, vr14 + vavgr.bu vr13, vr13, vr4 + vavgr.bu vr14, vr14, vr5 + vst vr13, a0, 0 + vstx vr14, a0, a2 + vldx vr4, t6, t2 + alsl.d t6, a2, t6, 2 + vld vr5, t6, 0 + QPEL8_V_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5 + vldx vr6, t8, t1 + vldx vr0, t8, t2 + vavgr.bu vr13, vr29, vr13 + vavgr.bu vr14, vr30, vr14 + vavgr.bu vr13, vr13, vr6 + vavgr.bu vr14, vr14, vr0 + vstx vr13, a0, t1 + vstx vr14, a0, t2 + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +.endm + +function ff_avg_h264_qpel16_mc33_lsx + slli.d t1, a2, 1 + add.d t2, t1, a2 + addi.d t0, a1, -2 // t0 = src - 2 + add.d t0, t0, a2 // t0 = src + stride - 2 + add.d t3, a1, zero // t3 = src + sub.d t4, a1, t1 // t4 = src - 2 * stride + addi.d t4, t4, 1 + addi.d t8, a0, 0 + avc_luma_hv_qrt_and_aver_dst_16x16_lsx +endfunc + +function ff_avg_h264_qpel16_mc11_lsx + slli.d t1, a2, 1 + add.d t2, t1, a2 + addi.d t0, a1, -2 // t0 = src - 2 + add.d t3, a1, zero // t3 = src + sub.d t4, a1, t1 // t4 = src - 2 * stride + addi.d t8, a0, 0 + avc_luma_hv_qrt_and_aver_dst_16x16_lsx +endfunc + +function ff_avg_h264_qpel16_mc31_lsx + slli.d t1, a2, 1 + add.d t2, t1, a2 + addi.d t0, a1, -2 // t0 = src - 2 + add.d t3, a1, zero // t3 = src + sub.d t4, a1, t1 // t4 = src - 2 * stride + addi.d t4, t4, 1 + addi.d t8, a0, 0 + avc_luma_hv_qrt_and_aver_dst_16x16_lsx +endfunc + +function ff_avg_h264_qpel16_mc13_lsx + slli.d t1, a2, 1 + add.d t2, t1, a2 + addi.d t0, a1, -2 // t0 = src - 2 + add.d t0, t0, a2 + add.d t3, a1, zero // t3 = src + sub.d t4, a1, t1 // t4 = src - 2 * stride + addi.d t8, a0, 0 + avc_luma_hv_qrt_and_aver_dst_16x16_lsx +endfunc + +function ff_avg_h264_qpel16_mc20_lsx + slli.d t1, a2, 1 + add.d t2, t1, a2 + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + addi.d t0, a1, -2 // t0 = src - 2 + addi.d t5, a0, 0 + addi.d a1, t0, 8 +.rept 4 + VLD_DOUBLE_QPEL8_H_LSX vr12, vr13, vr14, vr15, t0 + VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr12, vr13, a1 + vld vr0, t5, 0 + vldx vr1, t5, a2 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vst vr0, a0, 0 + vstx vr1, a0, a2 + add.d a1, a1, t1 + VLD_QPEL8_H_SSRANI_LSX vr2, vr3, vr14, vr15, a1 + vldx vr0, t5, t1 + vldx vr1, t5, t2 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vstx vr0, a0, t1 + vstx vr1, a0, t2 + alsl.d t0, a2, t0, 2 + alsl.d t5, a2, t5, 2 + alsl.d a0, a2, a0, 2 + alsl.d a1, a2, a1, 1 +.endr +endfunc + +.macro QPEL8_HV_H_LSX out0, out1 + vbsrl.v vr2, vr0, 1 + vbsrl.v vr3, vr1, 1 + vbsrl.v vr4, vr0, 2 + vbsrl.v vr5, vr1, 2 + vbsrl.v vr6, vr0, 3 + vbsrl.v vr7, vr1, 3 + vbsrl.v vr8, vr0, 4 + vbsrl.v vr9, vr1, 4 + vbsrl.v vr10, vr0, 5 + vbsrl.v vr11, vr1, 5 + vilvl.b vr6, vr4, vr6 + vilvl.b vr7, vr5, vr7 + vilvl.b vr8, vr2, vr8 + vilvl.b vr9, vr3, vr9 + vilvl.b vr10, vr0, vr10 + vilvl.b vr11, vr1, vr11 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vmul.h vr2, vr6, vr20 + vmul.h vr3, vr7, vr20 + vmul.h vr4, vr8, vr21 + vmul.h vr5, vr9, vr21 + vssub.h vr2, vr2, vr4 + vssub.h vr3, vr3, vr5 + vsadd.h \out0, vr2, vr10 + vsadd.h \out1, vr3, vr11 +.endm + +.macro QPEL8_HV_V_LSX in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3 + vilvl.h vr0, \in2, \in3 + vilvl.h vr1, \in3, \in4 // tmp0 + vilvl.h vr2, \in1, \in4 + vilvl.h vr3, \in2, \in5 // tmp2 + vilvl.h vr4, \in0, \in5 + vilvl.h vr5, \in1, \in6 // tmp4 + vhaddw.w.h vr0, vr0, vr0 + vhaddw.w.h vr1, vr1, vr1 + vhaddw.w.h vr2, vr2, vr2 + vhaddw.w.h vr3, vr3, vr3 + vhaddw.w.h vr4, vr4, vr4 + vhaddw.w.h vr5, vr5, vr5 + vmul.w vr0, vr0, vr22 + vmul.w vr1, vr1, vr22 + vmul.w vr2, vr2, vr23 + vmul.w vr3, vr3, vr23 + vssub.w vr0, vr0, vr2 + vssub.w vr1, vr1, vr3 + vsadd.w vr0, vr0, vr4 + vsadd.w vr1, vr1, vr5 + vsadd.w \out0, vr0, vr24 + vsadd.w \out1, vr1, vr24 + vilvh.h vr0, \in2, \in3 + vilvh.h vr1, \in3, \in4 // tmp0 + vilvh.h vr2, \in1, \in4 + vilvh.h vr3, \in2, \in5 // tmp2 + vilvh.h vr4, \in0, \in5 + vilvh.h vr5, \in1, \in6 // tmp4 + vhaddw.w.h vr0, vr0, vr0 + vhaddw.w.h vr1, vr1, vr1 + vhaddw.w.h vr2, vr2, vr2 + vhaddw.w.h vr3, vr3, vr3 + vhaddw.w.h vr4, vr4, vr4 + vhaddw.w.h vr5, vr5, vr5 + vmul.w vr0, vr0, vr22 + vmul.w vr1, vr1, vr22 + vmul.w vr2, vr2, vr23 + vmul.w vr3, vr3, vr23 + vssub.w vr0, vr0, vr2 + vssub.w vr1, vr1, vr3 + vsadd.w vr0, vr0, vr4 + vsadd.w vr1, vr1, vr5 + vsadd.w \out2, vr0, vr24 + vsadd.w \out3, vr1, vr24 + vssrani.hu.w \out2, \out0, 10 + vssrani.hu.w \out3, \out1, 10 + vssrani.bu.h \out3, \out2, 0 +.endm + +.macro h264_qpel8_hv_lowpass_core_lsx in0, in1, type + vld vr0, \in0, 0 + vldx vr1, \in0, a3 + QPEL8_HV_H_LSX vr12, vr13 // a b$ + vldx vr0, \in0, t1 + vldx vr1, \in0, t2 + QPEL8_HV_H_LSX vr14, vr15 // c d$ + + alsl.d \in0, a3, \in0, 2 + + vld vr0, \in0, 0 + vldx vr1, \in0, a3 + QPEL8_HV_H_LSX vr16, vr17 // e f$ + vldx vr0, \in0, t1 + vldx vr1, \in0, t2 + QPEL8_HV_H_LSX vr18, vr19 // g h$ + QPEL8_HV_V_LSX vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr6, vr7, vr0, vr1 +.ifc \type, avg + fld.d f2, t3, 0 + fldx.d f3, t3, a2 + vilvl.d vr2, vr3, vr2 + vavgr.bu vr1, vr2, vr1 +.endif + vstelm.d vr1, \in1, 0, 0 + add.d \in1, \in1, a2 + vstelm.d vr1, \in1, 0, 1 + + alsl.d \in0, a3, \in0, 2 + + // tmp8 + vld vr0, \in0, 0 + vldx vr1, \in0, a3 + QPEL8_HV_H_LSX vr12, vr13 + QPEL8_HV_V_LSX vr14, vr15, vr16, vr17, vr18, vr19, vr12, vr6, vr7, vr0, vr1 +.ifc \type, avg + fldx.d f2, t3, t5 + fldx.d f3, t3, t6 + vilvl.d vr2, vr3, vr2 + vavgr.bu vr1, vr2, vr1 +.endif + add.d \in1, \in1, a2 + vstelm.d vr1, \in1, 0, 0 + add.d \in1, \in1, a2 + vstelm.d vr1, \in1, 0, 1 + + // tmp10 + vldx vr0, \in0, t1 + vldx vr1, \in0, t2 + QPEL8_HV_H_LSX vr14, vr15 + QPEL8_HV_V_LSX vr16, vr17, vr18, vr19, vr12, vr13, vr14, vr6, vr7, vr0, vr1 +.ifc \type, avg + alsl.d t3, a2, t3, 2 + fld.d f2, t3, 0 + fldx.d f3, t3, a2 + vilvl.d vr2, vr3, vr2 + vavgr.bu vr1, vr2, vr1 +.endif + add.d \in1, \in1, a2 + vstelm.d vr1, \in1, 0, 0 + add.d \in1, \in1, a2 + vstelm.d vr1, \in1, 0, 1 + + // tmp12 + alsl.d \in0, a3, \in0, 2 + + vld vr0, \in0, 0 + vldx vr1, \in0, a3 + QPEL8_HV_H_LSX vr16, vr17 + QPEL8_HV_V_LSX vr18, vr19, vr12, vr13, vr14, vr15, vr16, vr6, vr7, vr0, vr1 +.ifc \type, avg + fldx.d f2, t3, t5 + fldx.d f3, t3, t6 + vilvl.d vr2, vr3, vr2 + vavgr.bu vr1, vr2, vr1 +.endif + add.d \in1, \in1, a2 + vstelm.d vr1, \in1, 0, 0 + add.d \in1, \in1, a2 + vstelm.d vr1, \in1, 0, 1 +.endm + +function put_h264_qpel8_hv_lowpass_lsx + slli.d t1, a3, 1 + add.d t2, t1, a3 + addi.d sp, sp, -8 + fst.d f24, sp, 0 + addi.d t0, a1, -2 // t0 = src - 2 + sub.d t0, t0, t1 // t0 = t0 - 2 * stride + vldi vr20, 0x414 // h_20 + vldi vr21, 0x405 // h_5 + vldi vr22, 0x814 // w_20 + vldi vr23, 0x805 // w_5 + addi.d t4, zero, 512 + vreplgr2vr.w vr24, t4 // w_512 + h264_qpel8_hv_lowpass_core_lsx t0, a0, put + fld.d f24, sp, 0 + addi.d sp, sp, 8 +endfunc + +function put_h264_qpel8_h_lowpass_lsx + slli.d t1, a3, 1 + add.d t2, t1, a3 + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + addi.d t0, a1, -2 // t0 = src - 2 + add.d t3, a1, zero // t3 = src +.rept 2 + vld vr0, t0, 0 + vldx vr1, t0, a3 + QPEL8_H_LSX vr12, vr13 + vssrani.bu.h vr13, vr12, 5 + vstelm.d vr13, a0, 0, 0 + add.d a0, a0, a2 + vstelm.d vr13, a0, 0, 1 + add.d a0, a0, a2 + vldx vr0, t0, t1 + vldx vr1, t0, t2 + QPEL8_H_LSX vr12, vr13 + vssrani.bu.h vr13, vr12, 5 + vstelm.d vr13, a0, 0, 0 + add.d a0, a0, a2 + vstelm.d vr13, a0, 0, 1 + add.d a0, a0, a2 + alsl.d t0, a3, t0, 2 +.endr +endfunc + +function put_pixels16_l2_8_lsx + slli.d t0, a4, 1 + add.d t1, t0, a4 + slli.d t2, t0, 1 + slli.d t3, a3, 1 + add.d t4, t3, a3 + slli.d t5, t3, 1 +.rept 4 + vld vr0, a1, 0 + vldx vr1, a1, a4 + vldx vr2, a1, t0 + vldx vr3, a1, t1 + add.d a1, a1, t2 + vld vr8, a2, 0x00 + vld vr9, a2, 0x10 + vld vr10, a2, 0x20 + vld vr11, a2, 0x30 + addi.d a2, a2, 0x40 + vavgr.bu vr0, vr8, vr0 + vavgr.bu vr1, vr9, vr1 + vavgr.bu vr2, vr10, vr2 + vavgr.bu vr3, vr11, vr3 + vst vr0, a0, 0 + vstx vr1, a0, a3 + vstx vr2, a0, t3 + vstx vr3, a0, t4 + add.d a0, a0, t5 +.endr +endfunc + +.macro QPEL8_V1_LSX in0, in1, in2, in3, in4, in5, in6 + vilvl.b vr7, \in3, \in2 + vilvl.b vr8, \in4, \in3 + vilvl.b vr9, \in4, \in1 + vilvl.b vr10, \in5, \in2 + vilvl.b vr11, \in5, \in0 + vilvl.b vr12, \in6, \in1 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vmul.h vr7, vr7, vr20 + vmul.h vr8, vr8, vr20 + vmul.h vr9, vr9, vr21 + vmul.h vr10, vr10, vr21 + vssub.h vr7, vr7, vr9 + vssub.h vr8, vr8, vr10 + vsadd.h vr7, vr7, vr11 + vsadd.h vr8, vr8, vr12 + vsadd.h vr7, vr7, vr22 + vsadd.h vr8, vr8, vr22 + vssrani.bu.h vr8, vr7, 5 +.endm + +.macro h264_qpel8_v_lowpass_lsx type +function \type\()_h264_qpel8_v_lowpass_lsx + slli.d t0, a3, 1 + add.d t1, t0, a3 + sub.d t2, a1, t0 // t2 = src - 2 * stride +.ifc \type, avg + addi.d t3, a0, 0 + slli.d t4, a2, 1 + add.d t5, t4, a2 +.endif + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + + fld.d f0, t2, 0 + fldx.d f1, t2, a3 + fldx.d f2, t2, t0 + fldx.d f3, t2, t1 + alsl.d t2, a3, t2, 2 // t2 = t2 + 4 * stride + fld.d f4, t2, 0 + fldx.d f5, t2, a3 + fldx.d f6, t2, t0 + QPEL8_V1_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6 +.ifc \type, avg + fld.d f0, t3, 0 + fldx.d f1, t3, a2 + vilvl.d vr0, vr1, vr0 + vavgr.bu vr8, vr8, vr0 +.endif + vstelm.d vr8, a0, 0, 0 + add.d a0, a0, a2 + vstelm.d vr8, a0, 0, 1 + add.d a0, a0, a2 + + fldx.d f0, t2, t1 + alsl.d t2, a3, t2, 2 // t2 = t2 + 4 *stride + fld.d f1, t2, 0 + QPEL8_V1_LSX vr2, vr3, vr4, vr5, vr6, vr0, vr1 +.ifc \type, avg + fldx.d f2, t3, t4 + fldx.d f3, t3, t5 + vilvl.d vr2, vr3, vr2 + vavgr.bu vr8, vr8, vr2 +.endif + vstelm.d vr8, a0, 0, 0 + add.d a0, a0, a2 + vstelm.d vr8, a0, 0, 1 + add.d a0, a0, a2 + + alsl.d t3, a2, t3, 2 + + fldx.d f2, t2, a3 + fldx.d f3, t2, t0 + QPEL8_V1_LSX vr4, vr5, vr6, vr0, vr1, vr2, vr3 +.ifc \type, avg + fld.d f4, t3, 0 + fldx.d f5, t3, a2 + vilvl.d vr4, vr5, vr4 + vavgr.bu vr8, vr8, vr4 +.endif + vstelm.d vr8, a0, 0, 0 + add.d a0, a0, a2 + vstelm.d vr8, a0, 0, 1 + add.d a0, a0, a2 + + fldx.d f4, t2, t1 + alsl.d t2, a3, t2, 2 // t2 = t2 + 4 * stride + fld.d f5, t2, 0 + QPEL8_V1_LSX vr6, vr0, vr1, vr2, vr3, vr4, vr5 +.ifc \type, avg + fldx.d f6, t3, t4 + fldx.d f0, t3, t5 + vilvl.d vr6, vr0, vr6 + vavgr.bu vr8, vr8, vr6 +.endif + vstelm.d vr8, a0, 0, 0 + add.d a0, a0, a2 + vstelm.d vr8, a0, 0, 1 +endfunc +.endm + +h264_qpel8_v_lowpass_lsx put +h264_qpel8_v_lowpass_lsx avg + +function avg_pixels16_l2_8_lsx + slli.d t0, a4, 1 + add.d t1, t0, a4 + slli.d t2, t0, 1 + slli.d t3, a3, 1 + add.d t4, t3, a3 + slli.d t5, t3, 1 + addi.d t6, a0, 0 +.rept 4 + vld vr0, a1, 0 + vldx vr1, a1, a4 + vldx vr2, a1, t0 + vldx vr3, a1, t1 + add.d a1, a1, t2 + vld vr8, a2, 0x00 + vld vr9, a2, 0x10 + vld vr10, a2, 0x20 + vld vr11, a2, 0x30 + addi.d a2, a2, 0x40 + vavgr.bu vr0, vr8, vr0 + vavgr.bu vr1, vr9, vr1 + vavgr.bu vr2, vr10, vr2 + vavgr.bu vr3, vr11, vr3 + vld vr8, t6, 0 + vldx vr9, t6, a3 + vldx vr10, t6, t3 + vldx vr11, t6, t4 + add.d t6, t6, t5 + vavgr.bu vr0, vr8, vr0 + vavgr.bu vr1, vr9, vr1 + vavgr.bu vr2, vr10, vr2 + vavgr.bu vr3, vr11, vr3 + vst vr0, a0, 0 + vstx vr1, a0, a3 + vstx vr2, a0, t3 + vstx vr3, a0, t4 + add.d a0, a0, t5 +.endr +endfunc + +function avg_h264_qpel8_hv_lowpass_lsx + slli.d t1, a3, 1 + add.d t2, t1, a3 + slli.d t5, a2, 1 + add.d t6, a2, t5 + addi.d sp, sp, -8 + fst.d f24, sp, 0 + vldi vr20, 0x414 // h_20 + vldi vr21, 0x405 // h_5 + vldi vr22, 0x814 // w_20 + vldi vr23, 0x805 // w_5 + addi.d t4, zero, 512 + vreplgr2vr.w vr24, t4 // w_512 + addi.d t0, a1, -2 // t0 = src - 2 + sub.d t0, t0, t1 // t0 = t0 - 2 * stride + addi.d t3, a0, 0 // t3 = dst + h264_qpel8_hv_lowpass_core_lsx t0, a0, avg + fld.d f24, sp, 0 + addi.d sp, sp, 8 +endfunc + +function put_pixels8_l2_8_lsx + slli.d t0, a4, 1 + add.d t1, t0, a4 + slli.d t2, t0, 1 +.rept 2 + vld vr0, a1, 0 + vldx vr1, a1, a4 + vldx vr2, a1, t0 + vldx vr3, a1, t1 + add.d a1, a1, t2 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vld vr8, a2, 0x00 + vld vr9, a2, 0x08 + vld vr10, a2, 0x10 + vld vr11, a2, 0x18 + vilvl.d vr8, vr9, vr8 + vilvl.d vr10, vr11, vr10 + addi.d a2, a2, 32 + vavgr.bu vr0, vr8, vr0 + vavgr.bu vr2, vr10, vr2 + vstelm.d vr0, a0, 0, 0 + add.d a0, a0, a3 + vstelm.d vr0, a0, 0, 1 + add.d a0, a0, a3 + vstelm.d vr2, a0, 0, 0 + add.d a0, a0, a3 + vstelm.d vr2, a0, 0, 1 + add.d a0, a0, a3 +.endr +endfunc + +function ff_put_h264_qpel8_mc00_lsx + slli.d t0, a2, 1 + add.d t1, t0, a2 + slli.d t2, t0, 1 + ld.d t3, a1, 0x0 + ldx.d t4, a1, a2 + ldx.d t5, a1, t0 + ldx.d t6, a1, t1 + st.d t3, a0, 0x0 + stx.d t4, a0, a2 + stx.d t5, a0, t0 + stx.d t6, a0, t1 + add.d a1, a1, t2 + add.d a0, a0, t2 + ld.d t3, a1, 0x0 + ldx.d t4, a1, a2 + ldx.d t5, a1, t0 + ldx.d t6, a1, t1 + st.d t3, a0, 0x0 + stx.d t4, a0, a2 + stx.d t5, a0, t0 + stx.d t6, a0, t1 +endfunc + +function ff_avg_h264_qpel8_mc00_lsx + slli.d t0, a2, 1 + add.d t1, t0, a2 + slli.d t2, t0, 1 + addi.d t3, a0, 0 +.rept 2 + vld vr0, a1, 0 + vldx vr1, a1, a2 + vldx vr2, a1, t0 + vldx vr3, a1, t1 + add.d a1, a1, t2 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vld vr8, t3, 0 + vldx vr9, t3, a2 + vldx vr10, t3, t0 + vldx vr11, t3, t1 + add.d t3, t3, t2 + vilvl.d vr8, vr9, vr8 + vilvl.d vr10, vr11, vr10 + vavgr.bu vr0, vr8, vr0 + vavgr.bu vr2, vr10, vr2 + vstelm.d vr0, a0, 0, 0 + add.d a0, a0, a2 + vstelm.d vr0, a0, 0, 1 + add.d a0, a0, a2 + vstelm.d vr2, a0, 0, 0 + add.d a0, a0, a2 + vstelm.d vr2, a0, 0, 1 + add.d a0, a0, a2 +.endr +endfunc + +function avg_pixels8_l2_8_lsx + slli.d t0, a4, 1 + add.d t1, t0, a4 + slli.d t2, t0, 1 + addi.d t3, a0, 0 + slli.d t4, a3, 1 + add.d t5, t4, a3 + slli.d t6, t4, 1 +.rept 2 + vld vr0, a1, 0 + vldx vr1, a1, a4 + vldx vr2, a1, t0 + vldx vr3, a1, t1 + add.d a1, a1, t2 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vld vr8, a2, 0x00 + vld vr9, a2, 0x08 + vld vr10, a2, 0x10 + vld vr11, a2, 0x18 + addi.d a2, a2, 0x20 + vilvl.d vr8, vr9, vr8 + vilvl.d vr10, vr11, vr10 + vavgr.bu vr0, vr8, vr0 + vavgr.bu vr2, vr10, vr2 + vld vr8, t3, 0 + vldx vr9, t3, a3 + vldx vr10, t3, t4 + vldx vr11, t3, t5 + add.d t3, t3, t6 + vilvl.d vr8, vr9, vr8 + vilvl.d vr10, vr11, vr10 + vavgr.bu vr0, vr8, vr0 + vavgr.bu vr2, vr10, vr2 + vstelm.d vr0, a0, 0, 0 + add.d a0, a0, a3 + vstelm.d vr0, a0, 0, 1 + add.d a0, a0, a3 + vstelm.d vr2, a0, 0, 0 + add.d a0, a0, a3 + vstelm.d vr2, a0, 0, 1 + add.d a0, a0, a3 +.endr +endfunc + +function avg_h264_qpel8_h_lowpass_lsx + slli.d t1, a3, 1 + add.d t2, t1, a3 + slli.d t5, a2, 1 + add.d t6, t5, a2 + vldi vr20, 0x414 + vldi vr21, 0x405 + vldi vr22, 0x410 + addi.d t0, a1, -2 // t0 = src - 2 + add.d t3, a1, zero // t3 = src + addi.d t4, a0, 0 // t4 = dst +.rept 4 + vld vr0, t0, 0 + vldx vr1, t0, a3 + QPEL8_H_LSX vr12, vr13 + vssrani.bu.h vr13, vr12, 5 + fld.d f0, t4, 0 + fldx.d f1, t4, a2 + vilvl.d vr0, vr1, vr0 + vavgr.bu vr13, vr13, vr0 + vstelm.d vr13, a0, 0, 0 + add.d a0, a0, a2 + vstelm.d vr13, a0, 0, 1 + add.d a0, a0, a2 + add.d t0, t0, t1 + add.d t4, t4, t1 +.endr +endfunc diff --git a/libavcodec/loongarch/h264qpel_init_loongarch.c b/libavcodec/loongarch/h264qpel_init_loongarch.c index 969c9c376c..9d3a5cb164 100644 --- a/libavcodec/loongarch/h264qpel_init_loongarch.c +++ b/libavcodec/loongarch/h264qpel_init_loongarch.c @@ -19,7 +19,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "h264qpel_lasx.h" +#include "h264qpel_loongarch.h" #include "libavutil/attributes.h" #include "libavutil/loongarch/cpu.h" #include "libavcodec/h264qpel.h" @@ -27,6 +27,77 @@ av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth) { int cpu_flags = av_get_cpu_flags(); + + if (have_lsx(cpu_flags)) { + if (8 == bit_depth) { + c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_lsx; + c->put_h264_qpel_pixels_tab[0][1] = ff_put_h264_qpel16_mc10_lsx; + c->put_h264_qpel_pixels_tab[0][2] = ff_put_h264_qpel16_mc20_lsx; + c->put_h264_qpel_pixels_tab[0][3] = ff_put_h264_qpel16_mc30_lsx; + c->put_h264_qpel_pixels_tab[0][4] = ff_put_h264_qpel16_mc01_lsx; + c->put_h264_qpel_pixels_tab[0][5] = ff_put_h264_qpel16_mc11_lsx; + c->put_h264_qpel_pixels_tab[0][6] = ff_put_h264_qpel16_mc21_lsx; + c->put_h264_qpel_pixels_tab[0][7] = ff_put_h264_qpel16_mc31_lsx; + c->put_h264_qpel_pixels_tab[0][8] = ff_put_h264_qpel16_mc02_lsx; + c->put_h264_qpel_pixels_tab[0][9] = ff_put_h264_qpel16_mc12_lsx; + c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_lsx; + c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_lsx; + c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_lsx; + c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_lsx; + c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_lsx; + c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_lsx; + + c->avg_h264_qpel_pixels_tab[0][0] = ff_avg_h264_qpel16_mc00_lsx; + c->avg_h264_qpel_pixels_tab[0][1] = ff_avg_h264_qpel16_mc10_lsx; + c->avg_h264_qpel_pixels_tab[0][2] = ff_avg_h264_qpel16_mc20_lsx; + c->avg_h264_qpel_pixels_tab[0][3] = ff_avg_h264_qpel16_mc30_lsx; + c->avg_h264_qpel_pixels_tab[0][4] = ff_avg_h264_qpel16_mc01_lsx; + c->avg_h264_qpel_pixels_tab[0][5] = ff_avg_h264_qpel16_mc11_lsx; + c->avg_h264_qpel_pixels_tab[0][6] = ff_avg_h264_qpel16_mc21_lsx; + c->avg_h264_qpel_pixels_tab[0][7] = ff_avg_h264_qpel16_mc31_lsx; + c->avg_h264_qpel_pixels_tab[0][8] = ff_avg_h264_qpel16_mc02_lsx; + c->avg_h264_qpel_pixels_tab[0][9] = ff_avg_h264_qpel16_mc12_lsx; + c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_lsx; + c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_lsx; + c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_lsx; + c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_lsx; + c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_lsx; + c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_lsx; + + c->put_h264_qpel_pixels_tab[1][0] = ff_put_h264_qpel8_mc00_lsx; + c->put_h264_qpel_pixels_tab[1][1] = ff_put_h264_qpel8_mc10_lsx; + c->put_h264_qpel_pixels_tab[1][2] = ff_put_h264_qpel8_mc20_lsx; + c->put_h264_qpel_pixels_tab[1][3] = ff_put_h264_qpel8_mc30_lsx; + c->put_h264_qpel_pixels_tab[1][4] = ff_put_h264_qpel8_mc01_lsx; + c->put_h264_qpel_pixels_tab[1][5] = ff_put_h264_qpel8_mc11_lsx; + c->put_h264_qpel_pixels_tab[1][6] = ff_put_h264_qpel8_mc21_lsx; + c->put_h264_qpel_pixels_tab[1][7] = ff_put_h264_qpel8_mc31_lsx; + c->put_h264_qpel_pixels_tab[1][8] = ff_put_h264_qpel8_mc02_lsx; + c->put_h264_qpel_pixels_tab[1][9] = ff_put_h264_qpel8_mc12_lsx; + c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_lsx; + c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_lsx; + c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_lsx; + c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_lsx; + c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_lsx; + c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_lsx; + + c->avg_h264_qpel_pixels_tab[1][0] = ff_avg_h264_qpel8_mc00_lsx; + c->avg_h264_qpel_pixels_tab[1][1] = ff_avg_h264_qpel8_mc10_lsx; + c->avg_h264_qpel_pixels_tab[1][2] = ff_avg_h264_qpel8_mc20_lsx; + c->avg_h264_qpel_pixels_tab[1][3] = ff_avg_h264_qpel8_mc30_lsx; + c->avg_h264_qpel_pixels_tab[1][5] = ff_avg_h264_qpel8_mc11_lsx; + c->avg_h264_qpel_pixels_tab[1][6] = ff_avg_h264_qpel8_mc21_lsx; + c->avg_h264_qpel_pixels_tab[1][7] = ff_avg_h264_qpel8_mc31_lsx; + c->avg_h264_qpel_pixels_tab[1][8] = ff_avg_h264_qpel8_mc02_lsx; + c->avg_h264_qpel_pixels_tab[1][9] = ff_avg_h264_qpel8_mc12_lsx; + c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_lsx; + c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_lsx; + c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_lsx; + c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_lsx; + c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lsx; + } + } +#if HAVE_LASX if (have_lasx(cpu_flags)) { if (8 == bit_depth) { c->put_h264_qpel_pixels_tab[0][0] = ff_put_h264_qpel16_mc00_lasx; @@ -95,4 +166,5 @@ av_cold void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth) c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_lasx; } } +#endif } diff --git a/libavcodec/loongarch/h264qpel_lasx.c b/libavcodec/loongarch/h264qpel_lasx.c index 1c142e510e..519bb03fe6 100644 --- a/libavcodec/loongarch/h264qpel_lasx.c +++ b/libavcodec/loongarch/h264qpel_lasx.c @@ -21,7 +21,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "h264qpel_lasx.h" +#include "h264qpel_loongarch.h" #include "libavutil/loongarch/loongson_intrinsics.h" #include "libavutil/attributes.h" @@ -418,157 +418,6 @@ avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) ); } -/* avg_pixels8_8_lsx : dst = avg(src, dst) - * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8. - * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ -static av_always_inline void -put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, - ptrdiff_t dstStride, ptrdiff_t srcStride) -{ - ptrdiff_t stride_2, stride_3, stride_4; - __asm__ volatile ( - /* h0~h7 */ - "slli.d %[stride_2], %[srcStride], 1 \n\t" - "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" - "slli.d %[stride_4], %[stride_2], 1 \n\t" - "vld $vr0, %[src], 0 \n\t" - "vldx $vr1, %[src], %[srcStride] \n\t" - "vldx $vr2, %[src], %[stride_2] \n\t" - "vldx $vr3, %[src], %[stride_3] \n\t" - "add.d %[src], %[src], %[stride_4] \n\t" - "vld $vr4, %[src], 0 \n\t" - "vldx $vr5, %[src], %[srcStride] \n\t" - "vldx $vr6, %[src], %[stride_2] \n\t" - "vldx $vr7, %[src], %[stride_3] \n\t" - - "vld $vr8, %[half], 0x00 \n\t" - "vld $vr9, %[half], 0x08 \n\t" - "vld $vr10, %[half], 0x10 \n\t" - "vld $vr11, %[half], 0x18 \n\t" - "vld $vr12, %[half], 0x20 \n\t" - "vld $vr13, %[half], 0x28 \n\t" - "vld $vr14, %[half], 0x30 \n\t" - "vld $vr15, %[half], 0x38 \n\t" - - "vavgr.bu $vr0, $vr8, $vr0 \n\t" - "vavgr.bu $vr1, $vr9, $vr1 \n\t" - "vavgr.bu $vr2, $vr10, $vr2 \n\t" - "vavgr.bu $vr3, $vr11, $vr3 \n\t" - "vavgr.bu $vr4, $vr12, $vr4 \n\t" - "vavgr.bu $vr5, $vr13, $vr5 \n\t" - "vavgr.bu $vr6, $vr14, $vr6 \n\t" - "vavgr.bu $vr7, $vr15, $vr7 \n\t" - - "vstelm.d $vr0, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr1, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr2, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr3, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr4, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr5, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr6, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr7, %[dst], 0, 0 \n\t" - : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src), - [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), - [stride_4]"=&r"(stride_4) - : [srcStride]"r"(srcStride), [dstStride]"r"(dstStride) - : "memory" - ); -} - -/* avg_pixels8_8_lsx : dst = avg(src, dst) - * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8. - * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ -static av_always_inline void -avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, - ptrdiff_t dstStride, ptrdiff_t srcStride) -{ - uint8_t *tmp = dst; - ptrdiff_t stride_2, stride_3, stride_4; - __asm__ volatile ( - /* h0~h7 */ - "slli.d %[stride_2], %[srcStride], 1 \n\t" - "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" - "slli.d %[stride_4], %[stride_2], 1 \n\t" - "vld $vr0, %[src], 0 \n\t" - "vldx $vr1, %[src], %[srcStride] \n\t" - "vldx $vr2, %[src], %[stride_2] \n\t" - "vldx $vr3, %[src], %[stride_3] \n\t" - "add.d %[src], %[src], %[stride_4] \n\t" - "vld $vr4, %[src], 0 \n\t" - "vldx $vr5, %[src], %[srcStride] \n\t" - "vldx $vr6, %[src], %[stride_2] \n\t" - "vldx $vr7, %[src], %[stride_3] \n\t" - - "vld $vr8, %[half], 0x00 \n\t" - "vld $vr9, %[half], 0x08 \n\t" - "vld $vr10, %[half], 0x10 \n\t" - "vld $vr11, %[half], 0x18 \n\t" - "vld $vr12, %[half], 0x20 \n\t" - "vld $vr13, %[half], 0x28 \n\t" - "vld $vr14, %[half], 0x30 \n\t" - "vld $vr15, %[half], 0x38 \n\t" - - "vavgr.bu $vr0, $vr8, $vr0 \n\t" - "vavgr.bu $vr1, $vr9, $vr1 \n\t" - "vavgr.bu $vr2, $vr10, $vr2 \n\t" - "vavgr.bu $vr3, $vr11, $vr3 \n\t" - "vavgr.bu $vr4, $vr12, $vr4 \n\t" - "vavgr.bu $vr5, $vr13, $vr5 \n\t" - "vavgr.bu $vr6, $vr14, $vr6 \n\t" - "vavgr.bu $vr7, $vr15, $vr7 \n\t" - - "slli.d %[stride_2], %[dstStride], 1 \n\t" - "add.d %[stride_3], %[stride_2], %[dstStride] \n\t" - "slli.d %[stride_4], %[stride_2], 1 \n\t" - "vld $vr8, %[tmp], 0 \n\t" - "vldx $vr9, %[tmp], %[dstStride] \n\t" - "vldx $vr10, %[tmp], %[stride_2] \n\t" - "vldx $vr11, %[tmp], %[stride_3] \n\t" - "add.d %[tmp], %[tmp], %[stride_4] \n\t" - "vld $vr12, %[tmp], 0 \n\t" - "vldx $vr13, %[tmp], %[dstStride] \n\t" - "vldx $vr14, %[tmp], %[stride_2] \n\t" - "vldx $vr15, %[tmp], %[stride_3] \n\t" - - "vavgr.bu $vr0, $vr8, $vr0 \n\t" - "vavgr.bu $vr1, $vr9, $vr1 \n\t" - "vavgr.bu $vr2, $vr10, $vr2 \n\t" - "vavgr.bu $vr3, $vr11, $vr3 \n\t" - "vavgr.bu $vr4, $vr12, $vr4 \n\t" - "vavgr.bu $vr5, $vr13, $vr5 \n\t" - "vavgr.bu $vr6, $vr14, $vr6 \n\t" - "vavgr.bu $vr7, $vr15, $vr7 \n\t" - - "vstelm.d $vr0, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr1, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr2, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr3, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr4, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr5, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr6, %[dst], 0, 0 \n\t" - "add.d %[dst], %[dst], %[dstStride] \n\t" - "vstelm.d $vr7, %[dst], 0, 0 \n\t" - : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), - [src]"+&r"(src), [stride_2]"=&r"(stride_2), - [stride_3]"=&r"(stride_3), [stride_4]"=&r"(stride_4) - : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride) - : "memory" - ); -} - /* put_pixels16_8_lsx: dst = src */ static av_always_inline void put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) @@ -729,254 +578,6 @@ avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) ); } -/* avg_pixels16_8_lsx : dst = avg(src, dst) - * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8. - * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ -static av_always_inline void -put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, - ptrdiff_t dstStride, ptrdiff_t srcStride) -{ - ptrdiff_t stride_2, stride_3, stride_4; - ptrdiff_t dstride_2, dstride_3, dstride_4; - __asm__ volatile ( - "slli.d %[stride_2], %[srcStride], 1 \n\t" - "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" - "slli.d %[stride_4], %[stride_2], 1 \n\t" - "slli.d %[dstride_2], %[dstStride], 1 \n\t" - "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t" - "slli.d %[dstride_4], %[dstride_2], 1 \n\t" - /* h0~h7 */ - "vld $vr0, %[src], 0 \n\t" - "vldx $vr1, %[src], %[srcStride] \n\t" - "vldx $vr2, %[src], %[stride_2] \n\t" - "vldx $vr3, %[src], %[stride_3] \n\t" - "add.d %[src], %[src], %[stride_4] \n\t" - "vld $vr4, %[src], 0 \n\t" - "vldx $vr5, %[src], %[srcStride] \n\t" - "vldx $vr6, %[src], %[stride_2] \n\t" - "vldx $vr7, %[src], %[stride_3] \n\t" - "add.d %[src], %[src], %[stride_4] \n\t" - - "vld $vr8, %[half], 0x00 \n\t" - "vld $vr9, %[half], 0x10 \n\t" - "vld $vr10, %[half], 0x20 \n\t" - "vld $vr11, %[half], 0x30 \n\t" - "vld $vr12, %[half], 0x40 \n\t" - "vld $vr13, %[half], 0x50 \n\t" - "vld $vr14, %[half], 0x60 \n\t" - "vld $vr15, %[half], 0x70 \n\t" - - "vavgr.bu $vr0, $vr8, $vr0 \n\t" - "vavgr.bu $vr1, $vr9, $vr1 \n\t" - "vavgr.bu $vr2, $vr10, $vr2 \n\t" - "vavgr.bu $vr3, $vr11, $vr3 \n\t" - "vavgr.bu $vr4, $vr12, $vr4 \n\t" - "vavgr.bu $vr5, $vr13, $vr5 \n\t" - "vavgr.bu $vr6, $vr14, $vr6 \n\t" - "vavgr.bu $vr7, $vr15, $vr7 \n\t" - - "vst $vr0, %[dst], 0 \n\t" - "vstx $vr1, %[dst], %[dstStride] \n\t" - "vstx $vr2, %[dst], %[dstride_2] \n\t" - "vstx $vr3, %[dst], %[dstride_3] \n\t" - "add.d %[dst], %[dst], %[dstride_4] \n\t" - "vst $vr4, %[dst], 0 \n\t" - "vstx $vr5, %[dst], %[dstStride] \n\t" - "vstx $vr6, %[dst], %[dstride_2] \n\t" - "vstx $vr7, %[dst], %[dstride_3] \n\t" - "add.d %[dst], %[dst], %[dstride_4] \n\t" - - /* h8~h15 */ - "vld $vr0, %[src], 0 \n\t" - "vldx $vr1, %[src], %[srcStride] \n\t" - "vldx $vr2, %[src], %[stride_2] \n\t" - "vldx $vr3, %[src], %[stride_3] \n\t" - "add.d %[src], %[src], %[stride_4] \n\t" - "vld $vr4, %[src], 0 \n\t" - "vldx $vr5, %[src], %[srcStride] \n\t" - "vldx $vr6, %[src], %[stride_2] \n\t" - "vldx $vr7, %[src], %[stride_3] \n\t" - - "vld $vr8, %[half], 0x80 \n\t" - "vld $vr9, %[half], 0x90 \n\t" - "vld $vr10, %[half], 0xa0 \n\t" - "vld $vr11, %[half], 0xb0 \n\t" - "vld $vr12, %[half], 0xc0 \n\t" - "vld $vr13, %[half], 0xd0 \n\t" - "vld $vr14, %[half], 0xe0 \n\t" - "vld $vr15, %[half], 0xf0 \n\t" - - "vavgr.bu $vr0, $vr8, $vr0 \n\t" - "vavgr.bu $vr1, $vr9, $vr1 \n\t" - "vavgr.bu $vr2, $vr10, $vr2 \n\t" - "vavgr.bu $vr3, $vr11, $vr3 \n\t" - "vavgr.bu $vr4, $vr12, $vr4 \n\t" - "vavgr.bu $vr5, $vr13, $vr5 \n\t" - "vavgr.bu $vr6, $vr14, $vr6 \n\t" - "vavgr.bu $vr7, $vr15, $vr7 \n\t" - - "vst $vr0, %[dst], 0 \n\t" - "vstx $vr1, %[dst], %[dstStride] \n\t" - "vstx $vr2, %[dst], %[dstride_2] \n\t" - "vstx $vr3, %[dst], %[dstride_3] \n\t" - "add.d %[dst], %[dst], %[dstride_4] \n\t" - "vst $vr4, %[dst], 0 \n\t" - "vstx $vr5, %[dst], %[dstStride] \n\t" - "vstx $vr6, %[dst], %[dstride_2] \n\t" - "vstx $vr7, %[dst], %[dstride_3] \n\t" - : [dst]"+&r"(dst), [half]"+&r"(half), [src]"+&r"(src), - [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), - [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2), - [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4) - : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride) - : "memory" - ); -} - -/* avg_pixels16_8_lsx : dst = avg(src, dst) - * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8. - * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/ -static av_always_inline void -avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, - ptrdiff_t dstStride, ptrdiff_t srcStride) -{ - uint8_t *tmp = dst; - ptrdiff_t stride_2, stride_3, stride_4; - ptrdiff_t dstride_2, dstride_3, dstride_4; - __asm__ volatile ( - "slli.d %[stride_2], %[srcStride], 1 \n\t" - "add.d %[stride_3], %[stride_2], %[srcStride] \n\t" - "slli.d %[stride_4], %[stride_2], 1 \n\t" - "slli.d %[dstride_2], %[dstStride], 1 \n\t" - "add.d %[dstride_3], %[dstride_2], %[dstStride] \n\t" - "slli.d %[dstride_4], %[dstride_2], 1 \n\t" - /* h0~h7 */ - "vld $vr0, %[src], 0 \n\t" - "vldx $vr1, %[src], %[srcStride] \n\t" - "vldx $vr2, %[src], %[stride_2] \n\t" - "vldx $vr3, %[src], %[stride_3] \n\t" - "add.d %[src], %[src], %[stride_4] \n\t" - "vld $vr4, %[src], 0 \n\t" - "vldx $vr5, %[src], %[srcStride] \n\t" - "vldx $vr6, %[src], %[stride_2] \n\t" - "vldx $vr7, %[src], %[stride_3] \n\t" - "add.d %[src], %[src], %[stride_4] \n\t" - - "vld $vr8, %[half], 0x00 \n\t" - "vld $vr9, %[half], 0x10 \n\t" - "vld $vr10, %[half], 0x20 \n\t" - "vld $vr11, %[half], 0x30 \n\t" - "vld $vr12, %[half], 0x40 \n\t" - "vld $vr13, %[half], 0x50 \n\t" - "vld $vr14, %[half], 0x60 \n\t" - "vld $vr15, %[half], 0x70 \n\t" - - "vavgr.bu $vr0, $vr8, $vr0 \n\t" - "vavgr.bu $vr1, $vr9, $vr1 \n\t" - "vavgr.bu $vr2, $vr10, $vr2 \n\t" - "vavgr.bu $vr3, $vr11, $vr3 \n\t" - "vavgr.bu $vr4, $vr12, $vr4 \n\t" - "vavgr.bu $vr5, $vr13, $vr5 \n\t" - "vavgr.bu $vr6, $vr14, $vr6 \n\t" - "vavgr.bu $vr7, $vr15, $vr7 \n\t" - - "vld $vr8, %[tmp], 0 \n\t" - "vldx $vr9, %[tmp], %[dstStride] \n\t" - "vldx $vr10, %[tmp], %[dstride_2] \n\t" - "vldx $vr11, %[tmp], %[dstride_3] \n\t" - "add.d %[tmp], %[tmp], %[dstride_4] \n\t" - "vld $vr12, %[tmp], 0 \n\t" - "vldx $vr13, %[tmp], %[dstStride] \n\t" - "vldx $vr14, %[tmp], %[dstride_2] \n\t" - "vldx $vr15, %[tmp], %[dstride_3] \n\t" - "add.d %[tmp], %[tmp], %[dstride_4] \n\t" - - "vavgr.bu $vr0, $vr8, $vr0 \n\t" - "vavgr.bu $vr1, $vr9, $vr1 \n\t" - "vavgr.bu $vr2, $vr10, $vr2 \n\t" - "vavgr.bu $vr3, $vr11, $vr3 \n\t" - "vavgr.bu $vr4, $vr12, $vr4 \n\t" - "vavgr.bu $vr5, $vr13, $vr5 \n\t" - "vavgr.bu $vr6, $vr14, $vr6 \n\t" - "vavgr.bu $vr7, $vr15, $vr7 \n\t" - - "vst $vr0, %[dst], 0 \n\t" - "vstx $vr1, %[dst], %[dstStride] \n\t" - "vstx $vr2, %[dst], %[dstride_2] \n\t" - "vstx $vr3, %[dst], %[dstride_3] \n\t" - "add.d %[dst], %[dst], %[dstride_4] \n\t" - "vst $vr4, %[dst], 0 \n\t" - "vstx $vr5, %[dst], %[dstStride] \n\t" - "vstx $vr6, %[dst], %[dstride_2] \n\t" - "vstx $vr7, %[dst], %[dstride_3] \n\t" - "add.d %[dst], %[dst], %[dstride_4] \n\t" - - /* h8~h15 */ - "vld $vr0, %[src], 0 \n\t" - "vldx $vr1, %[src], %[srcStride] \n\t" - "vldx $vr2, %[src], %[stride_2] \n\t" - "vldx $vr3, %[src], %[stride_3] \n\t" - "add.d %[src], %[src], %[stride_4] \n\t" - "vld $vr4, %[src], 0 \n\t" - "vldx $vr5, %[src], %[srcStride] \n\t" - "vldx $vr6, %[src], %[stride_2] \n\t" - "vldx $vr7, %[src], %[stride_3] \n\t" - - "vld $vr8, %[half], 0x80 \n\t" - "vld $vr9, %[half], 0x90 \n\t" - "vld $vr10, %[half], 0xa0 \n\t" - "vld $vr11, %[half], 0xb0 \n\t" - "vld $vr12, %[half], 0xc0 \n\t" - "vld $vr13, %[half], 0xd0 \n\t" - "vld $vr14, %[half], 0xe0 \n\t" - "vld $vr15, %[half], 0xf0 \n\t" - - "vavgr.bu $vr0, $vr8, $vr0 \n\t" - "vavgr.bu $vr1, $vr9, $vr1 \n\t" - "vavgr.bu $vr2, $vr10, $vr2 \n\t" - "vavgr.bu $vr3, $vr11, $vr3 \n\t" - "vavgr.bu $vr4, $vr12, $vr4 \n\t" - "vavgr.bu $vr5, $vr13, $vr5 \n\t" - "vavgr.bu $vr6, $vr14, $vr6 \n\t" - "vavgr.bu $vr7, $vr15, $vr7 \n\t" - - "vld $vr8, %[tmp], 0 \n\t" - "vldx $vr9, %[tmp], %[dstStride] \n\t" - "vldx $vr10, %[tmp], %[dstride_2] \n\t" - "vldx $vr11, %[tmp], %[dstride_3] \n\t" - "add.d %[tmp], %[tmp], %[dstride_4] \n\t" - "vld $vr12, %[tmp], 0 \n\t" - "vldx $vr13, %[tmp], %[dstStride] \n\t" - "vldx $vr14, %[tmp], %[dstride_2] \n\t" - "vldx $vr15, %[tmp], %[dstride_3] \n\t" - - "vavgr.bu $vr0, $vr8, $vr0 \n\t" - "vavgr.bu $vr1, $vr9, $vr1 \n\t" - "vavgr.bu $vr2, $vr10, $vr2 \n\t" - "vavgr.bu $vr3, $vr11, $vr3 \n\t" - "vavgr.bu $vr4, $vr12, $vr4 \n\t" - "vavgr.bu $vr5, $vr13, $vr5 \n\t" - "vavgr.bu $vr6, $vr14, $vr6 \n\t" - "vavgr.bu $vr7, $vr15, $vr7 \n\t" - - "vst $vr0, %[dst], 0 \n\t" - "vstx $vr1, %[dst], %[dstStride] \n\t" - "vstx $vr2, %[dst], %[dstride_2] \n\t" - "vstx $vr3, %[dst], %[dstride_3] \n\t" - "add.d %[dst], %[dst], %[dstride_4] \n\t" - "vst $vr4, %[dst], 0 \n\t" - "vstx $vr5, %[dst], %[dstStride] \n\t" - "vstx $vr6, %[dst], %[dstride_2] \n\t" - "vstx $vr7, %[dst], %[dstride_3] \n\t" - : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [half]"+&r"(half), [src]"+&r"(src), - [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3), - [stride_4]"=&r"(stride_4), [dstride_2]"=&r"(dstride_2), - [dstride_3]"=&r"(dstride_3), [dstride_4]"=&r"(dstride_4) - : [dstStride]"r"(dstStride), [srcStride]"r"(srcStride) - : "memory" - ); -} - #define QPEL8_H_LOWPASS(out_v) \ src00 = __lasx_xvld(src, - 2); \ src += srcStride; \ diff --git a/libavcodec/loongarch/h264qpel_lasx.h b/libavcodec/loongarch/h264qpel_lasx.h deleted file mode 100644 index 32b6b50917..0000000000 --- a/libavcodec/loongarch/h264qpel_lasx.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2020 Loongson Technology Corporation Limited - * Contributed by Shiyou Yin - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_LOONGARCH_H264QPEL_LASX_H -#define AVCODEC_LOONGARCH_H264QPEL_LASX_H - -#include -#include -#include "libavcodec/h264.h" - -void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride, - int alpha, int beta, int8_t *tc0); -void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride, - int alpha, int beta, int8_t *tc0); -void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); - -void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride); -void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, - ptrdiff_t dst_stride); -#endif // #ifndef AVCODEC_LOONGARCH_H264QPEL_LASX_H diff --git a/libavcodec/loongarch/h264qpel_loongarch.h b/libavcodec/loongarch/h264qpel_loongarch.h new file mode 100644 index 0000000000..68232730da --- /dev/null +++ b/libavcodec/loongarch/h264qpel_loongarch.h @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by Shiyou Yin + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H +#define AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H + +#include +#include +#include "libavcodec/h264.h" +#include "config.h" + +void put_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dstStride, ptrdiff_t srcStride); +void put_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dstStride, ptrdiff_t srcStride); +void put_h264_qpel8_v_lowpass_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dstStride, ptrdiff_t srcStride); +void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, + ptrdiff_t dstStride, ptrdiff_t srcStride); +void put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, + ptrdiff_t dstStride, ptrdiff_t srcStride); + +void avg_h264_qpel8_h_lowpass_lsx(uint8_t *dst, const uint8_t *src, int dstStride, + int srcStride); +void avg_h264_qpel8_v_lowpass_lsx(uint8_t *dst, uint8_t *src, int dstStride, + int srcStride); +void avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, + ptrdiff_t dstStride, ptrdiff_t srcStride); +void avg_h264_qpel8_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dstStride, ptrdiff_t srcStride); +void avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, + ptrdiff_t dstStride, ptrdiff_t srcStride); + +void ff_put_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); + +void ff_avg_h264_qpel16_mc00_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc10_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc30_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc33_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc11_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc31_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc13_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc20_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); + +void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); + +void ff_avg_h264_qpel8_mc00_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); + +#if HAVE_LASX +void ff_h264_h_lpf_luma_inter_lasx(uint8_t *src, int stride, + int alpha, int beta, int8_t *tc0); +void ff_h264_v_lpf_luma_inter_lasx(uint8_t *src, int stride, + int alpha, int beta, int8_t *tc0); +void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); + +void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride); +void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dst_stride); +#endif + +#endif // #ifndef AVCODEC_LOONGARCH_H264QPEL_LOONGARCH_H diff --git a/libavcodec/loongarch/h264qpel_lsx.c b/libavcodec/loongarch/h264qpel_lsx.c new file mode 100644 index 0000000000..12b3bae6d1 --- /dev/null +++ b/libavcodec/loongarch/h264qpel_lsx.c @@ -0,0 +1,487 @@ +/* + * Loongson LSX optimized h264qpel + * + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by Hecai Yuan + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264qpel_loongarch.h" +#include "libavutil/loongarch/loongson_intrinsics.h" +#include "libavutil/attributes.h" + +static void put_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dstStride, ptrdiff_t srcStride) +{ + put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride); + put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride); + src += srcStride << 3; + dst += dstStride << 3; + put_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride); + put_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride); +} + +void ff_put_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride); +} + +static void put_h264_qpel16_h_lowpass_lsx(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride); + put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride); + src += srcStride << 3; + dst += dstStride << 3; + put_h264_qpel8_h_lowpass_lsx(dst, src, dstStride, srcStride); + put_h264_qpel8_h_lowpass_lsx(dst+8, src+8, dstStride, srcStride); +} + +static void put_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride); + put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride); + put_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride); +} + +void ff_put_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[512]; + uint8_t *const halfH = temp; + uint8_t *const halfHV = temp + 256; + + put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride); + put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride); + put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); +} + +void ff_put_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[512]; + uint8_t *const halfHV = temp; + uint8_t *const halfH = temp + 256; + + put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride); + put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride); + put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); +} + +void ff_put_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[512]; + uint8_t *const halfHV = temp; + uint8_t *const halfH = temp + 256; + + put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride); + put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride); + put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); +} + +void ff_put_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[512]; + uint8_t *const halfH = temp; + uint8_t *const halfHV = temp + 256; + + put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride); + put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride); + put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); +} + +static void avg_h264_qpel16_v_lowpass_lsx(uint8_t *dst, const uint8_t *src, + int dstStride, int srcStride) +{ + avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride); + avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, dstStride, srcStride); + avg_h264_qpel8_v_lowpass_lsx(dst+8, (uint8_t*)src+8, dstStride, srcStride); +} + +void ff_avg_h264_qpel16_mc02_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel16_v_lowpass_lsx(dst, src, stride, stride); +} + +void ff_avg_h264_qpel16_mc03_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[256]; + + put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride); + avg_pixels16_l2_8_lsx(dst, src + stride, half, stride, stride); +} + +void ff_avg_h264_qpel16_mc23_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[512]; + uint8_t *const halfH = temp; + uint8_t *const halfHV = temp + 256; + + put_h264_qpel16_h_lowpass_lsx(halfH, src + stride, 16, stride); + put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride); + avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); +} + +void ff_avg_h264_qpel16_mc21_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[512]; + uint8_t *const halfH = temp; + uint8_t *const halfHV = temp + 256; + + put_h264_qpel16_h_lowpass_lsx(halfH, src, 16, stride); + put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride); + avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); +} + +void ff_avg_h264_qpel16_mc01_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[256]; + + put_h264_qpel16_v_lowpass_lsx(half, src, 16, stride); + avg_pixels16_l2_8_lsx(dst, src, half, stride, stride); +} + +void ff_avg_h264_qpel16_mc32_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[512]; + uint8_t *const halfHV = temp; + uint8_t *const halfH = temp + 256; + + put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride); + put_h264_qpel16_v_lowpass_lsx(halfH, src + 1, 16, stride); + avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); +} + +void ff_avg_h264_qpel16_mc12_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[512]; + uint8_t *const halfHV = temp; + uint8_t *const halfH = temp + 256; + + put_h264_qpel16_hv_lowpass_lsx(halfHV, src, 16, stride); + put_h264_qpel16_v_lowpass_lsx(halfH, src, 16, stride); + avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16); +} + +static void avg_h264_qpel16_hv_lowpass_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t dstStride, ptrdiff_t srcStride) +{ + avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride); + avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride); + src += srcStride << 3; + dst += dstStride << 3; + avg_h264_qpel8_hv_lowpass_lsx(dst, src, dstStride, srcStride); + avg_h264_qpel8_hv_lowpass_lsx(dst + 8, src + 8, dstStride, srcStride); +} + +void ff_avg_h264_qpel16_mc22_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel16_hv_lowpass_lsx(dst, src, stride, stride); +} + +void ff_put_h264_qpel8_mc03_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + + put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride); + put_pixels8_l2_8_lsx(dst, src + stride, half, stride, stride); +} + +void ff_put_h264_qpel8_mc01_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + + put_h264_qpel8_v_lowpass_lsx(half, (uint8_t*)src, 8, stride); + put_pixels8_l2_8_lsx(dst, src, half, stride, stride); +} + +void ff_put_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + + put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride); + put_pixels8_l2_8_lsx(dst, src+1, half, stride, stride); +} + +void ff_put_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + + put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride); + put_pixels8_l2_8_lsx(dst, src, half, stride, stride); +} + +void ff_put_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfV[64]; + + put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride); + put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); +} + +void ff_put_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfV[64]; + + put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride); + put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); +} + +void ff_put_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfV[64]; + + put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride); + put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); +} + +void ff_put_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfV[64]; + + put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride); + put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); +} + +void ff_put_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[128]; + uint8_t *const halfHV = temp; + uint8_t *const halfH = temp + 64; + + put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride); + put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); +} + +void ff_put_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[128]; + uint8_t *const halfH = temp; + uint8_t *const halfHV = temp + 64; + + put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride); + put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride); + put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); +} + +void ff_put_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[128]; + uint8_t *const halfH = temp; + uint8_t *const halfHV = temp + 64; + + put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride); + put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride); + put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); +} + +void ff_put_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[128]; + uint8_t *const halfHV = temp; + uint8_t *const halfH = temp + 64; + + put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride); + put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); +} + +void ff_put_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride); +} + +void ff_put_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride); +} + +void ff_put_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + put_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride); +} + +void ff_avg_h264_qpel8_mc10_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + + put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride); + avg_pixels8_l2_8_lsx(dst, src, half, stride, stride); +} + +void ff_avg_h264_qpel8_mc20_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel8_h_lowpass_lsx(dst, src, stride, stride); +} + +void ff_avg_h264_qpel8_mc30_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t half[64]; + + put_h264_qpel8_h_lowpass_lsx(half, src, 8, stride); + avg_pixels8_l2_8_lsx(dst, src+1, half, stride, stride); +} + +void ff_avg_h264_qpel8_mc11_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfV[64]; + + put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride); + avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); +} + +void ff_avg_h264_qpel8_mc21_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[128]; + uint8_t *const halfH = temp; + uint8_t *const halfHV = temp + 64; + + put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride); + put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride); + avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); +} + +void ff_avg_h264_qpel8_mc31_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfV[64]; + + put_h264_qpel8_h_lowpass_lsx(halfH, src, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride); + avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); +} + +void ff_avg_h264_qpel8_mc02_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel8_v_lowpass_lsx(dst, (uint8_t*)src, stride, stride); +} + +void ff_avg_h264_qpel8_mc12_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[128]; + uint8_t *const halfHV = temp; + uint8_t *const halfH = temp + 64; + + put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src, 8, stride); + avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); +} + +void ff_avg_h264_qpel8_mc22_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + avg_h264_qpel8_hv_lowpass_lsx(dst, src, stride, stride); +} + +void ff_avg_h264_qpel8_mc32_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[128]; + uint8_t *const halfHV = temp; + uint8_t *const halfH = temp + 64; + + put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfH, (uint8_t*)src + 1, 8, stride); + avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); +} + +void ff_avg_h264_qpel8_mc13_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfV[64]; + + put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src, 8, stride); + avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); +} + +void ff_avg_h264_qpel8_mc23_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t temp[128]; + uint8_t *const halfH = temp; + uint8_t *const halfHV = temp + 64; + + put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride); + put_h264_qpel8_hv_lowpass_lsx(halfHV, src, 8, stride); + avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8); +} + +void ff_avg_h264_qpel8_mc33_lsx(uint8_t *dst, const uint8_t *src, + ptrdiff_t stride) +{ + uint8_t halfH[64]; + uint8_t halfV[64]; + + put_h264_qpel8_h_lowpass_lsx(halfH, src + stride, 8, stride); + put_h264_qpel8_v_lowpass_lsx(halfV, (uint8_t*)src + 1, 8, stride); + avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8); +}