lavc/h263dsp: R-V V {h,v}_loop_filter

Since the horizontal and vertical filters are identical except for a transposition, this uses a common subprocedure with an ad-hoc ABI. To preserve return-address stack prediction, a link register has to be used (c.f. the "Control Transfer Instructions" from the RISC-V ISA Manual). The alternate/temporary link register T0 is used here, so that the normal RA is preserved (something Arm cannot do!). To load the strength value based on `qscale`, the shortest possible and PIC-compatible sequence is used: AUIPC; ADD; LBU. The classic LLA; ADD; LBU sequence would add one more instruction since LLA is a convenience alias for AUIPC; ADDI. To ensure that this trick works, relocation relaxation is disabled. To implement the two signed divisions by a power of two toward zero: (x / (1 << SHIFT)) the code relies on the small range of integers involved, computing: (x + (x >> (16 - SHIFT))) >> SHIFT rather than the more general: (x + ((x >> (16 - 1)) & ((1 << SHIFT) - 1))) >> SHIFT Thus one ANDI instruction is avoided. T-Head C908: h263dsp.h_loop_filter_c: 228.2 h263dsp.h_loop_filter_rvv_i32: 144.0 h263dsp.v_loop_filter_c: 242.7 h263dsp.v_loop_filter_rvv_i32: 114.0 (C is probably worse in real use due to less predictible branches.)
2024-12-29 06:45:47 +00:00 · 2024-05-19 10:03:29 +03:00 · 2024-05-19 10:03:29 +03:00 · 910d281b21
commit 910d281b21
parent 3d1597d3e2
5 changed files with 147 additions and 1 deletions
--- a/libavcodec/h263dsp.c
+++ b/libavcodec/h263dsp.c
@ -119,7 +119,9 @@ av_cold void ff_h263dsp_init(H263DSPContext *ctx)
    ctx->h263_h_loop_filter = h263_h_loop_filter_c;
    ctx->h263_v_loop_filter = h263_v_loop_filter_c;

-#if ARCH_X86
+#if ARCH_RISCV
+    ff_h263dsp_init_riscv(ctx);
+#elif ARCH_X86
    ff_h263dsp_init_x86(ctx);
 #elif ARCH_MIPS
    ff_h263dsp_init_mips(ctx);
--- a/libavcodec/h263dsp.h
+++ b/libavcodec/h263dsp.h
@ -29,6 +29,7 @@ typedef struct H263DSPContext {
 } H263DSPContext;

 void ff_h263dsp_init(H263DSPContext *ctx);
+void ff_h263dsp_init_riscv(H263DSPContext *ctx);
 void ff_h263dsp_init_x86(H263DSPContext *ctx);
 void ff_h263dsp_init_mips(H263DSPContext *ctx);

--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@ -26,6 +26,8 @@ OBJS-$(CONFIG_G722DSP) += riscv/g722dsp_init.o
 RVV-OBJS-$(CONFIG_G722DSP) += riscv/g722dsp_rvv.o
 OBJS-$(CONFIG_JPEG2000_DECODER) += riscv/jpeg2000dsp_init.o
 RVV-OBJS-$(CONFIG_JPEG2000_DECODER) += riscv/jpeg2000dsp_rvv.o
+OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_init.o
+RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
 OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
 RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
 OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
--- a/libavcodec/riscv/h263dsp_init.c
+++ b/libavcodec/riscv/h263dsp_init.c
@ -0,0 +1,41 @@
+/*
+ * Copyright © 2022 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/h263dsp.h"
+
+void ff_h263_h_loop_filter_rvv(uint8_t *src, int stride, int q);
+void ff_h263_v_loop_filter_rvv(uint8_t *src, int stride, int q);
+
+av_cold void ff_h263dsp_init_riscv(H263DSPContext *c)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if ((flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+        c->h263_h_loop_filter = ff_h263_h_loop_filter_rvv;
+        c->h263_v_loop_filter = ff_h263_v_loop_filter_rvv;
+    }
+#endif
+}
--- a/libavcodec/riscv/h263dsp_rvv.S
+++ b/libavcodec/riscv/h263dsp_rvv.S
@ -0,0 +1,100 @@
+/*
+ * Copyright © 2024 Rémi Denis-Courmont.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+        .option push
+        .option norelax
+func ff_h263_h_loop_filter_rvv, zve32x
+        addi        a0, a0, -2
+        vsetivli    zero, 8, e8, mf2, ta, ma
+        vlsseg4e8.v v8, (a0), a1
+        jal         t0, 1f
+        vssseg4e8.v v8, (a0), a1
+        ret
+1:
+        csrwi       vxrm, 0
+2:      auipc       t1, %pcrel_hi(ff_h263_loop_filter_strength)
+        vwsubu.vv   v14, v10, v9       # p2 - p1
+        add         t1, t1, a2
+        vwsubu.vv   v12, v8, v11       # p0 - p3
+        vsetvli     zero, zero, e16, m1, ta, mu
+        vsll.vi     v14, v14, 2
+        lbu         t1, %pcrel_lo(2b)(t1) # strength
+        vadd.vv     v16, v12, v14
+        # Divide by 8 toward 0. v16 is a signed 10-bit value at this point.
+        vsrl.vi     v18, v16, 16 - 3   # v18 = (v16 < 0) ? 7 : 0
+        slli        t2, t1, 1          # 2 * strength
+        vadd.vv     v16, v16, v18
+        # v16 (d) is signed 7-bit, but later arithmetics require 9 bits.
+        vsra.vi     v16, v16, 3        # d
+        vmv.v.x     v20, t2
+        vmslt.vi    v0, v16, 0
+        vneg.v      v18, v16
+        vneg.v      v20, v20, v0.t     # sign(d) * 2 * strength
+        vmax.vv     v18, v16, v18      # |d|
+        vsub.vv     v20, v20, v16      # d1 if strength <= |d| <= 2 * strength
+        vmsge.vx    v0, v18, t2
+        vsrl.vi     v14, v12, 16 - 2   # v14 = (v12 < 0) ? 3 : 0
+        vmerge.vxm  v20, v20, zero, v0 # d1 if strength <= |d|
+        vadd.vv     v12, v12, v14
+        vmsge.vx    v0, v18, t1
+        vsra.vi     v12, v12, 2        # (p0 - p3) / 4
+        vmerge.vvm  v16, v16, v20, v0  # d1
+        vzext.vf2   v24, v8     # p0 as u16 (because vwrsubu.wv does not exist)
+        vneg.v      v14, v16
+        vzext.vf2   v26, v9            # p1 as u16
+        vmax.vv     v14, v16, v14      # |d1|
+        vzext.vf2   v28, v10           # p2 as u16
+        vsra.vi     v14, v14, 1        # ad1
+        vadd.vv     v26, v26, v16      # p1 + d1
+        vneg.v      v18, v14           # -ad1
+        vmin.vv     v12, v12, v14
+        vsub.vv     v28, v28, v16      # p2 - d1
+        vmax.vv     v12, v12, v18      # d2
+        vmax.vx     v26, v26, zero
+        vsub.vv     v24, v24, v12      # p0 - d2
+        vmax.vx     v28, v28, zero
+        vsetvli     zero, zero, e8, mf2, ta, ma
+        vwaddu.wv   v30, v12, v11      # p3 + d2
+        vncvt.x.x.w v8, v24
+        vnclipu.wi  v9, v26, 0
+        vnclipu.wi  v10, v28, 0
+        vncvt.x.x.w v11, v30
+        jr          t0
+endfunc
+        .option pop
+
+func ff_h263_v_loop_filter_rvv, zve32x
+        sub         a4, a0, a1
+        vsetivli    zero, 8, e8, mf2, ta, ma
+        vle8.v      v10, (a0)
+        sub         a3, a4, a1
+        vle8.v      v9, (a4)
+        add         a5, a0, a1
+        vle8.v      v8, (a3)
+        vle8.v      v11, (a5)
+        jal         t0, 1b
+        vse8.v      v8, (a3)
+        vse8.v      v9, (a4)
+        vse8.v      v10, (a0)
+        vse8.v      v11, (a5)
+        ret
+endfunc