mirror of
https://github.com/FFmpeg/FFmpeg.git
synced 2025-02-12 13:06:36 +00:00
mmx implementation of 3-point GMC. (5x faster than C)
Originally committed as revision 5265 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
841f65f25a
commit
703c8195a8
@ -1144,7 +1144,7 @@ static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y
|
||||
}
|
||||
}
|
||||
|
||||
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
|
||||
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
|
||||
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
|
||||
{
|
||||
int y, vx, vy;
|
||||
@ -3865,7 +3865,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
||||
c->add_pixels8 = add_pixels8_c;
|
||||
c->add_pixels4 = add_pixels4_c;
|
||||
c->gmc1 = gmc1_c;
|
||||
c->gmc = gmc_c;
|
||||
c->gmc = ff_gmc_c;
|
||||
c->clear_blocks = clear_blocks_c;
|
||||
c->pix_sum = pix_sum_c;
|
||||
c->pix_norm1 = pix_norm1_c;
|
||||
|
@ -82,6 +82,9 @@ void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, i
|
||||
void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
|
||||
void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
|
||||
|
||||
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
|
||||
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
|
||||
|
||||
/* minimum alignment rules ;)
|
||||
if u notice errors in the align stuff, need more alignment for some asm code for some cpu
|
||||
or need to use a function with less aligned data then send a mail to the ffmpeg-dev list, ...
|
||||
|
@ -2403,6 +2403,126 @@ static void just_return() { return; }
|
||||
c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
|
||||
c->avg_ ## postfix1 = avg_ ## postfix2;
|
||||
|
||||
static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
|
||||
int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
|
||||
const int w = 8;
|
||||
const int s = 1<<shift;
|
||||
const int ix = ox>>(16+shift);
|
||||
const int iy = oy>>(16+shift);
|
||||
const int oxs = ox>>4;
|
||||
const int oys = oy>>4;
|
||||
const int dxxs = dxx>>4;
|
||||
const int dxys = dxy>>4;
|
||||
const int dyxs = dyx>>4;
|
||||
const int dyys = dyy>>4;
|
||||
const uint16_t r4[4] = {r,r,r,r};
|
||||
const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
|
||||
const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
|
||||
const uint64_t shift2 = 2*shift;
|
||||
uint8_t edge_buf[(h+1)*stride];
|
||||
int x, y;
|
||||
|
||||
const int dxw = (dxx-(1<<(16+shift)))*(w-1);
|
||||
const int dyh = (dyy-(1<<(16+shift)))*(h-1);
|
||||
const int dxh = dxy*(h-1);
|
||||
const int dyw = dyx*(w-1);
|
||||
if( // non-constant fullpel offset (3% of blocks)
|
||||
(ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
|
||||
oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
|
||||
// uses more than 16 bits of subpel mv (only at huge resolution)
|
||||
|| (dxx|dxy|dyx|dyy)&15 )
|
||||
{
|
||||
//FIXME could still use mmx for some of the rows
|
||||
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
|
||||
return;
|
||||
}
|
||||
|
||||
if( (unsigned)ix >= width-w ||
|
||||
(unsigned)iy >= height-h )
|
||||
{
|
||||
ff_emulated_edge_mc(edge_buf, src+ix+iy*stride, stride, w+1, h+1, ix, iy, width, height);
|
||||
src = edge_buf;
|
||||
}
|
||||
else
|
||||
src += ix + iy*stride;
|
||||
|
||||
for(x=0; x<w; x+=4){
|
||||
uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
|
||||
oxs - dxys + dxxs*(x+1),
|
||||
oxs - dxys + dxxs*(x+2),
|
||||
oxs - dxys + dxxs*(x+3) };
|
||||
uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
|
||||
oys - dyys + dyxs*(x+1),
|
||||
oys - dyys + dyxs*(x+2),
|
||||
oys - dyys + dyxs*(x+3) };
|
||||
|
||||
asm volatile(
|
||||
"movd %0, %%mm6 \n\t"
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
||||
"punpcklwd %%mm6, %%mm6 \n\t"
|
||||
:: "g"(s)
|
||||
);
|
||||
|
||||
for(y=0; y<h; y++){
|
||||
asm volatile(
|
||||
"movq %0, %%mm4 \n\t"
|
||||
"movq %1, %%mm5 \n\t"
|
||||
"paddw %2, %%mm4 \n\t"
|
||||
"paddw %3, %%mm5 \n\t"
|
||||
"movq %%mm4, %0 \n\t"
|
||||
"movq %%mm5, %1 \n\t"
|
||||
"psrlw $12, %%mm4 \n\t"
|
||||
"psrlw $12, %%mm5 \n\t"
|
||||
: "+m"(*dx4), "+m"(*dy4)
|
||||
: "m"(*dxy4), "m"(*dyy4)
|
||||
);
|
||||
|
||||
asm volatile(
|
||||
"movq %%mm6, %%mm2 \n\t"
|
||||
"movq %%mm6, %%mm1 \n\t"
|
||||
"psubw %%mm4, %%mm2 \n\t"
|
||||
"psubw %%mm5, %%mm1 \n\t"
|
||||
"movq %%mm2, %%mm0 \n\t"
|
||||
"movq %%mm4, %%mm3 \n\t"
|
||||
"pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
|
||||
"pmullw %%mm5, %%mm3 \n\t" // dx*dy
|
||||
"pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
|
||||
"pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
|
||||
|
||||
"movd %4, %%mm5 \n\t"
|
||||
"movd %3, %%mm4 \n\t"
|
||||
"punpcklbw %%mm7, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
|
||||
"pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
|
||||
|
||||
"movd %2, %%mm5 \n\t"
|
||||
"movd %1, %%mm4 \n\t"
|
||||
"punpcklbw %%mm7, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
|
||||
"pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
|
||||
"paddw %%mm3, %%mm2 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
"paddw %%mm2, %%mm0 \n\t"
|
||||
"paddw %5, %%mm0 \n\t"
|
||||
|
||||
"psrlw %6, %%mm0 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"movd %%mm0, %0 \n\t"
|
||||
|
||||
: "=m"(dst[x+y*stride])
|
||||
: "m"(src[0]), "m"(src[1]),
|
||||
"m"(src[stride]), "m"(src[stride+1]),
|
||||
"m"(*r4), "m"(shift2)
|
||||
);
|
||||
src += stride;
|
||||
}
|
||||
src += 4-h*stride;
|
||||
}
|
||||
}
|
||||
|
||||
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
|
||||
long i=0;
|
||||
|
||||
@ -2725,6 +2845,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
|
||||
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
|
||||
|
||||
c->gmc= gmc_mmx;
|
||||
|
||||
c->add_bytes= add_bytes_mmx;
|
||||
#ifdef CONFIG_ENCODERS
|
||||
c->diff_bytes= diff_bytes_mmx;
|
||||
|
Loading…
Reference in New Issue
Block a user