ffmpeg-2.1.1: move directory

git-svn-id: svn://kolibrios.org@6148 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Sergey Semyonov (Serge)
2016-02-05 22:14:10 +00:00
parent a4b787f4b8
commit ecf3e862ea
4011 changed files with 1868 additions and 4 deletions

View File

@@ -0,0 +1,19 @@
include $(SUBDIR)../config.mak
NAME = swscale
FFLIBS = avutil
HEADERS = swscale.h \
version.h \
OBJS = input.o \
options.o \
output.o \
rgb2rgb.o \
swscale.o \
swscale_unscaled.o \
utils.o \
yuv2rgb.o \
TESTPROGS = colorspace \
swscale \

View File

@@ -0,0 +1,3 @@
OBJS += bfin/internal_bfin.o \
bfin/swscale_bfin.o \
bfin/yuv2rgb_bfin.o \

View File

@@ -0,0 +1,613 @@
/*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
* April 20, 2007
*
* Blackfin video color space converter operations
* convert I420 YV12 to RGB in various formats
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
The following calculation is used for the conversion:
r = clipz((y - oy) * cy + crv * (v - 128))
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
b = clipz((y - oy) * cy + cbu * (u - 128))
y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
New factorization to eliminate the truncation error which was
occurring due to the byteop3p.
1) Use the bytop16m to subtract quad bytes we use this in U8 this
then so the offsets need to be renormalized to 8bits.
2) Scale operands up by a factor of 4 not 8 because Blackfin
multiplies include a shift.
3) Compute into the accumulators cy * yx0, cy * yx1.
4) Compute each of the linear equations:
r = clipz((y - oy) * cy + crv * (v - 128))
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128))
b = clipz((y - oy) * cy + cbu * (u - 128))
Reuse of the accumulators requires that we actually multiply
twice once with addition and the second time with a subtraction.
Because of this we need to compute the equations in the order R B
then G saving the writes for B in the case of 24/32 bit color
formats.
API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
int dW, uint32_t *coeffs);
A B
--- ---
i2 = cb i3 = cr
i1 = coeff i0 = y
Where coeffs have the following layout in memory.
uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;
coeffs is a pointer to oy.
The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
replication is used to simplify the internal algorithms for the dual Mac
architecture of BlackFin.
All routines are exported with _ff_bfin_ as a symbol prefix.
Rough performance gain compared against -O3:
2779809/1484290 187.28%
which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
c/pel for the optimized implementations. Not sure why there is such a
huge variation on the reference codes on Blackfin I guess it must have
to do with the memory system.
*/
#define mL3 .text
#if defined(__FDPIC__) && CONFIG_SRAM
#define mL1 .l1.text
#else
#define mL1 mL3
#endif
#define MEM mL1
#define DEFUN(fname,where,interface) \
.section where; \
.global _ff_bfin_ ## fname; \
.type _ff_bfin_ ## fname, STT_FUNC; \
.align 8; \
_ff_bfin_ ## fname
#define DEFUN_END(fname) \
.size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
.text
#define COEFF_LEN 11*4
#define COEFF_REL_CY_OFF 4*4
#define ARG_OUT 20
#define ARG_W 24
#define ARG_COEFF 28
DEFUN(yuv2rgb565_line,MEM,
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
link 0;
[--sp] = (r7:4);
p1 = [fp+ARG_OUT];
r3 = [fp+ARG_W];
i0 = r0;
i2 = r1;
i3 = r2;
r0 = [fp+ARG_COEFF];
i1 = r0;
b1 = i1;
l1 = COEFF_LEN;
m0 = COEFF_REL_CY_OFF;
p0 = r3;
r0 = [i0++]; // 2Y
r1.l = w[i2++]; // 2u
r1.h = w[i3++]; // 2v
p0 = p0>>2;
lsetup (.L0565, .L1565) lc0 = p0;
/*
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
r0 -- used to load 4ys
r1 -- used to load 2us,2vs
r4 -- y3,y2
r5 -- y1,y0
r6 -- u1,u0
r7 -- v1,v0
*/
r2=[i1++]; // oy
.L0565:
/*
rrrrrrrr gggggggg bbbbbbbb
5432109876543210
bbbbb >>3
gggggggg <<3
rrrrrrrr <<8
rrrrrggggggbbbbb
*/
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
(r7,r6) = byteop16m (r1:0, r3:2) (r);
r5 = r5 << 2 (v); // y1,y0
r4 = r4 << 2 (v); // y3,y2
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
/* Y' = y*cy */
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2 = r2 >> 3 (v);
r3 = r2 & r5;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
r2 = r2 << 8 (v);
r2 = r2 & r5;
r3 = r3 | r2;
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
r2 = r2 << 3 (v);
r2 = r2 & r5;
r3 = r3 | r2;
[p1++]=r3 || r1=[i1++]; // cy
/* Y' = y*cy */
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2 = r2 >> 3 (v);
r3 = r2 & r5;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
r2 = r2 << 8 (v);
r2 = r2 & r5;
r3 = r3 | r2;
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
r2 = byteop3p(r3:2, r1:0)(LO) || r0 = [i0++]; // 2Y
r2 = r2 << 3 (v) || r1.l = w[i2++]; // 2u
r2 = r2 & r5;
r3 = r3 | r2;
[p1++]=r3 || r1.h = w[i3++]; // 2v
.L1565: r2=[i1++]; // oy
l1 = 0;
(r7:4) = [sp++];
unlink;
rts;
DEFUN_END(yuv2rgb565_line)
DEFUN(yuv2rgb555_line,MEM,
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
link 0;
[--sp] = (r7:4);
p1 = [fp+ARG_OUT];
r3 = [fp+ARG_W];
i0 = r0;
i2 = r1;
i3 = r2;
r0 = [fp+ARG_COEFF];
i1 = r0;
b1 = i1;
l1 = COEFF_LEN;
m0 = COEFF_REL_CY_OFF;
p0 = r3;
r0 = [i0++]; // 2Y
r1.l = w[i2++]; // 2u
r1.h = w[i3++]; // 2v
p0 = p0>>2;
lsetup (.L0555, .L1555) lc0 = p0;
/*
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
r0 -- used to load 4ys
r1 -- used to load 2us,2vs
r4 -- y3,y2
r5 -- y1,y0
r6 -- u1,u0
r7 -- v1,v0
*/
r2=[i1++]; // oy
.L0555:
/*
rrrrrrrr gggggggg bbbbbbbb
5432109876543210
bbbbb >>3
gggggggg <<2
rrrrrrrr <<7
xrrrrrgggggbbbbb
*/
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
(r7,r6) = byteop16m (r1:0, r3:2) (r);
r5 = r5 << 2 (v); // y1,y0
r4 = r4 << 2 (v); // y3,y2
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
/* Y' = y*cy */
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2 = r2 >> 3 (v);
r3 = r2 & r5;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
r2 = r2 << 7 (v);
r2 = r2 & r5;
r3 = r3 | r2;
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask
r2 = r2 << 2 (v);
r2 = r2 & r5;
r3 = r3 | r2;
[p1++]=r3 || r1=[i1++]; // cy
/* Y' = y*cy */
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2 = r2 >> 3 (v);
r3 = r2 & r5;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
r2 = r2 << 7 (v);
r2 = r2 & r5;
r3 = r3 | r2;
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
r2 = byteop3p(r3:2, r1:0)(LO) || r0=[i0++]; // 4Y
r2 = r2 << 2 (v) || r1.l=w[i2++]; // 2u
r2 = r2 & r5;
r3 = r3 | r2;
[p1++]=r3 || r1.h=w[i3++]; // 2v
.L1555: r2=[i1++]; // oy
l1 = 0;
(r7:4) = [sp++];
unlink;
rts;
DEFUN_END(yuv2rgb555_line)
DEFUN(yuv2rgb24_line,MEM,
(uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
link 0;
[--sp] = (r7:4);
p1 = [fp+ARG_OUT];
r3 = [fp+ARG_W];
p2 = p1;
p2 += 3;
i0 = r0;
i2 = r1;
i3 = r2;
r0 = [fp+ARG_COEFF]; // coeff buffer
i1 = r0;
b1 = i1;
l1 = COEFF_LEN;
m0 = COEFF_REL_CY_OFF;
p0 = r3;
r0 = [i0++]; // 2Y
r1.l = w[i2++]; // 2u
r1.h = w[i3++]; // 2v
p0 = p0>>2;
lsetup (.L0888, .L1888) lc0 = p0;
/*
uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
r0 -- used to load 4ys
r1 -- used to load 2us,2vs
r4 -- y3,y2
r5 -- y1,y0
r6 -- u1,u0
r7 -- v1,v0
*/
r2=[i1++]; // oy
.L0888:
(r4,r5) = byteop16m (r1:0, r3:2) || r3=[i1++]; // oc
(r7,r6) = byteop16m (r1:0, r3:2) (r);
r5 = r5 << 2 (v); // y1,y0
r4 = r4 << 2 (v); // y3,y2
r6 = r6 << 2 (v) || r0=[i1++]; // u1,u0, r0=zero
r7 = r7 << 2 (v) || r1=[i1++]; // v1,v0 r1=cy
/* Y' = y*cy */
a1 = r1.h*r5.h, a0 = r1.l*r5.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
a1 -= r1.h*r7.l, a0 -= r1.l*r7.l || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2=r2>>16 || B[p1++]=r2;
B[p2++]=r2;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
a1 -= r1.h*r6.l, a0 -= r1.l*r6.l || r5=[i1++]; // bmask
r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.l, a0 += r1.l*r6.l || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++m0]; // gmask, oy,cy,zero
r2=r2>>16 || B[p1++]=r2;
B[p2++]=r2;
r3=r3>>16 || B[p1++]=r3;
B[p2++]=r3 || r1=[i1++]; // cy
p1+=3;
p2+=3;
/* Y' = y*cy */
a1 = r1.h*r4.h, a0 = r1.l*r4.l || r1=[i1++]; // crv
/* R = Y+ crv*(Cr-128) */
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
a1 -= r1.h*r7.h, a0 -= r1.l*r7.h || r5=[i1++]; // rmask
r2 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cbu
r2=r2>>16 || B[p1++]=r2;
B[p2++]=r2;
/* B = Y+ cbu*(Cb-128) */
r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
a1 -= r1.h*r6.h, a0 -= r1.l*r6.h || r5=[i1++]; // bmask
r3 = byteop3p(r3:2, r1:0)(LO) || r1=[i1++]; // cgu
/* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
a1 += r1.h*r6.h, a0 += r1.l*r6.h || r1=[i1++]; // cgv
r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
r2 = byteop3p(r3:2, r1:0)(LO) || r5=[i1++]; // gmask
r2=r2>>16 || B[p1++]=r2 || r0 = [i0++]; // 4y
B[p2++]=r2 || r1.l = w[i2++]; // 2u
r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
B[p2++]=r3 || r2=[i1++]; // oy
p1+=3;
.L1888: p2+=3;
l1 = 0;
(r7:4) = [sp++];
unlink;
rts;
DEFUN_END(yuv2rgb24_line)
#define ARG_vdst 20
#define ARG_width 24
#define ARG_height 28
#define ARG_lumStride 32
#define ARG_chromStride 36
#define ARG_srcStride 40
DEFUN(uyvytoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride)):
link 0;
[--sp] = (r7:4,p5:4);
p0 = r1; // Y top even
i2 = r2; // *u
r2 = [fp + ARG_vdst];
i3 = r2; // *v
r1 = [fp + ARG_srcStride];
r2 = r0 + r1;
i0 = r0; // uyvy_T even
i1 = r2; // uyvy_B odd
p2 = [fp + ARG_lumStride];
p1 = p0 + p2; // Y bot odd
p5 = [fp + ARG_width];
p4 = [fp + ARG_height];
r0 = p5;
p4 = p4 >> 1;
p5 = p5 >> 2;
r2 = r0 << 1;
r1 = r1 << 1;
r1 = r1 - r2; // srcStride + (srcStride - 2*width)
r1 += -8; // i0,i1 is pre read need to correct
m0 = r1;
r2 = [fp + ARG_chromStride];
r0 = r0 >> 1;
r2 = r2 - r0;
m1 = r2;
/* I0,I1 - src input line pointers
* p0,p1 - luma output line pointers
* I2 - dstU
* I3 - dstV
*/
lsetup (0f, 1f) lc1 = p4; // H/2
0: r0 = [i0++] || r2 = [i1++];
r1 = [i0++] || r3 = [i1++];
r4 = byteop1p(r1:0, r3:2);
r5 = byteop1p(r1:0, r3:2) (r);
lsetup (2f, 3f) lc0 = p5; // W/4
2: r0 = r0 >> 8(v);
r1 = r1 >> 8(v);
r2 = r2 >> 8(v);
r3 = r3 >> 8(v);
r0 = bytepack(r0, r1);
r2 = bytepack(r2, r3) || [p0++] = r0; // yyyy
r6 = pack(r5.l, r4.l) || [p1++] = r2; // yyyy
r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
r4 = byteop1p(r1:0, r3:2) || w[i2++] = r6.l; // uu
3: r5 = byteop1p(r1:0, r3:2) (r) || w[i3++] = r6.h; // vv
i0 += m0;
i1 += m0;
i2 += m1;
i3 += m1;
p0 = p0 + p2;
1: p1 = p1 + p2;
(r7:4,p5:4) = [sp++];
unlink;
rts;
DEFUN_END(uyvytoyv12)
DEFUN(yuyvtoyv12, mL3, (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride)):
link 0;
[--sp] = (r7:4,p5:4);
p0 = r1; // Y top even
i2 = r2; // *u
r2 = [fp + ARG_vdst];
i3 = r2; // *v
r1 = [fp + ARG_srcStride];
r2 = r0 + r1;
i0 = r0; // uyvy_T even
i1 = r2; // uyvy_B odd
p2 = [fp + ARG_lumStride];
p1 = p0 + p2; // Y bot odd
p5 = [fp + ARG_width];
p4 = [fp + ARG_height];
r0 = p5;
p4 = p4 >> 1;
p5 = p5 >> 2;
r2 = r0 << 1;
r1 = r1 << 1;
r1 = r1 - r2; // srcStride + (srcStride - 2*width)
r1 += -8; // i0,i1 is pre read need to correct
m0 = r1;
r2 = [fp + ARG_chromStride];
r0 = r0 >> 1;
r2 = r2 - r0;
m1 = r2;
/* I0,I1 - src input line pointers
* p0,p1 - luma output line pointers
* I2 - dstU
* I3 - dstV
*/
lsetup (0f, 1f) lc1 = p4; // H/2
0: r0 = [i0++] || r2 = [i1++];
r1 = [i0++] || r3 = [i1++];
r4 = bytepack(r0, r1);
r5 = bytepack(r2, r3);
lsetup (2f, 3f) lc0 = p5; // W/4
2: r0 = r0 >> 8(v) || [p0++] = r4; // yyyy-even
r1 = r1 >> 8(v) || [p1++] = r5; // yyyy-odd
r2 = r2 >> 8(v);
r3 = r3 >> 8(v);
r4 = byteop1p(r1:0, r3:2);
r5 = byteop1p(r1:0, r3:2) (r);
r6 = pack(r5.l, r4.l);
r7 = pack(r5.h, r4.h) || r0 = [i0++] || r2 = [i1++];
r6 = bytepack(r6, r7) || r1 = [i0++] || r3 = [i1++];
r4 = bytepack(r0, r1) || w[i2++] = r6.l; // uu
3: r5 = bytepack(r2, r3) || w[i3++] = r6.h; // vv
i0 += m0;
i1 += m0;
i2 += m1;
i3 += m1;
p0 = p0 + p2;
1: p1 = p1 + p2;
(r7:4,p5:4) = [sp++];
unlink;
rts;
DEFUN_END(yuyvtoyv12)

View File

@@ -0,0 +1,87 @@
/*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* Blackfin software video scaler operations
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libswscale/swscale_internal.h"
#if defined (__FDPIC__) && CONFIG_SRAM
#define L1CODE __attribute__((l1_text))
#else
#define L1CODE
#endif
int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height,
int lumStride, int chromStride, int srcStride) L1CODE;
int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height,
int lumStride, int chromStride, int srcStride) L1CODE;
static int uyvytoyv12_unscaled(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
const uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
int w = dstStride[0];
ff_bfin_uyvytoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
dstStride[0], dstStride[1], srcStride[0]);
return srcSliceH;
}
static int yuyvtoyv12_unscaled(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
const uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
int w = dstStride[0];
ff_bfin_yuyvtoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
dstStride[0], dstStride[1], srcStride[0]);
return srcSliceH;
}
av_cold void ff_get_unscaled_swscale_bfin(SwsContext *c)
{
if (c->dstFormat == AV_PIX_FMT_YUV420P && c->srcFormat == AV_PIX_FMT_UYVY422) {
av_log(NULL, AV_LOG_VERBOSE,
"selecting Blackfin optimized uyvytoyv12_unscaled\n");
c->swscale = uyvytoyv12_unscaled;
}
if (c->dstFormat == AV_PIX_FMT_YUV420P && c->srcFormat == AV_PIX_FMT_YUYV422) {
av_log(NULL, AV_LOG_VERBOSE,
"selecting Blackfin optimized yuyvtoyv12_unscaled\n");
c->swscale = yuyvtoyv12_unscaled;
}
}

View File

@@ -0,0 +1,203 @@
/*
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
*
* Blackfin video color space converter operations
* convert I420 YV12 to RGB in various formats
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/pixdesc.h"
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libswscale/swscale_internal.h"
#if defined(__FDPIC__) && CONFIG_SRAM
#define L1CODE __attribute__((l1_text))
#else
#define L1CODE
#endif
void ff_bfin_yuv2rgb555_line(const uint8_t *Y, const uint8_t *U,
const uint8_t *V, uint8_t *out,
int w, uint32_t *coeffs) L1CODE;
void ff_bfin_yuv2rgb565_line(const uint8_t *Y, const uint8_t *U,
const uint8_t *V, uint8_t *out,
int w, uint32_t *coeffs) L1CODE;
void ff_bfin_yuv2rgb24_line(const uint8_t *Y, const uint8_t *U,
const uint8_t *V, uint8_t *out,
int w, uint32_t *coeffs) L1CODE;
typedef void (*ltransform)(const uint8_t *Y, const uint8_t *U, const uint8_t *V,
uint8_t *out, int w, uint32_t *coeffs);
static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
{
int oy;
oy = c->yOffset & 0xffff;
oy = oy >> 3; // keep everything U8.0 for offset calculation
c->oc = 128 * 0x01010101U;
c->oy = oy * 0x01010101U;
/* copy 64bit vector coeffs down to 32bit vector coeffs */
c->cy = c->yCoeff;
c->zero = 0;
if (rgb) {
c->crv = c->vrCoeff;
c->cbu = c->ubCoeff;
c->cgu = c->ugCoeff;
c->cgv = c->vgCoeff;
} else {
c->crv = c->ubCoeff;
c->cbu = c->vrCoeff;
c->cgu = c->vgCoeff;
c->cgv = c->ugCoeff;
}
if (masks == 555) {
c->rmask = 0x001f * 0x00010001U;
c->gmask = 0x03e0 * 0x00010001U;
c->bmask = 0x7c00 * 0x00010001U;
} else if (masks == 565) {
c->rmask = 0x001f * 0x00010001U;
c->gmask = 0x07e0 * 0x00010001U;
c->bmask = 0xf800 * 0x00010001U;
}
}
static int core_yuv420_rgb(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH, uint8_t **oplanes,
int *outstrides, ltransform lcscf,
int rgb, int masks)
{
const uint8_t *py, *pu, *pv;
uint8_t *op;
int w = instrides[0];
int h2 = srcSliceH >> 1;
int i;
bfin_prepare_coefficients(c, rgb, masks);
py = in[0];
pu = in[1 + (1 ^ rgb)];
pv = in[1 + (0 ^ rgb)];
op = oplanes[0] + srcSliceY * outstrides[0];
for (i = 0; i < h2; i++) {
lcscf(py, pu, pv, op, w, &c->oy);
py += instrides[0];
op += outstrides[0];
lcscf(py, pu, pv, op, w, &c->oy);
py += instrides[0];
pu += instrides[1];
pv += instrides[2];
op += outstrides[0];
}
return srcSliceH;
}
static int bfin_yuv420_rgb555(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb555_line, 1, 555);
}
static int bfin_yuv420_bgr555(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb555_line, 0, 555);
}
static int bfin_yuv420_rgb24(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb24_line, 1, 888);
}
static int bfin_yuv420_bgr24(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb24_line, 0, 888);
}
static int bfin_yuv420_rgb565(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb565_line, 1, 565);
}
static int bfin_yuv420_bgr565(SwsContext *c, const uint8_t **in, int *instrides,
int srcSliceY, int srcSliceH,
uint8_t **oplanes, int *outstrides)
{
return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
outstrides, ff_bfin_yuv2rgb565_line, 0, 565);
}
av_cold SwsFunc ff_yuv2rgb_init_bfin(SwsContext *c)
{
SwsFunc f;
switch (c->dstFormat) {
case AV_PIX_FMT_RGB555:
f = bfin_yuv420_rgb555;
break;
case AV_PIX_FMT_BGR555:
f = bfin_yuv420_bgr555;
break;
case AV_PIX_FMT_RGB565:
f = bfin_yuv420_rgb565;
break;
case AV_PIX_FMT_BGR565:
f = bfin_yuv420_bgr565;
break;
case AV_PIX_FMT_RGB24:
f = bfin_yuv420_rgb24;
break;
case AV_PIX_FMT_BGR24:
f = bfin_yuv420_bgr24;
break;
default:
return 0;
}
av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
av_get_pix_fmt_name(c->dstFormat));
return f;
}

View File

@@ -0,0 +1,170 @@
/*
* Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <string.h> /* for memset() */
#include <stdlib.h>
#include <inttypes.h>
#include "swscale.h"
#include "rgb2rgb.h"
#include "libavutil/mem.h"
#define SIZE 1000
#define srcByte 0x55
#define dstByte 0xBB
#define FUNC(s, d, n) { s, d, #n, n }
int main(int argc, char **argv)
{
int i, funcNum;
uint8_t *srcBuffer = av_malloc(SIZE);
uint8_t *dstBuffer = av_malloc(SIZE);
int failedNum = 0;
int passedNum = 0;
if (!srcBuffer || !dstBuffer)
return -1;
av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
sws_rgb2rgb_init();
for (funcNum = 0; ; funcNum++) {
struct func_info_s {
int src_bpp;
int dst_bpp;
const char *name;
void (*func)(const uint8_t *src, uint8_t *dst, int src_size);
} func_info[] = {
FUNC(2, 2, rgb12to15),
FUNC(2, 2, rgb15to16),
FUNC(2, 3, rgb15to24),
FUNC(2, 4, rgb15to32),
FUNC(2, 3, rgb16to24),
FUNC(2, 4, rgb16to32),
FUNC(3, 2, rgb24to15),
FUNC(3, 2, rgb24to16),
FUNC(3, 4, rgb24to32),
FUNC(4, 2, rgb32to15),
FUNC(4, 2, rgb32to16),
FUNC(4, 3, rgb32to24),
FUNC(2, 2, rgb16to15),
FUNC(2, 2, rgb12tobgr12),
FUNC(2, 2, rgb15tobgr15),
FUNC(2, 2, rgb15tobgr16),
FUNC(2, 3, rgb15tobgr24),
FUNC(2, 4, rgb15tobgr32),
FUNC(2, 2, rgb16tobgr15),
FUNC(2, 2, rgb16tobgr16),
FUNC(2, 3, rgb16tobgr24),
FUNC(2, 4, rgb16tobgr32),
FUNC(3, 2, rgb24tobgr15),
FUNC(3, 2, rgb24tobgr16),
FUNC(3, 3, rgb24tobgr24),
FUNC(3, 4, rgb24tobgr32),
FUNC(4, 2, rgb32tobgr15),
FUNC(4, 2, rgb32tobgr16),
FUNC(4, 3, rgb32tobgr24),
FUNC(4, 4, shuffle_bytes_2103), /* rgb32tobgr32 */
FUNC(6, 6, rgb48tobgr48_nobswap),
FUNC(6, 6, rgb48tobgr48_bswap),
FUNC(8, 6, rgb64to48_nobswap),
FUNC(8, 6, rgb64to48_bswap),
FUNC(8, 6, rgb64tobgr48_nobswap),
FUNC(8, 6, rgb64tobgr48_bswap),
FUNC(0, 0, NULL)
};
int width;
int failed = 0;
int srcBpp = 0;
int dstBpp = 0;
if (!func_info[funcNum].func)
break;
av_log(NULL, AV_LOG_INFO, ".");
memset(srcBuffer, srcByte, SIZE);
for (width = 63; width > 0; width--) {
int dstOffset;
for (dstOffset = 128; dstOffset < 196; dstOffset += 4) {
int srcOffset;
memset(dstBuffer, dstByte, SIZE);
for (srcOffset = 128; srcOffset < 196; srcOffset += 4) {
uint8_t *src = srcBuffer + srcOffset;
uint8_t *dst = dstBuffer + dstOffset;
const char *name = NULL;
// don't fill the screen with shit ...
if (failed)
break;
srcBpp = func_info[funcNum].src_bpp;
dstBpp = func_info[funcNum].dst_bpp;
name = func_info[funcNum].name;
func_info[funcNum].func(src, dst, width * srcBpp);
if (!srcBpp)
break;
for (i = 0; i < SIZE; i++) {
if (srcBuffer[i] != srcByte) {
av_log(NULL, AV_LOG_INFO,
"src damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
for (i = 0; i < dstOffset; i++) {
if (dstBuffer[i] != dstByte) {
av_log(NULL, AV_LOG_INFO,
"dst damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
for (i = dstOffset + width * dstBpp; i < SIZE; i++) {
if (dstBuffer[i] != dstByte) {
av_log(NULL, AV_LOG_INFO,
"dst damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
}
}
}
if (failed)
failedNum++;
else if (srcBpp)
passedNum++;
}
av_log(NULL, AV_LOG_INFO,
"\n%d converters passed, %d converters randomly overwrote memory\n",
passedNum, failedNum);
return failedNum;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
prefix=/usr/local
exec_prefix=${prefix}
libdir=${prefix}/lib
includedir=${prefix}/include
Name: libswscale
Description: FFmpeg image rescaling library
Version: 2.5.101
Requires:
Requires.private: libavutil = 52.48.101
Conflicts:
Libs: -L${libdir} -lswscale
Libs.private: -lm
Cflags: -I${includedir}

View File

@@ -0,0 +1,4 @@
LIBSWSCALE_$MAJOR {
global: swscale_*; sws_*;
local: *;
};

View File

@@ -0,0 +1,4 @@
LIBSWSCALE_2 {
global: swscale_*; sws_*;
local: *;
};

View File

@@ -0,0 +1,91 @@
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/avutil.h"
#include "libavutil/opt.h"
#include "libavutil/pixfmt.h"
#include "swscale.h"
#include "swscale_internal.h"
static const char *sws_context_to_name(void *ptr)
{
return "swscaler";
}
#define OFFSET(x) offsetof(SwsContext, x)
#define DEFAULT 0
#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
static const AVOption swscale_options[] = {
{ "sws_flags", "scaler flags", OFFSET(flags), AV_OPT_TYPE_FLAGS, { .i64 = SWS_BICUBIC }, 0, UINT_MAX, VE, "sws_flags" },
{ "fast_bilinear", "fast bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FAST_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bilinear", "bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bicubic", "bicubic", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBIC }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "experimental", "experimental", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_X }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "neighbor", "nearest neighbor", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_POINT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "area", "averaging area", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_AREA }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bicublin", "luma bicubic, chroma bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBLIN }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "gauss", "gaussian", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_GAUSS }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "sinc", "sinc", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_SINC }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "lanczos", "lanczos", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_LANCZOS }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "spline", "natural bicubic spline", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_SPLINE }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "print_info", "print info", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_PRINT_INFO }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "accurate_rnd", "accurate rounding", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ACCURATE_RND }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "full_chroma_int", "full chroma interpolation", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "full_chroma_inp", "full chroma input", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INP }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bitexact", "", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BITEXACT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "error_diffusion", "error diffusion dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ERROR_DIFFUSION}, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "srcw", "source width", OFFSET(srcW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "srch", "source height", OFFSET(srcH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "dstw", "destination width", OFFSET(dstW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "dsth", "destination height", OFFSET(dstH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "src_format", "source format", OFFSET(srcFormat), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, AV_PIX_FMT_NB - 1, VE },
{ "dst_format", "destination format", OFFSET(dstFormat), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, AV_PIX_FMT_NB - 1, VE },
{ "src_range", "source range", OFFSET(srcRange), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, 1, VE },
{ "dst_range", "destination range", OFFSET(dstRange), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, 1, VE },
{ "param0", "scaler param 0", OFFSET(param[0]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE },
{ "param1", "scaler param 1", OFFSET(param[1]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE },
{ "src_v_chr_pos", "source vertical chroma position in luma grid/256" , OFFSET(src_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 512, VE },
{ "src_h_chr_pos", "source horizontal chroma position in luma grid/256", OFFSET(src_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 512, VE },
{ "dst_v_chr_pos", "destination vertical chroma position in luma grid/256" , OFFSET(dst_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 512, VE },
{ "dst_h_chr_pos", "destination horizontal chroma position in luma grid/256", OFFSET(dst_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, 512, VE },
{ "sws_dither", "set dithering algorithm", OFFSET(dither), AV_OPT_TYPE_INT, { .i64 = SWS_DITHER_AUTO }, 0, NB_SWS_DITHER, VE, "sws_dither" },
{ "auto", "leave choice to sws", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_AUTO }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "bayer", "bayer dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_BAYER }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "ed", "error diffusion", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_ED }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ NULL }
};
const AVClass sws_context_class = {
.class_name = "SWScaler",
.item_name = sws_context_to_name,
.option = swscale_options,
.category = AV_CLASS_CATEGORY_SWSCALER,
.version = LIBAVUTIL_VERSION_INT,
};
const AVClass *sws_get_class(void)
{
return &sws_context_class;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
OBJS += ppc/swscale_altivec.o \
ppc/yuv2rgb_altivec.o \
ppc/yuv2yuv_altivec.o \

View File

@@ -0,0 +1,332 @@
/*
* AltiVec-enhanced yuv2yuvX
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "yuv2rgb_altivec.h"
#if HAVE_ALTIVEC
#define vzero vec_splat_s32(0)
#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do { \
vector signed short l2 = vec_ld(((x) << 1) + 16, src); \
vector signed short ls = vec_perm(l1, l2, perm); \
vector signed int i1 = vec_mule(filter, ls); \
vector signed int i2 = vec_mulo(filter, ls); \
vector signed int vf1 = vec_mergeh(i1, i2); \
vector signed int vf2 = vec_mergel(i1, i2); \
d1 = vec_add(d1, vf1); \
d2 = vec_add(d2, vf2); \
l1 = l2; \
} while (0)
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest,
const uint8_t *dither, int offset, int x)
{
register int i, j;
DECLARE_ALIGNED(16, int, val)[16];
vector signed int vo1, vo2, vo3, vo4;
vector unsigned short vs1, vs2;
vector unsigned char vf;
vector unsigned int altivec_vectorShiftInt19 =
vec_add(vec_splat_u32(10), vec_splat_u32(9));
for (i = 0; i < 16; i++)
val[i] = dither[(x + i + offset) & 7] << 12;
vo1 = vec_ld(0, val);
vo2 = vec_ld(16, val);
vo3 = vec_ld(32, val);
vo4 = vec_ld(48, val);
for (j = 0; j < filterSize; j++) {
vector signed short l1, vLumFilter = vec_ld(j << 1, filter);
vector unsigned char perm, perm0 = vec_lvsl(j << 1, filter);
vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0);
vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter
perm = vec_lvsl(x << 1, src[j]);
l1 = vec_ld(x << 1, src[j]);
yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
}
vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
vs1 = vec_packsu(vo1, vo2);
vs2 = vec_packsu(vo3, vo4);
vf = vec_packsu(vs1, vs2);
vec_st(vf, 0, dest);
}
static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset, int x)
{
int i, j;
for (i = x; i < dstW; i++) {
int t = dither[(i + offset) & 7] << 12;
for (j = 0; j < filterSize; j++)
t += src[j][i] * filter[j];
dest[i] = av_clip_uint8(t >> 19);
}
}
static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
int dst_u = -(uintptr_t)dest & 15;
int i;
yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
for (i = dst_u; i < dstW - 15; i += 16)
yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
offset, i);
yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
}
static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
register int i;
DECLARE_ALIGNED(16, int, tempo)[4];
if (filterSize % 4) {
for (i = 0; i < dstW; i++) {
register int j;
register int srcPos = filterPos[i];
register int val = 0;
for (j = 0; j < filterSize; j++)
val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
}
} else
switch (filterSize) {
case 4:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_v0 = vec_ld(srcPos, src);
vector unsigned char src_v1, src_vF;
vector signed short src_v, filter_v;
vector signed int val_vEven, val_s;
if ((((uintptr_t)src + srcPos) % 16) > 12) {
src_v1 = vec_ld(srcPos + 16, src);
}
src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
src_v = // vec_unpackh sign-extends...
(vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
// now put our elements in the even slots
src_v = vec_mergeh(src_v, (vector signed short)vzero);
filter_v = vec_ld(i << 3, filter);
// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
// The neat trick: We only care for half the elements,
// high or low depending on (i<<3)%16 (it's 0 or 8 here),
// and we're going to use vec_mule, so we choose
// carefully how to "unpack" the elements into the even slots.
if ((i << 3) % 16)
filter_v = vec_mergel(filter_v, (vector signed short)vzero);
else
filter_v = vec_mergeh(filter_v, (vector signed short)vzero);
val_vEven = vec_mule(src_v, filter_v);
val_s = vec_sums(val_vEven, vzero);
vec_st(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
case 8:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_v0 = vec_ld(srcPos, src);
vector unsigned char src_v1, src_vF;
vector signed short src_v, filter_v;
vector signed int val_v, val_s;
if ((((uintptr_t)src + srcPos) % 16) > 8) {
src_v1 = vec_ld(srcPos + 16, src);
}
src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
src_v = // vec_unpackh sign-extends...
(vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
filter_v = vec_ld(i << 4, filter);
// the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)
val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
val_s = vec_sums(val_v, vzero);
vec_st(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
case 16:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_v0 = vec_ld(srcPos, src);
vector unsigned char src_v1 = vec_ld(srcPos + 16, src);
vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src));
vector signed short src_vA = // vec_unpackh sign-extends...
(vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
vector signed short src_vB = // vec_unpackh sign-extends...
(vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
vector signed short filter_v0 = vec_ld(i << 5, filter);
vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
// the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)
vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
vector signed int val_s = vec_sums(val_v, vzero);
vec_st(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
default:
for (i = 0; i < dstW; i++) {
register int j;
register int srcPos = filterPos[i];
vector signed int val_s, val_v = (vector signed int)vzero;
vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter);
vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter);
vector unsigned char src_v0 = vec_ld(srcPos, src);
vector unsigned char permS = vec_lvsl(srcPos, src);
for (j = 0; j < filterSize - 15; j += 16) {
vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src);
vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS);
vector signed short src_vA = // vec_unpackh sign-extends...
(vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
vector signed short src_vB = // vec_unpackh sign-extends...
(vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF));
vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter);
vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF);
vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF);
vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
val_v = vec_msums(src_vB, filter_v1, val_acc);
filter_v0R = filter_v2R;
src_v0 = src_v1;
}
if (j < filterSize - 7) {
// loading src_v0 is useless, it's already done above
// vector unsigned char src_v0 = vec_ld(srcPos + j, src);
vector unsigned char src_v1, src_vF;
vector signed short src_v, filter_v1R, filter_v;
if ((((uintptr_t)src + srcPos) % 16) > 8) {
src_v1 = vec_ld(srcPos + j + 16, src);
}
src_vF = vec_perm(src_v0, src_v1, permS);
src_v = // vec_unpackh sign-extends...
(vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF));
// loading filter_v0R is useless, it's already done above
// vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter);
filter_v = vec_perm(filter_v0R, filter_v1R, permF);
val_v = vec_msums(src_v, filter_v, val_v);
}
val_s = vec_sums(val_v, vzero);
vec_st(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
}
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
enum AVPixelFormat dstFormat = c->dstFormat;
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
if (c->srcBpc == 8 && c->dstBpc <= 14) {
c->hyScale = c->hcScale = hScale_altivec_real;
}
if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
!c->alpPixBuf) {
c->yuv2planeX = yuv2planeX_altivec;
}
/* The following list of supported dstFormat values should
* match what's found in the body of ff_yuv2packedX_altivec() */
if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->alpPixBuf) {
switch (c->dstFormat) {
case AV_PIX_FMT_ABGR:
c->yuv2packedX = ff_yuv2abgr_X_altivec;
break;
case AV_PIX_FMT_BGRA:
c->yuv2packedX = ff_yuv2bgra_X_altivec;
break;
case AV_PIX_FMT_ARGB:
c->yuv2packedX = ff_yuv2argb_X_altivec;
break;
case AV_PIX_FMT_RGBA:
c->yuv2packedX = ff_yuv2rgba_X_altivec;
break;
case AV_PIX_FMT_BGR24:
c->yuv2packedX = ff_yuv2bgr24_X_altivec;
break;
case AV_PIX_FMT_RGB24:
c->yuv2packedX = ff_yuv2rgb24_X_altivec;
break;
}
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,868 @@
/*
* AltiVec acceleration for colorspace conversion
*
* copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* Convert I420 YV12 to RGB in various formats,
* it rejects images that are not in 420 formats,
* it rejects images that don't have widths of multiples of 16,
* it rejects images that don't have heights of multiples of 2.
* Reject defers to C simulation code.
*
* Lots of optimizations to be done here.
*
* 1. Need to fix saturation code. I just couldn't get it to fly with packs
* and adds, so we currently use max/min to clip.
*
* 2. The inefficient use of chroma loading needs a bit of brushing up.
*
* 3. Analysis of pipeline stalls needs to be done. Use shark to identify
* pipeline stalls.
*
*
* MODIFIED to calculate coeffs from currently selected color space.
* MODIFIED core to be a macro where you specify the output format.
* ADDED UYVY conversion which is never called due to some thing in swscale.
* CORRECTED algorithim selection to be strict on input formats.
* ADDED runtime detection of AltiVec.
*
* ADDED altivec_yuv2packedX vertical scl + RGB converter
*
* March 27,2004
* PERFORMANCE ANALYSIS
*
* The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
* used as test.
* The AltiVec version uses 10% of the processor or ~100Mips for D1 video
* same sequence.
*
* 720 * 480 * 30 ~10MPS
*
* so we have roughly 10 clocks per pixel. This is too high, something has
* to be wrong.
*
* OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
* need for vec_min.
*
* OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
* have the input video frame, it was just decompressed so it probably resides
* in L1 caches. However, we are creating the output video stream. This needs
* to use the DSTST instruction to optimize for the cache. We couple this with
* the fact that we are not going to be visiting the input buffer again so we
* mark it Least Recently Used. This shaves 25% of the processor cycles off.
*
* Now memcpy is the largest mips consumer in the system, probably due
* to the inefficient X11 stuff.
*
* GL libraries seem to be very slow on this machine 1.33Ghz PB running
* Jaguar, this is not the case for my 1Ghz PB. I thought it might be
* a versioning issue, however I have libGL.1.2.dylib for both
* machines. (We need to figure this out now.)
*
* GL2 libraries work now with patch for RGB32.
*
* NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
*
* Integrated luma prescaling adjustment for saturation/contrast/brightness
* adjustment.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <assert.h>
#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
#include "yuv2rgb_altivec.h"
#if HAVE_ALTIVEC
#undef PROFILE_THE_BEAST
#undef INC_SCALING
typedef unsigned char ubyte;
typedef signed char sbyte;
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
* homogeneous vector registers x0,x1,x2 are interleaved with the
* following technique:
*
* o0 = vec_mergeh(x0, x1);
* o1 = vec_perm(o0, x2, perm_rgb_0);
* o2 = vec_perm(o0, x2, perm_rgb_1);
* o3 = vec_mergel(x0, x1);
* o4 = vec_perm(o3, o2, perm_rgb_2);
* o5 = vec_perm(o3, o2, perm_rgb_3);
*
* perm_rgb_0: o0(RG).h v1(B) --> o1*
* 0 1 2 3 4
* rgbr|gbrg|brgb|rgbr
* 0010 0100 1001 0010
* 0102 3145 2673 894A
*
* perm_rgb_1: o0(RG).h v1(B) --> o2
* 0 1 2 3 4
* gbrg|brgb|bbbb|bbbb
* 0100 1001 1111 1111
* B5CD 6EF7 89AB CDEF
*
* perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
* 0 1 2 3 4
* gbrg|brgb|rgbr|gbrg
* 1111 1111 0010 0100
* 89AB CDEF 0182 3945
*
* perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
* 0 1 2 3 4
* brgb|rgbr|gbrg|brgb
* 1001 0010 0100 1001
* a67b 89cA BdCD eEFf
*
*/
static const vector unsigned char
perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
#define vec_merge3(x2, x1, x0, y0, y1, y2) \
do { \
__typeof__(x0) o0, o2, o3; \
o0 = vec_mergeh(x0, x1); \
y0 = vec_perm(o0, x2, perm_rgb_0); \
o2 = vec_perm(o0, x2, perm_rgb_1); \
o3 = vec_mergel(x0, x1); \
y1 = vec_perm(o3, o2, perm_rgb_2); \
y2 = vec_perm(o3, o2, perm_rgb_3); \
} while (0)
#define vec_mstbgr24(x0, x1, x2, ptr) \
do { \
__typeof__(x0) _0, _1, _2; \
vec_merge3(x0, x1, x2, _0, _1, _2); \
vec_st(_0, 0, ptr++); \
vec_st(_1, 0, ptr++); \
vec_st(_2, 0, ptr++); \
} while (0)
#define vec_mstrgb24(x0, x1, x2, ptr) \
do { \
__typeof__(x0) _0, _1, _2; \
vec_merge3(x2, x1, x0, _0, _1, _2); \
vec_st(_0, 0, ptr++); \
vec_st(_1, 0, ptr++); \
vec_st(_2, 0, ptr++); \
} while (0)
/* pack the pixels in rgb0 format
* msb R
* lsb 0
*/
#define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
do { \
T _0, _1, _2, _3; \
_0 = vec_mergeh(x0, x1); \
_1 = vec_mergeh(x2, x3); \
_2 = (T) vec_mergeh((vector unsigned short) _0, \
(vector unsigned short) _1); \
_3 = (T) vec_mergel((vector unsigned short) _0, \
(vector unsigned short) _1); \
vec_st(_2, 0 * 16, (T *) ptr); \
vec_st(_3, 1 * 16, (T *) ptr); \
_0 = vec_mergel(x0, x1); \
_1 = vec_mergel(x2, x3); \
_2 = (T) vec_mergeh((vector unsigned short) _0, \
(vector unsigned short) _1); \
_3 = (T) vec_mergel((vector unsigned short) _0, \
(vector unsigned short) _1); \
vec_st(_2, 2 * 16, (T *) ptr); \
vec_st(_3, 3 * 16, (T *) ptr); \
ptr += 4; \
} while (0)
/*
* 1 0 1.4021 | | Y |
* 1 -0.3441 -0.7142 |x| Cb|
* 1 1.7718 0 | | Cr|
*
*
* Y: [-128 127]
* Cb/Cr : [-128 127]
*
* typical YUV conversion works on Y: 0-255 this version has been
* optimized for JPEG decoding.
*/
#define vec_unh(x) \
(vector signed short) \
vec_perm(x, (__typeof__(x)) { 0 }, \
((vector unsigned char) { \
0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
#define vec_unl(x) \
(vector signed short) \
vec_perm(x, (__typeof__(x)) { 0 }, \
((vector unsigned char) { \
0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
#define vec_clip_s16(x) \
vec_max(vec_min(x, ((vector signed short) { \
235, 235, 235, 235, 235, 235, 235, 235 })), \
((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
#define vec_packclp(x, y) \
(vector unsigned char) \
vec_packs((vector unsigned short) \
vec_max(x, ((vector signed short) { 0 })), \
(vector unsigned short) \
vec_max(y, ((vector signed short) { 0 })))
static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
vector signed short U, vector signed short V,
vector signed short *R, vector signed short *G,
vector signed short *B)
{
vector signed short vx, ux, uvx;
Y = vec_mradds(Y, c->CY, c->OY);
U = vec_sub(U, (vector signed short)
vec_splat((vector signed short) { 128 }, 0));
V = vec_sub(V, (vector signed short)
vec_splat((vector signed short) { 128 }, 0));
// ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
ux = vec_sl(U, c->CSHIFT);
*B = vec_mradds(ux, c->CBU, Y);
// vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
vx = vec_sl(V, c->CSHIFT);
*R = vec_mradds(vx, c->CRV, Y);
// uvx = ((CGU * u) + (CGV * v)) >> 15;
uvx = vec_mradds(U, c->CGU, Y);
*G = vec_mradds(V, c->CGV, uvx);
}
/*
* ------------------------------------------------------------------------------
* CS converters
* ------------------------------------------------------------------------------
*/
#define DEFCSP420_CVT(name, out_pixels) \
static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
int *instrides, int srcSliceY, int srcSliceH, \
unsigned char **oplanes, int *outstrides) \
{ \
int w = c->srcW; \
int h = srcSliceH; \
int i, j; \
int instrides_scl[3]; \
vector unsigned char y0, y1; \
\
vector signed char u, v; \
\
vector signed short Y0, Y1, Y2, Y3; \
vector signed short U, V; \
vector signed short vx, ux, uvx; \
vector signed short vx0, ux0, uvx0; \
vector signed short vx1, ux1, uvx1; \
vector signed short R0, G0, B0; \
vector signed short R1, G1, B1; \
vector unsigned char R, G, B; \
\
const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
vector unsigned char align_perm; \
\
vector signed short lCY = c->CY; \
vector signed short lOY = c->OY; \
vector signed short lCRV = c->CRV; \
vector signed short lCBU = c->CBU; \
vector signed short lCGU = c->CGU; \
vector signed short lCGV = c->CGV; \
vector unsigned short lCSHIFT = c->CSHIFT; \
\
const ubyte *y1i = in[0]; \
const ubyte *y2i = in[0] + instrides[0]; \
const ubyte *ui = in[1]; \
const ubyte *vi = in[2]; \
\
vector unsigned char *oute, *outo; \
\
/* loop moves y{1, 2}i by w */ \
instrides_scl[0] = instrides[0] * 2 - w; \
/* loop moves ui by w / 2 */ \
instrides_scl[1] = instrides[1] - w / 2; \
/* loop moves vi by w / 2 */ \
instrides_scl[2] = instrides[2] - w / 2; \
\
for (i = 0; i < h / 2; i++) { \
oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
(srcSliceY + i * 2)); \
outo = oute + (outstrides[0] >> 4); \
vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
\
for (j = 0; j < w / 16; j++) { \
y1ivP = (const vector unsigned char *) y1i; \
y2ivP = (const vector unsigned char *) y2i; \
uivP = (const vector unsigned char *) ui; \
vivP = (const vector unsigned char *) vi; \
\
align_perm = vec_lvsl(0, y1i); \
y0 = (vector unsigned char) \
vec_perm(y1ivP[0], y1ivP[1], align_perm); \
\
align_perm = vec_lvsl(0, y2i); \
y1 = (vector unsigned char) \
vec_perm(y2ivP[0], y2ivP[1], align_perm); \
\
align_perm = vec_lvsl(0, ui); \
u = (vector signed char) \
vec_perm(uivP[0], uivP[1], align_perm); \
\
align_perm = vec_lvsl(0, vi); \
v = (vector signed char) \
vec_perm(vivP[0], vivP[1], align_perm); \
\
u = (vector signed char) \
vec_sub(u, \
(vector signed char) \
vec_splat((vector signed char) { 128 }, 0)); \
v = (vector signed char) \
vec_sub(v, \
(vector signed char) \
vec_splat((vector signed char) { 128 }, 0)); \
\
U = vec_unpackh(u); \
V = vec_unpackh(v); \
\
Y0 = vec_unh(y0); \
Y1 = vec_unl(y0); \
Y2 = vec_unh(y1); \
Y3 = vec_unl(y1); \
\
Y0 = vec_mradds(Y0, lCY, lOY); \
Y1 = vec_mradds(Y1, lCY, lOY); \
Y2 = vec_mradds(Y2, lCY, lOY); \
Y3 = vec_mradds(Y3, lCY, lOY); \
\
/* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \
ux = vec_sl(U, lCSHIFT); \
ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
ux0 = vec_mergeh(ux, ux); \
ux1 = vec_mergel(ux, ux); \
\
/* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \
vx = vec_sl(V, lCSHIFT); \
vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
vx0 = vec_mergeh(vx, vx); \
vx1 = vec_mergel(vx, vx); \
\
/* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \
uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
uvx = vec_mradds(V, lCGV, uvx); \
uvx0 = vec_mergeh(uvx, uvx); \
uvx1 = vec_mergel(uvx, uvx); \
\
R0 = vec_add(Y0, vx0); \
G0 = vec_add(Y0, uvx0); \
B0 = vec_add(Y0, ux0); \
R1 = vec_add(Y1, vx1); \
G1 = vec_add(Y1, uvx1); \
B1 = vec_add(Y1, ux1); \
\
R = vec_packclp(R0, R1); \
G = vec_packclp(G0, G1); \
B = vec_packclp(B0, B1); \
\
out_pixels(R, G, B, oute); \
\
R0 = vec_add(Y2, vx0); \
G0 = vec_add(Y2, uvx0); \
B0 = vec_add(Y2, ux0); \
R1 = vec_add(Y3, vx1); \
G1 = vec_add(Y3, uvx1); \
B1 = vec_add(Y3, ux1); \
R = vec_packclp(R0, R1); \
G = vec_packclp(G0, G1); \
B = vec_packclp(B0, B1); \
\
\
out_pixels(R, G, B, outo); \
\
y1i += 16; \
y2i += 16; \
ui += 8; \
vi += 8; \
} \
\
ui += instrides_scl[1]; \
vi += instrides_scl[2]; \
y1i += instrides_scl[0]; \
y2i += instrides_scl[0]; \
} \
return srcSliceH; \
}
#define out_abgr(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), c, b, a, ptr)
#define out_bgra(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) { 255 }), ptr)
#define out_rgba(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) { 255 }), ptr)
#define out_argb(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, b, c, ptr)
#define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
#define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
DEFCSP420_CVT(yuv2_abgr, out_abgr)
DEFCSP420_CVT(yuv2_bgra, out_bgra)
DEFCSP420_CVT(yuv2_rgba, out_rgba)
DEFCSP420_CVT(yuv2_argb, out_argb)
DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
// uyvy|uyvy|uyvy|uyvy
// 0123 4567 89ab cdef
static const vector unsigned char
demux_u = { 0x10, 0x00, 0x10, 0x00,
0x10, 0x04, 0x10, 0x04,
0x10, 0x08, 0x10, 0x08,
0x10, 0x0c, 0x10, 0x0c },
demux_v = { 0x10, 0x02, 0x10, 0x02,
0x10, 0x06, 0x10, 0x06,
0x10, 0x0A, 0x10, 0x0A,
0x10, 0x0E, 0x10, 0x0E },
demux_y = { 0x10, 0x01, 0x10, 0x03,
0x10, 0x05, 0x10, 0x07,
0x10, 0x09, 0x10, 0x0B,
0x10, 0x0D, 0x10, 0x0F };
/*
* this is so I can play live CCIR raw video
*/
static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in,
int *instrides, int srcSliceY, int srcSliceH,
unsigned char **oplanes, int *outstrides)
{
int w = c->srcW;
int h = srcSliceH;
int i, j;
vector unsigned char uyvy;
vector signed short Y, U, V;
vector signed short R0, G0, B0, R1, G1, B1;
vector unsigned char R, G, B;
vector unsigned char *out;
const ubyte *img;
img = in[0];
out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
for (i = 0; i < h; i++)
for (j = 0; j < w / 16; j++) {
uyvy = vec_ld(0, img);
U = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
V = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
Y = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
uyvy = vec_ld(16, img);
U = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
V = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
Y = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
// vec_mstbgr24 (R,G,B, out);
out_rgba(R, G, B, out);
img += 32;
}
return srcSliceH;
}
#endif /* HAVE_ALTIVEC */
/* Ok currently the acceleration routine only supports
* inputs of widths a multiple of 16
* and heights a multiple 2
*
* So we just fall back to the C codes for this.
*/
av_cold SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return NULL;
/*
* and this seems not to matter too much I tried a bunch of
* videos with abnormal widths and MPlayer crashes elsewhere.
* mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
* boom with X11 bad match.
*
*/
if ((c->srcW & 0xf) != 0)
return NULL;
switch (c->srcFormat) {
case AV_PIX_FMT_YUV410P:
case AV_PIX_FMT_YUV420P:
/*case IMGFMT_CLPL: ??? */
case AV_PIX_FMT_GRAY8:
case AV_PIX_FMT_NV12:
case AV_PIX_FMT_NV21:
if ((c->srcH & 0x1) != 0)
return NULL;
switch (c->dstFormat) {
case AV_PIX_FMT_RGB24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
return altivec_yuv2_rgb24;
case AV_PIX_FMT_BGR24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
return altivec_yuv2_bgr24;
case AV_PIX_FMT_ARGB:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
return altivec_yuv2_argb;
case AV_PIX_FMT_ABGR:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
return altivec_yuv2_abgr;
case AV_PIX_FMT_RGBA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
return altivec_yuv2_rgba;
case AV_PIX_FMT_BGRA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
return altivec_yuv2_bgra;
default: return NULL;
}
break;
case AV_PIX_FMT_UYVY422:
switch (c->dstFormat) {
case AV_PIX_FMT_BGR32:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
return altivec_uyvy_rgb32;
default: return NULL;
}
break;
}
#endif /* HAVE_ALTIVEC */
return NULL;
}
av_cold void ff_yuv2rgb_init_tables_ppc(SwsContext *c,
const int inv_table[4],
int brightness,
int contrast,
int saturation)
{
#if HAVE_ALTIVEC
union {
DECLARE_ALIGNED(16, signed short, tmp)[8];
vector signed short vec;
} buf;
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy
buf.tmp[1] = -256 * brightness; // oy
buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv
buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu
buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu
buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv
c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
c->CY = vec_splat((vector signed short) buf.vec, 0);
c->OY = vec_splat((vector signed short) buf.vec, 1);
c->CRV = vec_splat((vector signed short) buf.vec, 2);
c->CBU = vec_splat((vector signed short) buf.vec, 3);
c->CGU = vec_splat((vector signed short) buf.vec, 4);
c->CGV = vec_splat((vector signed short) buf.vec, 5);
return;
#endif /* HAVE_ALTIVEC */
}
#if HAVE_ALTIVEC
static av_always_inline void yuv2packedX_altivec(SwsContext *c,
const int16_t *lumFilter,
const int16_t **lumSrc,
int lumFilterSize,
const int16_t *chrFilter,
const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize,
const int16_t **alpSrc,
uint8_t *dest,
int dstW, int dstY,
enum AVPixelFormat target)
{
int i, j;
vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
vector signed short R0, G0, B0, R1, G1, B1;
vector unsigned char R, G, B;
vector unsigned char *out, *nout;
vector signed short RND = vec_splat_s16(1 << 3);
vector unsigned short SCL = vec_splat_u16(4);
DECLARE_ALIGNED(16, unsigned int, scratch)[16];
vector signed short *YCoeffs, *CCoeffs;
YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
out = (vector unsigned char *) dest;
for (i = 0; i < dstW; i += 16) {
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for (j = 0; j < lumFilterSize; j++) {
X0 = vec_ld(0, &lumSrc[j][i]);
X1 = vec_ld(16, &lumSrc[j][i]);
Y0 = vec_mradds(X0, YCoeffs[j], Y0);
Y1 = vec_mradds(X1, YCoeffs[j], Y1);
}
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for (j = 0; j < chrFilterSize; j++) {
X = vec_ld(0, &chrUSrc[j][i / 2]);
U = vec_mradds(X, CCoeffs[j], U);
X = vec_ld(0, &chrVSrc[j][i / 2]);
V = vec_mradds(X, CCoeffs[j], V);
}
/* scale and clip signals */
Y0 = vec_sra(Y0, SCL);
Y1 = vec_sra(Y1, SCL);
U = vec_sra(U, SCL);
V = vec_sra(V, SCL);
Y0 = vec_clip_s16(Y0);
Y1 = vec_clip_s16(Y1);
U = vec_clip_s16(U);
V = vec_clip_s16(V);
/* now we have
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
*
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
* V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
*/
U0 = vec_mergeh(U, U);
V0 = vec_mergeh(V, V);
U1 = vec_mergel(U, U);
V1 = vec_mergel(V, V);
cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
switch (target) {
case AV_PIX_FMT_ABGR:
out_abgr(R, G, B, out);
break;
case AV_PIX_FMT_BGRA:
out_bgra(R, G, B, out);
break;
case AV_PIX_FMT_RGBA:
out_rgba(R, G, B, out);
break;
case AV_PIX_FMT_ARGB:
out_argb(R, G, B, out);
break;
case AV_PIX_FMT_RGB24:
out_rgb24(R, G, B, out);
break;
case AV_PIX_FMT_BGR24:
out_bgr24(R, G, B, out);
break;
default:
{
/* If this is reached, the caller should have called yuv2packedXinC
* instead. */
static int printed_error_message;
if (!printed_error_message) {
av_log(c, AV_LOG_ERROR,
"altivec_yuv2packedX doesn't support %s output\n",
av_get_pix_fmt_name(c->dstFormat));
printed_error_message = 1;
}
return;
}
}
}
if (i < dstW) {
i -= 16;
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for (j = 0; j < lumFilterSize; j++) {
X0 = vec_ld(0, &lumSrc[j][i]);
X1 = vec_ld(16, &lumSrc[j][i]);
Y0 = vec_mradds(X0, YCoeffs[j], Y0);
Y1 = vec_mradds(X1, YCoeffs[j], Y1);
}
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for (j = 0; j < chrFilterSize; j++) {
X = vec_ld(0, &chrUSrc[j][i / 2]);
U = vec_mradds(X, CCoeffs[j], U);
X = vec_ld(0, &chrVSrc[j][i / 2]);
V = vec_mradds(X, CCoeffs[j], V);
}
/* scale and clip signals */
Y0 = vec_sra(Y0, SCL);
Y1 = vec_sra(Y1, SCL);
U = vec_sra(U, SCL);
V = vec_sra(V, SCL);
Y0 = vec_clip_s16(Y0);
Y1 = vec_clip_s16(Y1);
U = vec_clip_s16(U);
V = vec_clip_s16(V);
/* now we have
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
*
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
* V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
*/
U0 = vec_mergeh(U, U);
V0 = vec_mergeh(V, V);
U1 = vec_mergel(U, U);
V1 = vec_mergel(V, V);
cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
nout = (vector unsigned char *) scratch;
switch (target) {
case AV_PIX_FMT_ABGR:
out_abgr(R, G, B, nout);
break;
case AV_PIX_FMT_BGRA:
out_bgra(R, G, B, nout);
break;
case AV_PIX_FMT_RGBA:
out_rgba(R, G, B, nout);
break;
case AV_PIX_FMT_ARGB:
out_argb(R, G, B, nout);
break;
case AV_PIX_FMT_RGB24:
out_rgb24(R, G, B, nout);
break;
case AV_PIX_FMT_BGR24:
out_bgr24(R, G, B, nout);
break;
default:
/* Unreachable, I think. */
av_log(c, AV_LOG_ERROR,
"altivec_yuv2packedX doesn't support %s output\n",
av_get_pix_fmt_name(c->dstFormat));
return;
}
memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
}
}
#define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
const int16_t *lumFilter, \
const int16_t **lumSrc, \
int lumFilterSize, \
const int16_t *chrFilter, \
const int16_t **chrUSrc, \
const int16_t **chrVSrc, \
int chrFilterSize, \
const int16_t **alpSrc, \
uint8_t *dest, int dstW, int dstY) \
{ \
yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
chrFilter, chrUSrc, chrVSrc, \
chrFilterSize, alpSrc, \
dest, dstW, dstY, pixfmt); \
}
YUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR);
YUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA);
YUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB);
YUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA);
YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24);
YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24);
#endif /* HAVE_ALTIVEC */

View File

@@ -0,0 +1,51 @@
/*
* AltiVec-enhanced yuv2yuvX
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_PPC_YUV2RGB_ALTIVEC_H
#define SWSCALE_PPC_YUV2RGB_ALTIVEC_H
#include <stdint.h>
#include "libswscale/swscale_internal.h"
#define YUV2PACKEDX_HEADER(suffix) \
void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
const int16_t *lumFilter, \
const int16_t **lumSrc, \
int lumFilterSize, \
const int16_t *chrFilter, \
const int16_t **chrUSrc, \
const int16_t **chrVSrc, \
int chrFilterSize, \
const int16_t **alpSrc, \
uint8_t *dest, \
int dstW, int dstY);
YUV2PACKEDX_HEADER(abgr);
YUV2PACKEDX_HEADER(bgra);
YUV2PACKEDX_HEADER(argb);
YUV2PACKEDX_HEADER(rgba);
YUV2PACKEDX_HEADER(rgb24);
YUV2PACKEDX_HEADER(bgr24);
#endif /* SWSCALE_PPC_YUV2RGB_ALTIVEC_H */

View File

@@ -0,0 +1,204 @@
/*
* AltiVec-enhanced yuv-to-yuv conversion routines.
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#if HAVE_ALTIVEC
static int yv12toyuy2_unscaled_altivec(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam[],
int dstStride_a[])
{
uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
// yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
// srcStride[0], srcStride[1], dstStride[0]);
const uint8_t *ysrc = src[0];
const uint8_t *usrc = src[1];
const uint8_t *vsrc = src[2];
const int width = c->srcW;
const int height = srcSliceH;
const int lumStride = srcStride[0];
const int chromStride = srcStride[1];
const int dstStride = dstStride_a[0];
const vector unsigned char yperm = vec_lvsl(0, ysrc);
const int vertLumPerChroma = 2;
register unsigned int y;
/* This code assumes:
*
* 1) dst is 16 bytes-aligned
* 2) dstStride is a multiple of 16
* 3) width is a multiple of 16
* 4) lum & chrom stride are multiples of 8
*/
for (y = 0; y < height; y++) {
int i;
for (i = 0; i < width - 31; i += 32) {
const unsigned int j = i >> 1;
vector unsigned char v_yA = vec_ld(i, ysrc);
vector unsigned char v_yB = vec_ld(i + 16, ysrc);
vector unsigned char v_yC = vec_ld(i + 32, ysrc);
vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
vector unsigned char v_uA = vec_ld(j, usrc);
vector unsigned char v_uB = vec_ld(j + 16, usrc);
vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
vector unsigned char v_vA = vec_ld(j, vsrc);
vector unsigned char v_vB = vec_ld(j + 16, vsrc);
vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b);
vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b);
vec_st(v_yuy2_0, (i << 1), dst);
vec_st(v_yuy2_1, (i << 1) + 16, dst);
vec_st(v_yuy2_2, (i << 1) + 32, dst);
vec_st(v_yuy2_3, (i << 1) + 48, dst);
}
if (i < width) {
const unsigned int j = i >> 1;
vector unsigned char v_y1 = vec_ld(i, ysrc);
vector unsigned char v_u = vec_ld(j, usrc);
vector unsigned char v_v = vec_ld(j, vsrc);
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
vec_st(v_yuy2_0, (i << 1), dst);
vec_st(v_yuy2_1, (i << 1) + 16, dst);
}
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
return srcSliceH;
}
static int yv12touyvy_unscaled_altivec(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam[],
int dstStride_a[])
{
uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
// yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
// srcStride[0], srcStride[1], dstStride[0]);
const uint8_t *ysrc = src[0];
const uint8_t *usrc = src[1];
const uint8_t *vsrc = src[2];
const int width = c->srcW;
const int height = srcSliceH;
const int lumStride = srcStride[0];
const int chromStride = srcStride[1];
const int dstStride = dstStride_a[0];
const int vertLumPerChroma = 2;
const vector unsigned char yperm = vec_lvsl(0, ysrc);
register unsigned int y;
/* This code assumes:
*
* 1) dst is 16 bytes-aligned
* 2) dstStride is a multiple of 16
* 3) width is a multiple of 16
* 4) lum & chrom stride are multiples of 8
*/
for (y = 0; y < height; y++) {
int i;
for (i = 0; i < width - 31; i += 32) {
const unsigned int j = i >> 1;
vector unsigned char v_yA = vec_ld(i, ysrc);
vector unsigned char v_yB = vec_ld(i + 16, ysrc);
vector unsigned char v_yC = vec_ld(i + 32, ysrc);
vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
vector unsigned char v_uA = vec_ld(j, usrc);
vector unsigned char v_uB = vec_ld(j + 16, usrc);
vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
vector unsigned char v_vA = vec_ld(j, vsrc);
vector unsigned char v_vB = vec_ld(j + 16, vsrc);
vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2);
vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2);
vec_st(v_uyvy_0, (i << 1), dst);
vec_st(v_uyvy_1, (i << 1) + 16, dst);
vec_st(v_uyvy_2, (i << 1) + 32, dst);
vec_st(v_uyvy_3, (i << 1) + 48, dst);
}
if (i < width) {
const unsigned int j = i >> 1;
vector unsigned char v_y1 = vec_ld(i, ysrc);
vector unsigned char v_u = vec_ld(j, usrc);
vector unsigned char v_v = vec_ld(j, vsrc);
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
vec_st(v_uyvy_0, (i << 1), dst);
vec_st(v_uyvy_1, (i << 1) + 16, dst);
}
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
return srcSliceH;
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_get_unscaled_swscale_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
if (!(c->srcW & 15) && !(c->flags & SWS_BITEXACT) &&
c->srcFormat == AV_PIX_FMT_YUV420P) {
enum AVPixelFormat dstFormat = c->dstFormat;
// unscaled YV12 -> packed YUV, we want speed
if (dstFormat == AV_PIX_FMT_YUYV422)
c->swscale = yv12toyuy2_unscaled_altivec;
else if (dstFormat == AV_PIX_FMT_UYVY422)
c->swscale = yv12touyvy_unscaled_altivec;
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,390 @@
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "libavutil/attributes.h"
#include "libavutil/bswap.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride);
void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride,
int32_t *rgb2yuv);
void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
int srcStride, int dstStride);
void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
int width, int height, int src1Stride,
int src2Stride, int dstStride);
void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2);
void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2,
const uint8_t *src3, uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride);
void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
#define BY ((int)( 0.098 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BV ((int)(-0.071 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BU ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GY ((int)( 0.504 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GV ((int)(-0.368 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GU ((int)(-0.291 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RY ((int)( 0.257 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RV ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RU ((int)(-0.148 * (1 << RGB2YUV_SHIFT) + 0.5))
//plain C versions
#include "rgb2rgb_template.c"
/*
* RGB15->RGB16 original by Strepto/Astral
* ported to gcc & bugfixed : A'rpi
* MMXEXT, 3DNOW optimization by Nick Kurshev
* 32-bit C version, and and&add trick by Michael Niedermayer
*/
av_cold void sws_rgb2rgb_init(void)
{
rgb2rgb_init_c();
if (ARCH_X86)
rgb2rgb_init_x86();
}
void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 2;
for (i = 0; i < num_pixels; i++) {
#if HAVE_BIGENDIAN
/* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */
dst[3 * i + 0] = src[4 * i + 1];
dst[3 * i + 1] = src[4 * i + 2];
dst[3 * i + 2] = src[4 * i + 3];
#else
dst[3 * i + 0] = src[4 * i + 2];
dst[3 * i + 1] = src[4 * i + 1];
dst[3 * i + 2] = src[4 * i + 0];
#endif
}
}
void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size)
{
int i;
for (i = 0; 3 * i < src_size; i++) {
#if HAVE_BIGENDIAN
/* RGB24 (= R, G, B) -> BGR32 (= A, R, G, B) */
dst[4 * i + 0] = 255;
dst[4 * i + 1] = src[3 * i + 0];
dst[4 * i + 2] = src[3 * i + 1];
dst[4 * i + 3] = src[3 * i + 2];
#else
dst[4 * i + 0] = src[3 * i + 2];
dst[4 * i + 1] = src[3 * i + 1];
dst[4 * i + 2] = src[3 * i + 0];
dst[4 * i + 3] = 255;
#endif
}
}
void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
#else
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = 255;
#endif
}
}
void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t rgb, r, g, b;
uint16_t *d = (uint16_t *)dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
rgb = *s++;
r = rgb & 0xF00;
g = rgb & 0x0F0;
b = rgb & 0x00F;
r = (r << 3) | ((r & 0x800) >> 1);
g = (g << 2) | ((g & 0x080) >> 2);
b = (b << 1) | ( b >> 3);
*d++ = r | g | b;
}
}
void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
}
}
void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = (rgb >> 11) | (rgb & 0x7E0) | (rgb << 11);
}
}
void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = (rgb >> 11) | ((rgb & 0x7C0) >> 1) | ((rgb & 0x1F) << 10);
}
}
void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
#else
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = 255;
#endif
}
}
void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
}
}
void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = ((rgb & 0x7C00) >> 10) | ((rgb & 0x3E0) << 1) | (rgb << 11);
}
}
void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
unsigned br = rgb & 0x7C1F;
((uint16_t *)dst)[i] = (br >> 10) | (rgb & 0x3E0) | (br << 10);
}
}
void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
uint16_t *s = (uint16_t *)src;
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = s[i];
d[i] = (rgb << 8 | rgb & 0xF0 | rgb >> 8) & 0xFFF;
}
}
#define DEFINE_SHUFFLE_BYTES(a, b, c, d) \
void shuffle_bytes_ ## a ## b ## c ## d(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
int i; \
\
for (i = 0; i < src_size; i += 4) { \
dst[i + 0] = src[i + a]; \
dst[i + 1] = src[i + b]; \
dst[i + 2] = src[i + c]; \
dst[i + 3] = src[i + d]; \
} \
}
DEFINE_SHUFFLE_BYTES(0, 3, 2, 1)
DEFINE_SHUFFLE_BYTES(1, 2, 3, 0)
DEFINE_SHUFFLE_BYTES(3, 0, 1, 2)
DEFINE_SHUFFLE_BYTES(3, 2, 1, 0)
#define DEFINE_RGB48TOBGR48(need_bswap, swap) \
void rgb48tobgr48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 1; \
\
for (i = 0; i < num_pixels; i += 3) { \
d[i ] = swap ? av_bswap16(s[i + 2]) : s[i + 2]; \
d[i + 1] = swap ? av_bswap16(s[i + 1]) : s[i + 1]; \
d[i + 2] = swap ? av_bswap16(s[i ]) : s[i ]; \
} \
}
DEFINE_RGB48TOBGR48(nobswap, 0)
DEFINE_RGB48TOBGR48(bswap, 1)
#define DEFINE_RGB64TOBGR48(need_bswap, swap) \
void rgb64tobgr48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 3; \
\
for (i = 0; i < num_pixels; i++) { \
d[3 * i ] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \
d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \
d[3 * i + 2] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \
} \
}
DEFINE_RGB64TOBGR48(nobswap, 0)
DEFINE_RGB64TOBGR48(bswap, 1)
#define DEFINE_RGB64TO48(need_bswap, swap) \
void rgb64to48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 3; \
\
for (i = 0; i < num_pixels; i++) { \
d[3 * i ] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \
d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \
d[3 * i + 2] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \
} \
}
DEFINE_RGB64TO48(nobswap, 0)
DEFINE_RGB64TO48(bswap, 1)

View File

@@ -0,0 +1,167 @@
/*
* software RGB to RGB converter
* pluralize by Software PAL8 to RGB converter
* Software YUV to YUV converter
* Software YUV to RGB converter
* Written by Nick Kurshev.
* YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_RGB2RGB_H
#define SWSCALE_RGB2RGB_H
#include <inttypes.h>
#include "libavutil/avutil.h"
#include "swscale.h"
/* A full collection of RGB to RGB(BGR) converters */
extern void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64to48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64to48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size);
void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
void shuffle_bytes_0321(const uint8_t *src, uint8_t *dst, int src_size);
void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size);
void shuffle_bytes_3012(const uint8_t *src, uint8_t *dst, int src_size);
void shuffle_bytes_3210(const uint8_t *src, uint8_t *dst, int src_size);
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height, int lumStride,
int chromStride, int srcStride, int32_t *rgb2yuv);
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Width should be a multiple of 16.
*/
extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride);
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Width should be a multiple of 16.
*/
extern void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write high quality version.
*/
extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride,
int32_t *rgb2yuv);
extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
int srcStride, int dstStride);
extern void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
int width, int height, int src1Stride,
int src2Stride, int dstStride);
extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2);
extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride);
extern void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
void sws_rgb2rgb_init(void);
void rgb2rgb_init_x86(void);
#endif /* SWSCALE_RGB2RGB_H */

View File

@@ -0,0 +1,932 @@
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byte order fixes by Alex Beregszaszi
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include "libavutil/attributes.h"
static inline void rgb24tobgr32_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
#if HAVE_BIGENDIAN
/* RGB24 (= R, G, B) -> RGB32 (= A, B, G, R) */
*dest++ = 255;
*dest++ = s[2];
*dest++ = s[1];
*dest++ = s[0];
s += 3;
#else
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
*dest++ = 255;
#endif
}
}
static inline void rgb32tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
#if HAVE_BIGENDIAN
/* RGB32 (= A, B, G, R) -> RGB24 (= R, G, B) */
s++;
dest[2] = *s++;
dest[1] = *s++;
dest[0] = *s++;
dest += 3;
#else
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
s++;
#endif
}
}
/*
* original by Strepto/Astral
* ported to gcc & bugfixed: A'rpi
* MMXEXT, 3DNOW optimization by Nick Kurshev
* 32-bit C version, and and&add trick by Michael Niedermayer
*/
static inline void rgb15to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
register uint8_t *d = dst;
register const uint8_t *s = src;
register const uint8_t *end = s + src_size;
const uint8_t *mm_end = end - 3;
while (s < mm_end) {
register unsigned x = *((const uint32_t *)s);
*((uint32_t *)d) = (x & 0x7FFF7FFF) + (x & 0x7FE07FE0);
d += 4;
s += 4;
}
if (s < end) {
register unsigned short x = *((const uint16_t *)s);
*((uint16_t *)d) = (x & 0x7FFF) + (x & 0x7FE0);
}
}
static inline void rgb16to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
register uint8_t *d = dst;
register const uint8_t *s = src;
register const uint8_t *end = s + src_size;
const uint8_t *mm_end = end - 3;
while (s < mm_end) {
register uint32_t x = *((const uint32_t *)s);
*((uint32_t *)d) = ((x >> 1) & 0x7FE07FE0) | (x & 0x001F001F);
s += 4;
d += 4;
}
if (s < end) {
register uint16_t x = *((const uint16_t *)s);
*((uint16_t *)d) = ((x >> 1) & 0x7FE0) | (x & 0x001F);
}
}
static inline void rgb32to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xFF) >> 3) +
((rgb & 0xFC00) >> 5) +
((rgb & 0xF80000) >> 8);
}
}
static inline void rgb32tobgr16_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xF8) << 8) +
((rgb & 0xFC00) >> 5) +
((rgb & 0xF80000) >> 19);
}
}
static inline void rgb32to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xFF) >> 3) +
((rgb & 0xF800) >> 6) +
((rgb & 0xF80000) >> 9);
}
}
static inline void rgb32tobgr15_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xF8) << 7) +
((rgb & 0xF800) >> 6) +
((rgb & 0xF80000) >> 19);
}
}
static inline void rgb24tobgr16_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int b = *s++;
const int g = *s++;
const int r = *s++;
*d++ = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
}
}
static inline void rgb24to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int r = *s++;
const int g = *s++;
const int b = *s++;
*d++ = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
}
}
static inline void rgb24tobgr15_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int b = *s++;
const int g = *s++;
const int r = *s++;
*d++ = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
}
}
static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int r = *s++;
const int g = *s++;
const int b = *s++;
*d++ = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
}
}
static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
}
}
static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
}
}
static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
#else
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = 255;
#endif
}
}
static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
#else
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = 255;
#endif
}
}
static inline void shuffle_bytes_2103_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
int idx = 15 - src_size;
const uint8_t *s = src - idx;
uint8_t *d = dst - idx;
for (; idx < 15; idx += 4) {
register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
v &= 0xff00ff;
*(uint32_t *)&d[idx] = (v >> 16) + g + (v << 16);
}
}
static inline void rgb24tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size)
{
unsigned i;
for (i = 0; i < src_size; i += 3) {
register uint8_t x = src[i + 2];
dst[i + 1] = src[i + 1];
dst[i + 2] = src[i + 0];
dst[i + 0] = x;
}
}
static inline void yuvPlanartoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride,
int dstStride, int vertLumPerChroma)
{
int y, i;
const int chromWidth = width >> 1;
for (y = 0; y < height; y++) {
#if HAVE_FAST_64BIT
uint64_t *ldst = (uint64_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i += 2) {
uint64_t k = yc[0] + (uc[0] << 8) +
(yc[1] << 16) + (unsigned)(vc[0] << 24);
uint64_t l = yc[2] + (uc[1] << 8) +
(yc[3] << 16) + (unsigned)(vc[1] << 24);
*ldst++ = k + (l << 32);
yc += 4;
uc += 2;
vc += 2;
}
#else
int *idst = (int32_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i++) {
#if HAVE_BIGENDIAN
*idst++ = (yc[0] << 24) + (uc[0] << 16) +
(yc[1] << 8) + (vc[0] << 0);
#else
*idst++ = yc[0] + (uc[0] << 8) +
(yc[1] << 16) + (vc[0] << 24);
#endif
yc += 2;
uc++;
vc++;
}
#endif
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yv12toyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
//FIXME interpolate chroma
yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 2);
}
static inline void yuvPlanartouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride,
int dstStride, int vertLumPerChroma)
{
int y, i;
const int chromWidth = width >> 1;
for (y = 0; y < height; y++) {
#if HAVE_FAST_64BIT
uint64_t *ldst = (uint64_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i += 2) {
uint64_t k = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + (unsigned)(yc[1] << 24);
uint64_t l = uc[1] + (yc[2] << 8) +
(vc[1] << 16) + (unsigned)(yc[3] << 24);
*ldst++ = k + (l << 32);
yc += 4;
uc += 2;
vc += 2;
}
#else
int *idst = (int32_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i++) {
#if HAVE_BIGENDIAN
*idst++ = (uc[0] << 24) + (yc[0] << 16) +
(vc[0] << 8) + (yc[1] << 0);
#else
*idst++ = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + (yc[1] << 24);
#endif
yc += 2;
uc++;
vc++;
}
#endif
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yv12touyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
//FIXME interpolate chroma
yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 2);
}
/**
* Width should be a multiple of 16.
*/
static inline void yuv422ptouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 1);
}
/**
* Width should be a multiple of 16.
*/
static inline void yuv422ptoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 1);
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yuy2toyv12_c(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height, int lumStride,
int chromStride, int srcStride)
{
int y;
const int chromWidth = width >> 1;
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 0];
udst[i] = src[4 * i + 1];
ydst[2 * i + 1] = src[4 * i + 2];
vdst[i] = src[4 * i + 3];
}
ydst += lumStride;
src += srcStride;
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 0];
ydst[2 * i + 1] = src[4 * i + 2];
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
static inline void planar2x_c(const uint8_t *src, uint8_t *dst, int srcWidth,
int srcHeight, int srcStride, int dstStride)
{
int x, y;
dst[0] = src[0];
// first line
for (x = 0; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (3 * src[x] + src[x + 1]) >> 2;
dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
}
dst[2 * srcWidth - 1] = src[srcWidth - 1];
dst += dstStride;
for (y = 1; y < srcHeight; y++) {
const int mmxSize = 1;
dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
for (x = mmxSize - 1; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (src[x + 0] * 3 + src[x + srcStride + 1]) >> 2;
dst[2 * x + dstStride + 2] = (src[x + 0] + 3 * src[x + srcStride + 1]) >> 2;
dst[2 * x + dstStride + 1] = (src[x + 1] + 3 * src[x + srcStride]) >> 2;
dst[2 * x + 2] = (src[x + 1] * 3 + src[x + srcStride]) >> 2;
}
dst[srcWidth * 2 - 1] = (src[srcWidth - 1] * 3 + src[srcWidth - 1 + srcStride]) >> 2;
dst[srcWidth * 2 - 1 + dstStride] = (src[srcWidth - 1] + 3 * src[srcWidth - 1 + srcStride]) >> 2;
dst += dstStride * 2;
src += srcStride;
}
// last line
dst[0] = src[0];
for (x = 0; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (src[x] * 3 + src[x + 1]) >> 2;
dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
}
dst[2 * srcWidth - 1] = src[srcWidth - 1];
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write HQ version.
*/
static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height, int lumStride,
int chromStride, int srcStride)
{
int y;
const int chromWidth = width >> 1;
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
udst[i] = src[4 * i + 0];
ydst[2 * i + 0] = src[4 * i + 1];
vdst[i] = src[4 * i + 2];
ydst[2 * i + 1] = src[4 * i + 3];
}
ydst += lumStride;
src += srcStride;
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 1];
ydst[2 * i + 1] = src[4 * i + 3];
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
/**
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line,
* others are ignored in the C version.
* FIXME: Write HQ version.
*/
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height, int lumStride,
int chromStride, int srcStride, int32_t *rgb2yuv)
{
int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
int y;
const int chromWidth = width >> 1;
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
unsigned int b = src[6 * i + 0];
unsigned int g = src[6 * i + 1];
unsigned int r = src[6 * i + 2];
unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
udst[i] = U;
vdst[i] = V;
ydst[2 * i] = Y;
b = src[6 * i + 3];
g = src[6 * i + 4];
r = src[6 * i + 5];
Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i + 1] = Y;
}
ydst += lumStride;
src += srcStride;
if (y+1 == height)
break;
for (i = 0; i < chromWidth; i++) {
unsigned int b = src[6 * i + 0];
unsigned int g = src[6 * i + 1];
unsigned int r = src[6 * i + 2];
unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i] = Y;
b = src[6 * i + 3];
g = src[6 * i + 4];
r = src[6 * i + 5];
Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i + 1] = Y;
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride)
{
int h;
for (h = 0; h < height; h++) {
int w;
for (w = 0; w < width; w++) {
dest[2 * w + 0] = src1[w];
dest[2 * w + 1] = src2[w];
}
dest += dstStride;
src1 += src1Stride;
src2 += src2Stride;
}
}
static inline void vu9_to_vu12_c(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2)
{
int x, y;
int w = width / 2;
int h = height / 2;
for (y = 0; y < h; y++) {
const uint8_t *s1 = src1 + srcStride1 * (y >> 1);
uint8_t *d = dst1 + dstStride1 * y;
for (x = 0; x < w; x++)
d[2 * x] = d[2 * x + 1] = s1[x];
}
for (y = 0; y < h; y++) {
const uint8_t *s2 = src2 + srcStride2 * (y >> 1);
uint8_t *d = dst2 + dstStride2 * y;
for (x = 0; x < w; x++)
d[2 * x] = d[2 * x + 1] = s2[x];
}
}
static inline void yvu9_to_yuy2_c(const uint8_t *src1, const uint8_t *src2,
const uint8_t *src3, uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride)
{
int x, y;
int w = width / 2;
int h = height;
for (y = 0; y < h; y++) {
const uint8_t *yp = src1 + srcStride1 * y;
const uint8_t *up = src2 + srcStride2 * (y >> 2);
const uint8_t *vp = src3 + srcStride3 * (y >> 2);
uint8_t *d = dst + dstStride * y;
for (x = 0; x < w; x++) {
const int x2 = x << 2;
d[8 * x + 0] = yp[x2];
d[8 * x + 1] = up[x];
d[8 * x + 2] = yp[x2 + 1];
d[8 * x + 3] = vp[x];
d[8 * x + 4] = yp[x2 + 2];
d[8 * x + 5] = up[x];
d[8 * x + 6] = yp[x2 + 3];
d[8 * x + 7] = vp[x];
}
}
}
static void extract_even_c(const uint8_t *src, uint8_t *dst, int count)
{
dst += count;
src += count * 2;
count = -count;
while (count < 0) {
dst[count] = src[2 * count];
count++;
}
}
static void extract_even2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
int count)
{
dst0 += count;
dst1 += count;
src += count * 4;
count = -count;
while (count < 0) {
dst0[count] = src[4 * count + 0];
dst1[count] = src[4 * count + 2];
count++;
}
}
static void extract_even2avg_c(const uint8_t *src0, const uint8_t *src1,
uint8_t *dst0, uint8_t *dst1, int count)
{
dst0 += count;
dst1 += count;
src0 += count * 4;
src1 += count * 4;
count = -count;
while (count < 0) {
dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
count++;
}
}
static void extract_odd2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
int count)
{
dst0 += count;
dst1 += count;
src += count * 4;
count = -count;
src++;
while (count < 0) {
dst0[count] = src[4 * count + 0];
dst1[count] = src[4 * count + 2];
count++;
}
}
static void extract_odd2avg_c(const uint8_t *src0, const uint8_t *src1,
uint8_t *dst0, uint8_t *dst1, int count)
{
dst0 += count;
dst1 += count;
src0 += count * 4;
src1 += count * 4;
count = -count;
src0++;
src1++;
while (count < 0) {
dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
count++;
}
}
static void yuyvtoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src, ydst, width);
if (y & 1) {
extract_odd2avg_c(src - srcStride, src, udst, vdst, chromWidth);
udst += chromStride;
vdst += chromStride;
}
src += srcStride;
ydst += lumStride;
}
}
static void yuyvtoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src, ydst, width);
extract_odd2_c(src, udst, vdst, chromWidth);
src += srcStride;
ydst += lumStride;
udst += chromStride;
vdst += chromStride;
}
}
static void uyvytoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src + 1, ydst, width);
if (y & 1) {
extract_even2avg_c(src - srcStride, src, udst, vdst, chromWidth);
udst += chromStride;
vdst += chromStride;
}
src += srcStride;
ydst += lumStride;
}
}
static void uyvytoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src + 1, ydst, width);
extract_even2_c(src, udst, vdst, chromWidth);
src += srcStride;
ydst += lumStride;
udst += chromStride;
vdst += chromStride;
}
}
static av_cold void rgb2rgb_init_c(void)
{
rgb15to16 = rgb15to16_c;
rgb15tobgr24 = rgb15tobgr24_c;
rgb15to32 = rgb15to32_c;
rgb16tobgr24 = rgb16tobgr24_c;
rgb16to32 = rgb16to32_c;
rgb16to15 = rgb16to15_c;
rgb24tobgr16 = rgb24tobgr16_c;
rgb24tobgr15 = rgb24tobgr15_c;
rgb24tobgr32 = rgb24tobgr32_c;
rgb32to16 = rgb32to16_c;
rgb32to15 = rgb32to15_c;
rgb32tobgr24 = rgb32tobgr24_c;
rgb24to15 = rgb24to15_c;
rgb24to16 = rgb24to16_c;
rgb24tobgr24 = rgb24tobgr24_c;
shuffle_bytes_2103 = shuffle_bytes_2103_c;
rgb32tobgr16 = rgb32tobgr16_c;
rgb32tobgr15 = rgb32tobgr15_c;
yv12toyuy2 = yv12toyuy2_c;
yv12touyvy = yv12touyvy_c;
yuv422ptoyuy2 = yuv422ptoyuy2_c;
yuv422ptouyvy = yuv422ptouyvy_c;
yuy2toyv12 = yuy2toyv12_c;
planar2x = planar2x_c;
ff_rgb24toyv12 = ff_rgb24toyv12_c;
interleaveBytes = interleaveBytes_c;
vu9_to_vu12 = vu9_to_vu12_c;
yvu9_to_yuy2 = yvu9_to_yuy2_c;
uyvytoyuv420 = uyvytoyuv420_c;
uyvytoyuv422 = uyvytoyuv422_c;
yuyvtoyuv420 = yuyvtoyuv420_c;
yuyvtoyuv422 = yuyvtoyuv422_c;
}

View File

@@ -0,0 +1 @@
VIS-OBJS += sparc/yuv2rgb_vis.o \

View File

@@ -0,0 +1,212 @@
/*
* VIS optimized software YUV to RGB converter
* Copyright (c) 2007 Denes Balatoni <dbalatoni@programozo.hu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include <stdlib.h>
#include "libavutil/attributes.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#define YUV2RGB_INIT \
"wr %%g0, 0x10, %%gsr \n\t" \
"ldd [%5], %%f32 \n\t" \
"ldd [%5 + 8], %%f34 \n\t" \
"ldd [%5 + 16], %%f36 \n\t" \
"ldd [%5 + 24], %%f38 \n\t" \
"ldd [%5 + 32], %%f40 \n\t" \
"ldd [%5 + 40], %%f42 \n\t" \
"ldd [%5 + 48], %%f44 \n\t" \
"ldd [%5 + 56], %%f46 \n\t" \
"ldd [%5 + 64], %%f48 \n\t" \
"ldd [%5 + 72], %%f50 \n\t"
#define YUV2RGB_KERNEL \
/* ^^^^ f0=Y f3=u f5=v */ \
"fmul8x16 %%f3, %%f48, %%f6 \n\t" \
"fmul8x16 %%f19, %%f48, %%f22 \n\t" \
"fmul8x16 %%f5, %%f44, %%f8 \n\t" \
"fmul8x16 %%f21, %%f44, %%f24 \n\t" \
"fmul8x16 %%f0, %%f42, %%f0 \n\t" \
"fmul8x16 %%f16, %%f42, %%f16 \n\t" \
"fmul8x16 %%f3, %%f50, %%f2 \n\t" \
"fmul8x16 %%f19, %%f50, %%f18 \n\t" \
"fmul8x16 %%f5, %%f46, %%f4 \n\t" \
"fmul8x16 %%f21, %%f46, %%f20 \n\t" \
\
"fpsub16 %%f6, %%f34, %%f6 \n\t" /* 1 */ \
"fpsub16 %%f22, %%f34, %%f22 \n\t" /* 1 */ \
"fpsub16 %%f8, %%f38, %%f8 \n\t" /* 3 */ \
"fpsub16 %%f24, %%f38, %%f24 \n\t" /* 3 */ \
"fpsub16 %%f0, %%f32, %%f0 \n\t" /* 0 */ \
"fpsub16 %%f16, %%f32, %%f16 \n\t" /* 0 */ \
"fpsub16 %%f2, %%f36, %%f2 \n\t" /* 2 */ \
"fpsub16 %%f18, %%f36, %%f18 \n\t" /* 2 */ \
"fpsub16 %%f4, %%f40, %%f4 \n\t" /* 4 */ \
"fpsub16 %%f20, %%f40, %%f20 \n\t" /* 4 */ \
\
"fpadd16 %%f0, %%f8, %%f8 \n\t" /* Gt */ \
"fpadd16 %%f16, %%f24, %%f24 \n\t" /* Gt */ \
"fpadd16 %%f0, %%f4, %%f4 \n\t" /* R */ \
"fpadd16 %%f16, %%f20, %%f20 \n\t" /* R */ \
"fpadd16 %%f0, %%f6, %%f6 \n\t" /* B */ \
"fpadd16 %%f16, %%f22, %%f22 \n\t" /* B */ \
"fpadd16 %%f8, %%f2, %%f2 \n\t" /* G */ \
"fpadd16 %%f24, %%f18, %%f18 \n\t" /* G */ \
\
"fpack16 %%f4, %%f4 \n\t" \
"fpack16 %%f20, %%f20 \n\t" \
"fpack16 %%f6, %%f6 \n\t" \
"fpack16 %%f22, %%f22 \n\t" \
"fpack16 %%f2, %%f2 \n\t" \
"fpack16 %%f18, %%f18 \n\t"
// FIXME: must be changed to set alpha to 255 instead of 0
static int vis_420P_ARGB32(SwsContext *c, uint8_t *src[], int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, out1, out2, out3, out4, out5, out6;
for (y = 0; y < srcSliceH; ++y)
__asm__ volatile (
YUV2RGB_INIT
"wr %%g0, 0xd2, %%asi \n\t" /* ASI_FL16_P */
"1: \n\t"
"ldda [%1] %%asi, %%f2 \n\t"
"ldda [%1 + 2] %%asi, %%f18 \n\t"
"ldda [%2] %%asi, %%f4 \n\t"
"ldda [%2 + 2] %%asi, %%f20 \n\t"
"ld [%0], %%f0 \n\t"
"ld [%0+4], %%f16 \n\t"
"fpmerge %%f3, %%f3, %%f2 \n\t"
"fpmerge %%f19, %%f19, %%f18 \n\t"
"fpmerge %%f5, %%f5, %%f4 \n\t"
"fpmerge %%f21, %%f21, %%f20 \n\t"
YUV2RGB_KERNEL
"fzero %%f0 \n\t"
"fpmerge %%f4, %%f6, %%f8 \n\t" // r, b, t1
"fpmerge %%f20, %%f22, %%f24 \n\t" // r, b, t1
"fpmerge %%f0, %%f2, %%f10 \n\t" // 0, g, t2
"fpmerge %%f0, %%f18, %%f26 \n\t" // 0, g, t2
"fpmerge %%f10, %%f8, %%f4 \n\t" // t2, t1, msb
"fpmerge %%f26, %%f24, %%f20 \n\t" // t2, t1, msb
"fpmerge %%f11, %%f9, %%f6 \n\t" // t2, t1, lsb
"fpmerge %%f27, %%f25, %%f22 \n\t" // t2, t1, lsb
"std %%f4, [%3] \n\t"
"std %%f20, [%3 + 16] \n\t"
"std %%f6, [%3 + 8] \n\t"
"std %%f22, [%3 + 24] \n\t"
"add %0, 8, %0 \n\t"
"add %1, 4, %1 \n\t"
"add %2, 4, %2 \n\t"
"subcc %4, 8, %4 \n\t"
"bne 1b \n\t"
"add %3, 32, %3 \n\t" // delay slot
: "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
: "0" (src[0] + (y + srcSliceY) * srcStride[0]), "1" (src[1] + ((y + srcSliceY) >> 1) * srcStride[1]),
"2" (src[2] + ((y + srcSliceY) >> 1) * srcStride[2]), "3" (dst[0] + (y + srcSliceY) * dstStride[0]),
"4" (c->dstW),
"5" (c->sparc_coeffs)
);
return srcSliceH;
}
// FIXME: must be changed to set alpha to 255 instead of 0
static int vis_422P_ARGB32(SwsContext *c, uint8_t *src[], int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, out1, out2, out3, out4, out5, out6;
for (y = 0; y < srcSliceH; ++y)
__asm__ volatile (
YUV2RGB_INIT
"wr %%g0, 0xd2, %%asi \n\t" /* ASI_FL16_P */
"1: \n\t"
"ldda [%1] %%asi, %%f2 \n\t"
"ldda [%1 + 2] %%asi, %%f18 \n\t"
"ldda [%2] %%asi, %%f4 \n\t"
"ldda [%2 + 2] %%asi, %%f20 \n\t"
"ld [%0], %%f0 \n\t"
"ld [%0 + 4], %%f16 \n\t"
"fpmerge %%f3, %%f3, %%f2 \n\t"
"fpmerge %%f19, %%f19, %%f18 \n\t"
"fpmerge %%f5, %%f5, %%f4 \n\t"
"fpmerge %%f21, %%f21, %%f20 \n\t"
YUV2RGB_KERNEL
"fzero %%f0 \n\t"
"fpmerge %%f4, %%f6, %%f8 \n\t" // r,b,t1
"fpmerge %%f20, %%f22, %%f24 \n\t" // r,b,t1
"fpmerge %%f0, %%f2, %%f10 \n\t" // 0,g,t2
"fpmerge %%f0, %%f18, %%f26 \n\t" // 0,g,t2
"fpmerge %%f10, %%f8, %%f4 \n\t" // t2,t1,msb
"fpmerge %%f26, %%f24, %%f20 \n\t" // t2,t1,msb
"fpmerge %%f11, %%f9, %%f6 \n\t" // t2,t1,lsb
"fpmerge %%f27, %%f25, %%f22 \n\t" // t2,t1,lsb
"std %%f4, [%3] \n\t"
"std %%f20, [%3 + 16] \n\t"
"std %%f6, [%3 + 8] \n\t"
"std %%f22, [%3 + 24] \n\t"
"add %0, 8, %0 \n\t"
"add %1, 4, %1 \n\t"
"add %2, 4, %2 \n\t"
"subcc %4, 8, %4 \n\t"
"bne 1b \n\t"
"add %3, 32, %3 \n\t" //delay slot
: "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6)
: "0" (src[0] + (y + srcSliceY) * srcStride[0]), "1" (src[1] + (y + srcSliceY) * srcStride[1]),
"2" (src[2] + (y + srcSliceY) * srcStride[2]), "3" (dst[0] + (y + srcSliceY) * dstStride[0]),
"4" (c->dstW),
"5" (c->sparc_coeffs)
);
return srcSliceH;
}
av_cold SwsFunc ff_yuv2rgb_init_vis(SwsContext *c)
{
c->sparc_coeffs[5] = c->yCoeff;
c->sparc_coeffs[6] = c->vgCoeff;
c->sparc_coeffs[7] = c->vrCoeff;
c->sparc_coeffs[8] = c->ubCoeff;
c->sparc_coeffs[9] = c->ugCoeff;
c->sparc_coeffs[0] = (((int16_t)c->yOffset * (int16_t)c->yCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
c->sparc_coeffs[1] = (((int16_t)c->uOffset * (int16_t)c->ubCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
c->sparc_coeffs[2] = (((int16_t)c->uOffset * (int16_t)c->ugCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
c->sparc_coeffs[3] = (((int16_t)c->vOffset * (int16_t)c->vgCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
c->sparc_coeffs[4] = (((int16_t)c->vOffset * (int16_t)c->vrCoeff >> 11) & 0xffff) * 0x0001000100010001ULL;
if (c->dstFormat == AV_PIX_FMT_RGB32 && c->srcFormat == AV_PIX_FMT_YUV422P && (c->dstW & 7) == 0) {
av_log(c, AV_LOG_INFO,
"SPARC VIS accelerated YUV422P -> RGB32 (WARNING: alpha value is wrong)\n");
return vis_422P_ARGB32;
} else if (c->dstFormat == AV_PIX_FMT_RGB32 && c->srcFormat == AV_PIX_FMT_YUV420P && (c->dstW & 7) == 0) {
av_log(c, AV_LOG_INFO,
"SPARC VIS accelerated YUV420P -> RGB32 (WARNING: alpha value is wrong)\n");
return vis_420P_ARGB32;
}
return NULL;
}

View File

@@ -0,0 +1,37 @@
EXPORTS
sws_addVec
sws_allocVec
sws_alloc_context
sws_cloneVec
sws_context_class DATA
sws_convVec
sws_convertPalette8ToPacked24
sws_convertPalette8ToPacked32
sws_format_name
sws_freeContext
sws_freeFilter
sws_freeVec
sws_getCachedContext
sws_getCoefficients
sws_getColorspaceDetails
sws_getConstVec
sws_getContext
sws_getDefaultFilter
sws_getGaussianVec
sws_getIdentityVec
sws_get_class
sws_init_context
sws_isSupportedEndiannessConversion
sws_isSupportedInput
sws_isSupportedOutput
sws_normalizeVec
sws_printVec2
sws_rgb2rgb_init
sws_scale
sws_scaleVec
sws_setColorspaceDetails
sws_shiftVec
sws_subVec
swscale_configuration
swscale_license
swscale_version

View File

@@ -0,0 +1,37 @@
EXPORTS
sws_addVec @1
sws_allocVec @2
sws_alloc_context @3
sws_cloneVec @4
sws_context_class @5 DATA
sws_convVec @6
sws_convertPalette8ToPacked24 @7
sws_convertPalette8ToPacked32 @8
sws_format_name @9
sws_freeContext @10
sws_freeFilter @11
sws_freeVec @12
sws_getCachedContext @13
sws_getCoefficients @14
sws_getColorspaceDetails @15
sws_getConstVec @16
sws_getContext @17
sws_getDefaultFilter @18
sws_getGaussianVec @19
sws_getIdentityVec @20
sws_get_class @21
sws_init_context @22
sws_isSupportedEndiannessConversion @23
sws_isSupportedInput @24
sws_isSupportedOutput @25
sws_normalizeVec @26
sws_printVec2 @27
sws_rgb2rgb_init @28
sws_scale @29
sws_scaleVec @30
sws_setColorspaceDetails @31
sws_shiftVec @32
sws_subVec @33
swscale_configuration @34
swscale_license @35
swscale_version @36

View File

@@ -0,0 +1,415 @@
/*
* Copyright (C) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <stdarg.h>
#undef HAVE_AV_CONFIG_H
#include "libavutil/imgutils.h"
#include "libavutil/mem.h"
#include "libavutil/avutil.h"
#include "libavutil/crc.h"
#include "libavutil/pixdesc.h"
#include "libavutil/lfg.h"
#include "swscale.h"
/* HACK Duplicated from swscale_internal.h.
* Should be removed when a cleaner pixel format system exists. */
#define isGray(x) \
((x) == AV_PIX_FMT_GRAY8 || \
(x) == AV_PIX_FMT_Y400A || \
(x) == AV_PIX_FMT_GRAY16BE || \
(x) == AV_PIX_FMT_GRAY16LE)
#define hasChroma(x) \
(!(isGray(x) || \
(x) == AV_PIX_FMT_MONOBLACK || \
(x) == AV_PIX_FMT_MONOWHITE))
#define isALPHA(x) \
((x) == AV_PIX_FMT_BGR32 || \
(x) == AV_PIX_FMT_BGR32_1 || \
(x) == AV_PIX_FMT_RGB32 || \
(x) == AV_PIX_FMT_RGB32_1 || \
(x) == AV_PIX_FMT_YUVA420P)
static uint64_t getSSD(const uint8_t *src1, const uint8_t *src2, int stride1,
int stride2, int w, int h)
{
int x, y;
uint64_t ssd = 0;
for (y = 0; y < h; y++) {
for (x = 0; x < w; x++) {
int d = src1[x + y * stride1] - src2[x + y * stride2];
ssd += d * d;
}
}
return ssd;
}
struct Results {
uint64_t ssdY;
uint64_t ssdU;
uint64_t ssdV;
uint64_t ssdA;
uint32_t crc;
};
// test by ref -> src -> dst -> out & compare out against ref
// ref & out are YV12
static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
enum AVPixelFormat srcFormat, enum AVPixelFormat dstFormat,
int srcW, int srcH, int dstW, int dstH, int flags,
struct Results *r)
{
const AVPixFmtDescriptor *desc_yuva420p = av_pix_fmt_desc_get(AV_PIX_FMT_YUVA420P);
const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(srcFormat);
const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(dstFormat);
static enum AVPixelFormat cur_srcFormat;
static int cur_srcW, cur_srcH;
static uint8_t *src[4];
static int srcStride[4];
uint8_t *dst[4] = { 0 };
uint8_t *out[4] = { 0 };
int dstStride[4] = {0};
int i;
uint64_t ssdY, ssdU = 0, ssdV = 0, ssdA = 0;
struct SwsContext *dstContext = NULL, *outContext = NULL;
uint32_t crc = 0;
int res = 0;
if (cur_srcFormat != srcFormat || cur_srcW != srcW || cur_srcH != srcH) {
struct SwsContext *srcContext = NULL;
int p;
for (p = 0; p < 4; p++)
av_freep(&src[p]);
av_image_fill_linesizes(srcStride, srcFormat, srcW);
for (p = 0; p < 4; p++) {
srcStride[p] = FFALIGN(srcStride[p], 16);
if (srcStride[p])
src[p] = av_mallocz(srcStride[p] * srcH + 16);
if (srcStride[p] && !src[p]) {
perror("Malloc");
res = -1;
goto end;
}
}
srcContext = sws_getContext(w, h, AV_PIX_FMT_YUVA420P, srcW, srcH,
srcFormat, SWS_BILINEAR, NULL, NULL, NULL);
if (!srcContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_yuva420p->name,
desc_src->name);
res = -1;
goto end;
}
sws_scale(srcContext, (const uint8_t * const*)ref, refStride, 0, h, src, srcStride);
sws_freeContext(srcContext);
cur_srcFormat = srcFormat;
cur_srcW = srcW;
cur_srcH = srcH;
}
av_image_fill_linesizes(dstStride, dstFormat, dstW);
for (i = 0; i < 4; i++) {
/* Image buffers passed into libswscale can be allocated any way you
* prefer, as long as they're aligned enough for the architecture, and
* they're freed appropriately (such as using av_free for buffers
* allocated with av_malloc). */
/* An extra 16 bytes is being allocated because some scalers may write
* out of bounds. */
dstStride[i] = FFALIGN(dstStride[i], 16);
if (dstStride[i])
dst[i] = av_mallocz(dstStride[i] * dstH + 16);
if (dstStride[i] && !dst[i]) {
perror("Malloc");
res = -1;
goto end;
}
}
dstContext = sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat,
flags, NULL, NULL, NULL);
if (!dstContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_src->name, desc_dst->name);
res = -1;
goto end;
}
printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
desc_src->name, srcW, srcH,
desc_dst->name, dstW, dstH,
flags);
fflush(stdout);
sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
for (i = 0; i < 4 && dstStride[i]; i++)
crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
dstStride[i] * dstH);
if (r && crc == r->crc) {
ssdY = r->ssdY;
ssdU = r->ssdU;
ssdV = r->ssdV;
ssdA = r->ssdA;
} else {
for (i = 0; i < 4; i++) {
refStride[i] = FFALIGN(refStride[i], 16);
if (refStride[i])
out[i] = av_mallocz(refStride[i] * h);
if (refStride[i] && !out[i]) {
perror("Malloc");
res = -1;
goto end;
}
}
outContext = sws_getContext(dstW, dstH, dstFormat, w, h,
AV_PIX_FMT_YUVA420P, SWS_BILINEAR,
NULL, NULL, NULL);
if (!outContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_dst->name,
desc_yuva420p->name);
res = -1;
goto end;
}
sws_scale(outContext, (const uint8_t * const*)dst, dstStride, 0, dstH, out, refStride);
ssdY = getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
if (hasChroma(srcFormat) && hasChroma(dstFormat)) {
//FIXME check that output is really gray
ssdU = getSSD(ref[1], out[1], refStride[1], refStride[1],
(w + 1) >> 1, (h + 1) >> 1);
ssdV = getSSD(ref[2], out[2], refStride[2], refStride[2],
(w + 1) >> 1, (h + 1) >> 1);
}
if (isALPHA(srcFormat) && isALPHA(dstFormat))
ssdA = getSSD(ref[3], out[3], refStride[3], refStride[3], w, h);
ssdY /= w * h;
ssdU /= w * h / 4;
ssdV /= w * h / 4;
ssdA /= w * h;
sws_freeContext(outContext);
for (i = 0; i < 4; i++)
if (refStride[i])
av_free(out[i]);
}
printf(" CRC=%08x SSD=%5"PRId64 ",%5"PRId64 ",%5"PRId64 ",%5"PRId64 "\n",
crc, ssdY, ssdU, ssdV, ssdA);
end:
sws_freeContext(dstContext);
for (i = 0; i < 4; i++)
if (dstStride[i])
av_free(dst[i]);
return res;
}
static void selfTest(uint8_t *ref[4], int refStride[4], int w, int h,
enum AVPixelFormat srcFormat_in,
enum AVPixelFormat dstFormat_in)
{
const int flags[] = { SWS_FAST_BILINEAR, SWS_BILINEAR, SWS_BICUBIC,
SWS_X, SWS_POINT, SWS_AREA, 0 };
const int srcW = w;
const int srcH = h;
const int dstW[] = { srcW - srcW / 3, srcW, srcW + srcW / 3, 0 };
const int dstH[] = { srcH - srcH / 3, srcH, srcH + srcH / 3, 0 };
enum AVPixelFormat srcFormat, dstFormat;
const AVPixFmtDescriptor *desc_src, *desc_dst;
for (srcFormat = srcFormat_in != AV_PIX_FMT_NONE ? srcFormat_in : 0;
srcFormat < AV_PIX_FMT_NB; srcFormat++) {
if (!sws_isSupportedInput(srcFormat) ||
!sws_isSupportedOutput(srcFormat))
continue;
desc_src = av_pix_fmt_desc_get(srcFormat);
for (dstFormat = dstFormat_in != AV_PIX_FMT_NONE ? dstFormat_in : 0;
dstFormat < AV_PIX_FMT_NB; dstFormat++) {
int i, j, k;
int res = 0;
if (!sws_isSupportedInput(dstFormat) ||
!sws_isSupportedOutput(dstFormat))
continue;
desc_dst = av_pix_fmt_desc_get(dstFormat);
printf("%s -> %s\n", desc_src->name, desc_dst->name);
fflush(stdout);
for (k = 0; flags[k] && !res; k++)
for (i = 0; dstW[i] && !res; i++)
for (j = 0; dstH[j] && !res; j++)
res = doTest(ref, refStride, w, h,
srcFormat, dstFormat,
srcW, srcH, dstW[i], dstH[j], flags[k],
NULL);
if (dstFormat_in != AV_PIX_FMT_NONE)
break;
}
if (srcFormat_in != AV_PIX_FMT_NONE)
break;
}
}
static int fileTest(uint8_t *ref[4], int refStride[4], int w, int h, FILE *fp,
enum AVPixelFormat srcFormat_in,
enum AVPixelFormat dstFormat_in)
{
char buf[256];
while (fgets(buf, sizeof(buf), fp)) {
struct Results r;
enum AVPixelFormat srcFormat;
char srcStr[12];
int srcW, srcH;
enum AVPixelFormat dstFormat;
char dstStr[12];
int dstW, dstH;
int flags;
int ret;
ret = sscanf(buf,
" %12s %dx%d -> %12s %dx%d flags=%d CRC=%x"
" SSD=%"SCNd64 ", %"SCNd64 ", %"SCNd64 ", %"SCNd64 "\n",
srcStr, &srcW, &srcH, dstStr, &dstW, &dstH,
&flags, &r.crc, &r.ssdY, &r.ssdU, &r.ssdV, &r.ssdA);
if (ret != 12) {
srcStr[0] = dstStr[0] = 0;
ret = sscanf(buf, "%12s -> %12s\n", srcStr, dstStr);
}
srcFormat = av_get_pix_fmt(srcStr);
dstFormat = av_get_pix_fmt(dstStr);
if (srcFormat == AV_PIX_FMT_NONE || dstFormat == AV_PIX_FMT_NONE ||
srcW > 8192U || srcH > 8192U || dstW > 8192U || dstH > 8192U) {
fprintf(stderr, "malformed input file\n");
return -1;
}
if ((srcFormat_in != AV_PIX_FMT_NONE && srcFormat_in != srcFormat) ||
(dstFormat_in != AV_PIX_FMT_NONE && dstFormat_in != dstFormat))
continue;
if (ret != 12) {
printf("%s", buf);
continue;
}
doTest(ref, refStride, w, h,
srcFormat, dstFormat,
srcW, srcH, dstW, dstH, flags,
&r);
}
return 0;
}
#define W 96
#define H 96
int main(int argc, char **argv)
{
enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
uint8_t *rgb_data = av_malloc(W * H * 4);
const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
int rgb_stride[4] = { 4 * W, 0, 0, 0 };
uint8_t *data = av_malloc(4 * W * H);
uint8_t *src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
int stride[4] = { W, W, W, W };
int x, y;
struct SwsContext *sws;
AVLFG rand;
int res = -1;
int i;
FILE *fp = NULL;
if (!rgb_data || !data)
return -1;
for (i = 1; i < argc; i += 2) {
if (argv[i][0] != '-' || i + 1 == argc)
goto bad_option;
if (!strcmp(argv[i], "-ref")) {
fp = fopen(argv[i + 1], "r");
if (!fp) {
fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
goto error;
}
} else if (!strcmp(argv[i], "-src")) {
srcFormat = av_get_pix_fmt(argv[i + 1]);
if (srcFormat == AV_PIX_FMT_NONE) {
fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
return -1;
}
} else if (!strcmp(argv[i], "-dst")) {
dstFormat = av_get_pix_fmt(argv[i + 1]);
if (dstFormat == AV_PIX_FMT_NONE) {
fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
return -1;
}
} else {
bad_option:
fprintf(stderr, "bad option or argument missing (%s)\n", argv[i]);
goto error;
}
}
sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
av_lfg_init(&rand, 1);
for (y = 0; y < H; y++)
for (x = 0; x < W * 4; x++)
rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride);
sws_freeContext(sws);
av_free(rgb_data);
if(fp) {
res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
fclose(fp);
} else {
selfTest(src, stride, W, H, srcFormat, dstFormat);
res = 0;
}
error:
av_free(data);
return res;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,362 @@
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_SWSCALE_H
#define SWSCALE_SWSCALE_H
/**
* @file
* @ingroup lsws
* external API header
*/
/**
* @defgroup lsws Libswscale
* @{
*/
#include <stdint.h>
#include "libavutil/avutil.h"
#include "libavutil/log.h"
#include "libavutil/pixfmt.h"
#include "version.h"
/**
* Return the LIBSWSCALE_VERSION_INT constant.
*/
unsigned swscale_version(void);
/**
* Return the libswscale build-time configuration.
*/
const char *swscale_configuration(void);
/**
* Return the libswscale license.
*/
const char *swscale_license(void);
/* values for the flags, the stuff on the command line is different */
#define SWS_FAST_BILINEAR 1
#define SWS_BILINEAR 2
#define SWS_BICUBIC 4
#define SWS_X 8
#define SWS_POINT 0x10
#define SWS_AREA 0x20
#define SWS_BICUBLIN 0x40
#define SWS_GAUSS 0x80
#define SWS_SINC 0x100
#define SWS_LANCZOS 0x200
#define SWS_SPLINE 0x400
#define SWS_SRC_V_CHR_DROP_MASK 0x30000
#define SWS_SRC_V_CHR_DROP_SHIFT 16
#define SWS_PARAM_DEFAULT 123456
#define SWS_PRINT_INFO 0x1000
//the following 3 flags are not completely implemented
//internal chrominace subsampling info
#define SWS_FULL_CHR_H_INT 0x2000
//input subsampling info
#define SWS_FULL_CHR_H_INP 0x4000
#define SWS_DIRECT_BGR 0x8000
#define SWS_ACCURATE_RND 0x40000
#define SWS_BITEXACT 0x80000
#define SWS_ERROR_DIFFUSION 0x800000
#if FF_API_SWS_CPU_CAPS
/**
* CPU caps are autodetected now, those flags
* are only provided for API compatibility.
*/
#define SWS_CPU_CAPS_MMX 0x80000000
#define SWS_CPU_CAPS_MMXEXT 0x20000000
#define SWS_CPU_CAPS_MMX2 0x20000000
#define SWS_CPU_CAPS_3DNOW 0x40000000
#define SWS_CPU_CAPS_ALTIVEC 0x10000000
#define SWS_CPU_CAPS_BFIN 0x01000000
#define SWS_CPU_CAPS_SSE2 0x02000000
#endif
#define SWS_MAX_REDUCE_CUTOFF 0.002
#define SWS_CS_ITU709 1
#define SWS_CS_FCC 4
#define SWS_CS_ITU601 5
#define SWS_CS_ITU624 5
#define SWS_CS_SMPTE170M 5
#define SWS_CS_SMPTE240M 7
#define SWS_CS_DEFAULT 5
/**
* Return a pointer to yuv<->rgb coefficients for the given colorspace
* suitable for sws_setColorspaceDetails().
*
* @param colorspace One of the SWS_CS_* macros. If invalid,
* SWS_CS_DEFAULT is used.
*/
const int *sws_getCoefficients(int colorspace);
// when used for filters they must have an odd number of elements
// coeffs cannot be shared between vectors
typedef struct SwsVector {
double *coeff; ///< pointer to the list of coefficients
int length; ///< number of coefficients in the vector
} SwsVector;
// vectors can be shared
typedef struct SwsFilter {
SwsVector *lumH;
SwsVector *lumV;
SwsVector *chrH;
SwsVector *chrV;
} SwsFilter;
struct SwsContext;
/**
* Return a positive value if pix_fmt is a supported input format, 0
* otherwise.
*/
int sws_isSupportedInput(enum AVPixelFormat pix_fmt);
/**
* Return a positive value if pix_fmt is a supported output format, 0
* otherwise.
*/
int sws_isSupportedOutput(enum AVPixelFormat pix_fmt);
/**
* @param[in] pix_fmt the pixel format
* @return a positive value if an endianness conversion for pix_fmt is
* supported, 0 otherwise.
*/
int sws_isSupportedEndiannessConversion(enum AVPixelFormat pix_fmt);
/**
* Allocate an empty SwsContext. This must be filled and passed to
* sws_init_context(). For filling see AVOptions, options.c and
* sws_setColorspaceDetails().
*/
struct SwsContext *sws_alloc_context(void);
/**
* Initialize the swscaler context sws_context.
*
* @return zero or positive value on success, a negative value on
* error
*/
int sws_init_context(struct SwsContext *sws_context, SwsFilter *srcFilter, SwsFilter *dstFilter);
/**
* Free the swscaler context swsContext.
* If swsContext is NULL, then does nothing.
*/
void sws_freeContext(struct SwsContext *swsContext);
#if FF_API_SWS_GETCONTEXT
/**
* Allocate and return an SwsContext. You need it to perform
* scaling/conversion operations using sws_scale().
*
* @param srcW the width of the source image
* @param srcH the height of the source image
* @param srcFormat the source image format
* @param dstW the width of the destination image
* @param dstH the height of the destination image
* @param dstFormat the destination image format
* @param flags specify which algorithm and options to use for rescaling
* @return a pointer to an allocated context, or NULL in case of error
* @note this function is to be removed after a saner alternative is
* written
* @deprecated Use sws_getCachedContext() instead.
*/
struct SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, SwsFilter *srcFilter,
SwsFilter *dstFilter, const double *param);
#endif
/**
* Scale the image slice in srcSlice and put the resulting scaled
* slice in the image in dst. A slice is a sequence of consecutive
* rows in an image.
*
* Slices have to be provided in sequential order, either in
* top-bottom or bottom-top order. If slices are provided in
* non-sequential order the behavior of the function is undefined.
*
* @param c the scaling context previously created with
* sws_getContext()
* @param srcSlice the array containing the pointers to the planes of
* the source slice
* @param srcStride the array containing the strides for each plane of
* the source image
* @param srcSliceY the position in the source image of the slice to
* process, that is the number (counted starting from
* zero) in the image of the first row of the slice
* @param srcSliceH the height of the source slice, that is the number
* of rows in the slice
* @param dst the array containing the pointers to the planes of
* the destination image
* @param dstStride the array containing the strides for each plane of
* the destination image
* @return the height of the output slice
*/
int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
const int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *const dst[], const int dstStride[]);
/**
* @param dstRange flag indicating the while-black range of the output (1=jpeg / 0=mpeg)
* @param srcRange flag indicating the while-black range of the input (1=jpeg / 0=mpeg)
* @param table the yuv2rgb coefficients describing the output yuv space, normally ff_yuv2rgb_coeffs[x]
* @param inv_table the yuv2rgb coefficients describing the input yuv space, normally ff_yuv2rgb_coeffs[x]
* @param brightness 16.16 fixed point brightness correction
* @param contrast 16.16 fixed point contrast correction
* @param saturation 16.16 fixed point saturation correction
* @return -1 if not supported
*/
int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
int srcRange, const int table[4], int dstRange,
int brightness, int contrast, int saturation);
/**
* @return -1 if not supported
*/
int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
int *srcRange, int **table, int *dstRange,
int *brightness, int *contrast, int *saturation);
/**
* Allocate and return an uninitialized vector with length coefficients.
*/
SwsVector *sws_allocVec(int length);
/**
* Return a normalized Gaussian curve used to filter stuff
* quality = 3 is high quality, lower is lower quality.
*/
SwsVector *sws_getGaussianVec(double variance, double quality);
/**
* Allocate and return a vector with length coefficients, all
* with the same value c.
*/
SwsVector *sws_getConstVec(double c, int length);
/**
* Allocate and return a vector with just one coefficient, with
* value 1.0.
*/
SwsVector *sws_getIdentityVec(void);
/**
* Scale all the coefficients of a by the scalar value.
*/
void sws_scaleVec(SwsVector *a, double scalar);
/**
* Scale all the coefficients of a so that their sum equals height.
*/
void sws_normalizeVec(SwsVector *a, double height);
void sws_convVec(SwsVector *a, SwsVector *b);
void sws_addVec(SwsVector *a, SwsVector *b);
void sws_subVec(SwsVector *a, SwsVector *b);
void sws_shiftVec(SwsVector *a, int shift);
/**
* Allocate and return a clone of the vector a, that is a vector
* with the same coefficients as a.
*/
SwsVector *sws_cloneVec(SwsVector *a);
/**
* Print with av_log() a textual representation of the vector a
* if log_level <= av_log_level.
*/
void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level);
void sws_freeVec(SwsVector *a);
SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
float lumaSharpen, float chromaSharpen,
float chromaHShift, float chromaVShift,
int verbose);
void sws_freeFilter(SwsFilter *filter);
/**
* Check if context can be reused, otherwise reallocate a new one.
*
* If context is NULL, just calls sws_getContext() to get a new
* context. Otherwise, checks if the parameters are the ones already
* saved in context. If that is the case, returns the current
* context. Otherwise, frees context and gets a new context with
* the new parameters.
*
* Be warned that srcFilter and dstFilter are not checked, they
* are assumed to remain the same.
*/
struct SwsContext *sws_getCachedContext(struct SwsContext *context,
int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, SwsFilter *srcFilter,
SwsFilter *dstFilter, const double *param);
/**
* Convert an 8-bit paletted frame into a frame with a color depth of 32 bits.
*
* The output frame will have the same packed format as the palette.
*
* @param src source frame buffer
* @param dst destination frame buffer
* @param num_pixels number of pixels to convert
* @param palette array with [256] entries, which must match color arrangement (RGB or BGR) of src
*/
void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
/**
* Convert an 8-bit paletted frame into a frame with a color depth of 24 bits.
*
* With the palette format "ABCD", the destination frame ends up with the format "ABC".
*
* @param src source frame buffer
* @param dst destination frame buffer
* @param num_pixels number of pixels to convert
* @param palette array with [256] entries, which must match color arrangement (RGB or BGR) of src
*/
void sws_convertPalette8ToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
/**
* Get the AVClass for swsContext. It can be used in combination with
* AV_OPT_SEARCH_FAKE_OBJ for examining options.
*
* @see av_opt_find().
*/
const AVClass *sws_get_class(void);
/**
* @}
*/
#endif /* SWSCALE_SWSCALE_H */

View File

@@ -0,0 +1,875 @@
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_SWSCALE_INTERNAL_H
#define SWSCALE_SWSCALE_INTERNAL_H
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/avassert.h"
#include "libavutil/avutil.h"
#include "libavutil/common.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/log.h"
#include "libavutil/pixfmt.h"
#include "libavutil/pixdesc.h"
#define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
#define YUVRGB_TABLE_HEADROOM 128
#define MAX_FILTER_SIZE 256
#define DITHER1XBPP
#if HAVE_BIGENDIAN
#define ALT32_CORR (-1)
#else
#define ALT32_CORR 1
#endif
#if ARCH_X86_64
# define APCK_PTR2 8
# define APCK_COEF 16
# define APCK_SIZE 24
#else
# define APCK_PTR2 4
# define APCK_COEF 8
# define APCK_SIZE 16
#endif
struct SwsContext;
typedef enum SwsDither {
SWS_DITHER_NONE = 0,
SWS_DITHER_AUTO,
SWS_DITHER_BAYER,
SWS_DITHER_ED,
NB_SWS_DITHER,
} SwsDither;
typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]);
/**
* Write one line of horizontally scaled data to planar output
* without any additional vertical scaling (or point-scaling).
*
* @param src scaled source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param dest pointer to the output plane. For >8bit
* output, this is in uint16_t
* @param dstW width of destination in pixels
* @param dither ordered dither array of type int16_t and size 8
* @param offset Dither offset
*/
typedef void (*yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
/**
* Write one line of horizontally scaled data to planar output
* with multi-point vertical scaling between input pixels.
*
* @param filter vertical luma/alpha scaling coefficients, 12bit [0,4096]
* @param src scaled luma (Y) or alpha (A) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param filterSize number of vertical input lines to scale
* @param dest pointer to output plane. For >8bit
* output, this is in uint16_t
* @param dstW width of destination pixels
* @param offset Dither offset
*/
typedef void (*yuv2planarX_fn)(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
/**
* Write one line of horizontally scaled chroma to interleaved output
* with multi-point vertical scaling between input pixels.
*
* @param c SWS scaling context
* @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096]
* @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrFilterSize number of vertical chroma input lines to scale
* @param dest pointer to the output plane. For >8bit
* output, this is in uint16_t
* @param dstW width of chroma planes
*/
typedef void (*yuv2interleavedX_fn)(struct SwsContext *c,
const int16_t *chrFilter,
int chrFilterSize,
const int16_t **chrUSrc,
const int16_t **chrVSrc,
uint8_t *dest, int dstW);
/**
* Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
* output without any additional vertical scaling (or point-scaling). Note
* that this function may do chroma scaling, see the "uvalpha" argument.
*
* @param c SWS scaling context
* @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param dest pointer to the output plane. For 16bit output, this is
* uint16_t
* @param dstW width of lumSrc and alpSrc in pixels, number of pixels
* to write into dest[]
* @param uvalpha chroma scaling coefficient for the second line of chroma
* pixels, either 2048 or 0. If 0, one chroma input is used
* for 2 output pixels (or if the SWS_FLAG_FULL_CHR_INT flag
* is set, it generates 1 output pixel). If 2048, two chroma
* input pixels should be averaged for 2 output pixels (this
* only happens if SWS_FLAG_FULL_CHR_INT is not set)
* @param y vertical line number for this output. This does not need
* to be used to calculate the offset in the destination,
* but can be used to generate comfort noise using dithering
* for some output formats.
*/
typedef void (*yuv2packed1_fn)(struct SwsContext *c, const int16_t *lumSrc,
const int16_t *chrUSrc[2],
const int16_t *chrVSrc[2],
const int16_t *alpSrc, uint8_t *dest,
int dstW, int uvalpha, int y);
/**
* Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
* output by doing bilinear scaling between two input lines.
*
* @param c SWS scaling context
* @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param dest pointer to the output plane. For 16bit output, this is
* uint16_t
* @param dstW width of lumSrc and alpSrc in pixels, number of pixels
* to write into dest[]
* @param yalpha luma/alpha scaling coefficients for the second input line.
* The first line's coefficients can be calculated by using
* 4096 - yalpha
* @param uvalpha chroma scaling coefficient for the second input line. The
* first line's coefficients can be calculated by using
* 4096 - uvalpha
* @param y vertical line number for this output. This does not need
* to be used to calculate the offset in the destination,
* but can be used to generate comfort noise using dithering
* for some output formats.
*/
typedef void (*yuv2packed2_fn)(struct SwsContext *c, const int16_t *lumSrc[2],
const int16_t *chrUSrc[2],
const int16_t *chrVSrc[2],
const int16_t *alpSrc[2],
uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y);
/**
* Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
* output by doing multi-point vertical scaling between input pixels.
*
* @param c SWS scaling context
* @param lumFilter vertical luma/alpha scaling coefficients, 12bit [0,4096]
* @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param lumFilterSize number of vertical luma/alpha input lines to scale
* @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096]
* @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrFilterSize number of vertical chroma input lines to scale
* @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param dest pointer to the output plane. For 16bit output, this is
* uint16_t
* @param dstW width of lumSrc and alpSrc in pixels, number of pixels
* to write into dest[]
* @param y vertical line number for this output. This does not need
* to be used to calculate the offset in the destination,
* but can be used to generate comfort noise using dithering
* or some output formats.
*/
typedef void (*yuv2packedX_fn)(struct SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter,
const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest,
int dstW, int y);
/**
* Write one line of horizontally scaled Y/U/V/A to YUV/RGB
* output by doing multi-point vertical scaling between input pixels.
*
* @param c SWS scaling context
* @param lumFilter vertical luma/alpha scaling coefficients, 12bit [0,4096]
* @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param lumFilterSize number of vertical luma/alpha input lines to scale
* @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096]
* @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrFilterSize number of vertical chroma input lines to scale
* @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param dest pointer to the output planes. For 16bit output, this is
* uint16_t
* @param dstW width of lumSrc and alpSrc in pixels, number of pixels
* to write into dest[]
* @param y vertical line number for this output. This does not need
* to be used to calculate the offset in the destination,
* but can be used to generate comfort noise using dithering
* or some output formats.
*/
typedef void (*yuv2anyX_fn)(struct SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter,
const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t **dest,
int dstW, int y);
/* This struct should be aligned on at least a 32-byte boundary. */
typedef struct SwsContext {
/**
* info on struct for av_log
*/
const AVClass *av_class;
/**
* Note that src, dst, srcStride, dstStride will be copied in the
* sws_scale() wrapper so they can be freely modified here.
*/
SwsFunc swscale;
int srcW; ///< Width of source luma/alpha planes.
int srcH; ///< Height of source luma/alpha planes.
int dstH; ///< Height of destination luma/alpha planes.
int chrSrcW; ///< Width of source chroma planes.
int chrSrcH; ///< Height of source chroma planes.
int chrDstW; ///< Width of destination chroma planes.
int chrDstH; ///< Height of destination chroma planes.
int lumXInc, chrXInc;
int lumYInc, chrYInc;
enum AVPixelFormat dstFormat; ///< Destination pixel format.
enum AVPixelFormat srcFormat; ///< Source pixel format.
int dstFormatBpp; ///< Number of bits per pixel of the destination pixel format.
int srcFormatBpp; ///< Number of bits per pixel of the source pixel format.
int dstBpc, srcBpc;
int chrSrcHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in source image.
int chrSrcVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in source image.
int chrDstHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in destination image.
int chrDstVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in destination image.
int vChrDrop; ///< Binary logarithm of extra vertical subsampling factor in source image chroma planes specified by user.
int sliceDir; ///< Direction that slices are fed to the scaler (1 = top-to-bottom, -1 = bottom-to-top).
double param[2]; ///< Input parameters for scaling algorithms that need them.
uint32_t pal_yuv[256];
uint32_t pal_rgb[256];
/**
* @name Scaled horizontal lines ring buffer.
* The horizontal scaler keeps just enough scaled lines in a ring buffer
* so they may be passed to the vertical scaler. The pointers to the
* allocated buffers for each line are duplicated in sequence in the ring
* buffer to simplify indexing and avoid wrapping around between lines
* inside the vertical scaler code. The wrapping is done before the
* vertical scaler is called.
*/
//@{
int16_t **lumPixBuf; ///< Ring buffer for scaled horizontal luma plane lines to be fed to the vertical scaler.
int16_t **chrUPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
int16_t **chrVPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
int16_t **alpPixBuf; ///< Ring buffer for scaled horizontal alpha plane lines to be fed to the vertical scaler.
int vLumBufSize; ///< Number of vertical luma/alpha lines allocated in the ring buffer.
int vChrBufSize; ///< Number of vertical chroma lines allocated in the ring buffer.
int lastInLumBuf; ///< Last scaled horizontal luma/alpha line from source in the ring buffer.
int lastInChrBuf; ///< Last scaled horizontal chroma line from source in the ring buffer.
int lumBufIndex; ///< Index in ring buffer of the last scaled horizontal luma/alpha line from source.
int chrBufIndex; ///< Index in ring buffer of the last scaled horizontal chroma line from source.
//@}
uint8_t *formatConvBuffer;
/**
* @name Horizontal and vertical filters.
* To better understand the following fields, here is a pseudo-code of
* their usage in filtering a horizontal line:
* @code
* for (i = 0; i < width; i++) {
* dst[i] = 0;
* for (j = 0; j < filterSize; j++)
* dst[i] += src[ filterPos[i] + j ] * filter[ filterSize * i + j ];
* dst[i] >>= FRAC_BITS; // The actual implementation is fixed-point.
* }
* @endcode
*/
//@{
int16_t *hLumFilter; ///< Array of horizontal filter coefficients for luma/alpha planes.
int16_t *hChrFilter; ///< Array of horizontal filter coefficients for chroma planes.
int16_t *vLumFilter; ///< Array of vertical filter coefficients for luma/alpha planes.
int16_t *vChrFilter; ///< Array of vertical filter coefficients for chroma planes.
int32_t *hLumFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for luma/alpha planes.
int32_t *hChrFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for chroma planes.
int32_t *vLumFilterPos; ///< Array of vertical filter starting positions for each dst[i] for luma/alpha planes.
int32_t *vChrFilterPos; ///< Array of vertical filter starting positions for each dst[i] for chroma planes.
int hLumFilterSize; ///< Horizontal filter size for luma/alpha pixels.
int hChrFilterSize; ///< Horizontal filter size for chroma pixels.
int vLumFilterSize; ///< Vertical filter size for luma/alpha pixels.
int vChrFilterSize; ///< Vertical filter size for chroma pixels.
//@}
int lumMmxextFilterCodeSize; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code size for luma/alpha planes.
int chrMmxextFilterCodeSize; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code size for chroma planes.
uint8_t *lumMmxextFilterCode; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code for luma/alpha planes.
uint8_t *chrMmxextFilterCode; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code for chroma planes.
int canMMXEXTBeUsed;
int dstY; ///< Last destination vertical line output from last slice.
int flags; ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
void *yuvTable; // pointer to the yuv->rgb table start so it can be freed()
uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
int table_gV[256 + 2*YUVRGB_TABLE_HEADROOM];
uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
DECLARE_ALIGNED(16, int32_t, input_rgb2yuv_table)[16+40*4]; // This table can contain both C and SIMD formatted values, teh C vales are always at the XY_IDX points
#define RY_IDX 0
#define GY_IDX 1
#define BY_IDX 2
#define RU_IDX 3
#define GU_IDX 4
#define BU_IDX 5
#define RV_IDX 6
#define GV_IDX 7
#define BV_IDX 8
#define RGB2YUV_SHIFT 15
int *dither_error[4];
//Colorspace stuff
int contrast, brightness, saturation; // for sws_getColorspaceDetails
int srcColorspaceTable[4];
int dstColorspaceTable[4];
int srcRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (source image).
int dstRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image).
int src0Alpha;
int dst0Alpha;
int srcXYZ;
int dstXYZ;
int src_h_chr_pos;
int dst_h_chr_pos;
int src_v_chr_pos;
int dst_v_chr_pos;
int yuv2rgb_y_offset;
int yuv2rgb_y_coeff;
int yuv2rgb_v2r_coeff;
int yuv2rgb_v2g_coeff;
int yuv2rgb_u2g_coeff;
int yuv2rgb_u2b_coeff;
#define RED_DITHER "0*8"
#define GREEN_DITHER "1*8"
#define BLUE_DITHER "2*8"
#define Y_COEFF "3*8"
#define VR_COEFF "4*8"
#define UB_COEFF "5*8"
#define VG_COEFF "6*8"
#define UG_COEFF "7*8"
#define Y_OFFSET "8*8"
#define U_OFFSET "9*8"
#define V_OFFSET "10*8"
#define LUM_MMX_FILTER_OFFSET "11*8"
#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
#define ESP_OFFSET "11*8+4*4*256*2+8"
#define VROUNDER_OFFSET "11*8+4*4*256*2+16"
#define U_TEMP "11*8+4*4*256*2+24"
#define V_TEMP "11*8+4*4*256*2+32"
#define Y_TEMP "11*8+4*4*256*2+40"
#define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48"
#define UV_OFF_PX "11*8+4*4*256*3+48"
#define UV_OFF_BYTE "11*8+4*4*256*3+56"
#define DITHER16 "11*8+4*4*256*3+64"
#define DITHER32 "11*8+4*4*256*3+80"
DECLARE_ALIGNED(8, uint64_t, redDither);
DECLARE_ALIGNED(8, uint64_t, greenDither);
DECLARE_ALIGNED(8, uint64_t, blueDither);
DECLARE_ALIGNED(8, uint64_t, yCoeff);
DECLARE_ALIGNED(8, uint64_t, vrCoeff);
DECLARE_ALIGNED(8, uint64_t, ubCoeff);
DECLARE_ALIGNED(8, uint64_t, vgCoeff);
DECLARE_ALIGNED(8, uint64_t, ugCoeff);
DECLARE_ALIGNED(8, uint64_t, yOffset);
DECLARE_ALIGNED(8, uint64_t, uOffset);
DECLARE_ALIGNED(8, uint64_t, vOffset);
int32_t lumMmxFilter[4 * MAX_FILTER_SIZE];
int32_t chrMmxFilter[4 * MAX_FILTER_SIZE];
int dstW; ///< Width of destination luma/alpha planes.
DECLARE_ALIGNED(8, uint64_t, esp);
DECLARE_ALIGNED(8, uint64_t, vRounder);
DECLARE_ALIGNED(8, uint64_t, u_temp);
DECLARE_ALIGNED(8, uint64_t, v_temp);
DECLARE_ALIGNED(8, uint64_t, y_temp);
int32_t alpMmxFilter[4 * MAX_FILTER_SIZE];
// alignment of these values is not necessary, but merely here
// to maintain the same offset across x8632 and x86-64. Once we
// use proper offset macros in the asm, they can be removed.
DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes
DECLARE_ALIGNED(8, uint16_t, dither16)[8];
DECLARE_ALIGNED(8, uint32_t, dither32)[8];
const uint8_t *chrDither8, *lumDither8;
#if HAVE_ALTIVEC
vector signed short CY;
vector signed short CRV;
vector signed short CBU;
vector signed short CGU;
vector signed short CGV;
vector signed short OY;
vector unsigned short CSHIFT;
vector signed short *vYCoeffsBank, *vCCoeffsBank;
#endif
#if ARCH_BFIN
DECLARE_ALIGNED(4, uint32_t, oy);
DECLARE_ALIGNED(4, uint32_t, oc);
DECLARE_ALIGNED(4, uint32_t, zero);
DECLARE_ALIGNED(4, uint32_t, cy);
DECLARE_ALIGNED(4, uint32_t, crv);
DECLARE_ALIGNED(4, uint32_t, rmask);
DECLARE_ALIGNED(4, uint32_t, cbu);
DECLARE_ALIGNED(4, uint32_t, bmask);
DECLARE_ALIGNED(4, uint32_t, cgu);
DECLARE_ALIGNED(4, uint32_t, cgv);
DECLARE_ALIGNED(4, uint32_t, gmask);
#endif
#if HAVE_VIS
DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10];
#endif
int use_mmx_vfilter;
/* pre defined color-spaces gamma */
#define XYZ_GAMMA (2.6f)
#define RGB_GAMMA (2.2f)
int16_t *xyzgamma;
int16_t *rgbgamma;
int16_t *xyzgammainv;
int16_t *rgbgammainv;
int16_t xyz2rgb_matrix[3][4];
int16_t rgb2xyz_matrix[3][4];
/* function pointers for swscale() */
yuv2planar1_fn yuv2plane1;
yuv2planarX_fn yuv2planeX;
yuv2interleavedX_fn yuv2nv12cX;
yuv2packed1_fn yuv2packed1;
yuv2packed2_fn yuv2packed2;
yuv2packedX_fn yuv2packedX;
yuv2anyX_fn yuv2anyX;
/// Unscaled conversion of luma plane to YV12 for horizontal scaler.
void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
int width, uint32_t *pal);
/// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
int width, uint32_t *pal);
/// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
int width, uint32_t *pal);
/**
* Functions to read planar input, such as planar RGB, and convert
* internally to Y/UV/A.
*/
/** @{ */
void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv);
void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv);
void (*readAlpPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv);
/** @} */
/**
* Scale one horizontal line of input data using a bilinear filter
* to produce one line of output data. Compared to SwsContext->hScale(),
* please take note of the following caveats when using these:
* - Scaling is done using only 7bit instead of 14bit coefficients.
* - You can use no more than 5 input pixels to produce 4 output
* pixels. Therefore, this filter should not be used for downscaling
* by more than ~20% in width (because that equals more than 5/4th
* downscaling and thus more than 5 pixels input per 4 pixels output).
* - In general, bilinear filters create artifacts during downscaling
* (even when <20%), because one output pixel will span more than one
* input pixel, and thus some pixels will need edges of both neighbor
* pixels to interpolate the output pixel. Since you can use at most
* two input pixels per output pixel in bilinear scaling, this is
* impossible and thus downscaling by any size will create artifacts.
* To enable this type of scaling, set SWS_FLAG_FAST_BILINEAR
* in SwsContext->flags.
*/
/** @{ */
void (*hyscale_fast)(struct SwsContext *c,
int16_t *dst, int dstWidth,
const uint8_t *src, int srcW, int xInc);
void (*hcscale_fast)(struct SwsContext *c,
int16_t *dst1, int16_t *dst2, int dstWidth,
const uint8_t *src1, const uint8_t *src2,
int srcW, int xInc);
/** @} */
/**
* Scale one horizontal line of input data using a filter over the input
* lines, to produce one (differently sized) line of output data.
*
* @param dst pointer to destination buffer for horizontally scaled
* data. If the number of bits per component of one
* destination pixel (SwsContext->dstBpc) is <= 10, data
* will be 15bpc in 16bits (int16_t) width. Else (i.e.
* SwsContext->dstBpc == 16), data will be 19bpc in
* 32bits (int32_t) width.
* @param dstW width of destination image
* @param src pointer to source data to be scaled. If the number of
* bits per component of a source pixel (SwsContext->srcBpc)
* is 8, this is 8bpc in 8bits (uint8_t) width. Else
* (i.e. SwsContext->dstBpc > 8), this is native depth
* in 16bits (uint16_t) width. In other words, for 9-bit
* YUV input, this is 9bpc, for 10-bit YUV input, this is
* 10bpc, and for 16-bit RGB or YUV, this is 16bpc.
* @param filter filter coefficients to be used per output pixel for
* scaling. This contains 14bpp filtering coefficients.
* Guaranteed to contain dstW * filterSize entries.
* @param filterPos position of the first input pixel to be used for
* each output pixel during scaling. Guaranteed to
* contain dstW entries.
* @param filterSize the number of input coefficients to be used (and
* thus the number of input pixels to be used) for
* creating a single output pixel. Is aligned to 4
* (and input coefficients thus padded with zeroes)
* to simplify creating SIMD code.
*/
/** @{ */
void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
/** @} */
/// Color range conversion function for luma plane if needed.
void (*lumConvertRange)(int16_t *dst, int width);
/// Color range conversion function for chroma planes if needed.
void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width);
int needs_hcscale; ///< Set if there are chroma planes to be converted.
SwsDither dither;
} SwsContext;
//FIXME check init (where 0)
SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c);
int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
int fullRange, int brightness,
int contrast, int saturation);
void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
int brightness, int contrast, int saturation);
void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
int lastInLumBuf, int lastInChrBuf);
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_vis(SwsContext *c);
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
SwsFunc ff_yuv2rgb_init_bfin(SwsContext *c);
#if FF_API_SWS_FORMAT_NAME
/**
* @deprecated Use av_get_pix_fmt_name() instead.
*/
attribute_deprecated
const char *sws_format_name(enum AVPixelFormat format);
#endif
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return desc->comp[0].depth_minus1 == 15;
}
static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return desc->comp[0].depth_minus1 >= 8 && desc->comp[0].depth_minus1 <= 13;
}
#define isNBPS(x) is9_OR_10BPS(x)
static av_always_inline int isBE(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return desc->flags & AV_PIX_FMT_FLAG_BE;
}
static av_always_inline int isYUV(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return !(desc->flags & AV_PIX_FMT_FLAG_RGB) && desc->nb_components >= 2;
}
static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return ((desc->flags & AV_PIX_FMT_FLAG_PLANAR) && isYUV(pix_fmt));
}
static av_always_inline int isRGB(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return (desc->flags & AV_PIX_FMT_FLAG_RGB);
}
#if 0 // FIXME
#define isGray(x) \
(!(av_pix_fmt_desc_get(x)->flags & AV_PIX_FMT_FLAG_PAL) && \
av_pix_fmt_desc_get(x)->nb_components <= 2)
#else
#define isGray(x) \
((x) == AV_PIX_FMT_GRAY8 || \
(x) == AV_PIX_FMT_Y400A || \
(x) == AV_PIX_FMT_GRAY16BE || \
(x) == AV_PIX_FMT_GRAY16LE)
#endif
#define isRGBinInt(x) \
( \
(x) == AV_PIX_FMT_RGB48BE || \
(x) == AV_PIX_FMT_RGB48LE || \
(x) == AV_PIX_FMT_RGBA64BE || \
(x) == AV_PIX_FMT_RGBA64LE || \
(x) == AV_PIX_FMT_RGB32 || \
(x) == AV_PIX_FMT_RGB32_1 || \
(x) == AV_PIX_FMT_RGB24 || \
(x) == AV_PIX_FMT_RGB565BE || \
(x) == AV_PIX_FMT_RGB565LE || \
(x) == AV_PIX_FMT_RGB555BE || \
(x) == AV_PIX_FMT_RGB555LE || \
(x) == AV_PIX_FMT_RGB444BE || \
(x) == AV_PIX_FMT_RGB444LE || \
(x) == AV_PIX_FMT_RGB8 || \
(x) == AV_PIX_FMT_RGB4 || \
(x) == AV_PIX_FMT_RGB4_BYTE || \
(x) == AV_PIX_FMT_MONOBLACK || \
(x) == AV_PIX_FMT_MONOWHITE \
)
#define isBGRinInt(x) \
( \
(x) == AV_PIX_FMT_BGR48BE || \
(x) == AV_PIX_FMT_BGR48LE || \
(x) == AV_PIX_FMT_BGRA64BE || \
(x) == AV_PIX_FMT_BGRA64LE || \
(x) == AV_PIX_FMT_BGR32 || \
(x) == AV_PIX_FMT_BGR32_1 || \
(x) == AV_PIX_FMT_BGR24 || \
(x) == AV_PIX_FMT_BGR565BE || \
(x) == AV_PIX_FMT_BGR565LE || \
(x) == AV_PIX_FMT_BGR555BE || \
(x) == AV_PIX_FMT_BGR555LE || \
(x) == AV_PIX_FMT_BGR444BE || \
(x) == AV_PIX_FMT_BGR444LE || \
(x) == AV_PIX_FMT_BGR8 || \
(x) == AV_PIX_FMT_BGR4 || \
(x) == AV_PIX_FMT_BGR4_BYTE || \
(x) == AV_PIX_FMT_MONOBLACK || \
(x) == AV_PIX_FMT_MONOWHITE \
)
#define isRGBinBytes(x) ( \
(x) == AV_PIX_FMT_RGB48BE \
|| (x) == AV_PIX_FMT_RGB48LE \
|| (x) == AV_PIX_FMT_RGBA64BE \
|| (x) == AV_PIX_FMT_RGBA64LE \
|| (x) == AV_PIX_FMT_RGBA \
|| (x) == AV_PIX_FMT_ARGB \
|| (x) == AV_PIX_FMT_RGB24 \
)
#define isBGRinBytes(x) ( \
(x) == AV_PIX_FMT_BGR48BE \
|| (x) == AV_PIX_FMT_BGR48LE \
|| (x) == AV_PIX_FMT_BGRA64BE \
|| (x) == AV_PIX_FMT_BGRA64LE \
|| (x) == AV_PIX_FMT_BGRA \
|| (x) == AV_PIX_FMT_ABGR \
|| (x) == AV_PIX_FMT_BGR24 \
)
#define isAnyRGB(x) \
( \
isRGBinInt(x) || \
isBGRinInt(x) || \
isRGB(x) \
)
static av_always_inline int isALPHA(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
if (pix_fmt == AV_PIX_FMT_PAL8)
return 1;
return desc->flags & AV_PIX_FMT_FLAG_ALPHA;
}
#if 1
#define isPacked(x) ( \
(x)==AV_PIX_FMT_PAL8 \
|| (x)==AV_PIX_FMT_YUYV422 \
|| (x)==AV_PIX_FMT_UYVY422 \
|| (x)==AV_PIX_FMT_Y400A \
|| isRGBinInt(x) \
|| isBGRinInt(x) \
)
#else
static av_always_inline int isPacked(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return ((desc->nb_components >= 2 && !(desc->flags & AV_PIX_FMT_FLAG_PLANAR)) ||
pix_fmt == AV_PIX_FMT_PAL8);
}
#endif
static av_always_inline int isPlanar(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return (desc->nb_components >= 2 && (desc->flags & AV_PIX_FMT_FLAG_PLANAR));
}
static av_always_inline int isPackedRGB(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return ((desc->flags & (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB)) == AV_PIX_FMT_FLAG_RGB);
}
static av_always_inline int isPlanarRGB(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return ((desc->flags & (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB)) ==
(AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB));
}
static av_always_inline int usePal(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return (desc->flags & AV_PIX_FMT_FLAG_PAL) || (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL);
}
extern const uint64_t ff_dither4[2];
extern const uint64_t ff_dither8[2];
extern const uint8_t ff_dither_2x2_4[3][8];
extern const uint8_t ff_dither_2x2_8[3][8];
extern const uint8_t ff_dither_4x4_16[5][8];
extern const uint8_t ff_dither_8x8_32[9][8];
extern const uint8_t ff_dither_8x8_73[9][8];
extern const uint8_t ff_dither_8x8_128[9][8];
extern const uint8_t ff_dither_8x8_220[9][8];
extern const int32_t ff_yuv2rgb_coeffs[8][4];
extern const AVClass sws_context_class;
/**
* Set c->swscale to an unscaled converter if one exists for the specific
* source and destination formats, bit depths, flags, etc.
*/
void ff_get_unscaled_swscale(SwsContext *c);
void ff_get_unscaled_swscale_bfin(SwsContext *c);
void ff_get_unscaled_swscale_ppc(SwsContext *c);
/**
* Return function pointer to fastest main scaler path function depending
* on architecture and available optimizations.
*/
SwsFunc ff_getSwsFunc(SwsContext *c);
void ff_sws_init_input_funcs(SwsContext *c);
void ff_sws_init_output_funcs(SwsContext *c,
yuv2planar1_fn *yuv2plane1,
yuv2planarX_fn *yuv2planeX,
yuv2interleavedX_fn *yuv2nv12cX,
yuv2packed1_fn *yuv2packed1,
yuv2packed2_fn *yuv2packed2,
yuv2packedX_fn *yuv2packedX,
yuv2anyX_fn *yuv2anyX);
void ff_sws_init_swscale_ppc(SwsContext *c);
void ff_sws_init_swscale_x86(SwsContext *c);
static inline void fillPlane16(uint8_t *plane, int stride, int width, int height, int y,
int alpha, int bits, const int big_endian)
{
int i, j;
uint8_t *ptr = plane + stride * y;
int v = alpha ? 0xFFFF>>(15-bits) : (1<<bits);
for (i = 0; i < height; i++) {
#define FILL(wfunc) \
for (j = 0; j < width; j++) {\
wfunc(ptr+2*j, v);\
}
if (big_endian) {
FILL(AV_WB16);
} else {
FILL(AV_WL16);
}
ptr += stride;
}
}
#endif /* SWSCALE_SWSCALE_INTERNAL_H */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,59 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_VERSION_H
#define SWSCALE_VERSION_H
/**
* @file
* swscale version macros
*/
#include "libavutil/avutil.h"
#define LIBSWSCALE_VERSION_MAJOR 2
#define LIBSWSCALE_VERSION_MINOR 5
#define LIBSWSCALE_VERSION_MICRO 101
#define LIBSWSCALE_VERSION_INT AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
LIBSWSCALE_VERSION_MINOR, \
LIBSWSCALE_VERSION_MICRO)
#define LIBSWSCALE_VERSION AV_VERSION(LIBSWSCALE_VERSION_MAJOR, \
LIBSWSCALE_VERSION_MINOR, \
LIBSWSCALE_VERSION_MICRO)
#define LIBSWSCALE_BUILD LIBSWSCALE_VERSION_INT
#define LIBSWSCALE_IDENT "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION)
/**
* FF_API_* defines may be placed below to indicate public API that will be
* dropped at a future version bump. The defines themselves are not part of
* the public API and may change, break or disappear at any time.
*/
#ifndef FF_API_SWS_GETCONTEXT
#define FF_API_SWS_GETCONTEXT (LIBSWSCALE_VERSION_MAJOR < 3)
#endif
#ifndef FF_API_SWS_CPU_CAPS
#define FF_API_SWS_CPU_CAPS (LIBSWSCALE_VERSION_MAJOR < 3)
#endif
#ifndef FF_API_SWS_FORMAT_NAME
#define FF_API_SWS_FORMAT_NAME (LIBSWSCALE_VERSION_MAJOR < 3)
#endif
#endif /* SWSCALE_VERSION_H */

View File

@@ -0,0 +1,11 @@
$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
OBJS += x86/rgb2rgb.o \
x86/swscale.o \
x86/yuv2rgb.o \
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
YASM-OBJS += x86/input.o \
x86/output.o \
x86/scale.o \

View File

@@ -0,0 +1,696 @@
;******************************************************************************
;* x86-optimized input routines; does shuffling of packed
;* YUV formats into individual planes, and converts RGB
;* into YUV planes also.
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
%define RY 0x20DE
%define GY 0x4087
%define BY 0x0C88
%define RU 0xECFF
%define GU 0xDAC8
%define BU 0x3838
%define RV 0x3838
%define GV 0xD0E3
%define BV 0xF6E4
rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15
rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15
%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq
%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq
%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq
%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq
%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq
%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq
%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq
%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq
%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq
%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq
%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq
%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq
%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq
%define rgba_Ycoeff_br 16*4 + 16*13 + tableq
%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq
%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq
%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq
%define rgba_Ucoeff_br 16*4 + 16*17 + tableq
%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq
%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq
%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq
%define rgba_Vcoeff_br 16*4 + 16*21 + tableq
%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq
%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq
; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
; rgba_Ycoeff_rb: times 4 dw RY, BY
; rgba_Ycoeff_br: times 4 dw BY, RY
; rgba_Ycoeff_ga: times 4 dw GY, 0
; rgba_Ycoeff_ag: times 4 dw 0, GY
; rgba_Ucoeff_rb: times 4 dw RU, BU
; rgba_Ucoeff_br: times 4 dw BU, RU
; rgba_Ucoeff_ga: times 4 dw GU, 0
; rgba_Ucoeff_ag: times 4 dw 0, GU
; rgba_Vcoeff_rb: times 4 dw RV, BV
; rgba_Vcoeff_br: times 4 dw BV, RV
; rgba_Vcoeff_ga: times 4 dw GV, 0
; rgba_Vcoeff_ag: times 4 dw 0, GV
shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \
6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80
shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \
8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80
SECTION .text
;-----------------------------------------------------------------------------
; RGB to Y/UV.
;
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
; and
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
; const uint8_t *unused, int w);
;-----------------------------------------------------------------------------
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_Y_FN 2-3
cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
%if mmsize == 8
mova m5, [%2_Ycoeff_12x4]
mova m6, [%2_Ycoeff_3x56]
%define coeff1 m5
%define coeff2 m6
%elif ARCH_X86_64
mova m8, [%2_Ycoeff_12x4]
mova m9, [%2_Ycoeff_3x56]
%define coeff1 m8
%define coeff2 m9
%else ; x86-32 && mmsize == 16
%define coeff1 [%2_Ycoeff_12x4]
%define coeff2 [%2_Ycoeff_3x56]
%endif ; x86-32/64 && mmsize == 8/16
%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
.body:
%if cpuflag(ssse3)
mova m7, [shuf_rgb_12x4]
%define shuf_rgb1 m7
%if ARCH_X86_64
mova m10, [shuf_rgb_3x56]
%define shuf_rgb2 m10
%else ; x86-32
%define shuf_rgb2 [shuf_rgb_3x56]
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
movsxd wq, wd
%endif
add wq, wq
add dstq, wq
neg wq
%if notcpuflag(ssse3)
pxor m7, m7
%endif ; !cpuflag(ssse3)
mova m4, [rgb_Yrnd]
.loop:
%if cpuflag(ssse3)
movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
pshufb m3, m2, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m2, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else ; !cpuflag(ssse3)
movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
%if mmsize == 16 ; i.e. sse2
punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 }
movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 }
movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; cpuflag(ssse3)
add srcq, 3 * mmsize / 2
pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
psrad m0, 9
psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
mova [dstq+wq], m0
add wq, mmsize
jl .loop
REP_RET
%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
%endmacro
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_UV_FN 2-3
cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [%2_Ucoeff_12x4]
mova m9, [%2_Ucoeff_3x56]
mova m10, [%2_Vcoeff_12x4]
mova m11, [%2_Vcoeff_3x56]
%define coeffU1 m8
%define coeffU2 m9
%define coeffV1 m10
%define coeffV2 m11
%else ; x86-32
%define coeffU1 [%2_Ucoeff_12x4]
%define coeffU2 [%2_Ucoeff_3x56]
%define coeffV1 [%2_Vcoeff_12x4]
%define coeffV2 [%2_Vcoeff_3x56]
%endif ; x86-32/64
%if ARCH_X86_64 && %0 == 3
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body
%else ; ARCH_X86_64 && %0 == 3
.body:
%if cpuflag(ssse3)
mova m7, [shuf_rgb_12x4]
%define shuf_rgb1 m7
%if ARCH_X86_64
mova m12, [shuf_rgb_3x56]
%define shuf_rgb2 m12
%else ; x86-32
%define shuf_rgb2 [shuf_rgb_3x56]
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add wq, wq
add dstUq, wq
add dstVq, wq
neg wq
mova m6, [rgb_UVrnd]
%if notcpuflag(ssse3)
pxor m7, m7
%endif
.loop:
%if cpuflag(ssse3)
movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
movu m4, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
%else ; !cpuflag(ssse3)
movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
%if mmsize == 16
punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
%endif ; cpuflag(ssse3)
pmaddwd m2, m0, coeffV1 ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
pmaddwd m3, m1, coeffV2 ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
pmaddwd m0, coeffU1 ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
pmaddwd m1, coeffU2 ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
paddd m0, m1 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
paddd m2, m3 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
%if cpuflag(ssse3)
pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else ; !cpuflag(ssse3)
%if mmsize == 16
movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; mmsize == 16 && !cpuflag(ssse3)
punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; cpuflag(ssse3)
add srcq, 3 * mmsize / 2
pmaddwd m1, m4, coeffU1 ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
pmaddwd m3, m5, coeffU2 ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
pmaddwd m4, coeffV1 ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
pmaddwd m5, coeffV2 ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
paddd m1, m3 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
paddd m4, m5 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
paddd m0, m6 ; += rgb_UVrnd, i.e. (dword) { U[0-3] }
paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
psrad m0, 9
psrad m2, 9
psrad m1, 9
psrad m4, 9
packssdw m0, m1 ; (word) { U[0-7] }
packssdw m2, m4 ; (word) { V[0-7] }
%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%else ; mmsize == 16
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
%endmacro
; %1 = nr. of XMM registers for rgb-to-Y func
; %2 = nr. of XMM registers for rgb-to-UV func
%macro RGB24_FUNCS 2
RGB24_TO_Y_FN %1, rgb
RGB24_TO_Y_FN %1, bgr, rgb
RGB24_TO_UV_FN %2, rgb
RGB24_TO_UV_FN %2, bgr, rgb
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
RGB24_FUNCS 0, 0
%endif
INIT_XMM sse2
RGB24_FUNCS 10, 12
INIT_XMM ssse3
RGB24_FUNCS 11, 13
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB24_FUNCS 11, 13
%endif
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_Y_FN 5-6
cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
mova m5, [rgba_Ycoeff_%2%4]
mova m6, [rgba_Ycoeff_%3%5]
%if %0 == 6
jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
%else ; %0 == 6
.body:
%if ARCH_X86_64
movsxd wq, wd
%endif
lea srcq, [srcq+wq*4]
add wq, wq
add dstq, wq
neg wq
mova m4, [rgb_Yrnd]
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
.loop:
; FIXME check alignment and use mova
movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7]
pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7]
paddd m0, m4 ; += rgb_Yrnd
paddd m2, m4 ; += rgb_Yrnd
paddd m0, m1 ; (dword) { Y[0-3] }
paddd m2, m3 ; (dword) { Y[4-7] }
psrad m0, 9
psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
mova [dstq+wq], m0
add wq, mmsize
jl .loop
REP_RET
%endif ; %0 == 3
%endmacro
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_UV_FN 5-6
cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [rgba_Ucoeff_%2%4]
mova m9, [rgba_Ucoeff_%3%5]
mova m10, [rgba_Vcoeff_%2%4]
mova m11, [rgba_Vcoeff_%3%5]
%define coeffU1 m8
%define coeffU2 m9
%define coeffV1 m10
%define coeffV2 m11
%else ; x86-32
%define coeffU1 [rgba_Ucoeff_%2%4]
%define coeffU2 [rgba_Ucoeff_%3%5]
%define coeffV1 [rgba_Vcoeff_%2%4]
%define coeffV2 [rgba_Vcoeff_%3%5]
%endif ; x86-64/32
%if ARCH_X86_64 && %0 == 6
jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
%else ; ARCH_X86_64 && %0 == 6
.body:
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add wq, wq
add dstUq, wq
add dstVq, wq
lea srcq, [srcq+wq*2]
neg wq
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
mova m6, [rgb_UVrnd]
.loop:
; FIXME check alignment and use mova
movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
paddd m3, m6 ; += rgb_UVrnd
paddd m1, m6 ; += rgb_UVrnd
paddd m2, m3 ; (dword) { V[0-3] }
paddd m0, m1 ; (dword) { U[0-3] }
pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7]
pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7]
pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7]
pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
paddd m3, m6 ; += rgb_UVrnd
paddd m5, m6 ; += rgb_UVrnd
psrad m0, 9
paddd m1, m3 ; (dword) { V[4-7] }
paddd m4, m5 ; (dword) { U[4-7] }
psrad m2, 9
psrad m4, 9
psrad m1, 9
packssdw m0, m4 ; (word) { U[0-7] }
packssdw m2, m1 ; (word) { V[0-7] }
%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%else ; mmsize == 16
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
%endmacro
; %1 = nr. of XMM registers for rgb-to-Y func
; %2 = nr. of XMM registers for rgb-to-UV func
%macro RGB32_FUNCS 2
RGB32_TO_Y_FN %1, r, g, b, a
RGB32_TO_Y_FN %1, b, g, r, a, rgba
RGB32_TO_Y_FN %1, a, r, g, b, rgba
RGB32_TO_Y_FN %1, a, b, g, r, rgba
RGB32_TO_UV_FN %2, r, g, b, a
RGB32_TO_UV_FN %2, b, g, r, a, rgba
RGB32_TO_UV_FN %2, a, r, g, b, rgba
RGB32_TO_UV_FN %2, a, b, g, r, rgba
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
RGB32_FUNCS 0, 0
%endif
INIT_XMM sse2
RGB32_FUNCS 8, 12
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB32_FUNCS 8, 12
%endif
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
;
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
; and
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
; const uint8_t *unused, int w);
;-----------------------------------------------------------------------------
; %1 = a (aligned) or u (unaligned)
; %2 = yuyv or uyvy
%macro LOOP_YUYV_TO_Y 2
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
%ifidn %2, yuyv
pand m0, m2 ; (word) { Y0, Y1, ..., Y7 }
pand m1, m2 ; (word) { Y8, Y9, ..., Y15 }
%else ; uyvy
psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 }
psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 }
%endif ; yuyv/uyvy
packuswb m0, m1 ; (byte) { Y0, ..., Y15 }
mova [dstq+wq], m0
add wq, mmsize
jl .loop_%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = yuyv or uyvy
; %3 = if specified, it means that unaligned and aligned code in loop
; will be the same (i.e. YUYV+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_Y_FN 2-3
cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
add dstq, wq
%if mmsize == 16
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
%ifidn %2, yuyv
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
%endif ; yuyv
%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_Y a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_Y u, %2
%else ; mmsize == 8
neg wq
LOOP_YUYV_TO_Y a, %2
%endif ; mmsize == 8/16
%endmacro
; %1 = a (aligned) or u (unaligned)
; %2 = yuyv or uyvy
%macro LOOP_YUYV_TO_UV 2
.loop_%1:
%ifidn %2, yuyv
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 }
psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 }
%else ; uyvy
%if cpuflag(avx)
vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 }
vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
%else
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
pand m0, m2 ; (word) { U0, V0, ..., U3, V3 }
pand m1, m2 ; (word) { U4, V4, ..., U7, V7 }
%endif
%endif ; yuyv/uyvy
packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
%if mmsize == 16
packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
movh [dstUq+wq], m1
movhps [dstVq+wq], m1
%else ; mmsize == 8
packuswb m1, m1 ; (byte) { U0, ... U3 }
packuswb m0, m0 ; (byte) { V0, ... V3 }
movh [dstUq+wq], m1
movh [dstVq+wq], m0
%endif ; mmsize == 8/16
add wq, mmsize / 2
jl .loop_%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = yuyv or uyvy
; %3 = if specified, it means that unaligned and aligned code in loop
; will be the same (i.e. UYVY+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_UV_FN 2-3
cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
%if mmsize == 16 && %0 == 2
test srcq, 15
%endif
lea srcq, [srcq+wq*4]
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
; NOTE: if uyvy+avx, u/a are identical
%if mmsize == 16 && %0 == 2
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_UV u, %2
%else ; mmsize == 8
neg wq
LOOP_YUYV_TO_UV a, %2
%endif ; mmsize == 8/16
%endmacro
; %1 = a (aligned) or u (unaligned)
; %2 = nv12 or nv21
%macro LOOP_NVXX_TO_UV 2
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
packuswb m2, m3 ; (byte) { U0, ..., U15 }
packuswb m0, m1 ; (byte) { V0, ..., V15 }
%ifidn %2, nv12
mova [dstUq+wq], m2
mova [dstVq+wq], m0
%else ; nv21
mova [dstVq+wq], m2
mova [dstUq+wq], m0
%endif ; nv12/21
add wq, mmsize
jl .loop_%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = nv12 or nv21
%macro NVXX_TO_UV_FN 2
cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
%if mmsize == 16
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
pcmpeqb m5, m5 ; (byte) { 0xff } x 16
psrlw m5, 8 ; (word) { 0x00ff } x 8
%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_NVXX_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_NVXX_TO_UV u, %2
%else ; mmsize == 8
neg wq
LOOP_NVXX_TO_UV a, %2
%endif ; mmsize == 8/16
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
YUYV_TO_Y_FN 0, yuyv
YUYV_TO_Y_FN 0, uyvy
YUYV_TO_UV_FN 0, yuyv
YUYV_TO_UV_FN 0, uyvy
NVXX_TO_UV_FN 0, nv12
NVXX_TO_UV_FN 0, nv21
%endif
INIT_XMM sse2
YUYV_TO_Y_FN 3, yuyv
YUYV_TO_Y_FN 2, uyvy
YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
; that's not faster in practice
YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy, 1
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
%endif

View File

@@ -0,0 +1,413 @@
;******************************************************************************
;* x86-optimized vertical line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;* Kieran Kunhya <kieran@kunhya.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
minshort: times 8 dw 0x8000
yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
yuv2yuvX_10_start: times 4 dd 0x10000
yuv2yuvX_9_start: times 4 dd 0x20000
yuv2yuvX_10_upper: times 8 dw 0x3ff
yuv2yuvX_9_upper: times 8 dw 0x1ff
pd_4: times 4 dd 4
pd_4min0x40000:times 4 dd 4 - (0x40000)
pw_16: times 8 dw 16
pw_32: times 8 dw 32
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
SECTION .text
;-----------------------------------------------------------------------------
; vertical line scaling
;
; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
; const uint8_t *dither, int offset)
; and
; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
; const int16_t **src, uint8_t *dst, int dstW,
; const uint8_t *dither, int offset)
;
; Scale one or $filterSize lines of source data to generate one line of output
; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
; of 2. $offset is either 0 or 3. $dither holds 8 values.
;-----------------------------------------------------------------------------
%macro yuv2planeX_fn 3
%if ARCH_X86_32
%define cntr_reg fltsizeq
%define movsx mov
%else
%define cntr_reg r7
%define movsx movsxd
%endif
cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%if %1 == 8 || %1 == 9 || %1 == 10
pxor m6, m6
%endif ; %1 == 8/9/10
%if %1 == 8
%if ARCH_X86_32
%assign pad 0x2c - (stack_offset & 15)
SUB rsp, pad
%define m_dith m7
%else ; x86-64
%define m_dith m9
%endif ; x86-32
; create registers holding dither
movq m_dith, [ditherq] ; dither
test offsetd, offsetd
jz .no_rot
%if mmsize == 16
punpcklqdq m_dith, m_dith
%endif ; mmsize == 16
PALIGNR m_dith, m_dith, 3, m0
.no_rot:
%if mmsize == 16
punpcklbw m_dith, m6
%if ARCH_X86_64
punpcklwd m8, m_dith, m6
pslld m8, 12
%else ; x86-32
punpcklwd m5, m_dith, m6
pslld m5, 12
%endif ; x86-32/64
punpckhwd m_dith, m6
pslld m_dith, 12
%if ARCH_X86_32
mova [rsp+ 0], m5
mova [rsp+16], m_dith
%endif
%else ; mmsize == 8
punpcklbw m5, m_dith, m6
punpckhbw m_dith, m6
punpcklwd m4, m5, m6
punpckhwd m5, m6
punpcklwd m3, m_dith, m6
punpckhwd m_dith, m6
pslld m4, 12
pslld m5, 12
pslld m3, 12
pslld m_dith, 12
mova [rsp+ 0], m4
mova [rsp+ 8], m5
mova [rsp+16], m3
mova [rsp+24], m_dith
%endif ; mmsize == 8/16
%endif ; %1 == 8
xor r5, r5
.pixelloop:
%assign %%i 0
; the rep here is for the 8bit output mmx case, where dither covers
; 8 pixels but we can only handle 2 pixels per register, and thus 4
; pixels per iteration. In order to not have to keep track of where
; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
%if %1 == 8
%assign %%repcnt 16/mmsize
%else
%assign %%repcnt 1
%endif
%rep %%repcnt
%if %1 == 8
%if ARCH_X86_32
mova m2, [rsp+mmsize*(0+%%i)]
mova m1, [rsp+mmsize*(1+%%i)]
%else ; x86-64
mova m2, m8
mova m1, m_dith
%endif ; x86-32/64
%else ; %1 == 9/10/16
mova m1, [yuv2yuvX_%1_start]
mova m2, m1
%endif ; %1 == 8/9/10/16
movsx cntr_reg, fltsizem
.filterloop_ %+ %%i:
; input pixels
mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
%if %1 == 16
mova m3, [r6+r5*4]
mova m5, [r6+r5*4+mmsize]
%else ; %1 == 8/9/10
mova m3, [r6+r5*2]
%endif ; %1 == 8/9/10/16
mov r6, [srcq+gprsize*cntr_reg-gprsize]
%if %1 == 16
mova m4, [r6+r5*4]
mova m6, [r6+r5*4+mmsize]
%else ; %1 == 8/9/10
mova m4, [r6+r5*2]
%endif ; %1 == 8/9/10/16
; coefficients
movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
%if %1 == 16
pshuflw m7, m0, 0 ; coeff[0]
pshuflw m0, m0, 0x55 ; coeff[1]
pmovsxwd m7, m7 ; word -> dword
pmovsxwd m0, m0 ; word -> dword
pmulld m3, m7
pmulld m5, m7
pmulld m4, m0
pmulld m6, m0
paddd m2, m3
paddd m1, m5
paddd m2, m4
paddd m1, m6
%else ; %1 == 10/9/8
punpcklwd m5, m3, m4
punpckhwd m3, m4
SPLATD m0
pmaddwd m5, m0
pmaddwd m3, m0
paddd m2, m5
paddd m1, m3
%endif ; %1 == 8/9/10/16
sub cntr_reg, 2
jg .filterloop_ %+ %%i
%if %1 == 16
psrad m2, 31 - %1
psrad m1, 31 - %1
%else ; %1 == 10/9/8
psrad m2, 27 - %1
psrad m1, 27 - %1
%endif ; %1 == 8/9/10/16
%if %1 == 8
packssdw m2, m1
packuswb m2, m2
movh [dstq+r5*1], m2
%else ; %1 == 9/10/16
%if %1 == 16
packssdw m2, m1
paddw m2, [minshort]
%else ; %1 == 9/10
%if cpuflag(sse4)
packusdw m2, m1
%else ; mmxext/sse2
packssdw m2, m1
pmaxsw m2, m6
%endif ; mmxext/sse2/sse4/avx
pminsw m2, [yuv2yuvX_%1_upper]
%endif ; %1 == 9/10/16
mova [dstq+r5*2], m2
%endif ; %1 == 8/9/10/16
add r5, mmsize/2
sub wd, mmsize/2
%assign %%i %%i+2
%endrep
jg .pixelloop
%if %1 == 8
%if ARCH_X86_32
ADD rsp, pad
RET
%else ; x86-64
REP_RET
%endif ; x86-32/64
%else ; %1 == 9/10/16
REP_RET
%endif ; %1 == 8/9/10/16
%endmacro
%if ARCH_X86_32
INIT_MMX mmxext
yuv2planeX_fn 8, 0, 7
yuv2planeX_fn 9, 0, 5
yuv2planeX_fn 10, 0, 5
%endif
INIT_XMM sse2
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
INIT_XMM sse4
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
yuv2planeX_fn 16, 8, 5
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
%endif
; %1=outout-bpc, %2=alignment (u/a)
%macro yuv2plane1_mainloop 2
.loop_%2:
%if %1 == 8
paddsw m0, m2, [srcq+wq*2+mmsize*0]
paddsw m1, m3, [srcq+wq*2+mmsize*1]
psraw m0, 7
psraw m1, 7
packuswb m0, m1
mov%2 [dstq+wq], m0
%elif %1 == 16
paddd m0, m4, [srcq+wq*4+mmsize*0]
paddd m1, m4, [srcq+wq*4+mmsize*1]
paddd m2, m4, [srcq+wq*4+mmsize*2]
paddd m3, m4, [srcq+wq*4+mmsize*3]
psrad m0, 3
psrad m1, 3
psrad m2, 3
psrad m3, 3
%if cpuflag(sse4) ; avx/sse4
packusdw m0, m1
packusdw m2, m3
%else ; mmx/sse2
packssdw m0, m1
packssdw m2, m3
paddw m0, m5
paddw m2, m5
%endif ; mmx/sse2/sse4/avx
mov%2 [dstq+wq*2+mmsize*0], m0
mov%2 [dstq+wq*2+mmsize*1], m2
%else ; %1 == 9/10
paddsw m0, m2, [srcq+wq*2+mmsize*0]
paddsw m1, m2, [srcq+wq*2+mmsize*1]
psraw m0, 15 - %1
psraw m1, 15 - %1
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m3
pminsw m1, m3
mov%2 [dstq+wq*2+mmsize*0], m0
mov%2 [dstq+wq*2+mmsize*1], m1
%endif
add wq, mmsize
jl .loop_%2
%endmacro
%macro yuv2plane1_fn 3
cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
movsxdifnidn wq, wd
add wq, mmsize - 1
and wq, ~(mmsize - 1)
%if %1 == 8
add dstq, wq
%else ; %1 != 8
lea dstq, [dstq+wq*2]
%endif ; %1 == 8
%if %1 == 16
lea srcq, [srcq+wq*4]
%else ; %1 != 16
lea srcq, [srcq+wq*2]
%endif ; %1 == 16
neg wq
%if %1 == 8
pxor m4, m4 ; zero
; create registers holding dither
movq m3, [ditherq] ; dither
test offsetd, offsetd
jz .no_rot
%if mmsize == 16
punpcklqdq m3, m3
%endif ; mmsize == 16
PALIGNR m3, m3, 3, m2
.no_rot:
%if mmsize == 8
mova m2, m3
punpckhbw m3, m4 ; byte->word
punpcklbw m2, m4 ; byte->word
%else
punpcklbw m3, m4
mova m2, m3
%endif
%elif %1 == 9
pxor m4, m4
mova m3, [pw_512]
mova m2, [pw_32]
%elif %1 == 10
pxor m4, m4
mova m3, [pw_1024]
mova m2, [pw_16]
%else ; %1 == 16
%if cpuflag(sse4) ; sse4/avx
mova m4, [pd_4]
%else ; mmx/sse2
mova m4, [pd_4min0x40000]
mova m5, [minshort]
%endif ; mmx/sse2/sse4/avx
%endif ; %1 == ..
; actual pixel scaling
%if mmsize == 8
yuv2plane1_mainloop %1, a
%else ; mmsize == 16
test dstq, 15
jnz .unaligned
yuv2plane1_mainloop %1, a
REP_RET
.unaligned:
yuv2plane1_mainloop %1, u
%endif ; mmsize == 8/16
REP_RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
yuv2plane1_fn 8, 0, 5
yuv2plane1_fn 16, 0, 3
INIT_MMX mmxext
yuv2plane1_fn 9, 0, 3
yuv2plane1_fn 10, 0, 3
%endif
INIT_XMM sse2
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 6, 3
INIT_XMM sse4
yuv2plane1_fn 16, 5, 3
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 5, 3
%endif

View File

@@ -0,0 +1,148 @@
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/bswap.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#if HAVE_INLINE_ASM
DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL;
DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL;
DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
#define mask16b mask15b
DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one.
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_AMD3DNOW 0
#define COMPILE_TEMPLATE_SSE2 0
//MMX versions
#undef RENAME
#define RENAME(a) a ## _MMX
#include "rgb2rgb_template.c"
// MMXEXT versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _MMXEXT
#include "rgb2rgb_template.c"
//SSE2 versions
#undef RENAME
#undef COMPILE_TEMPLATE_SSE2
#define COMPILE_TEMPLATE_SSE2 1
#define RENAME(a) a ## _SSE2
#include "rgb2rgb_template.c"
//3DNOW versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_AMD3DNOW
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_AMD3DNOW 1
#define RENAME(a) a ## _3DNOW
#include "rgb2rgb_template.c"
/*
RGB15->RGB16 original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMXEXT, 3DNOW optimization by Nick Kurshev
32-bit C version, and and&add trick by Michael Niedermayer
*/
#endif /* HAVE_INLINE_ASM */
av_cold void rgb2rgb_init_x86(void)
{
#if HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
rgb2rgb_init_MMX();
if (INLINE_AMD3DNOW(cpu_flags))
rgb2rgb_init_3DNOW();
if (INLINE_MMXEXT(cpu_flags))
rgb2rgb_init_MMXEXT();
if (INLINE_SSE2(cpu_flags))
rgb2rgb_init_SSE2();
#endif /* HAVE_INLINE_ASM */
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,431 @@
;******************************************************************************
;* x86-optimized horizontal line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
max_19bit_int: times 4 dd 0x7ffff
max_19bit_flt: times 4 dd 524287.0
minshort: times 8 dw 0x8000
unicoeff: times 4 dd 0x20000000
SECTION .text
;-----------------------------------------------------------------------------
; horizontal line scaling
;
; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
; (SwsContext *c, int{16,32}_t *dst,
; int dstW, const uint{8,16}_t *src,
; const int16_t *filter,
; const int32_t *filterPos, int filterSize);
;
; Scale one horizontal line. Input is either 8-bits width or 16-bits width
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
; downscale before multiplying). Filter is 14-bits. Output is either 15bits
; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
; output pixel is generated from $filterSize input pixels, the position of
; the first pixel is given in filterPos[nOutputPixel].
;-----------------------------------------------------------------------------
; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
%macro SCALE_FUNC 6
%ifnidn %3, X
cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
%else
cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
%endif
%if ARCH_X86_64
movsxd wq, wd
%define mov32 movsxd
%else ; x86-32
%define mov32 mov
%endif ; x86-64
%if %2 == 19
%if mmsize == 8 ; mmx
mova m2, [max_19bit_int]
%elif cpuflag(sse4)
mova m2, [max_19bit_int]
%else ; ssse3/sse2
mova m2, [max_19bit_flt]
%endif ; mmx/sse2/ssse3/sse4
%endif ; %2 == 19
%if %1 == 16
mova m6, [minshort]
mova m7, [unicoeff]
%elif %1 == 8
pxor m3, m3
%endif ; %1 == 8/16
%if %1 == 8
%define movlh movd
%define movbh movh
%define srcmul 1
%else ; %1 == 9-16
%define movlh movq
%define movbh movu
%define srcmul 2
%endif ; %1 == 8/9-16
%ifnidn %3, X
; setup loop
%if %3 == 8
shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
%define wshr 1
%else ; %3 == 4
%define wshr 0
%endif ; %3 == 8
lea filterq, [filterq+wq*8]
%if %2 == 15
lea dstq, [dstq+wq*(2>>wshr)]
%else ; %2 == 19
lea dstq, [dstq+wq*(4>>wshr)]
%endif ; %2 == 15/19
lea fltposq, [fltposq+wq*(4>>wshr)]
neg wq
.loop:
%if %3 == 4 ; filterSize == 4 scaling
; load 2x4 or 4x4 source pixels into m0/m1
mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
%if mmsize == 8
movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%else ; mmsize == 16
%if %1 > 8
movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%else ; %1 == 8
movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%endif
mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
%if %1 > 8
movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
%else ; %1 == 8
movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
punpckldq m0, m4
punpckldq m1, m5
%endif ; %1 == 8
%endif ; mmsize == 8/16
%if %1 == 8
punpcklbw m0, m3 ; byte -> word
punpcklbw m1, m3 ; byte -> word
%endif ; %1 == 8
; multiply with filter coefficients
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
%endif ; %1 == 16
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
%if mmsize == 8 ; mmx
movq m4, m0
punpckldq m0, m1
punpckhdq m4, m1
paddd m0, m4
%elif notcpuflag(ssse3) ; sse2
mova m4, m0
shufps m0, m1, 10001000b
shufps m4, m1, 11011101b
paddd m0, m4
%else ; ssse3/sse4
phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
%endif ; mmx/sse2/ssse3/sse4
%else ; %3 == 8, i.e. filterSize == 8 scaling
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
%if mmsize == 8
movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
%else ; mmsize == 16
movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
%endif ; mmsize == 8/16
%if %1 == 8
punpcklbw m0, m3 ; byte -> word
punpcklbw m1, m3 ; byte -> word
punpcklbw m4, m3 ; byte -> word
punpcklbw m5, m3 ; byte -> word
%endif ; %1 == 8
; multiply
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
psubw m4, m6
psubw m5, m6
%endif ; %1 == 16
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
%if mmsize == 8
paddd m0, m1
paddd m4, m5
movq m1, m0
punpckldq m0, m4
punpckhdq m1, m4
paddd m0, m1
%elif notcpuflag(ssse3) ; sse2
%if %1 == 8
%define mex m6
%else
%define mex m3
%endif
; emulate horizontal add as transpose + vertical add
mova mex, m0
punpckldq m0, m1
punpckhdq mex, m1
paddd m0, mex
mova m1, m4
punpckldq m4, m5
punpckhdq m1, m5
paddd m4, m1
mova m1, m0
punpcklqdq m0, m4
punpckhqdq m1, m4
paddd m0, m1
%else ; ssse3/sse4
; FIXME if we rearrange the filter in pairs of 4, we can
; load pixels likewise and use 2 x paddd + phaddd instead
; of 3 x phaddd here, faster on older cpus
phaddd m0, m1
phaddd m4, m5
phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
%endif ; mmx/sse2/ssse3/sse4
%endif ; %3 == 4/8
%else ; %3 == X, i.e. any filterSize scaling
%ifidn %4, X4
%define dlt 4
%else ; %4 == X || %4 == X8
%define dlt 0
%endif ; %4 ==/!= X4
%if ARCH_X86_64
%define srcq r8
%define pos1q r7
%define srcendq r9
movsxd fltsizeq, fltsized ; filterSize
lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
%else ; x86-32
%define srcq srcmemq
%define pos1q dstq
%define srcendq r6m
lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
mov srcendq, pos0q
%endif ; x86-32/64
lea fltposq, [fltposq+wq*4]
%if %2 == 15
lea dstq, [dstq+wq*2]
%else ; %2 == 19
lea dstq, [dstq+wq*4]
%endif ; %2 == 15/19
movifnidn dstmp, dstq
neg wq
.loop:
mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
pxor m4, m4
pxor m5, m5
mov srcq, srcmemmp
.innerloop:
; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
%if %1 == 8
punpcklbw m0, m3
punpcklbw m1, m3
%endif ; %1 == 8
; multiply
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
%endif ; %1 == 16
pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
paddd m4, m0
paddd m5, m1
add filterq, mmsize
add srcq, srcmul*mmsize/2
cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
jl .innerloop
%ifidn %4, X4
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
%if %1 > 8
movhps m0, [srcq+(pos1q+dlt)*srcmul]
%else ; %1 == 8
movd m1, [srcq+(pos1q+dlt)*srcmul]
punpckldq m0, m1
%endif ; %1 == 8
%if %1 == 8
punpcklbw m0, m3
%endif ; %1 == 8
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
%endif ; %1 == 16
pmaddwd m0, [filterq]
%endif ; %4 == X4
lea filterq, [filterq+(fltsizeq+dlt)*2]
%if mmsize == 8 ; mmx
movq m0, m4
punpckldq m4, m5
punpckhdq m0, m5
paddd m0, m4
%else ; mmsize == 16
%if notcpuflag(ssse3) ; sse2
mova m1, m4
punpcklqdq m4, m5
punpckhqdq m1, m5
paddd m4, m1
%else ; ssse3/sse4
phaddd m4, m5
%endif ; sse2/ssse3/sse4
%ifidn %4, X4
paddd m4, m0
%endif ; %3 == X4
%if notcpuflag(ssse3) ; sse2
pshufd m4, m4, 11011000b
movhlps m0, m4
paddd m0, m4
%else ; ssse3/sse4
phaddd m4, m4
SWAP 0, 4
%endif ; sse2/ssse3/sse4
%endif ; mmsize == 8/16
%endif ; %3 ==/!= X
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
paddd m0, m7
%endif ; %1 == 16
; clip, store
psrad m0, 14 + %1 - %2
%ifidn %3, X
movifnidn dstq, dstmp
%endif ; %3 == X
%if %2 == 15
packssdw m0, m0
%ifnidn %3, X
movh [dstq+wq*(2>>wshr)], m0
%else ; %3 == X
movd [dstq+wq*2], m0
%endif ; %3 ==/!= X
%else ; %2 == 19
%if mmsize == 8
PMINSD_MMX m0, m2, m4
%elif cpuflag(sse4)
pminsd m0, m2
%else ; sse2/ssse3
cvtdq2ps m0, m0
minps m0, m2
cvtps2dq m0, m0
%endif ; mmx/sse2/ssse3/sse4
%ifnidn %3, X
mova [dstq+wq*(4>>wshr)], m0
%else ; %3 == X
movq [dstq+wq*4], m0
%endif ; %3 ==/!= X
%endif ; %2 == 15/19
%ifnidn %3, X
add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
; per iteration. see "shl wq,1" above as for why we do this
%else ; %3 == X
add wq, 2
%endif ; %3 ==/!= X
jl .loop
REP_RET
%endmacro
; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
%macro SCALE_FUNCS 3
SCALE_FUNC %1, %2, 4, 4, 6, %3
SCALE_FUNC %1, %2, 8, 8, 6, %3
%if mmsize == 8
SCALE_FUNC %1, %2, X, X, 7, %3
%else
SCALE_FUNC %1, %2, X, X4, 7, %3
SCALE_FUNC %1, %2, X, X8, 7, %3
%endif
%endmacro
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
%macro SCALE_FUNCS2 3
%if notcpuflag(sse4)
SCALE_FUNCS 8, 15, %1
SCALE_FUNCS 9, 15, %2
SCALE_FUNCS 10, 15, %2
SCALE_FUNCS 12, 15, %2
SCALE_FUNCS 14, 15, %2
SCALE_FUNCS 16, 15, %3
%endif ; !sse4
SCALE_FUNCS 8, 19, %1
SCALE_FUNCS 9, 19, %2
SCALE_FUNCS 10, 19, %2
SCALE_FUNCS 12, 19, %2
SCALE_FUNCS 14, 19, %2
SCALE_FUNCS 16, 19, %3
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
SCALE_FUNCS2 0, 0, 0
%endif
INIT_XMM sse2
SCALE_FUNCS2 6, 7, 8
INIT_XMM ssse3
SCALE_FUNCS2 6, 6, 8
INIT_XMM sse4
SCALE_FUNCS2 6, 6, 8

View File

@@ -0,0 +1,580 @@
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
#if HAVE_INLINE_ASM
#define DITHER1XBPP
DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL;
DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL;
const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
0x0103010301030103LL,
0x0200020002000200LL,};
const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
0x0602060206020602LL,
0x0004000400040004LL,};
DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL;
DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL;
DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL;
DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL;
DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL;
DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff) = 0x000020E540830C8BULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff) = 0x0000ED0FDAC23831ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff) = 0x00003831D0E6F6EAULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
//MMX versions
#if HAVE_MMX_INLINE
#undef RENAME
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _MMX
#include "swscale_template.c"
#endif
// MMXEXT versions
#if HAVE_MMXEXT_INLINE
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _MMXEXT
#include "swscale_template.c"
#endif
void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
int lastInLumBuf, int lastInChrBuf)
{
const int dstH= c->dstH;
const int flags= c->flags;
int16_t **lumPixBuf= c->lumPixBuf;
int16_t **chrUPixBuf= c->chrUPixBuf;
int16_t **alpPixBuf= c->alpPixBuf;
const int vLumBufSize= c->vLumBufSize;
const int vChrBufSize= c->vChrBufSize;
int32_t *vLumFilterPos= c->vLumFilterPos;
int32_t *vChrFilterPos= c->vChrFilterPos;
int16_t *vLumFilter= c->vLumFilter;
int16_t *vChrFilter= c->vChrFilter;
int32_t *lumMmxFilter= c->lumMmxFilter;
int32_t *chrMmxFilter= c->chrMmxFilter;
int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
const int vLumFilterSize= c->vLumFilterSize;
const int vChrFilterSize= c->vChrFilterSize;
const int chrDstY= dstY>>c->chrDstVSubSample;
const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
c->blueDither= ff_dither8[dstY&1];
if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555)
c->greenDither= ff_dither8[dstY&1];
else
c->greenDither= ff_dither4[dstY&1];
c->redDither= ff_dither8[(dstY+1)&1];
if (dstY < dstH - 2) {
const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
int i;
if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
for (i = 0; i < neg; i++)
tmpY[i] = lumSrcPtr[neg];
for ( ; i < end; i++)
tmpY[i] = lumSrcPtr[i];
for ( ; i < vLumFilterSize; i++)
tmpY[i] = tmpY[i-1];
lumSrcPtr = tmpY;
if (alpSrcPtr) {
const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
for (i = 0; i < neg; i++)
tmpA[i] = alpSrcPtr[neg];
for ( ; i < end; i++)
tmpA[i] = alpSrcPtr[i];
for ( ; i < vLumFilterSize; i++)
tmpA[i] = tmpA[i - 1];
alpSrcPtr = tmpA;
}
}
if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
for (i = 0; i < neg; i++) {
tmpU[i] = chrUSrcPtr[neg];
}
for ( ; i < end; i++) {
tmpU[i] = chrUSrcPtr[i];
}
for ( ; i < vChrFilterSize; i++) {
tmpU[i] = tmpU[i - 1];
}
chrUSrcPtr = tmpU;
}
if (flags & SWS_ACCURATE_RND) {
int s= APCK_SIZE / 8;
for (i=0; i<vLumFilterSize; i+=2) {
*(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
*(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
lumMmxFilter[s*i+APCK_COEF/4 ]=
lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
+ (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
*(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
*(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
alpMmxFilter[s*i+APCK_COEF/4 ]=
alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
}
}
for (i=0; i<vChrFilterSize; i+=2) {
*(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ];
*(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)];
chrMmxFilter[s*i+APCK_COEF/4 ]=
chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
+ (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
}
} else {
for (i=0; i<vLumFilterSize; i++) {
*(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
lumMmxFilter[4*i+2]=
lumMmxFilter[4*i+3]=
((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
*(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
alpMmxFilter[4*i+2]=
alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
}
}
for (i=0; i<vChrFilterSize; i++) {
*(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
chrMmxFilter[4*i+2]=
chrMmxFilter[4*i+3]=
((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
}
}
}
}
#if HAVE_MMXEXT
static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
if(((int)dest) & 15){
return yuv2yuvX_MMXEXT(filter, filterSize, src, dest, dstW, dither, offset);
}
if (offset) {
__asm__ volatile("movq (%0), %%xmm3\n\t"
"movdqa %%xmm3, %%xmm4\n\t"
"psrlq $24, %%xmm3\n\t"
"psllq $40, %%xmm4\n\t"
"por %%xmm4, %%xmm3\n\t"
:: "r"(dither)
);
} else {
__asm__ volatile("movq (%0), %%xmm3\n\t"
:: "r"(dither)
);
}
filterSize--;
__asm__ volatile(
"pxor %%xmm0, %%xmm0\n\t"
"punpcklbw %%xmm0, %%xmm3\n\t"
"movd %0, %%xmm1\n\t"
"punpcklwd %%xmm1, %%xmm1\n\t"
"punpckldq %%xmm1, %%xmm1\n\t"
"punpcklqdq %%xmm1, %%xmm1\n\t"
"psllw $3, %%xmm1\n\t"
"paddw %%xmm1, %%xmm3\n\t"
"psraw $4, %%xmm3\n\t"
::"m"(filterSize)
);
__asm__ volatile(
"movdqa %%xmm3, %%xmm4\n\t"
"movdqa %%xmm3, %%xmm7\n\t"
"movl %3, %%ecx\n\t"
"mov %0, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
".p2align 4 \n\t" /* FIXME Unroll? */\
"1: \n\t"\
"movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\
"movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\
"movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\
"add $16, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\
"pmulhw %%xmm0, %%xmm2 \n\t"\
"pmulhw %%xmm0, %%xmm5 \n\t"\
"paddw %%xmm2, %%xmm3 \n\t"\
"paddw %%xmm5, %%xmm4 \n\t"\
" jnz 1b \n\t"\
"psraw $3, %%xmm3 \n\t"\
"psraw $3, %%xmm4 \n\t"\
"packuswb %%xmm4, %%xmm3 \n\t"
"movntdq %%xmm3, (%1, %%"REG_c")\n\t"
"add $16, %%"REG_c" \n\t"\
"cmp %2, %%"REG_c" \n\t"\
"movdqa %%xmm7, %%xmm3\n\t"
"movdqa %%xmm7, %%xmm4\n\t"
"mov %0, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"\
:: "g" (filter),
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
: "%"REG_d, "%"REG_S, "%"REG_c
);
}
#endif
#endif /* HAVE_INLINE_ASM */
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
SwsContext *c, int16_t *data, \
int dstW, const uint8_t *src, \
const int16_t *filter, \
const int32_t *filterPos, int filterSize)
#define SCALE_FUNCS(filter_n, opt) \
SCALE_FUNC(filter_n, 8, 15, opt); \
SCALE_FUNC(filter_n, 9, 15, opt); \
SCALE_FUNC(filter_n, 10, 15, opt); \
SCALE_FUNC(filter_n, 12, 15, opt); \
SCALE_FUNC(filter_n, 14, 15, opt); \
SCALE_FUNC(filter_n, 16, 15, opt); \
SCALE_FUNC(filter_n, 8, 19, opt); \
SCALE_FUNC(filter_n, 9, 19, opt); \
SCALE_FUNC(filter_n, 10, 19, opt); \
SCALE_FUNC(filter_n, 12, 19, opt); \
SCALE_FUNC(filter_n, 14, 19, opt); \
SCALE_FUNC(filter_n, 16, 19, opt)
#define SCALE_FUNCS_MMX(opt) \
SCALE_FUNCS(4, opt); \
SCALE_FUNCS(8, opt); \
SCALE_FUNCS(X, opt)
#define SCALE_FUNCS_SSE(opt) \
SCALE_FUNCS(4, opt); \
SCALE_FUNCS(8, opt); \
SCALE_FUNCS(X4, opt); \
SCALE_FUNCS(X8, opt)
#if ARCH_X86_32
SCALE_FUNCS_MMX(mmx);
#endif
SCALE_FUNCS_SSE(sse2);
SCALE_FUNCS_SSE(ssse3);
SCALE_FUNCS_SSE(sse4);
#define VSCALEX_FUNC(size, opt) \
void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset)
#define VSCALEX_FUNCS(opt) \
VSCALEX_FUNC(8, opt); \
VSCALEX_FUNC(9, opt); \
VSCALEX_FUNC(10, opt)
#if ARCH_X86_32
VSCALEX_FUNCS(mmxext);
#endif
VSCALEX_FUNCS(sse2);
VSCALEX_FUNCS(sse4);
VSCALEX_FUNC(16, sse4);
VSCALEX_FUNCS(avx);
#define VSCALE_FUNC(size, opt) \
void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
const uint8_t *dither, int offset)
#define VSCALE_FUNCS(opt1, opt2) \
VSCALE_FUNC(8, opt1); \
VSCALE_FUNC(9, opt2); \
VSCALE_FUNC(10, opt2); \
VSCALE_FUNC(16, opt1)
#if ARCH_X86_32
VSCALE_FUNCS(mmx, mmxext);
#endif
VSCALE_FUNCS(sse2, sse2);
VSCALE_FUNC(16, sse4);
VSCALE_FUNCS(avx, avx);
#define INPUT_Y_FUNC(fmt, opt) \
void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
const uint8_t *unused1, const uint8_t *unused2, \
int w, uint32_t *unused)
#define INPUT_UV_FUNC(fmt, opt) \
void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *unused0, \
const uint8_t *src1, \
const uint8_t *src2, \
int w, uint32_t *unused)
#define INPUT_FUNC(fmt, opt) \
INPUT_Y_FUNC(fmt, opt); \
INPUT_UV_FUNC(fmt, opt)
#define INPUT_FUNCS(opt) \
INPUT_FUNC(uyvy, opt); \
INPUT_FUNC(yuyv, opt); \
INPUT_UV_FUNC(nv12, opt); \
INPUT_UV_FUNC(nv21, opt); \
INPUT_FUNC(rgba, opt); \
INPUT_FUNC(bgra, opt); \
INPUT_FUNC(argb, opt); \
INPUT_FUNC(abgr, opt); \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(bgr24, opt)
#if ARCH_X86_32
INPUT_FUNCS(mmx);
#endif
INPUT_FUNCS(sse2);
INPUT_FUNCS(ssse3);
INPUT_FUNCS(avx);
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_MMX_INLINE
if (cpu_flags & AV_CPU_FLAG_MMX)
sws_init_swscale_MMX(c);
#endif
#if HAVE_MMXEXT_INLINE
if (cpu_flags & AV_CPU_FLAG_MMXEXT)
sws_init_swscale_MMXEXT(c);
if (cpu_flags & AV_CPU_FLAG_SSE3){
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
c->yuv2planeX = yuv2yuvX_sse3;
}
#endif
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
if (c->srcBpc == 8) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
ff_hscale8to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 9) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
ff_hscale9to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 10) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
ff_hscale10to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 12) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
ff_hscale12to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
ff_hscale14to19_ ## filtersize ## _ ## opt1; \
} else { /* c->srcBpc == 16 */ \
av_assert0(c->srcBpc == 16);\
hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
} \
} while (0)
#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
}
#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
switch(c->dstBpc){ \
case 16: do_16_case; break; \
case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \
default: if (condition_8bit) /*vscalefn = ff_yuv2planeX_8_ ## opt;*/ break; \
}
#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
switch(c->dstBpc){ \
case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
default: av_assert0(c->dstBpc>8); \
}
#define case_rgb(x, X, opt) \
case AV_PIX_FMT_ ## X: \
c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
if (!c->chrSrcHSubSample) \
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
break
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
switch (c->srcFormat) {
case AV_PIX_FMT_Y400A:
c->lumToYV12 = ff_yuyvToY_mmx;
if (c->alpPixBuf)
c->alpToYV12 = ff_uyvyToY_mmx;
break;
case AV_PIX_FMT_YUYV422:
c->lumToYV12 = ff_yuyvToY_mmx;
c->chrToYV12 = ff_yuyvToUV_mmx;
break;
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = ff_uyvyToY_mmx;
c->chrToYV12 = ff_uyvyToUV_mmx;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_mmx;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_mmx;
break;
case_rgb(rgb24, RGB24, mmx);
case_rgb(bgr24, BGR24, mmx);
case_rgb(bgra, BGRA, mmx);
case_rgb(rgba, RGBA, mmx);
case_rgb(abgr, ABGR, mmx);
case_rgb(argb, ARGB, mmx);
default:
break;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
}
#endif /* ARCH_X86_32 */
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
break; \
}
if (EXTERNAL_SSE2(cpu_flags)) {
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
HAVE_ALIGNED_STACK || ARCH_X86_64);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
switch (c->srcFormat) {
case AV_PIX_FMT_Y400A:
c->lumToYV12 = ff_yuyvToY_sse2;
if (c->alpPixBuf)
c->alpToYV12 = ff_uyvyToY_sse2;
break;
case AV_PIX_FMT_YUYV422:
c->lumToYV12 = ff_yuyvToY_sse2;
c->chrToYV12 = ff_yuyvToUV_sse2;
break;
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = ff_uyvyToY_sse2;
c->chrToYV12 = ff_uyvyToUV_sse2;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_sse2;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_sse2;
break;
case_rgb(rgb24, RGB24, sse2);
case_rgb(bgr24, BGR24, sse2);
case_rgb(bgra, BGRA, sse2);
case_rgb(rgba, RGBA, sse2);
case_rgb(abgr, ABGR, sse2);
case_rgb(argb, ARGB, sse2);
default:
break;
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
switch (c->srcFormat) {
case_rgb(rgb24, RGB24, ssse3);
case_rgb(bgr24, BGR24, ssse3);
default:
break;
}
}
if (EXTERNAL_SSE4(cpu_flags)) {
/* Xto15 don't need special sse4 functions */
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
HAVE_ALIGNED_STACK || ARCH_X86_64);
if (c->dstBpc == 16 && !isBE(c->dstFormat))
c->yuv2plane1 = ff_yuv2plane1_16_sse4;
}
if (EXTERNAL_AVX(cpu_flags)) {
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
HAVE_ALIGNED_STACK || ARCH_X86_64);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
switch (c->srcFormat) {
case AV_PIX_FMT_YUYV422:
c->chrToYV12 = ff_yuyvToUV_avx;
break;
case AV_PIX_FMT_UYVY422:
c->chrToYV12 = ff_uyvyToUV_avx;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_avx;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_avx;
break;
case_rgb(rgb24, RGB24, avx);
case_rgb(bgr24, BGR24, avx);
case_rgb(bgra, BGRA, avx);
case_rgb(rgba, RGBA, avx);
case_rgb(abgr, ABGR, avx);
case_rgb(argb, ARGB, avx);
default:
break;
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,31 @@
/*
* check XMM registers for clobbers on Win64
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/w64xmmtest.h"
#include "libswscale/swscale.h"
wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
const int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *const dst[], const int dstStride[]))
{
testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
srcSliceH, dst, dstStride);
}

View File

@@ -0,0 +1,112 @@
/*
* software YUV to RGB converter
*
* Copyright (C) 2009 Konstantin Shishkov
*
* MMX/MMXEXT template stuff (needed for fast movntq support),
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/cpu.h"
#if HAVE_INLINE_ASM
#define DITHER1XBPP // only for MMX
/* hope these constant values are cache line aligned */
DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL;
DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
//MMX versions
#if HAVE_MMX_INLINE
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _MMX
#include "yuv2rgb_template.c"
#endif /* HAVE_MMX_INLINE */
// MMXEXT versions
#if HAVE_MMXEXT_INLINE
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _MMXEXT
#include "yuv2rgb_template.c"
#endif /* HAVE_MMXEXT_INLINE */
#endif /* HAVE_INLINE_ASM */
av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
{
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
#if HAVE_MMXEXT_INLINE
if (cpu_flags & AV_CPU_FLAG_MMXEXT) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB24:
return yuv420_rgb24_MMXEXT;
case AV_PIX_FMT_BGR24:
return yuv420_bgr24_MMXEXT;
}
}
#endif
if (cpu_flags & AV_CPU_FLAG_MMX) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
return yuva420_rgb32_MMX;
#endif
break;
} else return yuv420_rgb32_MMX;
case AV_PIX_FMT_BGR32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
return yuva420_bgr32_MMX;
#endif
break;
} else return yuv420_bgr32_MMX;
case AV_PIX_FMT_RGB24: return yuv420_rgb24_MMX;
case AV_PIX_FMT_BGR24: return yuv420_bgr24_MMX;
case AV_PIX_FMT_RGB565: return yuv420_rgb16_MMX;
case AV_PIX_FMT_RGB555: return yuv420_rgb15_MMX;
}
}
#endif /* HAVE_MMX_INLINE */
return NULL;
}

View File

@@ -0,0 +1,451 @@
/*
* software YUV to RGB converter
*
* Copyright (C) 2001-2007 Michael Niedermayer
* (c) 2010 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#undef MOVNTQ
#undef EMMS
#undef SFENCE
#if COMPILE_TEMPLATE_MMXEXT
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE " # nop"
#endif
#define REG_BLUE "0"
#define REG_RED "1"
#define REG_GREEN "2"
#define REG_ALPHA "3"
#define YUV2RGB_LOOP(depth) \
h_size = (c->dstW + 7) & ~7; \
if (h_size * depth > FFABS(dstStride[0])) \
h_size -= 8; \
\
vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
\
__asm__ volatile ("pxor %mm4, %mm4\n\t"); \
for (y = 0; y < srcSliceH; y++) { \
uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
const uint8_t *py = src[0] + y * srcStride[0]; \
const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
x86_reg index = -h_size / 2; \
#define YUV2RGB_INITIAL_LOAD \
__asm__ volatile ( \
"movq (%5, %0, 2), %%mm6\n\t" \
"movd (%2, %0), %%mm0\n\t" \
"movd (%3, %0), %%mm1\n\t" \
"1: \n\t" \
/* YUV2RGB core
* Conversion is performed in usual way:
* R = Y' * Ycoef + Vred * V'
* G = Y' * Ycoef + Vgreen * V' + Ugreen * U'
* B = Y' * Ycoef + Ublue * U'
*
* where X' = X * 8 - Xoffset (multiplication is performed to increase
* precision a bit).
* Since it operates in YUV420 colorspace, Y component is additionally
* split into Y1 and Y2 for even and odd pixels.
*
* Input:
* mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register
* Output:
* mm1 - R, mm2 - G, mm0 - B
*/
#define YUV2RGB \
/* convert Y, U, V into Y1', Y2', U', V' */ \
"movq %%mm6, %%mm7\n\t" \
"punpcklbw %%mm4, %%mm0\n\t" \
"punpcklbw %%mm4, %%mm1\n\t" \
"pand "MANGLE(mmx_00ffw)", %%mm6\n\t" \
"psrlw $8, %%mm7\n\t" \
"psllw $3, %%mm0\n\t" \
"psllw $3, %%mm1\n\t" \
"psllw $3, %%mm6\n\t" \
"psllw $3, %%mm7\n\t" \
"psubsw "U_OFFSET"(%4), %%mm0\n\t" \
"psubsw "V_OFFSET"(%4), %%mm1\n\t" \
"psubw "Y_OFFSET"(%4), %%mm6\n\t" \
"psubw "Y_OFFSET"(%4), %%mm7\n\t" \
\
/* multiply by coefficients */ \
"movq %%mm0, %%mm2\n\t" \
"movq %%mm1, %%mm3\n\t" \
"pmulhw "UG_COEFF"(%4), %%mm2\n\t" \
"pmulhw "VG_COEFF"(%4), %%mm3\n\t" \
"pmulhw "Y_COEFF" (%4), %%mm6\n\t" \
"pmulhw "Y_COEFF" (%4), %%mm7\n\t" \
"pmulhw "UB_COEFF"(%4), %%mm0\n\t" \
"pmulhw "VR_COEFF"(%4), %%mm1\n\t" \
"paddsw %%mm3, %%mm2\n\t" \
/* now: mm0 = UB, mm1 = VR, mm2 = CG */ \
/* mm6 = Y1, mm7 = Y2 */ \
\
/* produce RGB */ \
"movq %%mm7, %%mm3\n\t" \
"movq %%mm7, %%mm5\n\t" \
"paddsw %%mm0, %%mm3\n\t" \
"paddsw %%mm1, %%mm5\n\t" \
"paddsw %%mm2, %%mm7\n\t" \
"paddsw %%mm6, %%mm0\n\t" \
"paddsw %%mm6, %%mm1\n\t" \
"paddsw %%mm6, %%mm2\n\t" \
#define RGB_PACK_INTERLEAVE \
/* pack and interleave even/odd pixels */ \
"packuswb %%mm1, %%mm0\n\t" \
"packuswb %%mm5, %%mm3\n\t" \
"packuswb %%mm2, %%mm2\n\t" \
"movq %%mm0, %%mm1\n\n" \
"packuswb %%mm7, %%mm7\n\t" \
"punpcklbw %%mm3, %%mm0\n\t" \
"punpckhbw %%mm3, %%mm1\n\t" \
"punpcklbw %%mm7, %%mm2\n\t" \
#define YUV2RGB_ENDLOOP(depth) \
"movq 8 (%5, %0, 2), %%mm6\n\t" \
"movd 4 (%3, %0), %%mm1\n\t" \
"movd 4 (%2, %0), %%mm0\n\t" \
"add $"AV_STRINGIFY(depth * 8)", %1\n\t" \
"add $4, %0\n\t" \
"js 1b\n\t" \
#define YUV2RGB_OPERANDS \
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index) \
: "memory" \
); \
} \
#define YUV2RGB_OPERANDS_ALPHA \
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index), "r" (pa - 2*index) \
: "memory" \
); \
} \
#define YUV2RGB_ENDFUNC \
__asm__ volatile (SFENCE"\n\t" \
"emms \n\t"); \
return srcSliceH; \
#define IF0(x)
#define IF1(x) x
#define RGB_PACK16(gmask, is15) \
"pand "MANGLE(mmx_redmask)", %%mm0\n\t" \
"pand "MANGLE(mmx_redmask)", %%mm1\n\t" \
"movq %%mm2, %%mm3\n\t" \
"psllw $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \
"psrlw $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \
"psrlw $3, %%mm0\n\t" \
IF##is15("psrlw $1, %%mm1\n\t") \
"pand "MANGLE(pb_e0)", %%mm2\n\t" \
"pand "MANGLE(gmask)", %%mm3\n\t" \
"por %%mm2, %%mm0\n\t" \
"por %%mm3, %%mm1\n\t" \
"movq %%mm0, %%mm2\n\t" \
"punpcklbw %%mm1, %%mm0\n\t" \
"punpckhbw %%mm1, %%mm2\n\t" \
MOVNTQ " %%mm0, (%1)\n\t" \
MOVNTQ " %%mm2, 8(%1)\n\t" \
#define DITHER_RGB \
"paddusb "BLUE_DITHER"(%4), %%mm0\n\t" \
"paddusb "GREEN_DITHER"(%4), %%mm2\n\t" \
"paddusb "RED_DITHER"(%4), %%mm1\n\t" \
#if !COMPILE_TEMPLATE_MMXEXT
static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(2)
#ifdef DITHER1XBPP
c->blueDither = ff_dither8[y & 1];
c->greenDither = ff_dither8[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
#endif
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
#ifdef DITHER1XBPP
DITHER_RGB
#endif
RGB_PACK16(pb_03, 1)
YUV2RGB_ENDLOOP(2)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(2)
#ifdef DITHER1XBPP
c->blueDither = ff_dither8[y & 1];
c->greenDither = ff_dither4[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
#endif
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
#ifdef DITHER1XBPP
DITHER_RGB
#endif
RGB_PACK16(pb_07, 0)
YUV2RGB_ENDLOOP(2)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#endif /* !COMPILE_TEMPLATE_MMXEXT */
#define RGB_PACK24(blue, red)\
"packuswb %%mm3, %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
"packuswb %%mm5, %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
"packuswb %%mm7, %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
"movq %%mm"red", %%mm3 \n"\
"movq %%mm"blue", %%mm6 \n"\
"psrlq $32, %%mm"red" \n" /* R1 R3 R5 R7 */\
"punpcklbw %%mm2, %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
"punpcklbw %%mm"red", %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
"movq %%mm3, %%mm5 \n"\
"punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
"punpcklwd %%mm6, %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
"punpckhwd %%mm6, %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
RGB_PACK24_B
#if COMPILE_TEMPLATE_MMXEXT
DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
#undef RGB_PACK24_B
#define RGB_PACK24_B\
"pshufw $0xc6, %%mm2, %%mm1 \n"\
"pshufw $0x84, %%mm3, %%mm6 \n"\
"pshufw $0x38, %%mm5, %%mm7 \n"\
"pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
"movq %%mm1, %%mm0 \n"\
"pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
"movq %%mm1, %%mm2 \n"\
"pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
"psrlq $48, %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
"pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
"psllq $32, %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
"pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
"por %%mm3, %%mm1 \n"\
"por %%mm6, %%mm0 \n"\
"por %%mm5, %%mm1 \n"\
"por %%mm7, %%mm2 \n"\
MOVNTQ" %%mm0, (%1) \n"\
MOVNTQ" %%mm1, 8(%1) \n"\
MOVNTQ" %%mm2, 16(%1) \n"\
#else
#undef RGB_PACK24_B
#define RGB_PACK24_B\
"movd %%mm3, (%1) \n" /* R0 G0 B0 R1 */\
"movd %%mm2, 4(%1) \n" /* G1 B1 */\
"psrlq $32, %%mm3 \n"\
"psrlq $16, %%mm2 \n"\
"movd %%mm3, 6(%1) \n" /* R2 G2 B2 R3 */\
"movd %%mm2, 10(%1) \n" /* G3 B3 */\
"psrlq $16, %%mm2 \n"\
"movd %%mm5, 12(%1) \n" /* R4 G4 B4 R5 */\
"movd %%mm2, 16(%1) \n" /* G5 B5 */\
"psrlq $32, %%mm5 \n"\
"movd %%mm2, 20(%1) \n" /* -- -- G7 B7 */\
"movd %%mm5, 18(%1) \n" /* R6 G6 B6 R7 */\
#endif
static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(3)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK24(REG_BLUE, REG_RED)
YUV2RGB_ENDLOOP(3)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(3)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK24(REG_RED, REG_BLUE)
YUV2RGB_ENDLOOP(3)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#define SET_EMPTY_ALPHA \
"pcmpeqd %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \
#define LOAD_ALPHA \
"movq (%6, %0, 2), %%mm"REG_ALPHA"\n\t" \
#define RGB_PACK32(red, green, blue, alpha) \
"movq %%mm"blue", %%mm5\n\t" \
"movq %%mm"red", %%mm6\n\t" \
"punpckhbw %%mm"green", %%mm5\n\t" \
"punpcklbw %%mm"green", %%mm"blue"\n\t" \
"punpckhbw %%mm"alpha", %%mm6\n\t" \
"punpcklbw %%mm"alpha", %%mm"red"\n\t" \
"movq %%mm"blue", %%mm"green"\n\t" \
"movq %%mm5, %%mm"alpha"\n\t" \
"punpcklwd %%mm"red", %%mm"blue"\n\t" \
"punpckhwd %%mm"red", %%mm"green"\n\t" \
"punpcklwd %%mm6, %%mm5\n\t" \
"punpckhwd %%mm6, %%mm"alpha"\n\t" \
MOVNTQ " %%mm"blue", 0(%1)\n\t" \
MOVNTQ " %%mm"green", 8(%1)\n\t" \
MOVNTQ " %%mm5, 16(%1)\n\t" \
MOVNTQ " %%mm"alpha", 24(%1)\n\t" \
#if !COMPILE_TEMPLATE_MMXEXT
static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
SET_EMPTY_ALPHA
RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
LOAD_ALPHA
RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS_ALPHA
YUV2RGB_ENDFUNC
}
#endif
static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
SET_EMPTY_ALPHA
RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
LOAD_ALPHA
RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS_ALPHA
YUV2RGB_ENDFUNC
}
#endif
#endif /* !COMPILE_TEMPLATE_MMXEXT */

View File

@@ -0,0 +1,920 @@
/*
* software YUV to RGB converter
*
* Copyright (C) 2009 Konstantin Shishkov
*
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "libavutil/cpu.h"
#include "libavutil/bswap.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
#include "libavutil/pixdesc.h"
const int32_t ff_yuv2rgb_coeffs[8][4] = {
{ 117504, 138453, 13954, 34903 }, /* no sequence_display_extension */
{ 117504, 138453, 13954, 34903 }, /* ITU-R Rec. 709 (1990) */
{ 104597, 132201, 25675, 53279 }, /* unspecified */
{ 104597, 132201, 25675, 53279 }, /* reserved */
{ 104448, 132798, 24759, 53109 }, /* FCC */
{ 104597, 132201, 25675, 53279 }, /* ITU-R Rec. 624-4 System B, G */
{ 104597, 132201, 25675, 53279 }, /* SMPTE 170M */
{ 117579, 136230, 16907, 35559 } /* SMPTE 240M (1987) */
};
const int *sws_getCoefficients(int colorspace)
{
if (colorspace > 7 || colorspace < 0)
colorspace = SWS_CS_DEFAULT;
return ff_yuv2rgb_coeffs[colorspace];
}
#define LOADCHROMA(i) \
U = pu[i]; \
V = pv[i]; \
r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM]; \
g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]); \
b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
#define PUTRGB(dst, src, i) \
Y = src[2 * i]; \
dst[2 * i] = r[Y] + g[Y] + b[Y]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y] + g[Y] + b[Y];
#define PUTRGB24(dst, src, i) \
Y = src[2 * i]; \
dst[6 * i + 0] = r[Y]; \
dst[6 * i + 1] = g[Y]; \
dst[6 * i + 2] = b[Y]; \
Y = src[2 * i + 1]; \
dst[6 * i + 3] = r[Y]; \
dst[6 * i + 4] = g[Y]; \
dst[6 * i + 5] = b[Y];
#define PUTBGR24(dst, src, i) \
Y = src[2 * i]; \
dst[6 * i + 0] = b[Y]; \
dst[6 * i + 1] = g[Y]; \
dst[6 * i + 2] = r[Y]; \
Y = src[2 * i + 1]; \
dst[6 * i + 3] = b[Y]; \
dst[6 * i + 4] = g[Y]; \
dst[6 * i + 5] = r[Y];
#define PUTRGBA(dst, ysrc, asrc, i, s) \
Y = ysrc[2 * i]; \
dst[2 * i] = r[Y] + g[Y] + b[Y] + (asrc[2 * i] << s); \
Y = ysrc[2 * i + 1]; \
dst[2 * i + 1] = r[Y] + g[Y] + b[Y] + (asrc[2 * i + 1] << s);
#define PUTRGB48(dst, src, i) \
Y = src[ 2 * i]; \
dst[12 * i + 0] = dst[12 * i + 1] = r[Y]; \
dst[12 * i + 2] = dst[12 * i + 3] = g[Y]; \
dst[12 * i + 4] = dst[12 * i + 5] = b[Y]; \
Y = src[ 2 * i + 1]; \
dst[12 * i + 6] = dst[12 * i + 7] = r[Y]; \
dst[12 * i + 8] = dst[12 * i + 9] = g[Y]; \
dst[12 * i + 10] = dst[12 * i + 11] = b[Y];
#define PUTBGR48(dst, src, i) \
Y = src[2 * i]; \
dst[12 * i + 0] = dst[12 * i + 1] = b[Y]; \
dst[12 * i + 2] = dst[12 * i + 3] = g[Y]; \
dst[12 * i + 4] = dst[12 * i + 5] = r[Y]; \
Y = src[2 * i + 1]; \
dst[12 * i + 6] = dst[12 * i + 7] = b[Y]; \
dst[12 * i + 8] = dst[12 * i + 9] = g[Y]; \
dst[12 * i + 10] = dst[12 * i + 11] = r[Y];
#define YUV2RGBFUNC(func_name, dst_type, alpha) \
static int func_name(SwsContext *c, const uint8_t *src[], \
int srcStride[], int srcSliceY, int srcSliceH, \
uint8_t *dst[], int dstStride[]) \
{ \
int y; \
\
if (!alpha && c->srcFormat == AV_PIX_FMT_YUV422P) { \
srcStride[1] *= 2; \
srcStride[2] *= 2; \
} \
for (y = 0; y < srcSliceH; y += 2) { \
dst_type *dst_1 = \
(dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]); \
dst_type *dst_2 = \
(dst_type *)(dst[0] + (y + srcSliceY + 1) * dstStride[0]); \
dst_type av_unused *r, *g, *b; \
const uint8_t *py_1 = src[0] + y * srcStride[0]; \
const uint8_t *py_2 = py_1 + srcStride[0]; \
const uint8_t *pu = src[1] + (y >> 1) * srcStride[1]; \
const uint8_t *pv = src[2] + (y >> 1) * srcStride[2]; \
const uint8_t av_unused *pa_1, *pa_2; \
unsigned int h_size = c->dstW >> 3; \
if (alpha) { \
pa_1 = src[3] + y * srcStride[3]; \
pa_2 = pa_1 + srcStride[3]; \
} \
while (h_size--) { \
int av_unused U, V, Y; \
#define ENDYUV2RGBLINE(dst_delta, ss) \
pu += 4 >> ss; \
pv += 4 >> ss; \
py_1 += 8 >> ss; \
py_2 += 8 >> ss; \
dst_1 += dst_delta >> ss; \
dst_2 += dst_delta >> ss; \
} \
if (c->dstW & (4 >> ss)) { \
int av_unused Y, U, V; \
#define ENDYUV2RGBFUNC() \
} \
} \
return srcSliceH; \
}
#define CLOSEYUV2RGBFUNC(dst_delta) \
ENDYUV2RGBLINE(dst_delta, 0) \
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_48, uint8_t, 0)
LOADCHROMA(0);
PUTRGB48(dst_1, py_1, 0);
PUTRGB48(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB48(dst_2, py_2, 1);
PUTRGB48(dst_1, py_1, 1);
LOADCHROMA(2);
PUTRGB48(dst_1, py_1, 2);
PUTRGB48(dst_2, py_2, 2);
LOADCHROMA(3);
PUTRGB48(dst_2, py_2, 3);
PUTRGB48(dst_1, py_1, 3);
ENDYUV2RGBLINE(48, 0)
LOADCHROMA(0);
PUTRGB48(dst_1, py_1, 0);
PUTRGB48(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB48(dst_2, py_2, 1);
PUTRGB48(dst_1, py_1, 1);
ENDYUV2RGBLINE(48, 1)
LOADCHROMA(0);
PUTRGB48(dst_1, py_1, 0);
PUTRGB48(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_bgr48, uint8_t, 0)
LOADCHROMA(0);
PUTBGR48(dst_1, py_1, 0);
PUTBGR48(dst_2, py_2, 0);
LOADCHROMA(1);
PUTBGR48(dst_2, py_2, 1);
PUTBGR48(dst_1, py_1, 1);
LOADCHROMA(2);
PUTBGR48(dst_1, py_1, 2);
PUTBGR48(dst_2, py_2, 2);
LOADCHROMA(3);
PUTBGR48(dst_2, py_2, 3);
PUTBGR48(dst_1, py_1, 3);
ENDYUV2RGBLINE(48, 0)
LOADCHROMA(0);
PUTBGR48(dst_1, py_1, 0);
PUTBGR48(dst_2, py_2, 0);
LOADCHROMA(1);
PUTBGR48(dst_2, py_2, 1);
PUTBGR48(dst_1, py_1, 1);
ENDYUV2RGBLINE(48, 1)
LOADCHROMA(0);
PUTBGR48(dst_1, py_1, 0);
PUTBGR48(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_32, uint32_t, 0)
LOADCHROMA(0);
PUTRGB(dst_1, py_1, 0);
PUTRGB(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB(dst_2, py_2, 1);
PUTRGB(dst_1, py_1, 1);
LOADCHROMA(2);
PUTRGB(dst_1, py_1, 2);
PUTRGB(dst_2, py_2, 2);
LOADCHROMA(3);
PUTRGB(dst_2, py_2, 3);
PUTRGB(dst_1, py_1, 3);
ENDYUV2RGBLINE(8, 0)
LOADCHROMA(0);
PUTRGB(dst_1, py_1, 0);
PUTRGB(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB(dst_2, py_2, 1);
PUTRGB(dst_1, py_1, 1);
ENDYUV2RGBLINE(8, 1)
LOADCHROMA(0);
PUTRGB(dst_1, py_1, 0);
PUTRGB(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuva2rgba_c, uint32_t, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 24);
PUTRGBA(dst_2, py_2, pa_2, 0, 24);
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 24);
PUTRGBA(dst_1, py_1, pa_1, 1, 24);
LOADCHROMA(2);
PUTRGBA(dst_1, py_1, pa_1, 2, 24);
PUTRGBA(dst_2, py_2, pa_2, 2, 24);
LOADCHROMA(3);
PUTRGBA(dst_2, py_2, pa_2, 3, 24);
PUTRGBA(dst_1, py_1, pa_1, 3, 24);
pa_1 += 8;
pa_2 += 8;
ENDYUV2RGBLINE(8, 0)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 24);
PUTRGBA(dst_2, py_2, pa_2, 0, 24);
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 24);
PUTRGBA(dst_1, py_1, pa_1, 1, 24);
pa_1 += 4;
pa_2 += 4;
ENDYUV2RGBLINE(8, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 24);
PUTRGBA(dst_2, py_2, pa_2, 0, 24);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuva2argb_c, uint32_t, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 0);
PUTRGBA(dst_2, py_2, pa_2, 0, 0);
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 0);
PUTRGBA(dst_1, py_1, pa_1, 1, 0);
LOADCHROMA(2);
PUTRGBA(dst_1, py_1, pa_1, 2, 0);
PUTRGBA(dst_2, py_2, pa_2, 2, 0);
LOADCHROMA(3);
PUTRGBA(dst_2, py_2, pa_2, 3, 0);
PUTRGBA(dst_1, py_1, pa_1, 3, 0);
pa_1 += 8;
pa_2 += 8;
ENDYUV2RGBLINE(8, 0)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 0);
PUTRGBA(dst_2, py_2, pa_2, 0, 0);
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 0);
PUTRGBA(dst_1, py_1, pa_1, 1, 0);
pa_1 += 4;
pa_2 += 4;
ENDYUV2RGBLINE(8, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 0);
PUTRGBA(dst_2, py_2, pa_2, 0, 0);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_24_rgb, uint8_t, 0)
LOADCHROMA(0);
PUTRGB24(dst_1, py_1, 0);
PUTRGB24(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB24(dst_2, py_2, 1);
PUTRGB24(dst_1, py_1, 1);
LOADCHROMA(2);
PUTRGB24(dst_1, py_1, 2);
PUTRGB24(dst_2, py_2, 2);
LOADCHROMA(3);
PUTRGB24(dst_2, py_2, 3);
PUTRGB24(dst_1, py_1, 3);
ENDYUV2RGBLINE(24, 0)
LOADCHROMA(0);
PUTRGB24(dst_1, py_1, 0);
PUTRGB24(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB24(dst_2, py_2, 1);
PUTRGB24(dst_1, py_1, 1);
ENDYUV2RGBLINE(24, 1)
LOADCHROMA(0);
PUTRGB24(dst_1, py_1, 0);
PUTRGB24(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
// only trivial mods from yuv2rgb_c_24_rgb
YUV2RGBFUNC(yuv2rgb_c_24_bgr, uint8_t, 0)
LOADCHROMA(0);
PUTBGR24(dst_1, py_1, 0);
PUTBGR24(dst_2, py_2, 0);
LOADCHROMA(1);
PUTBGR24(dst_2, py_2, 1);
PUTBGR24(dst_1, py_1, 1);
LOADCHROMA(2);
PUTBGR24(dst_1, py_1, 2);
PUTBGR24(dst_2, py_2, 2);
LOADCHROMA(3);
PUTBGR24(dst_2, py_2, 3);
PUTBGR24(dst_1, py_1, 3);
ENDYUV2RGBLINE(24, 0)
LOADCHROMA(0);
PUTBGR24(dst_1, py_1, 0);
PUTBGR24(dst_2, py_2, 0);
LOADCHROMA(1);
PUTBGR24(dst_2, py_2, 1);
PUTBGR24(dst_1, py_1, 1);
ENDYUV2RGBLINE(24, 1)
LOADCHROMA(0);
PUTBGR24(dst_1, py_1, 0);
PUTBGR24(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_16_ordered_dither, uint16_t, 0)
const uint8_t *d16 = ff_dither_2x2_8[y & 1];
const uint8_t *e16 = ff_dither_2x2_4[y & 1];
const uint8_t *f16 = ff_dither_2x2_8[(y & 1)^1];
#define PUTRGB16(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d16[0 + o]] + \
g[Y + e16[0 + o]] + \
b[Y + f16[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d16[1 + o]] + \
g[Y + e16[1 + o]] + \
b[Y + f16[1 + o]];
LOADCHROMA(0);
PUTRGB16(dst_1, py_1, 0, 0);
PUTRGB16(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB16(dst_2, py_2, 1, 2 + 8);
PUTRGB16(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB16(dst_1, py_1, 2, 4);
PUTRGB16(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB16(dst_2, py_2, 3, 6 + 8);
PUTRGB16(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
YUV2RGBFUNC(yuv2rgb_c_15_ordered_dither, uint16_t, 0)
const uint8_t *d16 = ff_dither_2x2_8[y & 1];
const uint8_t *e16 = ff_dither_2x2_8[(y & 1)^1];
#define PUTRGB15(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d16[0 + o]] + \
g[Y + d16[1 + o]] + \
b[Y + e16[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d16[1 + o]] + \
g[Y + d16[0 + o]] + \
b[Y + e16[1 + o]];
LOADCHROMA(0);
PUTRGB15(dst_1, py_1, 0, 0);
PUTRGB15(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB15(dst_2, py_2, 1, 2 + 8);
PUTRGB15(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB15(dst_1, py_1, 2, 4);
PUTRGB15(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB15(dst_2, py_2, 3, 6 + 8);
PUTRGB15(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
// r, g, b, dst_1, dst_2
YUV2RGBFUNC(yuv2rgb_c_12_ordered_dither, uint16_t, 0)
const uint8_t *d16 = ff_dither_4x4_16[y & 3];
#define PUTRGB12(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d16[0 + o]] + \
g[Y + d16[0 + o]] + \
b[Y + d16[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d16[1 + o]] + \
g[Y + d16[1 + o]] + \
b[Y + d16[1 + o]];
LOADCHROMA(0);
PUTRGB12(dst_1, py_1, 0, 0);
PUTRGB12(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB12(dst_2, py_2, 1, 2 + 8);
PUTRGB12(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB12(dst_1, py_1, 2, 4);
PUTRGB12(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB12(dst_2, py_2, 3, 6 + 8);
PUTRGB12(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
// r, g, b, dst_1, dst_2
YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t, 0)
const uint8_t *d32 = ff_dither_8x8_32[y & 7];
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
#define PUTRGB8(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d32[0 + o]] + \
g[Y + d32[0 + o]] + \
b[Y + d64[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d32[1 + o]] + \
g[Y + d32[1 + o]] + \
b[Y + d64[1 + o]];
LOADCHROMA(0);
PUTRGB8(dst_1, py_1, 0, 0);
PUTRGB8(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB8(dst_2, py_2, 1, 2 + 8);
PUTRGB8(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB8(dst_1, py_1, 2, 4);
PUTRGB8(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB8(dst_2, py_2, 3, 6 + 8);
PUTRGB8(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t, 0)
const uint8_t * d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
int acc;
#define PUTRGB4D(dst, src, i, o) \
Y = src[2 * i]; \
acc = r[Y + d128[0 + o]] + \
g[Y + d64[0 + o]] + \
b[Y + d128[0 + o]]; \
Y = src[2 * i + 1]; \
acc |= (r[Y + d128[1 + o]] + \
g[Y + d64[1 + o]] + \
b[Y + d128[1 + o]]) << 4; \
dst[i] = acc;
LOADCHROMA(0);
PUTRGB4D(dst_1, py_1, 0, 0);
PUTRGB4D(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB4D(dst_2, py_2, 1, 2 + 8);
PUTRGB4D(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB4D(dst_1, py_1, 2, 4);
PUTRGB4D(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB4D(dst_2, py_2, 3, 6 + 8);
PUTRGB4D(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(4)
YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t, 0)
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
#define PUTRGB4DB(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d128[0 + o]] + \
g[Y + d64[0 + o]] + \
b[Y + d128[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d128[1 + o]] + \
g[Y + d64[1 + o]] + \
b[Y + d128[1 + o]];
LOADCHROMA(0);
PUTRGB4DB(dst_1, py_1, 0, 0);
PUTRGB4DB(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB4DB(dst_2, py_2, 1, 2 + 8);
PUTRGB4DB(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB4DB(dst_1, py_1, 2, 4);
PUTRGB4DB(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB4DB(dst_2, py_2, 3, 6 + 8);
PUTRGB4DB(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
char out_1 = 0, out_2 = 0;
g = c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM];
#define PUTRGB1(out, src, i, o) \
Y = src[2 * i]; \
out += out + g[Y + d128[0 + o]]; \
Y = src[2 * i + 1]; \
out += out + g[Y + d128[1 + o]];
PUTRGB1(out_1, py_1, 0, 0);
PUTRGB1(out_2, py_2, 0, 0 + 8);
PUTRGB1(out_2, py_2, 1, 2 + 8);
PUTRGB1(out_1, py_1, 1, 2);
PUTRGB1(out_1, py_1, 2, 4);
PUTRGB1(out_2, py_2, 2, 4 + 8);
PUTRGB1(out_2, py_2, 3, 6 + 8);
PUTRGB1(out_1, py_1, 3, 6);
dst_1[0] = out_1;
dst_2[0] = out_2;
CLOSEYUV2RGBFUNC(1)
SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
{
SwsFunc t = NULL;
if (ARCH_BFIN)
t = ff_yuv2rgb_init_bfin(c);
if (ARCH_PPC)
t = ff_yuv2rgb_init_ppc(c);
if (HAVE_VIS)
t = ff_yuv2rgb_init_vis(c);
if (ARCH_X86)
t = ff_yuv2rgb_init_x86(c);
if (t)
return t;
av_log(c, AV_LOG_WARNING,
"No accelerated colorspace conversion found from %s to %s.\n",
av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
switch (c->dstFormat) {
case AV_PIX_FMT_BGR48BE:
case AV_PIX_FMT_BGR48LE:
return yuv2rgb_c_bgr48;
case AV_PIX_FMT_RGB48BE:
case AV_PIX_FMT_RGB48LE:
return yuv2rgb_c_48;
case AV_PIX_FMT_ARGB:
case AV_PIX_FMT_ABGR:
if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat))
return yuva2argb_c;
case AV_PIX_FMT_RGBA:
case AV_PIX_FMT_BGRA:
return (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) ? yuva2rgba_c : yuv2rgb_c_32;
case AV_PIX_FMT_RGB24:
return yuv2rgb_c_24_rgb;
case AV_PIX_FMT_BGR24:
return yuv2rgb_c_24_bgr;
case AV_PIX_FMT_RGB565:
case AV_PIX_FMT_BGR565:
return yuv2rgb_c_16_ordered_dither;
case AV_PIX_FMT_RGB555:
case AV_PIX_FMT_BGR555:
return yuv2rgb_c_15_ordered_dither;
case AV_PIX_FMT_RGB444:
case AV_PIX_FMT_BGR444:
return yuv2rgb_c_12_ordered_dither;
case AV_PIX_FMT_RGB8:
case AV_PIX_FMT_BGR8:
return yuv2rgb_c_8_ordered_dither;
case AV_PIX_FMT_RGB4:
case AV_PIX_FMT_BGR4:
return yuv2rgb_c_4_ordered_dither;
case AV_PIX_FMT_RGB4_BYTE:
case AV_PIX_FMT_BGR4_BYTE:
return yuv2rgb_c_4b_ordered_dither;
case AV_PIX_FMT_MONOBLACK:
return yuv2rgb_c_1_ordered_dither;
}
return NULL;
}
static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize,
const int64_t inc, void *y_tab)
{
int i;
uint8_t *y_table = y_tab;
y_table -= elemsize * (inc >> 9);
for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
table[i] = y_table + elemsize * (cb >> 16);
}
}
static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int64_t inc)
{
int i;
int off = -(inc >> 9);
for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
int64_t cb = av_clip(i-YUVRGB_TABLE_HEADROOM, 0, 255)*inc;
table[i] = elemsize * (off + (cb >> 16));
}
}
static uint16_t roundToInt16(int64_t f)
{
int r = (f + (1 << 15)) >> 16;
if (r < -0x7FFF)
return 0x8000;
else if (r > 0x7FFF)
return 0x7FFF;
else
return r;
}
av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
int fullRange, int brightness,
int contrast, int saturation)
{
const int isRgb = c->dstFormat == AV_PIX_FMT_RGB32 ||
c->dstFormat == AV_PIX_FMT_RGB32_1 ||
c->dstFormat == AV_PIX_FMT_BGR24 ||
c->dstFormat == AV_PIX_FMT_RGB565BE ||
c->dstFormat == AV_PIX_FMT_RGB565LE ||
c->dstFormat == AV_PIX_FMT_RGB555BE ||
c->dstFormat == AV_PIX_FMT_RGB555LE ||
c->dstFormat == AV_PIX_FMT_RGB444BE ||
c->dstFormat == AV_PIX_FMT_RGB444LE ||
c->dstFormat == AV_PIX_FMT_RGB8 ||
c->dstFormat == AV_PIX_FMT_RGB4 ||
c->dstFormat == AV_PIX_FMT_RGB4_BYTE ||
c->dstFormat == AV_PIX_FMT_MONOBLACK;
const int isNotNe = c->dstFormat == AV_PIX_FMT_NE(RGB565LE, RGB565BE) ||
c->dstFormat == AV_PIX_FMT_NE(RGB555LE, RGB555BE) ||
c->dstFormat == AV_PIX_FMT_NE(RGB444LE, RGB444BE) ||
c->dstFormat == AV_PIX_FMT_NE(BGR565LE, BGR565BE) ||
c->dstFormat == AV_PIX_FMT_NE(BGR555LE, BGR555BE) ||
c->dstFormat == AV_PIX_FMT_NE(BGR444LE, BGR444BE);
const int bpp = c->dstFormatBpp;
uint8_t *y_table;
uint16_t *y_table16;
uint32_t *y_table32;
int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha;
const int yoffs = fullRange ? 384 : 326;
int64_t crv = inv_table[0];
int64_t cbu = inv_table[1];
int64_t cgu = -inv_table[2];
int64_t cgv = -inv_table[3];
int64_t cy = 1 << 16;
int64_t oy = 0;
int64_t yb = 0;
if (!fullRange) {
cy = (cy * 255) / 219;
oy = 16 << 16;
} else {
crv = (crv * 224) / 255;
cbu = (cbu * 224) / 255;
cgu = (cgu * 224) / 255;
cgv = (cgv * 224) / 255;
}
cy = (cy * contrast) >> 16;
crv = (crv * contrast * saturation) >> 32;
cbu = (cbu * contrast * saturation) >> 32;
cgu = (cgu * contrast * saturation) >> 32;
cgv = (cgv * contrast * saturation) >> 32;
oy -= 256 * brightness;
c->uOffset = 0x0400040004000400LL;
c->vOffset = 0x0400040004000400LL;
c->yCoeff = roundToInt16(cy * 8192) * 0x0001000100010001ULL;
c->vrCoeff = roundToInt16(crv * 8192) * 0x0001000100010001ULL;
c->ubCoeff = roundToInt16(cbu * 8192) * 0x0001000100010001ULL;
c->vgCoeff = roundToInt16(cgv * 8192) * 0x0001000100010001ULL;
c->ugCoeff = roundToInt16(cgu * 8192) * 0x0001000100010001ULL;
c->yOffset = roundToInt16(oy * 8) * 0x0001000100010001ULL;
c->yuv2rgb_y_coeff = (int16_t)roundToInt16(cy << 13);
c->yuv2rgb_y_offset = (int16_t)roundToInt16(oy << 9);
c->yuv2rgb_v2r_coeff = (int16_t)roundToInt16(crv << 13);
c->yuv2rgb_v2g_coeff = (int16_t)roundToInt16(cgv << 13);
c->yuv2rgb_u2g_coeff = (int16_t)roundToInt16(cgu << 13);
c->yuv2rgb_u2b_coeff = (int16_t)roundToInt16(cbu << 13);
//scale coefficients by cy
crv = ((crv << 16) + 0x8000) / FFMAX(cy, 1);
cbu = ((cbu << 16) + 0x8000) / FFMAX(cy, 1);
cgu = ((cgu << 16) + 0x8000) / FFMAX(cy, 1);
cgv = ((cgv << 16) + 0x8000) / FFMAX(cy, 1);
av_freep(&c->yuvTable);
switch (bpp) {
case 1:
c->yuvTable = av_malloc(1024);
y_table = c->yuvTable;
yb = -(384 << 16) - oy;
for (i = 0; i < 1024 - 110; i++) {
y_table[i + 110] = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
yb += cy;
}
fill_table(c->table_gU, 1, cgu, y_table + yoffs);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 4:
case 4 | 128:
rbase = isRgb ? 3 : 0;
gbase = 1;
bbase = isRgb ? 0 : 3;
c->yuvTable = av_malloc(1024 * 3);
y_table = c->yuvTable;
yb = -(384 << 16) - oy;
for (i = 0; i < 1024 - 110; i++) {
int yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table[i + 110] = (yval >> 7) << rbase;
y_table[i + 37 + 1024] = ((yval + 43) / 85) << gbase;
y_table[i + 110 + 2048] = (yval >> 7) << bbase;
yb += cy;
}
fill_table(c->table_rV, 1, crv, y_table + yoffs);
fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 8:
rbase = isRgb ? 5 : 0;
gbase = isRgb ? 2 : 3;
bbase = isRgb ? 0 : 6;
c->yuvTable = av_malloc(1024 * 3);
y_table = c->yuvTable;
yb = -(384 << 16) - oy;
for (i = 0; i < 1024 - 38; i++) {
int yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table[i + 16] = ((yval + 18) / 36) << rbase;
y_table[i + 16 + 1024] = ((yval + 18) / 36) << gbase;
y_table[i + 37 + 2048] = ((yval + 43) / 85) << bbase;
yb += cy;
}
fill_table(c->table_rV, 1, crv, y_table + yoffs);
fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 12:
rbase = isRgb ? 8 : 0;
gbase = 4;
bbase = isRgb ? 0 : 8;
c->yuvTable = av_malloc(1024 * 3 * 2);
y_table16 = c->yuvTable;
yb = -(384 << 16) - oy;
for (i = 0; i < 1024; i++) {
uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table16[i] = (yval >> 4) << rbase;
y_table16[i + 1024] = (yval >> 4) << gbase;
y_table16[i + 2048] = (yval >> 4) << bbase;
yb += cy;
}
if (isNotNe)
for (i = 0; i < 1024 * 3; i++)
y_table16[i] = av_bswap16(y_table16[i]);
fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
fill_gv_table(c->table_gV, 2, cgv);
break;
case 15:
case 16:
rbase = isRgb ? bpp - 5 : 0;
gbase = 5;
bbase = isRgb ? 0 : (bpp - 5);
c->yuvTable = av_malloc(1024 * 3 * 2);
y_table16 = c->yuvTable;
yb = -(384 << 16) - oy;
for (i = 0; i < 1024; i++) {
uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table16[i] = (yval >> 3) << rbase;
y_table16[i + 1024] = (yval >> (18 - bpp)) << gbase;
y_table16[i + 2048] = (yval >> 3) << bbase;
yb += cy;
}
if (isNotNe)
for (i = 0; i < 1024 * 3; i++)
y_table16[i] = av_bswap16(y_table16[i]);
fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
fill_gv_table(c->table_gV, 2, cgv);
break;
case 24:
case 48:
c->yuvTable = av_malloc(1024);
y_table = c->yuvTable;
yb = -(384 << 16) - oy;
for (i = 0; i < 1024; i++) {
y_table[i] = av_clip_uint8((yb + 0x8000) >> 16);
yb += cy;
}
fill_table(c->table_rV, 1, crv, y_table + yoffs);
fill_table(c->table_gU, 1, cgu, y_table + yoffs);
fill_table(c->table_bU, 1, cbu, y_table + yoffs);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 32:
case 64:
base = (c->dstFormat == AV_PIX_FMT_RGB32_1 ||
c->dstFormat == AV_PIX_FMT_BGR32_1) ? 8 : 0;
rbase = base + (isRgb ? 16 : 0);
gbase = base + 8;
bbase = base + (isRgb ? 0 : 16);
needAlpha = CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat);
if (!needAlpha)
abase = (base + 24) & 31;
c->yuvTable = av_malloc(1024 * 3 * 4);
y_table32 = c->yuvTable;
yb = -(384 << 16) - oy;
for (i = 0; i < 1024; i++) {
unsigned yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table32[i] = (yval << rbase) +
(needAlpha ? 0 : (255u << abase));
y_table32[i + 1024] = yval << gbase;
y_table32[i + 2048] = yval << bbase;
yb += cy;
}
fill_table(c->table_rV, 4, crv, y_table32 + yoffs);
fill_table(c->table_gU, 4, cgu, y_table32 + yoffs + 1024);
fill_table(c->table_bU, 4, cbu, y_table32 + yoffs + 2048);
fill_gv_table(c->table_gV, 4, cgv);
break;
default:
if(!isPlanar(c->dstFormat) || bpp <= 24)
av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp);
return -1;
}
return 0;
}