ffmpeg-2.8.5

git-svn-id: svn://kolibrios.org@6147 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Sergey Semyonov (Serge)
2016-02-05 22:08:02 +00:00
parent a08f61ddb9
commit a4b787f4b8
5429 changed files with 1356786 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
include $(SUBDIR)../config.mak
NAME = swscale
FFLIBS = avutil
HEADERS = swscale.h \
version.h \
OBJS = alphablend.o \
hscale_fast_bilinear.o \
input.o \
options.o \
output.o \
rgb2rgb.o \
swscale.o \
swscale_unscaled.o \
utils.o \
yuv2rgb.o \
slice.o \
hscale.o \
vscale.o \
gamma.o \
OBJS-$(CONFIG_SHARED) += log2_tab.o
# Windows resource file
SLIBOBJS-$(HAVE_GNU_WINDRES) += swscaleres.o
TESTPROGS = colorspace \
swscale \

View File

@@ -0,0 +1,169 @@
/*
* Copyright (C) 2015 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
int ff_sws_alphablendaway(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
int nb_components = desc->nb_components;
int plane, x, y;
int plane_count = isGray(c->srcFormat) ? 1 : 3;
int sixteen_bits = desc->comp[0].depth_minus1 >= 8;
unsigned off = 1<<desc->comp[0].depth_minus1;
unsigned shift = desc->comp[0].depth_minus1 + 1;
unsigned max = (1<<shift) - 1;
int target_table[2][3];
for (plane = 0; plane < plane_count; plane++) {
int a = 0, b = 0;
if (c->alphablend == SWS_ALPHA_BLEND_CHECKERBOARD) {
a = (1<<desc->comp[0].depth_minus1)/2;
b = 3*(1<<desc->comp[0].depth_minus1)/2;
}
target_table[0][plane] = plane && !(desc->flags & AV_PIX_FMT_FLAG_RGB) ? 1<<desc->comp[0].depth_minus1 : a;
target_table[1][plane] = plane && !(desc->flags & AV_PIX_FMT_FLAG_RGB) ? 1<<desc->comp[0].depth_minus1 : b;
}
av_assert0(plane_count == nb_components - 1);
if (desc->flags & AV_PIX_FMT_FLAG_PLANAR) {
for (plane = 0; plane < plane_count; plane++) {
int w = plane ? c->chrSrcW : c->srcW;
int x_subsample = plane ? desc->log2_chroma_w: 0;
int y_subsample = plane ? desc->log2_chroma_h: 0;
for (y = srcSliceY >> y_subsample; y < FF_CEIL_RSHIFT(srcSliceH, y_subsample); y++) {
if (x_subsample || y_subsample) {
int alpha;
unsigned u;
if (sixteen_bits) {
ptrdiff_t alpha_step = srcStride[plane_count] >> 1;
const uint16_t *s = (const uint16_t *)(src[plane ] + srcStride[plane ] * y);
const uint16_t *a = (const uint16_t *)(src[plane_count] + (srcStride[plane_count] * y << y_subsample));
uint16_t *d = ( uint16_t *)(dst[plane ] + dstStride[plane ] * y);
if ((!isBE(c->srcFormat)) == !HAVE_BIGENDIAN) {
for (x = 0; x < w; x++) {
if (y_subsample) {
alpha = (a[2*x] + a[2*x + 1] + 2 +
a[2*x + alpha_step] + a[2*x + alpha_step + 1]) >> 2;
} else
alpha = (a[2*x] + a[2*x + 1]) >> 1;
u = s[x]*alpha + target_table[((x^y)>>5)&1][plane]*(max-alpha) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
} else {
for (x = 0; x < w; x++) {
if (y_subsample) {
alpha = (av_bswap16(a[2*x]) + av_bswap16(a[2*x + 1]) + 2 +
av_bswap16(a[2*x + alpha_step]) + av_bswap16(a[2*x + alpha_step + 1])) >> 2;
} else
alpha = (av_bswap16(a[2*x]) + av_bswap16(a[2*x + 1])) >> 1;
u = av_bswap16(s[x])*alpha + target_table[((x^y)>>5)&1][plane]*(max-alpha) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
} else {
ptrdiff_t alpha_step = srcStride[plane_count];
const uint8_t *s = src[plane ] + srcStride[plane] * y;
const uint8_t *a = src[plane_count] + (srcStride[plane_count] * y << y_subsample);
uint8_t *d = dst[plane ] + dstStride[plane] * y;
for (x = 0; x < w; x++) {
if (y_subsample) {
alpha = (a[2*x] + a[2*x + 1] + 2 +
a[2*x + alpha_step] + a[2*x + alpha_step + 1]) >> 2;
} else
alpha = (a[2*x] + a[2*x + 1]) >> 1;
u = s[x]*alpha + target_table[((x^y)>>5)&1][plane]*(255-alpha) + 128;
d[x] = (257*u) >> 16;
}
}
} else {
if (sixteen_bits) {
const uint16_t *s = (const uint16_t *)(src[plane ] + srcStride[plane ] * y);
const uint16_t *a = (const uint16_t *)(src[plane_count] + srcStride[plane_count] * y);
uint16_t *d = ( uint16_t *)(dst[plane ] + dstStride[plane ] * y);
if ((!isBE(c->srcFormat)) == !HAVE_BIGENDIAN) {
for (x = 0; x < w; x++) {
unsigned u = s[x]*a[x] + target_table[((x^y)>>5)&1][plane]*(max-a[x]) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
} else {
for (x = 0; x < w; x++) {
unsigned aswap =av_bswap16(a[x]);
unsigned u = av_bswap16(s[x])*aswap + target_table[((x^y)>>5)&1][plane]*(max-aswap) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
} else {
const uint8_t *s = src[plane ] + srcStride[plane] * y;
const uint8_t *a = src[plane_count] + srcStride[plane_count] * y;
uint8_t *d = dst[plane ] + dstStride[plane] * y;
for (x = 0; x < w; x++) {
unsigned u = s[x]*a[x] + target_table[((x^y)>>5)&1][plane]*(255-a[x]) + 128;
d[x] = (257*u) >> 16;
}
}
}
}
}
} else {
int alpha_pos = desc->comp[plane_count].offset_plus1 - 1;
int w = c->srcW;
for (y = srcSliceY; y < srcSliceH; y++) {
if (sixteen_bits) {
const uint16_t *s = (const uint16_t *)(src[0] + srcStride[0] * y + 2*!alpha_pos);
const uint16_t *a = (const uint16_t *)(src[0] + srcStride[0] * y + alpha_pos);
uint16_t *d = ( uint16_t *)(dst[0] + dstStride[0] * y);
if ((!isBE(c->srcFormat)) == !HAVE_BIGENDIAN) {
for (x = 0; x < w; x++) {
for (plane = 0; plane < plane_count; plane++) {
int x_index = (plane_count + 1) * x;
unsigned u = s[x_index + plane]*a[x_index] + target_table[((x^y)>>5)&1][plane]*(max-a[x_index]) + off;
d[plane_count*x + plane] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
} else {
for (x = 0; x < w; x++) {
for (plane = 0; plane < plane_count; plane++) {
int x_index = (plane_count + 1) * x;
unsigned aswap =av_bswap16(a[x_index]);
unsigned u = av_bswap16(s[x_index + plane])*aswap + target_table[((x^y)>>5)&1][plane]*(max-aswap) + off;
d[plane_count*x + plane] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
}
} else {
const uint8_t *s = src[0] + srcStride[0] * y + !alpha_pos;
const uint8_t *a = src[0] + srcStride[0] * y + alpha_pos;
uint8_t *d = dst[0] + dstStride[0] * y;
for (x = 0; x < w; x++) {
for (plane = 0; plane < plane_count; plane++) {
int x_index = (plane_count + 1) * x;
unsigned u = s[x_index + plane]*a[x_index] + target_table[((x^y)>>5)&1][plane]*(255-a[x_index]) + 128;
d[plane_count*x + plane] = (257*u) >> 16;
}
}
}
}
}
return 0;
}

View File

@@ -0,0 +1,4 @@
# OBJS += arm/swscale_unscaled.o
# NEON-OBJS += arm/rgb2yuv_neon_32.o
# NEON-OBJS += arm/rgb2yuv_neon_16.o

View File

@@ -0,0 +1,80 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "rgb2yuv_neon_common.S"
/* downsampled R16G16B16 x8 */
alias_qw r16x8, q7
alias_qw g16x8, q8
alias_qw b16x8, q9
alias n16x16_l, q11
alias n16x16_h, q12
alias y16x16_l, q13
alias y16x16_h, q14
alias_qw y8x16, q15
.macro init src
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vrshrn.i32 CO_R, q13, #7
vrshrn.i32 CO_G, q14, #7
vrshrn.i32 CO_B, q15, #7
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
.macro compute_y_16x1_step action, s8x16, coeff
vmovl.u8 n16x16_l, \s8x16\()_l
vmovl.u8 n16x16_h, \s8x16\()_h
\action y16x16_l, n16x16_l, \coeff
\action y16x16_h, n16x16_h, \coeff
.endm
.macro compute_y_16x1
compute_y_16x1_step vmul, r8x16, CO_RY
compute_y_16x1_step vmla, g8x16, CO_GY
compute_y_16x1_step vmla, b8x16, CO_BY
vrshrn.i16 y8x16_l, y16x16_l, #8
vrshrn.i16 y8x16_h, y16x16_h, #8
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
alias c16x8, q15
alias_qw c8x8x2, q10
.macro compute_chroma_8x1 c, C
vmul c16x8, r16x8, CO_R\C
vmla c16x8, g16x8, CO_G\C
vmla c16x8, b16x8, CO_B\C
vrshrn.i16 \c\()8x8, c16x8, #8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
loop_420sp rgbx, nv12, init, kernel_420_16x2, 16

View File

@@ -0,0 +1,119 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "rgb2yuv_neon_common.S"
/* downsampled R16G16B16 x8 */
alias_qw r16x8, q7
alias_qw g16x8, q8
alias_qw b16x8, q9
alias n16x16_o, q11
alias n16x16_ol, q11_l
alias n16x16_oh, q11_h
alias y32x16_el, q12
alias y32x16_eh, q13
alias y32x16_ol, q14
alias y32x16_oh, q15
alias y16x16_e, q12
alias y16x16_el, q12_l
alias y16x16_eh, q12_h
alias y16x16_o, q13
alias y16x16_ol, q13_l
alias y16x16_oh, q13_h
alias y8x16, y16x16_e
.macro init src
// load s32x3x3, narrow to s16x3x3
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vmovn.i32 CO_R, q13
vmovn.i32 CO_G, q14
vmovn.i32 CO_B, q15
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
.macro compute_y_16x1_step action, s8x16, coeff
vmov.u8 n16x16_o, #0
vtrn.u8 \s8x16, n16x16_o
\action y32x16_el, \s8x16\()_l, \coeff
\action y32x16_eh, \s8x16\()_h, \coeff
\action y32x16_ol, n16x16_ol, \coeff
\action y32x16_oh, n16x16_oh, \coeff
.endm
/*
* in: r8x16, g8x16, b8x16
* out: y8x16
* clobber: q11-q15, r8x16, g8x16, b8x16
*/
.macro compute_y_16x1
compute_y_16x1_step vmull, r8x16, CO_RY
compute_y_16x1_step vmlal, g8x16, CO_GY
compute_y_16x1_step vmlal, b8x16, CO_BY
vrshrn.i32 y16x16_el, y32x16_el, #15
vrshrn.i32 y16x16_eh, y32x16_eh, #15
vrshrn.i32 y16x16_ol, y32x16_ol, #15
vrshrn.i32 y16x16_oh, y32x16_oh, #15
vtrn.8 y16x16_e, y16x16_o
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
alias c32x8_l, q14
alias c32x8_h, q15
alias_qw c16x8, q13
alias_qw c8x8x2, q10
.macro compute_chroma_8x1_step action, s16x8, coeff
\action c32x8_l, \s16x8\()_l, \coeff
\action c32x8_h, \s16x8\()_h, \coeff
.endm
/*
* in: r16x8, g16x8, b16x8
* out: c8x8
* clobber: q14-q15
*/
.macro compute_chroma_8x1 c, C
compute_chroma_8x1_step vmull, r16x8, CO_R\C
compute_chroma_8x1_step vmlal, g16x8, CO_G\C
compute_chroma_8x1_step vmlal, b16x8, CO_B\C
vrshrn.i32 c16x8_l, c32x8_l, #15
vrshrn.i32 c16x8_h, c32x8_h, #15
vmovn.i16 \c\()8x8, c16x8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
loop_420sp rgbx, nv12, init, kernel_420_16x2, 32

View File

@@ -0,0 +1,291 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro alias name, tgt, set=1
.if \set != 0
\name .req \tgt
.else
.unreq \name
.endif
.endm
.altmacro
.macro alias_dw_all qw, dw_l, dw_h
alias q\qw\()_l, d\dw_l
alias q\qw\()_h, d\dw_h
.if \qw < 15
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
.endif
.endm
alias_dw_all 0, 0, 1
.noaltmacro
.macro alias_qw name, qw, set=1
alias \name\(), \qw, \set
alias \name\()_l, \qw\()_l, \set
alias \name\()_h, \qw\()_h, \set
.endm
.macro prologue
push {r4-r12, lr}
vpush {q4-q7}
.endm
.macro epilogue
vpop {q4-q7}
pop {r4-r12, pc}
.endm
.macro load_arg reg, ix
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
.endm
/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
* int width, int height,
* int y_stride, int c_stride, int src_stride,
* int32_t coeff_table[9]);
*/
.macro alias_loop_420sp set=1
alias src, r0, \set
alias src0, src, \set
alias y, r1, \set
alias y0, y, \set
alias chroma, r2, \set
alias width, r3, \set
alias header, width, \set
alias height, r4, \set
alias y_stride, r5, \set
alias c_stride, r6, \set
alias c_padding, c_stride, \set
alias src_stride, r7, \set
alias y0_end, r8, \set
alias src_padding,r9, \set
alias y_padding, r10, \set
alias src1, r11, \set
alias y1, r12, \set
alias coeff_table,r12, \set
.endm
.macro loop_420sp s_fmt, d_fmt, init, kernel, precision
function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
prologue
alias_loop_420sp
load_arg height, 4
load_arg y_stride, 5
load_arg c_stride, 6
load_arg src_stride, 7
load_arg coeff_table, 8
\init coeff_table
sub y_padding, y_stride, width
sub c_padding, c_stride, width
sub src_padding, src_stride, width, LSL #2
add y0_end, y0, width
and header, width, #15
add y1, y0, y_stride
add src1, src0, src_stride
0:
cmp header, #0
beq 1f
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
1:
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
cmp y0, y0_end
blt 1b
2:
add y0, y1, y_padding
add y0_end, y1, y_stride
add chroma, chroma, c_padding
add src0, src1, src_padding
add y1, y0, y_stride
add src1, src0, src_stride
subs height, height, #2
bgt 0b
epilogue
alias_loop_420sp 0
endfunc
.endm
.macro downsample
vpaddl.u8 r16x8, r8x16
vpaddl.u8 g16x8, g8x16
vpaddl.u8 b16x8, b8x16
.endm
/* acculumate and right shift by 2 */
.macro downsample_ars2
vpadal.u8 r16x8, r8x16
vpadal.u8 g16x8, g8x16
vpadal.u8 b16x8, b8x16
vrshr.u16 r16x8, r16x8, #2
vrshr.u16 g16x8, g16x8, #2
vrshr.u16 b16x8, b16x8, #2
.endm
.macro store_y8_16x1 dst, count
.ifc "\count",""
vstmia \dst!, {y8x16}
.else
vstmia \dst, {y8x16}
add \dst, \dst, \count
.endif
.endm
.macro store_chroma_nv12_8x1 dst, count
.ifc "\count",""
vst2.i8 {u8x8, v8x8}, [\dst]!
.else
vst2.i8 {u8x8, v8x8}, [\dst], \count
.endif
.endm
.macro store_chroma_nv21_8x1 dst, count
.ifc "\count",""
vst2.i8 {v8x8, u8x8}, [\dst]!
.else
vst2.i8 {v8x8, u8x8}, [\dst], \count
.endif
.endm
.macro load_8888_16x1 a, b, c, d, src, count
.ifc "\count",""
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
.else
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
sub \src, \src, #32
add \src, \src, \count, LSL #2
.endif
.endm
.macro load_rgbx_16x1 src, count
load_8888_16x1 r, g, b, x, \src, \count
.endm
.macro load_bgrx_16x1 src, count
load_8888_16x1 b, g, r, x, \src, \count
.endm
.macro alias_src_rgbx set=1
alias_src_8888 r, g, b, x, \set
.endm
.macro alias_src_bgrx set=1
alias_src_8888 b, g, r, x, \set
.endm
.macro alias_dst_nv12 set=1
alias u8x8, c8x8x2_l, \set
alias v8x8, c8x8x2_h, \set
.endm
.macro alias_dst_nv21 set=1
alias v8x8, c8x8x2_l, \set
alias u8x8, c8x8x2_h, \set
.endm
// common aliases
alias CO_R d0
CO_RY .dn d0.s16[0]
CO_RU .dn d0.s16[1]
CO_RV .dn d0.s16[2]
alias CO_G d1
CO_GY .dn d1.s16[0]
CO_GU .dn d1.s16[1]
CO_GV .dn d1.s16[2]
alias CO_B d2
CO_BY .dn d2.s16[0]
CO_BU .dn d2.s16[1]
CO_BV .dn d2.s16[2]
alias BIAS_U, d3
alias BIAS_V, BIAS_U
alias BIAS_Y, q2
/* q3-q6 R8G8B8X8 x16 */
.macro alias_src_8888 a, b, c, d, set
alias_qw \a\()8x16, q3, \set
alias_qw \b\()8x16, q4, \set
alias_qw \c\()8x16, q5, \set
alias_qw \d\()8x16, q6, \set
.endm
.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
alias_src_\rgb_fmt
alias_dst_\yuv_fmt
load_\rgb_fmt\()_16x1 \rgb0, \count
downsample
compute_y_16x1
store_y8_16x1 \y0, \count
load_\rgb_fmt\()_16x1 \rgb1, \count
downsample_ars2
compute_y_16x1
store_y8_16x1 \y1, \count
compute_chroma_8x1 u, U
compute_chroma_8x1 v, V
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
alias_dst_\yuv_fmt 0
alias_src_\rgb_fmt 0
.endm

View File

@@ -0,0 +1,79 @@
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/arm/cpu.h"
extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma,
int width, int height,
int y_stride, int c_stride, int src_stride,
int32_t coeff_tbl[9]);
extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma,
int width, int height,
int y_stride, int c_stride, int src_stride,
int32_t coeff_tbl[9]);
static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]) {
rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY / 2) * dstStride[1],
context->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
context->input_rgb2yuv_table);
return 0;
}
static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]) {
rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY / 2) * dstStride[1],
context->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
context->input_rgb2yuv_table);
return 0;
}
static void get_unscaled_swscale_neon(SwsContext *c) {
int accurate_rnd = c->flags & SWS_ACCURATE_RND;
if (c->srcFormat == AV_PIX_FMT_RGBA
&& c->dstFormat == AV_PIX_FMT_NV12
&& (c->srcW >= 16)) {
c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper
: rgbx_to_nv12_neon_16_wrapper;
}
}
void ff_get_unscaled_swscale_arm(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
get_unscaled_swscale_neon(c);
}

View File

@@ -0,0 +1,236 @@
/*
* Bayer-to-RGB/YV12 template
* Copyright (c) 2011-2014 Peter Ross <pross@xvid.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#if defined(BAYER_BGGR) || defined(BAYER_GBRG)
#define BAYER_R 0
#define BAYER_G 1
#define BAYER_B 2
#endif
#if defined(BAYER_RGGB) || defined(BAYER_GRBG)
#define BAYER_R 2
#define BAYER_G 1
#define BAYER_B 0
#endif
#if defined(BAYER_8)
#define BAYER_READ(x) (x)
#define BAYER_SIZEOF 1
#define BAYER_SHIFT 0
#endif
#if defined(BAYER_16LE)
#define BAYER_READ(x) AV_RL16(&(x))
#define BAYER_SIZEOF 2
#define BAYER_SHIFT 8
#endif
#if defined(BAYER_16BE)
#define BAYER_READ(x) AV_RB16(&(x))
#define BAYER_SIZEOF 2
#define BAYER_SHIFT 8
#endif
#define S(y, x) BAYER_READ(src[(y)*src_stride + BAYER_SIZEOF*(x)])
#define T(y, x) (unsigned int)S(y, x)
#define R(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_R]
#define G(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_G]
#define B(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_B]
#if defined(BAYER_BGGR) || defined(BAYER_RGGB)
#define BAYER_TO_RGB24_COPY \
R(0, 0) = \
R(0, 1) = \
R(1, 1) = \
R(1, 0) = S(1, 1) >> BAYER_SHIFT; \
\
G(0, 1) = S(0, 1) >> BAYER_SHIFT; \
G(0, 0) = \
G(1, 1) = (T(0, 1) + T(1, 0)) >> (1 + BAYER_SHIFT); \
G(1, 0) = S(1, 0) >> BAYER_SHIFT; \
\
B(1, 1) = \
B(0, 0) = \
B(0, 1) = \
B(1, 0) = S(0, 0) >> BAYER_SHIFT;
#define BAYER_TO_RGB24_INTERPOLATE \
R(0, 0) = (T(-1, -1) + T(-1, 1) + T(1, -1) + T(1, 1)) >> (2 + BAYER_SHIFT); \
G(0, 0) = (T(-1, 0) + T( 0, -1) + T(0, 1) + T(1, 0)) >> (2 + BAYER_SHIFT); \
B(0, 0) = S(0, 0) >> BAYER_SHIFT; \
\
R(0, 1) = (T(-1, 1) + T(1, 1)) >> (1 + BAYER_SHIFT); \
G(0, 1) = S(0, 1) >> BAYER_SHIFT; \
B(0, 1) = (T(0, 0) + T(0, 2)) >> (1 + BAYER_SHIFT); \
\
R(1, 0) = (T(1, -1) + T(1, 1)) >> (1 + BAYER_SHIFT); \
G(1, 0) = S(1, 0) >> BAYER_SHIFT; \
B(1, 0) = (T(0, 0) + T(2, 0)) >> (1 + BAYER_SHIFT); \
\
R(1, 1) = S(1, 1) >> BAYER_SHIFT; \
G(1, 1) = (T(0, 1) + T(1, 0) + T(1, 2) + T(2, 1)) >> (2 + BAYER_SHIFT); \
B(1, 1) = (T(0, 0) + T(0, 2) + T(2, 0) + T(2, 2)) >> (2 + BAYER_SHIFT);
#else
#define BAYER_TO_RGB24_COPY \
R(0, 0) = \
R(0, 1) = \
R(1, 1) = \
R(1, 0) = S(1, 0) >> BAYER_SHIFT; \
\
G(0, 0) = S(0, 0) >> BAYER_SHIFT; \
G(1, 1) = S(1, 1) >> BAYER_SHIFT; \
G(0, 1) = \
G(1, 0) = (T(0, 0) + T(1, 1)) >> (1 + BAYER_SHIFT); \
\
B(1, 1) = \
B(0, 0) = \
B(0, 1) = \
B(1, 0) = S(0, 1) >> BAYER_SHIFT;
#define BAYER_TO_RGB24_INTERPOLATE \
R(0, 0) = (T(-1, 0) + T(1, 0)) >> (1 + BAYER_SHIFT); \
G(0, 0) = S(0, 0) >> BAYER_SHIFT; \
B(0, 0) = (T(0, -1) + T(0, 1)) >> (1 + BAYER_SHIFT); \
\
R(0, 1) = (T(-1, 0) + T(-1, 2) + T(1, 0) + T(1, 2)) >> (2 + BAYER_SHIFT); \
G(0, 1) = (T(-1, 1) + T(0, 0) + T(0, 2) + T(1, 1)) >> (2 + BAYER_SHIFT); \
B(0, 1) = S(0, 1) >> BAYER_SHIFT; \
\
R(1, 0) = S(1, 0) >> BAYER_SHIFT; \
G(1, 0) = (T(0, 0) + T(1, -1) + T(1, 1) + T(2, 0)) >> (2 + BAYER_SHIFT); \
B(1, 0) = (T(0, -1) + T(0, 1) + T(2, -1) + T(2, 1)) >> (2 + BAYER_SHIFT); \
\
R(1, 1) = (T(1, 0) + T(1, 2)) >> (1 + BAYER_SHIFT); \
G(1, 1) = S(1, 1) >> BAYER_SHIFT; \
B(1, 1) = (T(0, 1) + T(2, 1)) >> (1 + BAYER_SHIFT);
#endif
/**
* invoke ff_rgb24toyv12 for 2x2 pixels
*/
#define rgb24toyv12_2x2(src, dstY, dstU, dstV, luma_stride, src_stride, rgb2yuv) \
ff_rgb24toyv12(src, dstY, dstV, dstU, 2, 2, luma_stride, 0, src_stride, rgb2yuv)
static void BAYER_RENAME(rgb24_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width)
{
int i;
for (i = 0 ; i < width; i+= 2) {
BAYER_TO_RGB24_COPY
src += 2 * BAYER_SIZEOF;
dst += 6;
}
}
static void BAYER_RENAME(rgb24_interpolate)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width)
{
int i;
BAYER_TO_RGB24_COPY
src += 2 * BAYER_SIZEOF;
dst += 6;
for (i = 2 ; i < width - 2; i+= 2) {
BAYER_TO_RGB24_INTERPOLATE
src += 2 * BAYER_SIZEOF;
dst += 6;
}
if (width > 2) {
BAYER_TO_RGB24_COPY
}
}
static void BAYER_RENAME(yv12_copy)(const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv)
{
uint8_t dst[12];
const int dst_stride = 6;
int i;
for (i = 0 ; i < width; i+= 2) {
BAYER_TO_RGB24_COPY
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
src += 2 * BAYER_SIZEOF;
dstY += 2;
dstU++;
dstV++;
}
}
static void BAYER_RENAME(yv12_interpolate)(const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv)
{
uint8_t dst[12];
const int dst_stride = 6;
int i;
BAYER_TO_RGB24_COPY
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
src += 2 * BAYER_SIZEOF;
dstY += 2;
dstU++;
dstV++;
for (i = 2 ; i < width - 2; i+= 2) {
BAYER_TO_RGB24_INTERPOLATE
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
src += 2 * BAYER_SIZEOF;
dstY += 2;
dstU++;
dstV++;
}
if (width > 2) {
BAYER_TO_RGB24_COPY
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
}
}
#undef S
#undef T
#undef R
#undef G
#undef B
#undef BAYER_TO_RGB24_COPY
#undef BAYER_TO_RGB24_INTERPOLATE
#undef BAYER_RENAME
#undef BAYER_R
#undef BAYER_G
#undef BAYER_B
#undef BAYER_READ
#undef BAYER_SIZEOF
#undef BAYER_SHIFT
#if defined(BAYER_BGGR)
#undef BAYER_BGGR
#endif
#if defined(BAYER_RGGB)
#undef BAYER_RGGB
#endif
#if defined(BAYER_GBRG)
#undef BAYER_GBRG
#endif
#if defined(BAYER_GRBG)
#undef BAYER_GRBG
#endif
#if defined(BAYER_8)
#undef BAYER_8
#endif
#if defined(BAYER_16LE)
#undef BAYER_16LE
#endif
#if defined(BAYER_16BE)
#undef BAYER_16BE
#endif

View File

@@ -0,0 +1,170 @@
/*
* Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <string.h> /* for memset() */
#include <stdlib.h>
#include <inttypes.h>
#include "swscale.h"
#include "rgb2rgb.h"
#include "libavutil/mem.h"
#define SIZE 1000
#define srcByte 0x55
#define dstByte 0xBB
#define FUNC(s, d, n) { s, d, #n, n }
int main(int argc, char **argv)
{
int i, funcNum;
uint8_t *srcBuffer = av_malloc(SIZE);
uint8_t *dstBuffer = av_malloc(SIZE);
int failedNum = 0;
int passedNum = 0;
if (!srcBuffer || !dstBuffer)
return -1;
av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
sws_rgb2rgb_init();
for (funcNum = 0; ; funcNum++) {
struct func_info_s {
int src_bpp;
int dst_bpp;
const char *name;
void (*func)(const uint8_t *src, uint8_t *dst, int src_size);
} func_info[] = {
FUNC(2, 2, rgb12to15),
FUNC(2, 2, rgb15to16),
FUNC(2, 3, rgb15to24),
FUNC(2, 4, rgb15to32),
FUNC(2, 3, rgb16to24),
FUNC(2, 4, rgb16to32),
FUNC(3, 2, rgb24to15),
FUNC(3, 2, rgb24to16),
FUNC(3, 4, rgb24to32),
FUNC(4, 2, rgb32to15),
FUNC(4, 2, rgb32to16),
FUNC(4, 3, rgb32to24),
FUNC(2, 2, rgb16to15),
FUNC(2, 2, rgb12tobgr12),
FUNC(2, 2, rgb15tobgr15),
FUNC(2, 2, rgb15tobgr16),
FUNC(2, 3, rgb15tobgr24),
FUNC(2, 4, rgb15tobgr32),
FUNC(2, 2, rgb16tobgr15),
FUNC(2, 2, rgb16tobgr16),
FUNC(2, 3, rgb16tobgr24),
FUNC(2, 4, rgb16tobgr32),
FUNC(3, 2, rgb24tobgr15),
FUNC(3, 2, rgb24tobgr16),
FUNC(3, 3, rgb24tobgr24),
FUNC(3, 4, rgb24tobgr32),
FUNC(4, 2, rgb32tobgr15),
FUNC(4, 2, rgb32tobgr16),
FUNC(4, 3, rgb32tobgr24),
FUNC(4, 4, shuffle_bytes_2103), /* rgb32tobgr32 */
FUNC(6, 6, rgb48tobgr48_nobswap),
FUNC(6, 6, rgb48tobgr48_bswap),
FUNC(8, 6, rgb64to48_nobswap),
FUNC(8, 6, rgb64to48_bswap),
FUNC(8, 6, rgb64tobgr48_nobswap),
FUNC(8, 6, rgb64tobgr48_bswap),
FUNC(0, 0, NULL)
};
int width;
int failed = 0;
int srcBpp = 0;
int dstBpp = 0;
if (!func_info[funcNum].func)
break;
av_log(NULL, AV_LOG_INFO, ".");
memset(srcBuffer, srcByte, SIZE);
for (width = 63; width > 0; width--) {
int dstOffset;
for (dstOffset = 128; dstOffset < 196; dstOffset += 4) {
int srcOffset;
memset(dstBuffer, dstByte, SIZE);
for (srcOffset = 128; srcOffset < 196; srcOffset += 4) {
uint8_t *src = srcBuffer + srcOffset;
uint8_t *dst = dstBuffer + dstOffset;
const char *name = NULL;
// don't fill the screen with shit ...
if (failed)
break;
srcBpp = func_info[funcNum].src_bpp;
dstBpp = func_info[funcNum].dst_bpp;
name = func_info[funcNum].name;
func_info[funcNum].func(src, dst, width * srcBpp);
if (!srcBpp)
break;
for (i = 0; i < SIZE; i++) {
if (srcBuffer[i] != srcByte) {
av_log(NULL, AV_LOG_INFO,
"src damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
for (i = 0; i < dstOffset; i++) {
if (dstBuffer[i] != dstByte) {
av_log(NULL, AV_LOG_INFO,
"dst damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
for (i = dstOffset + width * dstBpp; i < SIZE; i++) {
if (dstBuffer[i] != dstByte) {
av_log(NULL, AV_LOG_INFO,
"dst damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
}
}
}
if (failed)
failedNum++;
else if (srcBpp)
passedNum++;
}
av_log(NULL, AV_LOG_INFO,
"\n%d converters passed, %d converters randomly overwrote memory\n",
passedNum, failedNum);
return failedNum;
}

View File

@@ -0,0 +1,72 @@
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
typedef struct GammaContext
{
uint16_t *table;
} GammaContext;
// gamma_convert expects 16 bit rgb format
// it writes directly in src slice thus it must be modifiable (done through cascade context)
static int gamma_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
GammaContext *instance = desc->instance;
uint16_t *table = instance->table;
int srcW = desc->src->width;
int i;
for (i = 0; i < sliceH; ++i) {
uint8_t ** src = desc->src->plane[0].line;
int src_pos = sliceY+i - desc->src->plane[0].sliceY;
uint16_t *src1 = (uint16_t*)*(src+src_pos);
int j;
for (j = 0; j < srcW; ++j) {
uint16_t r = AV_RL16(src1 + j*4 + 0);
uint16_t g = AV_RL16(src1 + j*4 + 1);
uint16_t b = AV_RL16(src1 + j*4 + 2);
AV_WL16(src1 + j*4 + 0, table[r]);
AV_WL16(src1 + j*4 + 1, table[g]);
AV_WL16(src1 + j*4 + 2, table[b]);
}
}
return sliceH;
}
int ff_init_gamma_convert(SwsFilterDescriptor *desc, SwsSlice * src, uint16_t *table)
{
GammaContext *li = av_malloc(sizeof(GammaContext));
if (!li)
return AVERROR(ENOMEM);
li->table = table;
desc->instance = li;
desc->src = src;
desc->dst = NULL;
desc->process = &gamma_convert;
return 0;
}

View File

@@ -0,0 +1,274 @@
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
static int lum_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
FilterContext *instance = desc->instance;
int srcW = desc->src->width;
int dstW = desc->dst->width;
int xInc = instance->xInc;
int i;
for (i = 0; i < sliceH; ++i) {
uint8_t ** src = desc->src->plane[0].line;
uint8_t ** dst = desc->dst->plane[0].line;
int src_pos = sliceY+i - desc->src->plane[0].sliceY;
int dst_pos = sliceY+i - desc->dst->plane[0].sliceY;
if (c->hyscale_fast) {
c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
} else {
c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
instance->filter_pos, instance->filter_size);
}
if (c->lumConvertRange)
c->lumConvertRange((int16_t*)dst[dst_pos], dstW);
desc->dst->plane[0].sliceH += 1;
if (desc->alpha) {
src = desc->src->plane[3].line;
dst = desc->dst->plane[3].line;
src_pos = sliceY+i - desc->src->plane[3].sliceY;
dst_pos = sliceY+i - desc->dst->plane[3].sliceY;
desc->dst->plane[3].sliceH += 1;
if (c->hyscale_fast) {
c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
} else {
c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
instance->filter_pos, instance->filter_size);
}
}
}
return sliceH;
}
static int lum_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
int srcW = desc->src->width;
ColorContext * instance = desc->instance;
uint32_t * pal = instance->pal;
int i;
desc->dst->plane[0].sliceY = sliceY;
desc->dst->plane[0].sliceH = sliceH;
desc->dst->plane[3].sliceY = sliceY;
desc->dst->plane[3].sliceH = sliceH;
for (i = 0; i < sliceH; ++i) {
int sp0 = sliceY+i - desc->src->plane[0].sliceY;
int sp1 = ((sliceY+i) >> desc->src->v_chr_sub_sample) - desc->src->plane[1].sliceY;
const uint8_t * src[4] = { desc->src->plane[0].line[sp0],
desc->src->plane[1].line[sp1],
desc->src->plane[2].line[sp1],
desc->src->plane[3].line[sp0]};
uint8_t * dst = desc->dst->plane[0].line[i];
if (c->lumToYV12) {
c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal);
} else if (c->readLumPlanar) {
c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table);
}
if (desc->alpha) {
dst = desc->dst->plane[3].line[i];
if (c->alpToYV12) {
c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal);
} else if (c->readAlpPlanar) {
c->readAlpPlanar(dst, src, srcW, NULL);
}
}
}
return sliceH;
}
int ff_init_desc_fmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
{
ColorContext * li = av_malloc(sizeof(ColorContext));
if (!li)
return AVERROR(ENOMEM);
li->pal = pal;
desc->instance = li;
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src =src;
desc->dst = dst;
desc->process = &lum_convert;
return 0;
}
int ff_init_desc_hscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
{
FilterContext *li = av_malloc(sizeof(FilterContext));
if (!li)
return AVERROR(ENOMEM);
li->filter = filter;
li->filter_pos = filter_pos;
li->filter_size = filter_size;
li->xInc = xInc;
desc->instance = li;
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src = src;
desc->dst = dst;
desc->process = &lum_h_scale;
return 0;
}
static int chr_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
FilterContext *instance = desc->instance;
int srcW = FF_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
int dstW = FF_CEIL_RSHIFT(desc->dst->width, desc->dst->h_chr_sub_sample);
int xInc = instance->xInc;
uint8_t ** src1 = desc->src->plane[1].line;
uint8_t ** dst1 = desc->dst->plane[1].line;
uint8_t ** src2 = desc->src->plane[2].line;
uint8_t ** dst2 = desc->dst->plane[2].line;
int src_pos1 = sliceY - desc->src->plane[1].sliceY;
int dst_pos1 = sliceY - desc->dst->plane[1].sliceY;
int src_pos2 = sliceY - desc->src->plane[2].sliceY;
int dst_pos2 = sliceY - desc->dst->plane[2].sliceY;
int i;
for (i = 0; i < sliceH; ++i) {
if (c->hcscale_fast) {
c->hcscale_fast(c, (uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW, src1[src_pos1+i], src2[src_pos2+i], srcW, xInc);
} else {
c->hcScale(c, (uint16_t*)dst1[dst_pos1+i], dstW, src1[src_pos1+i], instance->filter, instance->filter_pos, instance->filter_size);
c->hcScale(c, (uint16_t*)dst2[dst_pos2+i], dstW, src2[src_pos2+i], instance->filter, instance->filter_pos, instance->filter_size);
}
if (c->chrConvertRange)
c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW);
desc->dst->plane[1].sliceH += 1;
desc->dst->plane[2].sliceH += 1;
}
return sliceH;
}
static int chr_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
int srcW = FF_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
ColorContext * instance = desc->instance;
uint32_t * pal = instance->pal;
int sp0 = (sliceY - (desc->src->plane[0].sliceY >> desc->src->v_chr_sub_sample)) << desc->src->v_chr_sub_sample;
int sp1 = sliceY - desc->src->plane[1].sliceY;
int i;
desc->dst->plane[1].sliceY = sliceY;
desc->dst->plane[1].sliceH = sliceH;
desc->dst->plane[2].sliceY = sliceY;
desc->dst->plane[2].sliceH = sliceH;
for (i = 0; i < sliceH; ++i) {
const uint8_t * src[4] = { desc->src->plane[0].line[sp0+i],
desc->src->plane[1].line[sp1+i],
desc->src->plane[2].line[sp1+i],
desc->src->plane[3].line[sp0+i]};
uint8_t * dst1 = desc->dst->plane[1].line[i];
uint8_t * dst2 = desc->dst->plane[2].line[i];
if (c->chrToYV12) {
c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal);
} else if (c->readChrPlanar) {
c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table);
}
}
return sliceH;
}
int ff_init_desc_cfmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
{
ColorContext * li = av_malloc(sizeof(ColorContext));
if (!li)
return AVERROR(ENOMEM);
li->pal = pal;
desc->instance = li;
desc->src =src;
desc->dst = dst;
desc->process = &chr_convert;
return 0;
}
int ff_init_desc_chscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
{
FilterContext *li = av_malloc(sizeof(FilterContext));
if (!li)
return AVERROR(ENOMEM);
li->filter = filter;
li->filter_pos = filter_pos;
li->filter_size = filter_size;
li->xInc = xInc;
desc->instance = li;
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src = src;
desc->dst = dst;
desc->process = &chr_h_scale;
return 0;
}
static int no_chr_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
desc->dst->plane[1].sliceY = sliceY + sliceH - desc->dst->plane[1].available_lines;
desc->dst->plane[1].sliceH = desc->dst->plane[1].available_lines;
desc->dst->plane[2].sliceY = sliceY + sliceH - desc->dst->plane[2].available_lines;
desc->dst->plane[2].sliceH = desc->dst->plane[2].available_lines;
return 0;
}
int ff_init_desc_no_chr(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst)
{
desc->src = src;
desc->dst = dst;
desc->alpha = 0;
desc->instance = NULL;
desc->process = &no_chr_scale;
return 0;
}

View File

@@ -0,0 +1,55 @@
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
const uint8_t *src, int srcW, int xInc)
{
int i;
unsigned int xpos = 0;
for (i = 0; i < dstWidth; i++) {
register unsigned int xx = xpos >> 16;
register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha;
xpos += xInc;
}
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
dst[i] = src[srcW-1]*128;
}
void ff_hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc)
{
int i;
unsigned int xpos = 0;
for (i = 0; i < dstWidth; i++) {
register unsigned int xx = xpos >> 16;
register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
dst1[i] = (src1[xx] * (xalpha ^ 127) + src1[xx + 1] * xalpha);
dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha);
xpos += xInc;
}
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
dst1[i] = src1[srcW-1]*128;
dst2[i] = src2[srcW-1]*128;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
prefix=/usr/local
exec_prefix=${prefix}
libdir=${prefix}/lib
includedir=${prefix}/include
Name: libswscale
Description: FFmpeg image rescaling library
Version: 3.1.101
Requires:
Requires.private: libavutil >= 54.31.100
Conflicts:
Libs: -L${libdir} -lswscale
Libs.private: -lm
Cflags: -I${includedir}

View File

@@ -0,0 +1,4 @@
LIBSWSCALE_$MAJOR {
global: swscale_*; sws_*;
local: *;
};

View File

@@ -0,0 +1,5 @@
LIBSWSCALE_3 {
global: DllStartup;
swscale_*; sws_*;
local: *;
};

View File

@@ -0,0 +1 @@
#include "libavutil/log2_tab.c"

View File

@@ -0,0 +1,100 @@
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/avutil.h"
#include "libavutil/opt.h"
#include "libavutil/pixfmt.h"
#include "swscale.h"
#include "swscale_internal.h"
static const char *sws_context_to_name(void *ptr)
{
return "swscaler";
}
#define OFFSET(x) offsetof(SwsContext, x)
#define DEFAULT 0
#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
static const AVOption swscale_options[] = {
{ "sws_flags", "scaler flags", OFFSET(flags), AV_OPT_TYPE_FLAGS, { .i64 = SWS_BICUBIC }, 0, UINT_MAX, VE, "sws_flags" },
{ "fast_bilinear", "fast bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FAST_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bilinear", "bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bicubic", "bicubic", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBIC }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "experimental", "experimental", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_X }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "neighbor", "nearest neighbor", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_POINT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "area", "averaging area", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_AREA }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bicublin", "luma bicubic, chroma bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBLIN }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "gauss", "gaussian", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_GAUSS }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "sinc", "sinc", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_SINC }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "lanczos", "lanczos", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_LANCZOS }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "spline", "natural bicubic spline", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_SPLINE }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "print_info", "print info", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_PRINT_INFO }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "accurate_rnd", "accurate rounding", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ACCURATE_RND }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "full_chroma_int", "full chroma interpolation", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "full_chroma_inp", "full chroma input", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INP }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bitexact", "", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BITEXACT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "error_diffusion", "error diffusion dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ERROR_DIFFUSION}, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "srcw", "source width", OFFSET(srcW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "srch", "source height", OFFSET(srcH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "dstw", "destination width", OFFSET(dstW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "dsth", "destination height", OFFSET(dstH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "src_format", "source format", OFFSET(srcFormat), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, AV_PIX_FMT_NB - 1, VE },
{ "dst_format", "destination format", OFFSET(dstFormat), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, AV_PIX_FMT_NB - 1, VE },
{ "src_range", "source range", OFFSET(srcRange), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, 1, VE },
{ "dst_range", "destination range", OFFSET(dstRange), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, 1, VE },
{ "param0", "scaler param 0", OFFSET(param[0]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE },
{ "param1", "scaler param 1", OFFSET(param[1]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE },
{ "src_v_chr_pos", "source vertical chroma position in luma grid/256" , OFFSET(src_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "src_h_chr_pos", "source horizontal chroma position in luma grid/256", OFFSET(src_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "dst_v_chr_pos", "destination vertical chroma position in luma grid/256" , OFFSET(dst_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "dst_h_chr_pos", "destination horizontal chroma position in luma grid/256", OFFSET(dst_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "sws_dither", "set dithering algorithm", OFFSET(dither), AV_OPT_TYPE_INT, { .i64 = SWS_DITHER_AUTO }, 0, NB_SWS_DITHER, VE, "sws_dither" },
{ "auto", "leave choice to sws", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_AUTO }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "bayer", "bayer dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_BAYER }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "ed", "error diffusion", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_ED }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "a_dither", "arithmetic addition dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_A_DITHER}, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "x_dither", "arithmetic xor dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_X_DITHER}, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "gamma", "gamma correct scaling", OFFSET(gamma_flag), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE, "gamma" },
{ "true", "enable", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, INT_MIN, INT_MAX, VE, "gamma" },
{ "false", "disable", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, INT_MIN, INT_MAX, VE, "gamma" },
{ "alphablend", "mode for alpha -> non alpha", OFFSET(alphablend),AV_OPT_TYPE_INT, { .i64 = SWS_ALPHA_BLEND_NONE}, 0, SWS_ALPHA_BLEND_NB-1, VE, "alphablend" },
{ "none", "ignore alpha", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_NONE}, INT_MIN, INT_MAX, VE, "alphablend" },
{ "uniform_color", "blend onto a uniform color", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_UNIFORM},INT_MIN, INT_MAX, VE, "alphablend" },
{ "checkerboard", "blend onto a checkerboard", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_CHECKERBOARD},INT_MIN, INT_MAX, VE, "alphablend" },
{ NULL }
};
const AVClass sws_context_class = {
.class_name = "SWScaler",
.item_name = sws_context_to_name,
.option = swscale_options,
.category = AV_CLASS_CATEGORY_SWSCALER,
.version = LIBAVUTIL_VERSION_INT,
};
const AVClass *sws_get_class(void)
{
return &sws_context_class;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
OBJS += ppc/swscale_altivec.o \
ppc/yuv2rgb_altivec.o \
ppc/yuv2yuv_altivec.o \

View File

@@ -0,0 +1,371 @@
/*
* AltiVec-enhanced yuv2yuvX
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "yuv2rgb_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#if HAVE_ALTIVEC
#define vzero vec_splat_s32(0)
#if HAVE_BIGENDIAN
#define GET_LS(a,b,c,s) {\
vector signed short l2 = vec_ld(((b) << 1) + 16, s);\
ls = vec_perm(a, l2, c);\
a = l2;\
}
#else
#define GET_LS(a,b,c,s) {\
ls = a;\
a = vec_vsx_ld(((b) << 1) + 16, s);\
}
#endif
#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
vector signed short ls;\
GET_LS(l1, x, perm, src);\
vector signed int i1 = vec_mule(filter, ls);\
vector signed int i2 = vec_mulo(filter, ls);\
vector signed int vf1, vf2;\
vf1 = vec_mergeh(i1, i2);\
vf2 = vec_mergel(i1, i2);\
d1 = vec_add(d1, vf1);\
d2 = vec_add(d2, vf2);\
} while (0)
#if HAVE_BIGENDIAN
#define LOAD_FILTER(vf,f) {\
vector unsigned char perm0 = vec_lvsl(joffset, f);\
vf = vec_ld(joffset, f);\
vf = vec_perm(vf, vf, perm0);\
}
#define LOAD_L1(ll1,s,p){\
p = vec_lvsl(xoffset, s);\
ll1 = vec_ld(xoffset, s);\
}
#else
#define LOAD_FILTER(vf,f) {\
vf = vec_vsx_ld(joffset, f);\
}
#define LOAD_L1(ll1,s,p){\
ll1 = vec_vsx_ld(xoffset, s);\
}
#endif
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest,
const uint8_t *dither, int offset, int x)
{
register int i, j;
LOCAL_ALIGNED(16, int, val, [16]);
vector signed int vo1, vo2, vo3, vo4;
vector unsigned short vs1, vs2;
vector unsigned char vf;
vector unsigned int altivec_vectorShiftInt19 =
vec_add(vec_splat_u32(10), vec_splat_u32(9));
for (i = 0; i < 16; i++)
val[i] = dither[(x + i + offset) & 7] << 12;
vo1 = vec_ld(0, val);
vo2 = vec_ld(16, val);
vo3 = vec_ld(32, val);
vo4 = vec_ld(48, val);
for (j = 0; j < filterSize; j++) {
unsigned int joffset=j<<1;
unsigned int xoffset=x<<1;
vector unsigned char perm;
vector signed short l1,vLumFilter;
LOAD_FILTER(vLumFilter,filter);
vLumFilter = vec_splat(vLumFilter, 0);
LOAD_L1(l1,src[j],perm);
yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
}
vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
vs1 = vec_packsu(vo1, vo2);
vs2 = vec_packsu(vo3, vo4);
vf = vec_packsu(vs1, vs2);
VEC_ST(vf, 0, dest);
}
static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset, int x)
{
int i, j;
for (i = x; i < dstW; i++) {
int t = dither[(i + offset) & 7] << 12;
for (j = 0; j < filterSize; j++)
t += src[j][i] * filter[j];
dest[i] = av_clip_uint8(t >> 19);
}
}
static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
int dst_u = -(uintptr_t)dest & 15;
int i;
yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
for (i = dst_u; i < dstW - 15; i += 16)
yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
offset, i);
yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
}
#if HAVE_BIGENDIAN
// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
// The neat trick: We only care for half the elements,
// high or low depending on (i<<3)%16 (it's 0 or 8 here),
// and we're going to use vec_mule, so we choose
// carefully how to "unpack" the elements into the even slots.
#define GET_VF4(a, vf, f) {\
vf = vec_ld(a<< 3, f);\
if ((a << 3) % 16)\
vf = vec_mergel(vf, (vector signed short)vzero);\
else\
vf = vec_mergeh(vf, (vector signed short)vzero);\
}
#define FIRST_LOAD(sv, pos, s, per) {\
sv = vec_ld(pos, s);\
per = vec_lvsl(pos, s);\
}
#define UPDATE_PTR(s0, d0, s1, d1) {\
d0 = s0;\
d1 = s1;\
}
#define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
v1 = vec_ld(pos + a + 16, s);\
vf = vec_perm(v0, v1, per);\
}
#define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\
if ((((uintptr_t)s + pos) % 16) > 8) {\
v1 = vec_ld(pos + a + 16, s);\
}\
vf = vec_perm(v0, src_v1, per);\
}
#define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\
vf = vec_perm(vf0, vf1, per);\
}
#else /* else of #if HAVE_BIGENDIAN */
#define GET_VF4(a, vf, f) {\
vf = (vector signed short)vec_vsx_ld(a << 3, f);\
vf = vec_mergeh(vf, (vector signed short)vzero);\
}
#define FIRST_LOAD(sv, pos, s, per) {}
#define UPDATE_PTR(s0, d0, s1, d1) {}
#define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
vf = vec_vsx_ld(pos + a, s);\
}
#define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
#define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
}
#endif /* end of #if HAVE_BIGENDIAN */
static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
register int i;
LOCAL_ALIGNED(16, int, tempo, [4]);
if (filterSize % 4) {
for (i = 0; i < dstW; i++) {
register int j;
register int srcPos = filterPos[i];
register int val = 0;
for (j = 0; j < filterSize; j++)
val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
}
} else
switch (filterSize) {
case 4:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_vF = unaligned_load(srcPos, src);
vector signed short src_v, filter_v;
vector signed int val_vEven, val_s;
src_v = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
// now put our elements in the even slots
src_v = vec_mergeh(src_v, (vector signed short)vzero);
GET_VF4(i, filter_v, filter);
val_vEven = vec_mule(src_v, filter_v);
val_s = vec_sums(val_vEven, vzero);
vec_st(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
case 8:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_vF, src_v0, src_v1;
vector unsigned char permS;
vector signed short src_v, filter_v;
vector signed int val_v, val_s;
FIRST_LOAD(src_v0, srcPos, src, permS);
LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
src_v = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
filter_v = vec_ld(i << 4, filter);
val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
val_s = vec_sums(val_v, vzero);
vec_st(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
case 16:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_vF = unaligned_load(srcPos, src);
vector signed short src_vA = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
vector signed short src_vB = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
vector signed short filter_v0 = vec_ld(i << 5, filter);
vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
vector signed int val_s = vec_sums(val_v, vzero);
VEC_ST(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
default:
for (i = 0; i < dstW; i++) {
register int j, offset = i * 2 * filterSize;
register int srcPos = filterPos[i];
vector signed int val_s, val_v = (vector signed int)vzero;
vector signed short filter_v0R;
vector unsigned char permF, src_v0, permS;
FIRST_LOAD(filter_v0R, offset, filter, permF);
FIRST_LOAD(src_v0, srcPos, src, permS);
for (j = 0; j < filterSize - 15; j += 16) {
vector unsigned char src_v1, src_vF;
vector signed short filter_v1R, filter_v2R, filter_v0, filter_v1;
LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
vector signed short src_vA = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
vector signed short src_vB = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
val_v = vec_msums(src_vB, filter_v1, val_acc);
UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
}
if (j < filterSize - 7) {
// loading src_v0 is useless, it's already done above
vector unsigned char src_v1, src_vF;
vector signed short src_v, filter_v1R, filter_v;
LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
src_v = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
val_v = vec_msums(src_v, filter_v, val_v);
}
val_s = vec_sums(val_v, vzero);
VEC_ST(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
}
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
enum AVPixelFormat dstFormat = c->dstFormat;
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
if (c->srcBpc == 8 && c->dstBpc <= 14) {
c->hyScale = c->hcScale = hScale_altivec_real;
}
if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
!c->alpPixBuf) {
c->yuv2planeX = yuv2planeX_altivec;
}
/* The following list of supported dstFormat values should
* match what's found in the body of ff_yuv2packedX_altivec() */
if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->alpPixBuf) {
switch (c->dstFormat) {
case AV_PIX_FMT_ABGR:
c->yuv2packedX = ff_yuv2abgr_X_altivec;
break;
case AV_PIX_FMT_BGRA:
c->yuv2packedX = ff_yuv2bgra_X_altivec;
break;
case AV_PIX_FMT_ARGB:
c->yuv2packedX = ff_yuv2argb_X_altivec;
break;
case AV_PIX_FMT_RGBA:
c->yuv2packedX = ff_yuv2rgba_X_altivec;
break;
case AV_PIX_FMT_BGR24:
c->yuv2packedX = ff_yuv2bgr24_X_altivec;
break;
case AV_PIX_FMT_RGB24:
c->yuv2packedX = ff_yuv2rgb24_X_altivec;
break;
}
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,873 @@
/*
* AltiVec acceleration for colorspace conversion
*
* copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* Convert I420 YV12 to RGB in various formats,
* it rejects images that are not in 420 formats,
* it rejects images that don't have widths of multiples of 16,
* it rejects images that don't have heights of multiples of 2.
* Reject defers to C simulation code.
*
* Lots of optimizations to be done here.
*
* 1. Need to fix saturation code. I just couldn't get it to fly with packs
* and adds, so we currently use max/min to clip.
*
* 2. The inefficient use of chroma loading needs a bit of brushing up.
*
* 3. Analysis of pipeline stalls needs to be done. Use shark to identify
* pipeline stalls.
*
*
* MODIFIED to calculate coeffs from currently selected color space.
* MODIFIED core to be a macro where you specify the output format.
* ADDED UYVY conversion which is never called due to some thing in swscale.
* CORRECTED algorithim selection to be strict on input formats.
* ADDED runtime detection of AltiVec.
*
* ADDED altivec_yuv2packedX vertical scl + RGB converter
*
* March 27,2004
* PERFORMANCE ANALYSIS
*
* The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
* used as test.
* The AltiVec version uses 10% of the processor or ~100Mips for D1 video
* same sequence.
*
* 720 * 480 * 30 ~10MPS
*
* so we have roughly 10 clocks per pixel. This is too high, something has
* to be wrong.
*
* OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
* need for vec_min.
*
* OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
* have the input video frame, it was just decompressed so it probably resides
* in L1 caches. However, we are creating the output video stream. This needs
* to use the DSTST instruction to optimize for the cache. We couple this with
* the fact that we are not going to be visiting the input buffer again so we
* mark it Least Recently Used. This shaves 25% of the processor cycles off.
*
* Now memcpy is the largest mips consumer in the system, probably due
* to the inefficient X11 stuff.
*
* GL libraries seem to be very slow on this machine 1.33Ghz PB running
* Jaguar, this is not the case for my 1Ghz PB. I thought it might be
* a versioning issue, however I have libGL.1.2.dylib for both
* machines. (We need to figure this out now.)
*
* GL2 libraries work now with patch for RGB32.
*
* NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
*
* Integrated luma prescaling adjustment for saturation/contrast/brightness
* adjustment.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <assert.h>
#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
#include "yuv2rgb_altivec.h"
#if HAVE_ALTIVEC
#undef PROFILE_THE_BEAST
#undef INC_SCALING
typedef unsigned char ubyte;
typedef signed char sbyte;
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
* homogeneous vector registers x0,x1,x2 are interleaved with the
* following technique:
*
* o0 = vec_mergeh(x0, x1);
* o1 = vec_perm(o0, x2, perm_rgb_0);
* o2 = vec_perm(o0, x2, perm_rgb_1);
* o3 = vec_mergel(x0, x1);
* o4 = vec_perm(o3, o2, perm_rgb_2);
* o5 = vec_perm(o3, o2, perm_rgb_3);
*
* perm_rgb_0: o0(RG).h v1(B) --> o1*
* 0 1 2 3 4
* rgbr|gbrg|brgb|rgbr
* 0010 0100 1001 0010
* 0102 3145 2673 894A
*
* perm_rgb_1: o0(RG).h v1(B) --> o2
* 0 1 2 3 4
* gbrg|brgb|bbbb|bbbb
* 0100 1001 1111 1111
* B5CD 6EF7 89AB CDEF
*
* perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
* 0 1 2 3 4
* gbrg|brgb|rgbr|gbrg
* 1111 1111 0010 0100
* 89AB CDEF 0182 3945
*
* perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
* 0 1 2 3 4
* brgb|rgbr|gbrg|brgb
* 1001 0010 0100 1001
* a67b 89cA BdCD eEFf
*
*/
static const vector unsigned char
perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
#define vec_merge3(x2, x1, x0, y0, y1, y2) \
do { \
__typeof__(x0) o0, o2, o3; \
o0 = vec_mergeh(x0, x1); \
y0 = vec_perm(o0, x2, perm_rgb_0); \
o2 = vec_perm(o0, x2, perm_rgb_1); \
o3 = vec_mergel(x0, x1); \
y1 = vec_perm(o3, o2, perm_rgb_2); \
y2 = vec_perm(o3, o2, perm_rgb_3); \
} while (0)
#define vec_mstbgr24(x0, x1, x2, ptr) \
do { \
__typeof__(x0) _0, _1, _2; \
vec_merge3(x0, x1, x2, _0, _1, _2); \
vec_st(_0, 0, ptr++); \
vec_st(_1, 0, ptr++); \
vec_st(_2, 0, ptr++); \
} while (0)
#define vec_mstrgb24(x0, x1, x2, ptr) \
do { \
__typeof__(x0) _0, _1, _2; \
vec_merge3(x2, x1, x0, _0, _1, _2); \
vec_st(_0, 0, ptr++); \
vec_st(_1, 0, ptr++); \
vec_st(_2, 0, ptr++); \
} while (0)
/* pack the pixels in rgb0 format
* msb R
* lsb 0
*/
#define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
do { \
T _0, _1, _2, _3; \
_0 = vec_mergeh(x0, x1); \
_1 = vec_mergeh(x2, x3); \
_2 = (T) vec_mergeh((vector unsigned short) _0, \
(vector unsigned short) _1); \
_3 = (T) vec_mergel((vector unsigned short) _0, \
(vector unsigned short) _1); \
vec_st(_2, 0 * 16, (T *) ptr); \
vec_st(_3, 1 * 16, (T *) ptr); \
_0 = vec_mergel(x0, x1); \
_1 = vec_mergel(x2, x3); \
_2 = (T) vec_mergeh((vector unsigned short) _0, \
(vector unsigned short) _1); \
_3 = (T) vec_mergel((vector unsigned short) _0, \
(vector unsigned short) _1); \
vec_st(_2, 2 * 16, (T *) ptr); \
vec_st(_3, 3 * 16, (T *) ptr); \
ptr += 4; \
} while (0)
/*
* 1 0 1.4021 | | Y |
* 1 -0.3441 -0.7142 |x| Cb|
* 1 1.7718 0 | | Cr|
*
*
* Y: [-128 127]
* Cb/Cr : [-128 127]
*
* typical YUV conversion works on Y: 0-255 this version has been
* optimized for JPEG decoding.
*/
#if HAVE_BIGENDIAN
#define vec_unh(x) \
(vector signed short) \
vec_perm(x, (__typeof__(x)) { 0 }, \
((vector unsigned char) { \
0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
#define vec_unl(x) \
(vector signed short) \
vec_perm(x, (__typeof__(x)) { 0 }, \
((vector unsigned char) { \
0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
#else
#define vec_unh(x)(vector signed short) vec_mergeh(x,(__typeof__(x)) { 0 })
#define vec_unl(x)(vector signed short) vec_mergel(x,(__typeof__(x)) { 0 })
#endif
#define vec_clip_s16(x) \
vec_max(vec_min(x, ((vector signed short) { \
235, 235, 235, 235, 235, 235, 235, 235 })), \
((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
#define vec_packclp(x, y) \
(vector unsigned char) \
vec_packs((vector unsigned short) \
vec_max(x, ((vector signed short) { 0 })), \
(vector unsigned short) \
vec_max(y, ((vector signed short) { 0 })))
static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
vector signed short U, vector signed short V,
vector signed short *R, vector signed short *G,
vector signed short *B)
{
vector signed short vx, ux, uvx;
Y = vec_mradds(Y, c->CY, c->OY);
U = vec_sub(U, (vector signed short)
vec_splat((vector signed short) { 128 }, 0));
V = vec_sub(V, (vector signed short)
vec_splat((vector signed short) { 128 }, 0));
// ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
ux = vec_sl(U, c->CSHIFT);
*B = vec_mradds(ux, c->CBU, Y);
// vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
vx = vec_sl(V, c->CSHIFT);
*R = vec_mradds(vx, c->CRV, Y);
// uvx = ((CGU * u) + (CGV * v)) >> 15;
uvx = vec_mradds(U, c->CGU, Y);
*G = vec_mradds(V, c->CGV, uvx);
}
/*
* ------------------------------------------------------------------------------
* CS converters
* ------------------------------------------------------------------------------
*/
#define DEFCSP420_CVT(name, out_pixels) \
static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
int *instrides, int srcSliceY, int srcSliceH, \
unsigned char **oplanes, int *outstrides) \
{ \
int w = c->srcW; \
int h = srcSliceH; \
int i, j; \
int instrides_scl[3]; \
vector unsigned char y0, y1; \
\
vector signed char u, v; \
\
vector signed short Y0, Y1, Y2, Y3; \
vector signed short U, V; \
vector signed short vx, ux, uvx; \
vector signed short vx0, ux0, uvx0; \
vector signed short vx1, ux1, uvx1; \
vector signed short R0, G0, B0; \
vector signed short R1, G1, B1; \
vector unsigned char R, G, B; \
\
const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
vector unsigned char align_perm; \
\
vector signed short lCY = c->CY; \
vector signed short lOY = c->OY; \
vector signed short lCRV = c->CRV; \
vector signed short lCBU = c->CBU; \
vector signed short lCGU = c->CGU; \
vector signed short lCGV = c->CGV; \
vector unsigned short lCSHIFT = c->CSHIFT; \
\
const ubyte *y1i = in[0]; \
const ubyte *y2i = in[0] + instrides[0]; \
const ubyte *ui = in[1]; \
const ubyte *vi = in[2]; \
\
vector unsigned char *oute, *outo; \
\
/* loop moves y{1, 2}i by w */ \
instrides_scl[0] = instrides[0] * 2 - w; \
/* loop moves ui by w / 2 */ \
instrides_scl[1] = instrides[1] - w / 2; \
/* loop moves vi by w / 2 */ \
instrides_scl[2] = instrides[2] - w / 2; \
\
for (i = 0; i < h / 2; i++) { \
oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
(srcSliceY + i * 2)); \
outo = oute + (outstrides[0] >> 4); \
vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
\
for (j = 0; j < w / 16; j++) { \
y1ivP = (const vector unsigned char *) y1i; \
y2ivP = (const vector unsigned char *) y2i; \
uivP = (const vector unsigned char *) ui; \
vivP = (const vector unsigned char *) vi; \
\
align_perm = vec_lvsl(0, y1i); \
y0 = (vector unsigned char) \
vec_perm(y1ivP[0], y1ivP[1], align_perm); \
\
align_perm = vec_lvsl(0, y2i); \
y1 = (vector unsigned char) \
vec_perm(y2ivP[0], y2ivP[1], align_perm); \
\
align_perm = vec_lvsl(0, ui); \
u = (vector signed char) \
vec_perm(uivP[0], uivP[1], align_perm); \
\
align_perm = vec_lvsl(0, vi); \
v = (vector signed char) \
vec_perm(vivP[0], vivP[1], align_perm); \
\
u = (vector signed char) \
vec_sub(u, \
(vector signed char) \
vec_splat((vector signed char) { 128 }, 0)); \
v = (vector signed char) \
vec_sub(v, \
(vector signed char) \
vec_splat((vector signed char) { 128 }, 0)); \
\
U = vec_unpackh(u); \
V = vec_unpackh(v); \
\
Y0 = vec_unh(y0); \
Y1 = vec_unl(y0); \
Y2 = vec_unh(y1); \
Y3 = vec_unl(y1); \
\
Y0 = vec_mradds(Y0, lCY, lOY); \
Y1 = vec_mradds(Y1, lCY, lOY); \
Y2 = vec_mradds(Y2, lCY, lOY); \
Y3 = vec_mradds(Y3, lCY, lOY); \
\
/* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \
ux = vec_sl(U, lCSHIFT); \
ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
ux0 = vec_mergeh(ux, ux); \
ux1 = vec_mergel(ux, ux); \
\
/* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \
vx = vec_sl(V, lCSHIFT); \
vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
vx0 = vec_mergeh(vx, vx); \
vx1 = vec_mergel(vx, vx); \
\
/* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \
uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
uvx = vec_mradds(V, lCGV, uvx); \
uvx0 = vec_mergeh(uvx, uvx); \
uvx1 = vec_mergel(uvx, uvx); \
\
R0 = vec_add(Y0, vx0); \
G0 = vec_add(Y0, uvx0); \
B0 = vec_add(Y0, ux0); \
R1 = vec_add(Y1, vx1); \
G1 = vec_add(Y1, uvx1); \
B1 = vec_add(Y1, ux1); \
\
R = vec_packclp(R0, R1); \
G = vec_packclp(G0, G1); \
B = vec_packclp(B0, B1); \
\
out_pixels(R, G, B, oute); \
\
R0 = vec_add(Y2, vx0); \
G0 = vec_add(Y2, uvx0); \
B0 = vec_add(Y2, ux0); \
R1 = vec_add(Y3, vx1); \
G1 = vec_add(Y3, uvx1); \
B1 = vec_add(Y3, ux1); \
R = vec_packclp(R0, R1); \
G = vec_packclp(G0, G1); \
B = vec_packclp(B0, B1); \
\
\
out_pixels(R, G, B, outo); \
\
y1i += 16; \
y2i += 16; \
ui += 8; \
vi += 8; \
} \
\
ui += instrides_scl[1]; \
vi += instrides_scl[2]; \
y1i += instrides_scl[0]; \
y2i += instrides_scl[0]; \
} \
return srcSliceH; \
}
#define out_abgr(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), c, b, a, ptr)
#define out_bgra(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) { 255 }), ptr)
#define out_rgba(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) { 255 }), ptr)
#define out_argb(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, b, c, ptr)
#define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
#define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
DEFCSP420_CVT(yuv2_abgr, out_abgr)
DEFCSP420_CVT(yuv2_bgra, out_bgra)
DEFCSP420_CVT(yuv2_rgba, out_rgba)
DEFCSP420_CVT(yuv2_argb, out_argb)
DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
// uyvy|uyvy|uyvy|uyvy
// 0123 4567 89ab cdef
static const vector unsigned char
demux_u = { 0x10, 0x00, 0x10, 0x00,
0x10, 0x04, 0x10, 0x04,
0x10, 0x08, 0x10, 0x08,
0x10, 0x0c, 0x10, 0x0c },
demux_v = { 0x10, 0x02, 0x10, 0x02,
0x10, 0x06, 0x10, 0x06,
0x10, 0x0A, 0x10, 0x0A,
0x10, 0x0E, 0x10, 0x0E },
demux_y = { 0x10, 0x01, 0x10, 0x03,
0x10, 0x05, 0x10, 0x07,
0x10, 0x09, 0x10, 0x0B,
0x10, 0x0D, 0x10, 0x0F };
/*
* this is so I can play live CCIR raw video
*/
static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in,
int *instrides, int srcSliceY, int srcSliceH,
unsigned char **oplanes, int *outstrides)
{
int w = c->srcW;
int h = srcSliceH;
int i, j;
vector unsigned char uyvy;
vector signed short Y, U, V;
vector signed short R0, G0, B0, R1, G1, B1;
vector unsigned char R, G, B;
vector unsigned char *out;
const ubyte *img;
img = in[0];
out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
for (i = 0; i < h; i++)
for (j = 0; j < w / 16; j++) {
uyvy = vec_ld(0, img);
U = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
V = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
Y = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
uyvy = vec_ld(16, img);
U = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
V = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
Y = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
// vec_mstbgr24 (R,G,B, out);
out_rgba(R, G, B, out);
img += 32;
}
return srcSliceH;
}
#endif /* HAVE_ALTIVEC */
/* Ok currently the acceleration routine only supports
* inputs of widths a multiple of 16
* and heights a multiple 2
*
* So we just fall back to the C codes for this.
*/
av_cold SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return NULL;
/*
* and this seems not to matter too much I tried a bunch of
* videos with abnormal widths and MPlayer crashes elsewhere.
* mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
* boom with X11 bad match.
*
*/
if ((c->srcW & 0xf) != 0)
return NULL;
switch (c->srcFormat) {
case AV_PIX_FMT_YUV410P:
case AV_PIX_FMT_YUV420P:
/*case IMGFMT_CLPL: ??? */
case AV_PIX_FMT_GRAY8:
case AV_PIX_FMT_NV12:
case AV_PIX_FMT_NV21:
if ((c->srcH & 0x1) != 0)
return NULL;
switch (c->dstFormat) {
case AV_PIX_FMT_RGB24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
return altivec_yuv2_rgb24;
case AV_PIX_FMT_BGR24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
return altivec_yuv2_bgr24;
case AV_PIX_FMT_ARGB:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
return altivec_yuv2_argb;
case AV_PIX_FMT_ABGR:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
return altivec_yuv2_abgr;
case AV_PIX_FMT_RGBA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
return altivec_yuv2_rgba;
case AV_PIX_FMT_BGRA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
return altivec_yuv2_bgra;
default: return NULL;
}
break;
case AV_PIX_FMT_UYVY422:
switch (c->dstFormat) {
case AV_PIX_FMT_BGR32:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
return altivec_uyvy_rgb32;
default: return NULL;
}
break;
}
#endif /* HAVE_ALTIVEC */
return NULL;
}
av_cold void ff_yuv2rgb_init_tables_ppc(SwsContext *c,
const int inv_table[4],
int brightness,
int contrast,
int saturation)
{
#if HAVE_ALTIVEC
union {
DECLARE_ALIGNED(16, signed short, tmp)[8];
vector signed short vec;
} buf;
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy
buf.tmp[1] = -256 * brightness; // oy
buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv
buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu
buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu
buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv
c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
c->CY = vec_splat((vector signed short) buf.vec, 0);
c->OY = vec_splat((vector signed short) buf.vec, 1);
c->CRV = vec_splat((vector signed short) buf.vec, 2);
c->CBU = vec_splat((vector signed short) buf.vec, 3);
c->CGU = vec_splat((vector signed short) buf.vec, 4);
c->CGV = vec_splat((vector signed short) buf.vec, 5);
return;
#endif /* HAVE_ALTIVEC */
}
#if HAVE_ALTIVEC
static av_always_inline void yuv2packedX_altivec(SwsContext *c,
const int16_t *lumFilter,
const int16_t **lumSrc,
int lumFilterSize,
const int16_t *chrFilter,
const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize,
const int16_t **alpSrc,
uint8_t *dest,
int dstW, int dstY,
enum AVPixelFormat target)
{
int i, j;
vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
vector signed short R0, G0, B0, R1, G1, B1;
vector unsigned char R, G, B;
vector unsigned char *out, *nout;
vector signed short RND = vec_splat_s16(1 << 3);
vector unsigned short SCL = vec_splat_u16(4);
DECLARE_ALIGNED(16, unsigned int, scratch)[16];
vector signed short *YCoeffs, *CCoeffs;
YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
out = (vector unsigned char *) dest;
for (i = 0; i < dstW; i += 16) {
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for (j = 0; j < lumFilterSize; j++) {
X0 = vec_ld(0, &lumSrc[j][i]);
X1 = vec_ld(16, &lumSrc[j][i]);
Y0 = vec_mradds(X0, YCoeffs[j], Y0);
Y1 = vec_mradds(X1, YCoeffs[j], Y1);
}
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for (j = 0; j < chrFilterSize; j++) {
X = vec_ld(0, &chrUSrc[j][i / 2]);
U = vec_mradds(X, CCoeffs[j], U);
X = vec_ld(0, &chrVSrc[j][i / 2]);
V = vec_mradds(X, CCoeffs[j], V);
}
/* scale and clip signals */
Y0 = vec_sra(Y0, SCL);
Y1 = vec_sra(Y1, SCL);
U = vec_sra(U, SCL);
V = vec_sra(V, SCL);
Y0 = vec_clip_s16(Y0);
Y1 = vec_clip_s16(Y1);
U = vec_clip_s16(U);
V = vec_clip_s16(V);
/* now we have
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
*
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
* V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
*/
U0 = vec_mergeh(U, U);
V0 = vec_mergeh(V, V);
U1 = vec_mergel(U, U);
V1 = vec_mergel(V, V);
cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
switch (target) {
case AV_PIX_FMT_ABGR:
out_abgr(R, G, B, out);
break;
case AV_PIX_FMT_BGRA:
out_bgra(R, G, B, out);
break;
case AV_PIX_FMT_RGBA:
out_rgba(R, G, B, out);
break;
case AV_PIX_FMT_ARGB:
out_argb(R, G, B, out);
break;
case AV_PIX_FMT_RGB24:
out_rgb24(R, G, B, out);
break;
case AV_PIX_FMT_BGR24:
out_bgr24(R, G, B, out);
break;
default:
{
/* If this is reached, the caller should have called yuv2packedXinC
* instead. */
static int printed_error_message;
if (!printed_error_message) {
av_log(c, AV_LOG_ERROR,
"altivec_yuv2packedX doesn't support %s output\n",
av_get_pix_fmt_name(c->dstFormat));
printed_error_message = 1;
}
return;
}
}
}
if (i < dstW) {
i -= 16;
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for (j = 0; j < lumFilterSize; j++) {
X0 = vec_ld(0, &lumSrc[j][i]);
X1 = vec_ld(16, &lumSrc[j][i]);
Y0 = vec_mradds(X0, YCoeffs[j], Y0);
Y1 = vec_mradds(X1, YCoeffs[j], Y1);
}
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for (j = 0; j < chrFilterSize; j++) {
X = vec_ld(0, &chrUSrc[j][i / 2]);
U = vec_mradds(X, CCoeffs[j], U);
X = vec_ld(0, &chrVSrc[j][i / 2]);
V = vec_mradds(X, CCoeffs[j], V);
}
/* scale and clip signals */
Y0 = vec_sra(Y0, SCL);
Y1 = vec_sra(Y1, SCL);
U = vec_sra(U, SCL);
V = vec_sra(V, SCL);
Y0 = vec_clip_s16(Y0);
Y1 = vec_clip_s16(Y1);
U = vec_clip_s16(U);
V = vec_clip_s16(V);
/* now we have
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
*
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
* V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
*/
U0 = vec_mergeh(U, U);
V0 = vec_mergeh(V, V);
U1 = vec_mergel(U, U);
V1 = vec_mergel(V, V);
cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
nout = (vector unsigned char *) scratch;
switch (target) {
case AV_PIX_FMT_ABGR:
out_abgr(R, G, B, nout);
break;
case AV_PIX_FMT_BGRA:
out_bgra(R, G, B, nout);
break;
case AV_PIX_FMT_RGBA:
out_rgba(R, G, B, nout);
break;
case AV_PIX_FMT_ARGB:
out_argb(R, G, B, nout);
break;
case AV_PIX_FMT_RGB24:
out_rgb24(R, G, B, nout);
break;
case AV_PIX_FMT_BGR24:
out_bgr24(R, G, B, nout);
break;
default:
/* Unreachable, I think. */
av_log(c, AV_LOG_ERROR,
"altivec_yuv2packedX doesn't support %s output\n",
av_get_pix_fmt_name(c->dstFormat));
return;
}
memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
}
}
#define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
const int16_t *lumFilter, \
const int16_t **lumSrc, \
int lumFilterSize, \
const int16_t *chrFilter, \
const int16_t **chrUSrc, \
const int16_t **chrVSrc, \
int chrFilterSize, \
const int16_t **alpSrc, \
uint8_t *dest, int dstW, int dstY) \
{ \
yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
chrFilter, chrUSrc, chrVSrc, \
chrFilterSize, alpSrc, \
dest, dstW, dstY, pixfmt); \
}
YUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR);
YUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA);
YUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB);
YUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA);
YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24);
YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24);
#endif /* HAVE_ALTIVEC */

View File

@@ -0,0 +1,51 @@
/*
* AltiVec-enhanced yuv2yuvX
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_PPC_YUV2RGB_ALTIVEC_H
#define SWSCALE_PPC_YUV2RGB_ALTIVEC_H
#include <stdint.h>
#include "libswscale/swscale_internal.h"
#define YUV2PACKEDX_HEADER(suffix) \
void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
const int16_t *lumFilter, \
const int16_t **lumSrc, \
int lumFilterSize, \
const int16_t *chrFilter, \
const int16_t **chrUSrc, \
const int16_t **chrVSrc, \
int chrFilterSize, \
const int16_t **alpSrc, \
uint8_t *dest, \
int dstW, int dstY);
YUV2PACKEDX_HEADER(abgr);
YUV2PACKEDX_HEADER(bgra);
YUV2PACKEDX_HEADER(argb);
YUV2PACKEDX_HEADER(rgba);
YUV2PACKEDX_HEADER(rgb24);
YUV2PACKEDX_HEADER(bgr24);
#endif /* SWSCALE_PPC_YUV2RGB_ALTIVEC_H */

View File

@@ -0,0 +1,204 @@
/*
* AltiVec-enhanced yuv-to-yuv conversion routines.
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#if HAVE_ALTIVEC
static int yv12toyuy2_unscaled_altivec(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam[],
int dstStride_a[])
{
uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
// yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
// srcStride[0], srcStride[1], dstStride[0]);
const uint8_t *ysrc = src[0];
const uint8_t *usrc = src[1];
const uint8_t *vsrc = src[2];
const int width = c->srcW;
const int height = srcSliceH;
const int lumStride = srcStride[0];
const int chromStride = srcStride[1];
const int dstStride = dstStride_a[0];
const vector unsigned char yperm = vec_lvsl(0, ysrc);
const int vertLumPerChroma = 2;
register unsigned int y;
/* This code assumes:
*
* 1) dst is 16 bytes-aligned
* 2) dstStride is a multiple of 16
* 3) width is a multiple of 16
* 4) lum & chrom stride are multiples of 8
*/
for (y = 0; y < height; y++) {
int i;
for (i = 0; i < width - 31; i += 32) {
const unsigned int j = i >> 1;
vector unsigned char v_yA = vec_ld(i, ysrc);
vector unsigned char v_yB = vec_ld(i + 16, ysrc);
vector unsigned char v_yC = vec_ld(i + 32, ysrc);
vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
vector unsigned char v_uA = vec_ld(j, usrc);
vector unsigned char v_uB = vec_ld(j + 16, usrc);
vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
vector unsigned char v_vA = vec_ld(j, vsrc);
vector unsigned char v_vB = vec_ld(j + 16, vsrc);
vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b);
vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b);
vec_st(v_yuy2_0, (i << 1), dst);
vec_st(v_yuy2_1, (i << 1) + 16, dst);
vec_st(v_yuy2_2, (i << 1) + 32, dst);
vec_st(v_yuy2_3, (i << 1) + 48, dst);
}
if (i < width) {
const unsigned int j = i >> 1;
vector unsigned char v_y1 = vec_ld(i, ysrc);
vector unsigned char v_u = vec_ld(j, usrc);
vector unsigned char v_v = vec_ld(j, vsrc);
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
vec_st(v_yuy2_0, (i << 1), dst);
vec_st(v_yuy2_1, (i << 1) + 16, dst);
}
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
return srcSliceH;
}
static int yv12touyvy_unscaled_altivec(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam[],
int dstStride_a[])
{
uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
// yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
// srcStride[0], srcStride[1], dstStride[0]);
const uint8_t *ysrc = src[0];
const uint8_t *usrc = src[1];
const uint8_t *vsrc = src[2];
const int width = c->srcW;
const int height = srcSliceH;
const int lumStride = srcStride[0];
const int chromStride = srcStride[1];
const int dstStride = dstStride_a[0];
const int vertLumPerChroma = 2;
const vector unsigned char yperm = vec_lvsl(0, ysrc);
register unsigned int y;
/* This code assumes:
*
* 1) dst is 16 bytes-aligned
* 2) dstStride is a multiple of 16
* 3) width is a multiple of 16
* 4) lum & chrom stride are multiples of 8
*/
for (y = 0; y < height; y++) {
int i;
for (i = 0; i < width - 31; i += 32) {
const unsigned int j = i >> 1;
vector unsigned char v_yA = vec_ld(i, ysrc);
vector unsigned char v_yB = vec_ld(i + 16, ysrc);
vector unsigned char v_yC = vec_ld(i + 32, ysrc);
vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
vector unsigned char v_uA = vec_ld(j, usrc);
vector unsigned char v_uB = vec_ld(j + 16, usrc);
vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
vector unsigned char v_vA = vec_ld(j, vsrc);
vector unsigned char v_vB = vec_ld(j + 16, vsrc);
vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2);
vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2);
vec_st(v_uyvy_0, (i << 1), dst);
vec_st(v_uyvy_1, (i << 1) + 16, dst);
vec_st(v_uyvy_2, (i << 1) + 32, dst);
vec_st(v_uyvy_3, (i << 1) + 48, dst);
}
if (i < width) {
const unsigned int j = i >> 1;
vector unsigned char v_y1 = vec_ld(i, ysrc);
vector unsigned char v_u = vec_ld(j, usrc);
vector unsigned char v_v = vec_ld(j, vsrc);
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
vec_st(v_uyvy_0, (i << 1), dst);
vec_st(v_uyvy_1, (i << 1) + 16, dst);
}
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
return srcSliceH;
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_get_unscaled_swscale_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
if (!(c->srcW & 15) && !(c->flags & SWS_BITEXACT) &&
c->srcFormat == AV_PIX_FMT_YUV420P) {
enum AVPixelFormat dstFormat = c->dstFormat;
// unscaled YV12 -> packed YUV, we want speed
if (dstFormat == AV_PIX_FMT_YUYV422)
c->swscale = yv12toyuy2_unscaled_altivec;
else if (dstFormat == AV_PIX_FMT_UYVY422)
c->swscale = yv12touyvy_unscaled_altivec;
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,393 @@
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "libavutil/attributes.h"
#include "libavutil/bswap.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*shuffle_bytes_0321)(const uint8_t *src, uint8_t *dst, int src_size);
void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride);
void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride,
int32_t *rgb2yuv);
void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
int srcStride, int dstStride);
void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
int width, int height, int src1Stride,
int src2Stride, int dstStride);
void (*deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride);
void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2);
void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2,
const uint8_t *src3, uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride);
void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
#define BY ((int)( 0.098 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BV ((int)(-0.071 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BU ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GY ((int)( 0.504 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GV ((int)(-0.368 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GU ((int)(-0.291 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RY ((int)( 0.257 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RV ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RU ((int)(-0.148 * (1 << RGB2YUV_SHIFT) + 0.5))
//plain C versions
#include "rgb2rgb_template.c"
/*
* RGB15->RGB16 original by Strepto/Astral
* ported to gcc & bugfixed : A'rpi
* MMXEXT, 3DNOW optimization by Nick Kurshev
* 32-bit C version, and and&add trick by Michael Niedermayer
*/
av_cold void sws_rgb2rgb_init(void)
{
rgb2rgb_init_c();
if (ARCH_X86)
rgb2rgb_init_x86();
}
void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 2;
for (i = 0; i < num_pixels; i++) {
#if HAVE_BIGENDIAN
/* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */
dst[3 * i + 0] = src[4 * i + 1];
dst[3 * i + 1] = src[4 * i + 2];
dst[3 * i + 2] = src[4 * i + 3];
#else
dst[3 * i + 0] = src[4 * i + 2];
dst[3 * i + 1] = src[4 * i + 1];
dst[3 * i + 2] = src[4 * i + 0];
#endif
}
}
void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size)
{
int i;
for (i = 0; 3 * i < src_size; i++) {
#if HAVE_BIGENDIAN
/* RGB24 (= R, G, B) -> BGR32 (= A, R, G, B) */
dst[4 * i + 0] = 255;
dst[4 * i + 1] = src[3 * i + 0];
dst[4 * i + 2] = src[3 * i + 1];
dst[4 * i + 3] = src[3 * i + 2];
#else
dst[4 * i + 0] = src[3 * i + 2];
dst[4 * i + 1] = src[3 * i + 1];
dst[4 * i + 2] = src[3 * i + 0];
dst[4 * i + 3] = 255;
#endif
}
}
void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
#else
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = 255;
#endif
}
}
void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t rgb, r, g, b;
uint16_t *d = (uint16_t *)dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
rgb = *s++;
r = rgb & 0xF00;
g = rgb & 0x0F0;
b = rgb & 0x00F;
r = (r << 3) | ((r & 0x800) >> 1);
g = (g << 2) | ((g & 0x080) >> 2);
b = (b << 1) | ( b >> 3);
*d++ = r | g | b;
}
}
void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
}
}
void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = (rgb >> 11) | (rgb & 0x7E0) | (rgb << 11);
}
}
void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = (rgb >> 11) | ((rgb & 0x7C0) >> 1) | ((rgb & 0x1F) << 10);
}
}
void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
#else
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = 255;
#endif
}
}
void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
}
}
void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = ((rgb & 0x7C00) >> 10) | ((rgb & 0x3E0) << 1) | (rgb << 11);
}
}
void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
unsigned br = rgb & 0x7C1F;
((uint16_t *)dst)[i] = (br >> 10) | (rgb & 0x3E0) | (br << 10);
}
}
void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
uint16_t *s = (uint16_t *)src;
int i, num_pixels = src_size >> 1;
for (i = 0; i < num_pixels; i++) {
unsigned rgb = s[i];
d[i] = (rgb << 8 | rgb & 0xF0 | rgb >> 8) & 0xFFF;
}
}
#define DEFINE_SHUFFLE_BYTES(a, b, c, d) \
void shuffle_bytes_ ## a ## b ## c ## d(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
int i; \
\
for (i = 0; i < src_size; i += 4) { \
dst[i + 0] = src[i + a]; \
dst[i + 1] = src[i + b]; \
dst[i + 2] = src[i + c]; \
dst[i + 3] = src[i + d]; \
} \
}
DEFINE_SHUFFLE_BYTES(1, 2, 3, 0)
DEFINE_SHUFFLE_BYTES(3, 0, 1, 2)
DEFINE_SHUFFLE_BYTES(3, 2, 1, 0)
#define DEFINE_RGB48TOBGR48(need_bswap, swap) \
void rgb48tobgr48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 1; \
\
for (i = 0; i < num_pixels; i += 3) { \
d[i ] = swap ? av_bswap16(s[i + 2]) : s[i + 2]; \
d[i + 1] = swap ? av_bswap16(s[i + 1]) : s[i + 1]; \
d[i + 2] = swap ? av_bswap16(s[i ]) : s[i ]; \
} \
}
DEFINE_RGB48TOBGR48(nobswap, 0)
DEFINE_RGB48TOBGR48(bswap, 1)
#define DEFINE_RGB64TOBGR48(need_bswap, swap) \
void rgb64tobgr48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 3; \
\
for (i = 0; i < num_pixels; i++) { \
d[3 * i ] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \
d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \
d[3 * i + 2] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \
} \
}
DEFINE_RGB64TOBGR48(nobswap, 0)
DEFINE_RGB64TOBGR48(bswap, 1)
#define DEFINE_RGB64TO48(need_bswap, swap) \
void rgb64to48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 3; \
\
for (i = 0; i < num_pixels; i++) { \
d[3 * i ] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \
d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \
d[3 * i + 2] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \
} \
}
DEFINE_RGB64TO48(nobswap, 0)
DEFINE_RGB64TO48(bswap, 1)

View File

@@ -0,0 +1,171 @@
/*
* software RGB to RGB converter
* pluralize by Software PAL8 to RGB converter
* Software YUV to YUV converter
* Software YUV to RGB converter
* Written by Nick Kurshev.
* YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_RGB2RGB_H
#define SWSCALE_RGB2RGB_H
#include <inttypes.h>
#include "libavutil/avutil.h"
#include "swscale.h"
/* A full collection of RGB to RGB(BGR) converters */
extern void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*shuffle_bytes_0321)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64to48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64to48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size);
void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size);
void shuffle_bytes_3012(const uint8_t *src, uint8_t *dst, int src_size);
void shuffle_bytes_3210(const uint8_t *src, uint8_t *dst, int src_size);
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height, int lumStride,
int chromStride, int srcStride, int32_t *rgb2yuv);
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Width should be a multiple of 16.
*/
extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride);
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Width should be a multiple of 16.
*/
extern void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
/**
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write high quality version.
*/
extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride,
int32_t *rgb2yuv);
extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
int srcStride, int dstStride);
extern void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
int width, int height, int src1Stride,
int src2Stride, int dstStride);
extern void (*deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride);
extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2);
extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride);
extern void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
void sws_rgb2rgb_init(void);
void rgb2rgb_init_x86(void);
#endif /* SWSCALE_RGB2RGB_H */

View File

@@ -0,0 +1,971 @@
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byte order fixes by Alex Beregszaszi
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include "libavutil/attributes.h"
static inline void rgb24tobgr32_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
#if HAVE_BIGENDIAN
/* RGB24 (= R, G, B) -> RGB32 (= A, B, G, R) */
*dest++ = 255;
*dest++ = s[2];
*dest++ = s[1];
*dest++ = s[0];
s += 3;
#else
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
*dest++ = 255;
#endif
}
}
static inline void rgb32tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
#if HAVE_BIGENDIAN
/* RGB32 (= A, B, G, R) -> RGB24 (= R, G, B) */
s++;
dest[2] = *s++;
dest[1] = *s++;
dest[0] = *s++;
dest += 3;
#else
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
s++;
#endif
}
}
/*
* original by Strepto/Astral
* ported to gcc & bugfixed: A'rpi
* MMXEXT, 3DNOW optimization by Nick Kurshev
* 32-bit C version, and and&add trick by Michael Niedermayer
*/
static inline void rgb15to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
register uint8_t *d = dst;
register const uint8_t *s = src;
register const uint8_t *end = s + src_size;
const uint8_t *mm_end = end - 3;
while (s < mm_end) {
register unsigned x = *((const uint32_t *)s);
*((uint32_t *)d) = (x & 0x7FFF7FFF) + (x & 0x7FE07FE0);
d += 4;
s += 4;
}
if (s < end) {
register unsigned short x = *((const uint16_t *)s);
*((uint16_t *)d) = (x & 0x7FFF) + (x & 0x7FE0);
}
}
static inline void rgb16to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
register uint8_t *d = dst;
register const uint8_t *s = src;
register const uint8_t *end = s + src_size;
const uint8_t *mm_end = end - 3;
while (s < mm_end) {
register uint32_t x = *((const uint32_t *)s);
*((uint32_t *)d) = ((x >> 1) & 0x7FE07FE0) | (x & 0x001F001F);
s += 4;
d += 4;
}
if (s < end) {
register uint16_t x = *((const uint16_t *)s);
*((uint16_t *)d) = ((x >> 1) & 0x7FE0) | (x & 0x001F);
}
}
static inline void rgb32to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xFF) >> 3) +
((rgb & 0xFC00) >> 5) +
((rgb & 0xF80000) >> 8);
}
}
static inline void rgb32tobgr16_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xF8) << 8) +
((rgb & 0xFC00) >> 5) +
((rgb & 0xF80000) >> 19);
}
}
static inline void rgb32to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xFF) >> 3) +
((rgb & 0xF800) >> 6) +
((rgb & 0xF80000) >> 9);
}
}
static inline void rgb32tobgr15_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xF8) << 7) +
((rgb & 0xF800) >> 6) +
((rgb & 0xF80000) >> 19);
}
}
static inline void rgb24tobgr16_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int b = *s++;
const int g = *s++;
const int r = *s++;
*d++ = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
}
}
static inline void rgb24to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int r = *s++;
const int g = *s++;
const int b = *s++;
*d++ = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
}
}
static inline void rgb24tobgr15_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int b = *s++;
const int g = *s++;
const int r = *s++;
*d++ = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
}
}
static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
while (s < end) {
const int r = *s++;
const int g = *s++;
const int b = *s++;
*d++ = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
}
}
static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
}
}
static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
}
}
static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
#else
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = 255;
#endif
}
}
static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
#else
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = 255;
#endif
}
}
static inline void shuffle_bytes_2103_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
int idx = 15 - src_size;
const uint8_t *s = src - idx;
uint8_t *d = dst - idx;
for (; idx < 15; idx += 4) {
register unsigned v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
v &= 0xff00ff;
*(uint32_t *)&d[idx] = (v >> 16) + g + (v << 16);
}
}
static inline void shuffle_bytes_0321_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
int idx = 15 - src_size;
const uint8_t *s = src - idx;
uint8_t *d = dst - idx;
for (; idx < 15; idx += 4) {
register unsigned v = *(const uint32_t *)&s[idx], g = v & 0x00ff00ff;
v &= 0xff00ff00;
*(uint32_t *)&d[idx] = (v >> 16) + g + (v << 16);
}
}
static inline void rgb24tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size)
{
unsigned i;
for (i = 0; i < src_size; i += 3) {
register uint8_t x = src[i + 2];
dst[i + 1] = src[i + 1];
dst[i + 2] = src[i + 0];
dst[i + 0] = x;
}
}
static inline void yuvPlanartoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride,
int dstStride, int vertLumPerChroma)
{
int y, i;
const int chromWidth = width >> 1;
for (y = 0; y < height; y++) {
#if HAVE_FAST_64BIT
uint64_t *ldst = (uint64_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i += 2) {
uint64_t k = yc[0] + (uc[0] << 8) +
(yc[1] << 16) + ((unsigned) vc[0] << 24);
uint64_t l = yc[2] + (uc[1] << 8) +
(yc[3] << 16) + ((unsigned) vc[1] << 24);
*ldst++ = k + (l << 32);
yc += 4;
uc += 2;
vc += 2;
}
#else
int *idst = (int32_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i++) {
#if HAVE_BIGENDIAN
*idst++ = (yc[0] << 24) + (uc[0] << 16) +
(yc[1] << 8) + (vc[0] << 0);
#else
*idst++ = yc[0] + (uc[0] << 8) +
(yc[1] << 16) + (vc[0] << 24);
#endif
yc += 2;
uc++;
vc++;
}
#endif
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yv12toyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
//FIXME interpolate chroma
yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 2);
}
static inline void yuvPlanartouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride,
int dstStride, int vertLumPerChroma)
{
int y, i;
const int chromWidth = width >> 1;
for (y = 0; y < height; y++) {
#if HAVE_FAST_64BIT
uint64_t *ldst = (uint64_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i += 2) {
uint64_t k = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + ((unsigned) yc[1] << 24);
uint64_t l = uc[1] + (yc[2] << 8) +
(vc[1] << 16) + ((unsigned) yc[3] << 24);
*ldst++ = k + (l << 32);
yc += 4;
uc += 2;
vc += 2;
}
#else
int *idst = (int32_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i++) {
#if HAVE_BIGENDIAN
*idst++ = (uc[0] << 24) + (yc[0] << 16) +
(vc[0] << 8) + (yc[1] << 0);
#else
*idst++ = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + (yc[1] << 24);
#endif
yc += 2;
uc++;
vc++;
}
#endif
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yv12touyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
//FIXME interpolate chroma
yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 2);
}
/**
* Width should be a multiple of 16.
*/
static inline void yuv422ptouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 1);
}
/**
* Width should be a multiple of 16.
*/
static inline void yuv422ptoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 1);
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yuy2toyv12_c(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height, int lumStride,
int chromStride, int srcStride)
{
int y;
const int chromWidth = width >> 1;
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 0];
udst[i] = src[4 * i + 1];
ydst[2 * i + 1] = src[4 * i + 2];
vdst[i] = src[4 * i + 3];
}
ydst += lumStride;
src += srcStride;
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 0];
ydst[2 * i + 1] = src[4 * i + 2];
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
static inline void planar2x_c(const uint8_t *src, uint8_t *dst, int srcWidth,
int srcHeight, int srcStride, int dstStride)
{
int x, y;
dst[0] = src[0];
// first line
for (x = 0; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (3 * src[x] + src[x + 1]) >> 2;
dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
}
dst[2 * srcWidth - 1] = src[srcWidth - 1];
dst += dstStride;
for (y = 1; y < srcHeight; y++) {
const int mmxSize = 1;
dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
for (x = mmxSize - 1; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (src[x + 0] * 3 + src[x + srcStride + 1]) >> 2;
dst[2 * x + dstStride + 2] = (src[x + 0] + 3 * src[x + srcStride + 1]) >> 2;
dst[2 * x + dstStride + 1] = (src[x + 1] + 3 * src[x + srcStride]) >> 2;
dst[2 * x + 2] = (src[x + 1] * 3 + src[x + srcStride]) >> 2;
}
dst[srcWidth * 2 - 1] = (src[srcWidth - 1] * 3 + src[srcWidth - 1 + srcStride]) >> 2;
dst[srcWidth * 2 - 1 + dstStride] = (src[srcWidth - 1] + 3 * src[srcWidth - 1 + srcStride]) >> 2;
dst += dstStride * 2;
src += srcStride;
}
// last line
dst[0] = src[0];
for (x = 0; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (src[x] * 3 + src[x + 1]) >> 2;
dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
}
dst[2 * srcWidth - 1] = src[srcWidth - 1];
}
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write HQ version.
*/
static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height, int lumStride,
int chromStride, int srcStride)
{
int y;
const int chromWidth = width >> 1;
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
udst[i] = src[4 * i + 0];
ydst[2 * i + 0] = src[4 * i + 1];
vdst[i] = src[4 * i + 2];
ydst[2 * i + 1] = src[4 * i + 3];
}
ydst += lumStride;
src += srcStride;
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 1];
ydst[2 * i + 1] = src[4 * i + 3];
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
/**
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line,
* others are ignored in the C version.
* FIXME: Write HQ version.
*/
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height, int lumStride,
int chromStride, int srcStride, int32_t *rgb2yuv)
{
int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
int y;
const int chromWidth = width >> 1;
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
unsigned int b = src[6 * i + 0];
unsigned int g = src[6 * i + 1];
unsigned int r = src[6 * i + 2];
unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
udst[i] = U;
vdst[i] = V;
ydst[2 * i] = Y;
b = src[6 * i + 3];
g = src[6 * i + 4];
r = src[6 * i + 5];
Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i + 1] = Y;
}
ydst += lumStride;
src += srcStride;
if (y+1 == height)
break;
for (i = 0; i < chromWidth; i++) {
unsigned int b = src[6 * i + 0];
unsigned int g = src[6 * i + 1];
unsigned int r = src[6 * i + 2];
unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i] = Y;
b = src[6 * i + 3];
g = src[6 * i + 4];
r = src[6 * i + 5];
Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i + 1] = Y;
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride)
{
int h;
for (h = 0; h < height; h++) {
int w;
for (w = 0; w < width; w++) {
dest[2 * w + 0] = src1[w];
dest[2 * w + 1] = src2[w];
}
dest += dstStride;
src1 += src1Stride;
src2 += src2Stride;
}
}
static void deinterleaveBytes_c(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride)
{
int h;
for (h = 0; h < height; h++) {
int w;
for (w = 0; w < width; w++) {
dst1[w] = src[2 * w + 0];
dst2[w] = src[2 * w + 1];
}
src += srcStride;
dst1 += dst1Stride;
dst2 += dst2Stride;
}
}
static inline void vu9_to_vu12_c(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2)
{
int x, y;
int w = width / 2;
int h = height / 2;
for (y = 0; y < h; y++) {
const uint8_t *s1 = src1 + srcStride1 * (y >> 1);
uint8_t *d = dst1 + dstStride1 * y;
for (x = 0; x < w; x++)
d[2 * x] = d[2 * x + 1] = s1[x];
}
for (y = 0; y < h; y++) {
const uint8_t *s2 = src2 + srcStride2 * (y >> 1);
uint8_t *d = dst2 + dstStride2 * y;
for (x = 0; x < w; x++)
d[2 * x] = d[2 * x + 1] = s2[x];
}
}
static inline void yvu9_to_yuy2_c(const uint8_t *src1, const uint8_t *src2,
const uint8_t *src3, uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride)
{
int x, y;
int w = width / 2;
int h = height;
for (y = 0; y < h; y++) {
const uint8_t *yp = src1 + srcStride1 * y;
const uint8_t *up = src2 + srcStride2 * (y >> 2);
const uint8_t *vp = src3 + srcStride3 * (y >> 2);
uint8_t *d = dst + dstStride * y;
for (x = 0; x < w; x++) {
const int x2 = x << 2;
d[8 * x + 0] = yp[x2];
d[8 * x + 1] = up[x];
d[8 * x + 2] = yp[x2 + 1];
d[8 * x + 3] = vp[x];
d[8 * x + 4] = yp[x2 + 2];
d[8 * x + 5] = up[x];
d[8 * x + 6] = yp[x2 + 3];
d[8 * x + 7] = vp[x];
}
}
}
static void extract_even_c(const uint8_t *src, uint8_t *dst, int count)
{
dst += count;
src += count * 2;
count = -count;
while (count < 0) {
dst[count] = src[2 * count];
count++;
}
}
static void extract_even2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
int count)
{
dst0 += count;
dst1 += count;
src += count * 4;
count = -count;
while (count < 0) {
dst0[count] = src[4 * count + 0];
dst1[count] = src[4 * count + 2];
count++;
}
}
static void extract_even2avg_c(const uint8_t *src0, const uint8_t *src1,
uint8_t *dst0, uint8_t *dst1, int count)
{
dst0 += count;
dst1 += count;
src0 += count * 4;
src1 += count * 4;
count = -count;
while (count < 0) {
dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
count++;
}
}
static void extract_odd2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
int count)
{
dst0 += count;
dst1 += count;
src += count * 4;
count = -count;
src++;
while (count < 0) {
dst0[count] = src[4 * count + 0];
dst1[count] = src[4 * count + 2];
count++;
}
}
static void extract_odd2avg_c(const uint8_t *src0, const uint8_t *src1,
uint8_t *dst0, uint8_t *dst1, int count)
{
dst0 += count;
dst1 += count;
src0 += count * 4;
src1 += count * 4;
count = -count;
src0++;
src1++;
while (count < 0) {
dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
count++;
}
}
static void yuyvtoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src, ydst, width);
if (y & 1) {
extract_odd2avg_c(src - srcStride, src, udst, vdst, chromWidth);
udst += chromStride;
vdst += chromStride;
}
src += srcStride;
ydst += lumStride;
}
}
static void yuyvtoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src, ydst, width);
extract_odd2_c(src, udst, vdst, chromWidth);
src += srcStride;
ydst += lumStride;
udst += chromStride;
vdst += chromStride;
}
}
static void uyvytoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src + 1, ydst, width);
if (y & 1) {
extract_even2avg_c(src - srcStride, src, udst, vdst, chromWidth);
udst += chromStride;
vdst += chromStride;
}
src += srcStride;
ydst += lumStride;
}
}
static void uyvytoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y = 0; y < height; y++) {
extract_even_c(src + 1, ydst, width);
extract_even2_c(src, udst, vdst, chromWidth);
src += srcStride;
ydst += lumStride;
udst += chromStride;
vdst += chromStride;
}
}
static av_cold void rgb2rgb_init_c(void)
{
rgb15to16 = rgb15to16_c;
rgb15tobgr24 = rgb15tobgr24_c;
rgb15to32 = rgb15to32_c;
rgb16tobgr24 = rgb16tobgr24_c;
rgb16to32 = rgb16to32_c;
rgb16to15 = rgb16to15_c;
rgb24tobgr16 = rgb24tobgr16_c;
rgb24tobgr15 = rgb24tobgr15_c;
rgb24tobgr32 = rgb24tobgr32_c;
rgb32to16 = rgb32to16_c;
rgb32to15 = rgb32to15_c;
rgb32tobgr24 = rgb32tobgr24_c;
rgb24to15 = rgb24to15_c;
rgb24to16 = rgb24to16_c;
rgb24tobgr24 = rgb24tobgr24_c;
#if HAVE_BIGENDIAN
shuffle_bytes_0321 = shuffle_bytes_2103_c;
shuffle_bytes_2103 = shuffle_bytes_0321_c;
#else
shuffle_bytes_0321 = shuffle_bytes_0321_c;
shuffle_bytes_2103 = shuffle_bytes_2103_c;
#endif
rgb32tobgr16 = rgb32tobgr16_c;
rgb32tobgr15 = rgb32tobgr15_c;
yv12toyuy2 = yv12toyuy2_c;
yv12touyvy = yv12touyvy_c;
yuv422ptoyuy2 = yuv422ptoyuy2_c;
yuv422ptouyvy = yuv422ptouyvy_c;
yuy2toyv12 = yuy2toyv12_c;
planar2x = planar2x_c;
ff_rgb24toyv12 = ff_rgb24toyv12_c;
interleaveBytes = interleaveBytes_c;
deinterleaveBytes = deinterleaveBytes_c;
vu9_to_vu12 = vu9_to_vu12_c;
yvu9_to_yuy2 = yvu9_to_yuy2_c;
uyvytoyuv420 = uyvytoyuv420_c;
uyvytoyuv422 = uyvytoyuv422_c;
yuyvtoyuv420 = yuyvtoyuv420_c;
yuyvtoyuv422 = yuyvtoyuv422_c;
}

View File

@@ -0,0 +1,349 @@
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
static void free_lines(SwsSlice *s)
{
int i;
for (i = 0; i < 2; ++i) {
int n = s->plane[i].available_lines;
int j;
for (j = 0; j < n; ++j) {
av_freep(&s->plane[i].line[j]);
if (s->is_ring)
s->plane[i].line[j+n] = NULL;
}
}
for (i = 0; i < 4; ++i)
memset(s->plane[i].line, 0, sizeof(uint8_t*) * s->plane[i].available_lines * (s->is_ring ? 3 : 1));
s->should_free_lines = 0;
}
/*
slice lines contains extra bytes for vetorial code thus @size
is the allocated memory size and @width is the number of pixels
*/
static int alloc_lines(SwsSlice *s, int size, int width)
{
int i;
int idx[2] = {3, 2};
s->should_free_lines = 1;
s->width = width;
for (i = 0; i < 2; ++i) {
int n = s->plane[i].available_lines;
int j;
int ii = idx[i];
av_assert0(n == s->plane[ii].available_lines);
for (j = 0; j < n; ++j) {
// chroma plane line U and V are expected to be contiguous in memory
// by mmx vertical scaler code
s->plane[i].line[j] = av_malloc(size * 2 + 32);
if (!s->plane[i].line[j]) {
free_lines(s);
return AVERROR(ENOMEM);
}
s->plane[ii].line[j] = s->plane[i].line[j] + size + 16;
if (s->is_ring) {
s->plane[i].line[j+n] = s->plane[i].line[j];
s->plane[ii].line[j+n] = s->plane[ii].line[j];
}
}
}
return 0;
}
static int alloc_slice(SwsSlice *s, enum AVPixelFormat fmt, int lumLines, int chrLines, int h_sub_sample, int v_sub_sample, int ring)
{
int i;
int size[4] = { lumLines,
chrLines,
chrLines,
lumLines };
s->h_chr_sub_sample = h_sub_sample;
s->v_chr_sub_sample = v_sub_sample;
s->fmt = fmt;
s->is_ring = ring;
s->should_free_lines = 0;
for (i = 0; i < 4; ++i) {
int n = size[i] * ( ring == 0 ? 1 : 3);
s->plane[i].line = av_mallocz_array(sizeof(uint8_t*), n);
if (!s->plane[i].line)
return AVERROR(ENOMEM);
s->plane[i].tmp = ring ? s->plane[i].line + size[i] * 2 : NULL;
s->plane[i].available_lines = size[i];
s->plane[i].sliceY = 0;
s->plane[i].sliceH = 0;
}
return 0;
}
static void free_slice(SwsSlice *s)
{
int i;
if (s) {
if (s->should_free_lines)
free_lines(s);
for (i = 0; i < 4; ++i) {
av_freep(&s->plane[i].line);
s->plane[i].tmp = NULL;
}
}
}
int ff_rotate_slice(SwsSlice *s, int lum, int chr)
{
int i;
if (lum) {
for (i = 0; i < 4; i+=3) {
int n = s->plane[i].available_lines;
int l = lum - s->plane[i].sliceY;
if (l >= n * 2) {
s->plane[i].sliceY += n;
s->plane[i].sliceH -= n;
}
}
}
if (chr) {
for (i = 1; i < 3; ++i) {
int n = s->plane[i].available_lines;
int l = chr - s->plane[i].sliceY;
if (l >= n * 2) {
s->plane[i].sliceY += n;
s->plane[i].sliceH -= n;
}
}
}
return 0;
}
int ff_init_slice_from_src(SwsSlice * s, uint8_t *src[4], int stride[4], int srcW, int lumY, int lumH, int chrY, int chrH, int relative)
{
int i = 0;
const int start[4] = {lumY,
chrY,
chrY,
lumY};
const int end[4] = {lumY +lumH,
chrY + chrH,
chrY + chrH,
lumY + lumH};
const uint8_t *src_[4] = {src[0] + (relative ? 0 : start[0]) * stride[0],
src[1] + (relative ? 0 : start[1]) * stride[0],
src[2] + (relative ? 0 : start[2]) * stride[0],
src[3] + (relative ? 0 : start[3]) * stride[0]};
s->width = srcW;
for (i = 0; i < 4; ++i) {
int j;
int first = s->plane[i].sliceY;
int n = s->plane[i].available_lines;
int lines = end[i] - start[i];
int tot_lines = end[i] - first;
if (start[i] >= first && n >= tot_lines) {
s->plane[i].sliceH = FFMAX(tot_lines, s->plane[i].sliceH);
for (j = 0; j < lines; j+= 1)
s->plane[i].line[start[i] - first + j] = src_[i] + j * stride[i];
} else {
s->plane[i].sliceY = start[i];
lines = lines > n ? n : lines;
s->plane[i].sliceH = lines;
for (j = 0; j < lines; j+= 1)
s->plane[i].line[j] = src_[i] + j * stride[i];
}
}
return 0;
}
static void fill_ones(SwsSlice *s, int n, int is16bit)
{
int i;
for (i = 0; i < 4; ++i) {
int j;
int size = s->plane[i].available_lines;
for (j = 0; j < size; ++j) {
int k;
int end = is16bit ? n>>1: n;
// fill also one extra element
end += 1;
if (is16bit)
for (k = 0; k < end; ++k)
((int32_t*)(s->plane[i].line[j]))[k] = 1<<18;
else
for (k = 0; k < end; ++k)
((int16_t*)(s->plane[i].line[j]))[k] = 1<<14;
}
}
}
int ff_init_filters(SwsContext * c)
{
int i;
int index;
int num_ydesc;
int num_cdesc;
int num_vdesc = isPlanarYUV(c->dstFormat) && !isGray(c->dstFormat) ? 2 : 1;
int need_lum_conv = c->lumToYV12 || c->readLumPlanar || c->alpToYV12 || c->readAlpPlanar;
int need_chr_conv = c->chrToYV12 || c->readChrPlanar;
int need_gamma = c->is_internal_gamma;
int srcIdx, dstIdx;
int dst_stride = FFALIGN(c->dstW * sizeof(int16_t) + 66, 16);
uint32_t * pal = usePal(c->srcFormat) ? c->pal_yuv : (uint32_t*)c->input_rgb2yuv_table;
int res = 0;
if (c->dstBpc == 16)
dst_stride <<= 1;
num_ydesc = need_lum_conv ? 2 : 1;
num_cdesc = need_chr_conv ? 2 : 1;
c->numSlice = FFMAX(num_ydesc, num_cdesc) + 2;
c->numDesc = num_ydesc + num_cdesc + num_vdesc + (need_gamma ? 2 : 0);
c->descIndex[0] = num_ydesc + (need_gamma ? 1 : 0);
c->descIndex[1] = num_ydesc + num_cdesc + (need_gamma ? 1 : 0);
c->desc = av_mallocz_array(sizeof(SwsFilterDescriptor), c->numDesc);
if (!c->desc)
return AVERROR(ENOMEM);
c->slice = av_mallocz_array(sizeof(SwsSlice), c->numSlice);
res = alloc_slice(&c->slice[0], c->srcFormat, c->srcH, c->chrSrcH, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
if (res < 0) goto cleanup;
for (i = 1; i < c->numSlice-2; ++i) {
res = alloc_slice(&c->slice[i], c->srcFormat, c->vLumFilterSize + MAX_LINES_AHEAD, c->vChrFilterSize + MAX_LINES_AHEAD, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
if (res < 0) goto cleanup;
res = alloc_lines(&c->slice[i], FFALIGN(c->srcW*2+78, 16), c->srcW);
if (res < 0) goto cleanup;
}
// horizontal scaler output
res = alloc_slice(&c->slice[i], c->srcFormat, c->vLumFilterSize + MAX_LINES_AHEAD, c->vChrFilterSize + MAX_LINES_AHEAD, c->chrDstHSubSample, c->chrDstVSubSample, 1);
if (res < 0) goto cleanup;
res = alloc_lines(&c->slice[i], dst_stride, c->dstW);
if (res < 0) goto cleanup;
fill_ones(&c->slice[i], dst_stride>>1, c->dstBpc == 16);
// vertical scaler output
++i;
res = alloc_slice(&c->slice[i], c->dstFormat, c->dstH, c->chrDstH, c->chrDstHSubSample, c->chrDstVSubSample, 0);
if (res < 0) goto cleanup;
index = 0;
srcIdx = 0;
dstIdx = 1;
if (need_gamma) {
res = ff_init_gamma_convert(c->desc + index, c->slice + srcIdx, c->inv_gamma);
if (res < 0) goto cleanup;
++index;
}
if (need_lum_conv) {
res = ff_init_desc_fmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
if (res < 0) goto cleanup;
c->desc[index].alpha = c->alpPixBuf != 0;
++index;
srcIdx = dstIdx;
}
dstIdx = FFMAX(num_ydesc, num_cdesc);
res = ff_init_desc_hscale(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], c->hLumFilter, c->hLumFilterPos, c->hLumFilterSize, c->lumXInc);
if (res < 0) goto cleanup;
c->desc[index].alpha = c->alpPixBuf != 0;
++index;
{
srcIdx = 0;
dstIdx = 1;
if (need_chr_conv) {
res = ff_init_desc_cfmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
if (res < 0) goto cleanup;
++index;
srcIdx = dstIdx;
}
dstIdx = FFMAX(num_ydesc, num_cdesc);
if (c->needs_hcscale)
res = ff_init_desc_chscale(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], c->hChrFilter, c->hChrFilterPos, c->hChrFilterSize, c->chrXInc);
else
res = ff_init_desc_no_chr(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx]);
if (res < 0) goto cleanup;
}
++index;
{
srcIdx = c->numSlice - 2;
dstIdx = c->numSlice - 1;
res = ff_init_vscale(c, c->desc + index, c->slice + srcIdx, c->slice + dstIdx);
if (res < 0) goto cleanup;
}
++index;
if (need_gamma) {
res = ff_init_gamma_convert(c->desc + index, c->slice + dstIdx, c->gamma);
if (res < 0) goto cleanup;
}
return 0;
cleanup:
ff_free_filters(c);
return res;
}
int ff_free_filters(SwsContext *c)
{
int i;
if (c->desc) {
for (i = 0; i < c->numDesc; ++i)
av_freep(&c->desc[i].instance);
av_freep(&c->desc);
}
if (c->slice) {
for (i = 0; i < c->numSlice; ++i)
free_slice(&c->slice[i]);
av_freep(&c->slice);
}
return 0;
}

View File

@@ -0,0 +1,38 @@
EXPORTS
DllStartup
sws_addVec
sws_allocVec
sws_alloc_context
sws_alloc_set_opts
sws_cloneVec
sws_context_class DATA
sws_convVec
sws_convertPalette8ToPacked24
sws_convertPalette8ToPacked32
sws_freeContext
sws_freeFilter
sws_freeVec
sws_getCachedContext
sws_getCoefficients
sws_getColorspaceDetails
sws_getConstVec
sws_getContext
sws_getDefaultFilter
sws_getGaussianVec
sws_getIdentityVec
sws_get_class
sws_init_context
sws_isSupportedEndiannessConversion
sws_isSupportedInput
sws_isSupportedOutput
sws_normalizeVec
sws_printVec2
sws_rgb2rgb_init
sws_scale
sws_scaleVec
sws_setColorspaceDetails
sws_shiftVec
sws_subVec
swscale_configuration
swscale_license
swscale_version

View File

@@ -0,0 +1,38 @@
EXPORTS
DllStartup @1
sws_addVec @2
sws_allocVec @3
sws_alloc_context @4
sws_alloc_set_opts @5
sws_cloneVec @6
sws_context_class @7 DATA
sws_convVec @8
sws_convertPalette8ToPacked24 @9
sws_convertPalette8ToPacked32 @10
sws_freeContext @11
sws_freeFilter @12
sws_freeVec @13
sws_getCachedContext @14
sws_getCoefficients @15
sws_getColorspaceDetails @16
sws_getConstVec @17
sws_getContext @18
sws_getDefaultFilter @19
sws_getGaussianVec @20
sws_getIdentityVec @21
sws_get_class @22
sws_init_context @23
sws_isSupportedEndiannessConversion @24
sws_isSupportedInput @25
sws_isSupportedOutput @26
sws_normalizeVec @27
sws_printVec2 @28
sws_rgb2rgb_init @29
sws_scale @30
sws_scaleVec @31
sws_setColorspaceDetails @32
sws_shiftVec @33
sws_subVec @34
swscale_configuration @35
swscale_license @36
swscale_version @37

View File

@@ -0,0 +1,417 @@
/*
* Copyright (C) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <stdarg.h>
#undef HAVE_AV_CONFIG_H
#include "libavutil/imgutils.h"
#include "libavutil/mem.h"
#include "libavutil/avutil.h"
#include "libavutil/crc.h"
#include "libavutil/pixdesc.h"
#include "libavutil/lfg.h"
#include "swscale.h"
/* HACK Duplicated from swscale_internal.h.
* Should be removed when a cleaner pixel format system exists. */
#define isGray(x) \
((x) == AV_PIX_FMT_GRAY8 || \
(x) == AV_PIX_FMT_YA8 || \
(x) == AV_PIX_FMT_GRAY16BE || \
(x) == AV_PIX_FMT_GRAY16LE || \
(x) == AV_PIX_FMT_YA16BE || \
(x) == AV_PIX_FMT_YA16LE)
#define hasChroma(x) \
(!(isGray(x) || \
(x) == AV_PIX_FMT_MONOBLACK || \
(x) == AV_PIX_FMT_MONOWHITE))
#define isALPHA(x) \
((x) == AV_PIX_FMT_BGR32 || \
(x) == AV_PIX_FMT_BGR32_1 || \
(x) == AV_PIX_FMT_RGB32 || \
(x) == AV_PIX_FMT_RGB32_1 || \
(x) == AV_PIX_FMT_YUVA420P)
static uint64_t getSSD(const uint8_t *src1, const uint8_t *src2, int stride1,
int stride2, int w, int h)
{
int x, y;
uint64_t ssd = 0;
for (y = 0; y < h; y++) {
for (x = 0; x < w; x++) {
int d = src1[x + y * stride1] - src2[x + y * stride2];
ssd += d * d;
}
}
return ssd;
}
struct Results {
uint64_t ssdY;
uint64_t ssdU;
uint64_t ssdV;
uint64_t ssdA;
uint32_t crc;
};
// test by ref -> src -> dst -> out & compare out against ref
// ref & out are YV12
static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
enum AVPixelFormat srcFormat, enum AVPixelFormat dstFormat,
int srcW, int srcH, int dstW, int dstH, int flags,
struct Results *r)
{
const AVPixFmtDescriptor *desc_yuva420p = av_pix_fmt_desc_get(AV_PIX_FMT_YUVA420P);
const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(srcFormat);
const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(dstFormat);
static enum AVPixelFormat cur_srcFormat;
static int cur_srcW, cur_srcH;
static uint8_t *src[4];
static int srcStride[4];
uint8_t *dst[4] = { 0 };
uint8_t *out[4] = { 0 };
int dstStride[4] = {0};
int i;
uint64_t ssdY, ssdU = 0, ssdV = 0, ssdA = 0;
struct SwsContext *dstContext = NULL, *outContext = NULL;
uint32_t crc = 0;
int res = 0;
if (cur_srcFormat != srcFormat || cur_srcW != srcW || cur_srcH != srcH) {
struct SwsContext *srcContext = NULL;
int p;
for (p = 0; p < 4; p++)
av_freep(&src[p]);
av_image_fill_linesizes(srcStride, srcFormat, srcW);
for (p = 0; p < 4; p++) {
srcStride[p] = FFALIGN(srcStride[p], 16);
if (srcStride[p])
src[p] = av_mallocz(srcStride[p] * srcH + 16);
if (srcStride[p] && !src[p]) {
perror("Malloc");
res = -1;
goto end;
}
}
srcContext = sws_getContext(w, h, AV_PIX_FMT_YUVA420P, srcW, srcH,
srcFormat, SWS_BILINEAR, NULL, NULL, NULL);
if (!srcContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_yuva420p->name,
desc_src->name);
res = -1;
goto end;
}
sws_scale(srcContext, (const uint8_t * const*)ref, refStride, 0, h, src, srcStride);
sws_freeContext(srcContext);
cur_srcFormat = srcFormat;
cur_srcW = srcW;
cur_srcH = srcH;
}
av_image_fill_linesizes(dstStride, dstFormat, dstW);
for (i = 0; i < 4; i++) {
/* Image buffers passed into libswscale can be allocated any way you
* prefer, as long as they're aligned enough for the architecture, and
* they're freed appropriately (such as using av_free for buffers
* allocated with av_malloc). */
/* An extra 16 bytes is being allocated because some scalers may write
* out of bounds. */
dstStride[i] = FFALIGN(dstStride[i], 16);
if (dstStride[i])
dst[i] = av_mallocz(dstStride[i] * dstH + 16);
if (dstStride[i] && !dst[i]) {
perror("Malloc");
res = -1;
goto end;
}
}
dstContext = sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat,
flags, NULL, NULL, NULL);
if (!dstContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_src->name, desc_dst->name);
res = -1;
goto end;
}
printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
desc_src->name, srcW, srcH,
desc_dst->name, dstW, dstH,
flags);
fflush(stdout);
sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
for (i = 0; i < 4 && dstStride[i]; i++)
crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
dstStride[i] * dstH);
if (r && crc == r->crc) {
ssdY = r->ssdY;
ssdU = r->ssdU;
ssdV = r->ssdV;
ssdA = r->ssdA;
} else {
for (i = 0; i < 4; i++) {
refStride[i] = FFALIGN(refStride[i], 16);
if (refStride[i])
out[i] = av_mallocz(refStride[i] * h);
if (refStride[i] && !out[i]) {
perror("Malloc");
res = -1;
goto end;
}
}
outContext = sws_getContext(dstW, dstH, dstFormat, w, h,
AV_PIX_FMT_YUVA420P, SWS_BILINEAR,
NULL, NULL, NULL);
if (!outContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_dst->name,
desc_yuva420p->name);
res = -1;
goto end;
}
sws_scale(outContext, (const uint8_t * const*)dst, dstStride, 0, dstH, out, refStride);
ssdY = getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
if (hasChroma(srcFormat) && hasChroma(dstFormat)) {
//FIXME check that output is really gray
ssdU = getSSD(ref[1], out[1], refStride[1], refStride[1],
(w + 1) >> 1, (h + 1) >> 1);
ssdV = getSSD(ref[2], out[2], refStride[2], refStride[2],
(w + 1) >> 1, (h + 1) >> 1);
}
if (isALPHA(srcFormat) && isALPHA(dstFormat))
ssdA = getSSD(ref[3], out[3], refStride[3], refStride[3], w, h);
ssdY /= w * h;
ssdU /= w * h / 4;
ssdV /= w * h / 4;
ssdA /= w * h;
sws_freeContext(outContext);
for (i = 0; i < 4; i++)
if (refStride[i])
av_free(out[i]);
}
printf(" CRC=%08x SSD=%5"PRId64 ",%5"PRId64 ",%5"PRId64 ",%5"PRId64 "\n",
crc, ssdY, ssdU, ssdV, ssdA);
end:
sws_freeContext(dstContext);
for (i = 0; i < 4; i++)
if (dstStride[i])
av_free(dst[i]);
return res;
}
static void selfTest(uint8_t *ref[4], int refStride[4], int w, int h,
enum AVPixelFormat srcFormat_in,
enum AVPixelFormat dstFormat_in)
{
const int flags[] = { SWS_FAST_BILINEAR, SWS_BILINEAR, SWS_BICUBIC,
SWS_X, SWS_POINT, SWS_AREA, 0 };
const int srcW = w;
const int srcH = h;
const int dstW[] = { srcW - srcW / 3, srcW, srcW + srcW / 3, 0 };
const int dstH[] = { srcH - srcH / 3, srcH, srcH + srcH / 3, 0 };
enum AVPixelFormat srcFormat, dstFormat;
const AVPixFmtDescriptor *desc_src, *desc_dst;
for (srcFormat = srcFormat_in != AV_PIX_FMT_NONE ? srcFormat_in : 0;
srcFormat < AV_PIX_FMT_NB; srcFormat++) {
if (!sws_isSupportedInput(srcFormat) ||
!sws_isSupportedOutput(srcFormat))
continue;
desc_src = av_pix_fmt_desc_get(srcFormat);
for (dstFormat = dstFormat_in != AV_PIX_FMT_NONE ? dstFormat_in : 0;
dstFormat < AV_PIX_FMT_NB; dstFormat++) {
int i, j, k;
int res = 0;
if (!sws_isSupportedInput(dstFormat) ||
!sws_isSupportedOutput(dstFormat))
continue;
desc_dst = av_pix_fmt_desc_get(dstFormat);
printf("%s -> %s\n", desc_src->name, desc_dst->name);
fflush(stdout);
for (k = 0; flags[k] && !res; k++)
for (i = 0; dstW[i] && !res; i++)
for (j = 0; dstH[j] && !res; j++)
res = doTest(ref, refStride, w, h,
srcFormat, dstFormat,
srcW, srcH, dstW[i], dstH[j], flags[k],
NULL);
if (dstFormat_in != AV_PIX_FMT_NONE)
break;
}
if (srcFormat_in != AV_PIX_FMT_NONE)
break;
}
}
static int fileTest(uint8_t *ref[4], int refStride[4], int w, int h, FILE *fp,
enum AVPixelFormat srcFormat_in,
enum AVPixelFormat dstFormat_in)
{
char buf[256];
while (fgets(buf, sizeof(buf), fp)) {
struct Results r;
enum AVPixelFormat srcFormat;
char srcStr[12];
int srcW, srcH;
enum AVPixelFormat dstFormat;
char dstStr[12];
int dstW, dstH;
int flags;
int ret;
ret = sscanf(buf,
" %12s %dx%d -> %12s %dx%d flags=%d CRC=%x"
" SSD=%"SCNd64 ", %"SCNd64 ", %"SCNd64 ", %"SCNd64 "\n",
srcStr, &srcW, &srcH, dstStr, &dstW, &dstH,
&flags, &r.crc, &r.ssdY, &r.ssdU, &r.ssdV, &r.ssdA);
if (ret != 12) {
srcStr[0] = dstStr[0] = 0;
ret = sscanf(buf, "%12s -> %12s\n", srcStr, dstStr);
}
srcFormat = av_get_pix_fmt(srcStr);
dstFormat = av_get_pix_fmt(dstStr);
if (srcFormat == AV_PIX_FMT_NONE || dstFormat == AV_PIX_FMT_NONE ||
srcW > 8192U || srcH > 8192U || dstW > 8192U || dstH > 8192U) {
fprintf(stderr, "malformed input file\n");
return -1;
}
if ((srcFormat_in != AV_PIX_FMT_NONE && srcFormat_in != srcFormat) ||
(dstFormat_in != AV_PIX_FMT_NONE && dstFormat_in != dstFormat))
continue;
if (ret != 12) {
printf("%s", buf);
continue;
}
doTest(ref, refStride, w, h,
srcFormat, dstFormat,
srcW, srcH, dstW, dstH, flags,
&r);
}
return 0;
}
#define W 96
#define H 96
int main(int argc, char **argv)
{
enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
uint8_t *rgb_data = av_malloc(W * H * 4);
const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
int rgb_stride[4] = { 4 * W, 0, 0, 0 };
uint8_t *data = av_malloc(4 * W * H);
uint8_t *src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
int stride[4] = { W, W, W, W };
int x, y;
struct SwsContext *sws;
AVLFG rand;
int res = -1;
int i;
FILE *fp = NULL;
if (!rgb_data || !data)
return -1;
for (i = 1; i < argc; i += 2) {
if (argv[i][0] != '-' || i + 1 == argc)
goto bad_option;
if (!strcmp(argv[i], "-ref")) {
fp = fopen(argv[i + 1], "r");
if (!fp) {
fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
goto error;
}
} else if (!strcmp(argv[i], "-src")) {
srcFormat = av_get_pix_fmt(argv[i + 1]);
if (srcFormat == AV_PIX_FMT_NONE) {
fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
return -1;
}
} else if (!strcmp(argv[i], "-dst")) {
dstFormat = av_get_pix_fmt(argv[i + 1]);
if (dstFormat == AV_PIX_FMT_NONE) {
fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
return -1;
}
} else {
bad_option:
fprintf(stderr, "bad option or argument missing (%s)\n", argv[i]);
goto error;
}
}
sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
av_lfg_init(&rand, 1);
for (y = 0; y < H; y++)
for (x = 0; x < W * 4; x++)
rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, src, stride);
sws_freeContext(sws);
av_free(rgb_data);
if(fp) {
res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
fclose(fp);
} else {
selfTest(src, stride, W, H, srcFormat, dstFormat);
res = 0;
}
error:
av_free(data);
return res;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,365 @@
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_SWSCALE_H
#define SWSCALE_SWSCALE_H
/**
* @file
* @ingroup libsws
* external API header
*/
#include <stdint.h>
#include "libavutil/avutil.h"
#include "libavutil/log.h"
#include "libavutil/pixfmt.h"
#include "version.h"
/**
* @defgroup libsws Color conversion and scaling
* @{
*
* Return the LIBSWSCALE_VERSION_INT constant.
*/
unsigned swscale_version(void);
/**
* Return the libswscale build-time configuration.
*/
const char *swscale_configuration(void);
/**
* Return the libswscale license.
*/
const char *swscale_license(void);
/* values for the flags, the stuff on the command line is different */
#define SWS_FAST_BILINEAR 1
#define SWS_BILINEAR 2
#define SWS_BICUBIC 4
#define SWS_X 8
#define SWS_POINT 0x10
#define SWS_AREA 0x20
#define SWS_BICUBLIN 0x40
#define SWS_GAUSS 0x80
#define SWS_SINC 0x100
#define SWS_LANCZOS 0x200
#define SWS_SPLINE 0x400
#define SWS_SRC_V_CHR_DROP_MASK 0x30000
#define SWS_SRC_V_CHR_DROP_SHIFT 16
#define SWS_PARAM_DEFAULT 123456
#define SWS_PRINT_INFO 0x1000
//the following 3 flags are not completely implemented
//internal chrominace subsampling info
#define SWS_FULL_CHR_H_INT 0x2000
//input subsampling info
#define SWS_FULL_CHR_H_INP 0x4000
#define SWS_DIRECT_BGR 0x8000
#define SWS_ACCURATE_RND 0x40000
#define SWS_BITEXACT 0x80000
#define SWS_ERROR_DIFFUSION 0x800000
#if FF_API_SWS_CPU_CAPS
/**
* CPU caps are autodetected now, those flags
* are only provided for API compatibility.
*/
#define SWS_CPU_CAPS_MMX 0x80000000
#define SWS_CPU_CAPS_MMXEXT 0x20000000
#define SWS_CPU_CAPS_MMX2 0x20000000
#define SWS_CPU_CAPS_3DNOW 0x40000000
#define SWS_CPU_CAPS_ALTIVEC 0x10000000
#if FF_API_ARCH_BFIN
#define SWS_CPU_CAPS_BFIN 0x01000000
#endif
#define SWS_CPU_CAPS_SSE2 0x02000000
#endif
#define SWS_MAX_REDUCE_CUTOFF 0.002
#define SWS_CS_ITU709 1
#define SWS_CS_FCC 4
#define SWS_CS_ITU601 5
#define SWS_CS_ITU624 5
#define SWS_CS_SMPTE170M 5
#define SWS_CS_SMPTE240M 7
#define SWS_CS_DEFAULT 5
/**
* Return a pointer to yuv<->rgb coefficients for the given colorspace
* suitable for sws_setColorspaceDetails().
*
* @param colorspace One of the SWS_CS_* macros. If invalid,
* SWS_CS_DEFAULT is used.
*/
const int *sws_getCoefficients(int colorspace);
// when used for filters they must have an odd number of elements
// coeffs cannot be shared between vectors
typedef struct SwsVector {
double *coeff; ///< pointer to the list of coefficients
int length; ///< number of coefficients in the vector
} SwsVector;
// vectors can be shared
typedef struct SwsFilter {
SwsVector *lumH;
SwsVector *lumV;
SwsVector *chrH;
SwsVector *chrV;
} SwsFilter;
struct SwsContext;
/**
* Return a positive value if pix_fmt is a supported input format, 0
* otherwise.
*/
int sws_isSupportedInput(enum AVPixelFormat pix_fmt);
/**
* Return a positive value if pix_fmt is a supported output format, 0
* otherwise.
*/
int sws_isSupportedOutput(enum AVPixelFormat pix_fmt);
/**
* @param[in] pix_fmt the pixel format
* @return a positive value if an endianness conversion for pix_fmt is
* supported, 0 otherwise.
*/
int sws_isSupportedEndiannessConversion(enum AVPixelFormat pix_fmt);
/**
* Allocate an empty SwsContext. This must be filled and passed to
* sws_init_context(). For filling see AVOptions, options.c and
* sws_setColorspaceDetails().
*/
struct SwsContext *sws_alloc_context(void);
/**
* Initialize the swscaler context sws_context.
*
* @return zero or positive value on success, a negative value on
* error
*/
int sws_init_context(struct SwsContext *sws_context, SwsFilter *srcFilter, SwsFilter *dstFilter);
/**
* Free the swscaler context swsContext.
* If swsContext is NULL, then does nothing.
*/
void sws_freeContext(struct SwsContext *swsContext);
/**
* Allocate and return an SwsContext. You need it to perform
* scaling/conversion operations using sws_scale().
*
* @param srcW the width of the source image
* @param srcH the height of the source image
* @param srcFormat the source image format
* @param dstW the width of the destination image
* @param dstH the height of the destination image
* @param dstFormat the destination image format
* @param flags specify which algorithm and options to use for rescaling
* @param param extra parameters to tune the used scaler
* For SWS_BICUBIC param[0] and [1] tune the shape of the basis
* function, param[0] tunes f(1) and param[1] f´(1)
* For SWS_GAUSS param[0] tunes the exponent and thus cutoff
* frequency
* For SWS_LANCZOS param[0] tunes the width of the window function
* @return a pointer to an allocated context, or NULL in case of error
* @note this function is to be removed after a saner alternative is
* written
*/
struct SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, SwsFilter *srcFilter,
SwsFilter *dstFilter, const double *param);
/**
* Scale the image slice in srcSlice and put the resulting scaled
* slice in the image in dst. A slice is a sequence of consecutive
* rows in an image.
*
* Slices have to be provided in sequential order, either in
* top-bottom or bottom-top order. If slices are provided in
* non-sequential order the behavior of the function is undefined.
*
* @param c the scaling context previously created with
* sws_getContext()
* @param srcSlice the array containing the pointers to the planes of
* the source slice
* @param srcStride the array containing the strides for each plane of
* the source image
* @param srcSliceY the position in the source image of the slice to
* process, that is the number (counted starting from
* zero) in the image of the first row of the slice
* @param srcSliceH the height of the source slice, that is the number
* of rows in the slice
* @param dst the array containing the pointers to the planes of
* the destination image
* @param dstStride the array containing the strides for each plane of
* the destination image
* @return the height of the output slice
*/
int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
const int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *const dst[], const int dstStride[]);
/**
* @param dstRange flag indicating the while-black range of the output (1=jpeg / 0=mpeg)
* @param srcRange flag indicating the while-black range of the input (1=jpeg / 0=mpeg)
* @param table the yuv2rgb coefficients describing the output yuv space, normally ff_yuv2rgb_coeffs[x]
* @param inv_table the yuv2rgb coefficients describing the input yuv space, normally ff_yuv2rgb_coeffs[x]
* @param brightness 16.16 fixed point brightness correction
* @param contrast 16.16 fixed point contrast correction
* @param saturation 16.16 fixed point saturation correction
* @return -1 if not supported
*/
int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
int srcRange, const int table[4], int dstRange,
int brightness, int contrast, int saturation);
/**
* @return -1 if not supported
*/
int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
int *srcRange, int **table, int *dstRange,
int *brightness, int *contrast, int *saturation);
/**
* Allocate and return an uninitialized vector with length coefficients.
*/
SwsVector *sws_allocVec(int length);
/**
* Return a normalized Gaussian curve used to filter stuff
* quality = 3 is high quality, lower is lower quality.
*/
SwsVector *sws_getGaussianVec(double variance, double quality);
/**
* Allocate and return a vector with length coefficients, all
* with the same value c.
*/
SwsVector *sws_getConstVec(double c, int length);
/**
* Allocate and return a vector with just one coefficient, with
* value 1.0.
*/
SwsVector *sws_getIdentityVec(void);
/**
* Scale all the coefficients of a by the scalar value.
*/
void sws_scaleVec(SwsVector *a, double scalar);
/**
* Scale all the coefficients of a so that their sum equals height.
*/
void sws_normalizeVec(SwsVector *a, double height);
void sws_convVec(SwsVector *a, SwsVector *b);
void sws_addVec(SwsVector *a, SwsVector *b);
void sws_subVec(SwsVector *a, SwsVector *b);
void sws_shiftVec(SwsVector *a, int shift);
/**
* Allocate and return a clone of the vector a, that is a vector
* with the same coefficients as a.
*/
SwsVector *sws_cloneVec(SwsVector *a);
/**
* Print with av_log() a textual representation of the vector a
* if log_level <= av_log_level.
*/
void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level);
void sws_freeVec(SwsVector *a);
SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
float lumaSharpen, float chromaSharpen,
float chromaHShift, float chromaVShift,
int verbose);
void sws_freeFilter(SwsFilter *filter);
/**
* Check if context can be reused, otherwise reallocate a new one.
*
* If context is NULL, just calls sws_getContext() to get a new
* context. Otherwise, checks if the parameters are the ones already
* saved in context. If that is the case, returns the current
* context. Otherwise, frees context and gets a new context with
* the new parameters.
*
* Be warned that srcFilter and dstFilter are not checked, they
* are assumed to remain the same.
*/
struct SwsContext *sws_getCachedContext(struct SwsContext *context,
int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, SwsFilter *srcFilter,
SwsFilter *dstFilter, const double *param);
/**
* Convert an 8-bit paletted frame into a frame with a color depth of 32 bits.
*
* The output frame will have the same packed format as the palette.
*
* @param src source frame buffer
* @param dst destination frame buffer
* @param num_pixels number of pixels to convert
* @param palette array with [256] entries, which must match color arrangement (RGB or BGR) of src
*/
void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
/**
* Convert an 8-bit paletted frame into a frame with a color depth of 24 bits.
*
* With the palette format "ABCD", the destination frame ends up with the format "ABC".
*
* @param src source frame buffer
* @param dst destination frame buffer
* @param num_pixels number of pixels to convert
* @param palette array with [256] entries, which must match color arrangement (RGB or BGR) of src
*/
void sws_convertPalette8ToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
/**
* Get the AVClass for swsContext. It can be used in combination with
* AV_OPT_SEARCH_FAKE_OBJ for examining options.
*
* @see av_opt_find().
*/
const AVClass *sws_get_class(void);
/**
* @}
*/
#endif /* SWSCALE_SWSCALE_H */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,55 @@
/*
* Windows resource file for libswscale
*
* Copyright (C) 2012 James Almer
* Copyright (C) 2013 Tiancheng "Timothy" Gu
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <windows.h>
#include "libswscale/version.h"
#include "libavutil/ffversion.h"
#include "config.h"
1 VERSIONINFO
FILEVERSION LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO, 0
PRODUCTVERSION LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO, 0
FILEFLAGSMASK VS_FFI_FILEFLAGSMASK
FILEOS VOS_NT_WINDOWS32
FILETYPE VFT_DLL
{
BLOCK "StringFileInfo"
{
BLOCK "040904B0"
{
VALUE "CompanyName", "FFmpeg Project"
VALUE "FileDescription", "FFmpeg image rescaling library"
VALUE "FileVersion", AV_STRINGIFY(LIBSWSCALE_VERSION)
VALUE "InternalName", "libswscale"
VALUE "LegalCopyright", "Copyright (C) 2000-" AV_STRINGIFY(CONFIG_THIS_YEAR) " FFmpeg Project"
VALUE "OriginalFilename", "swscale" BUILDSUF "-" AV_STRINGIFY(LIBSWSCALE_VERSION_MAJOR) SLIBSUF
VALUE "ProductName", "FFmpeg"
VALUE "ProductVersion", FFMPEG_VERSION
}
}
BLOCK "VarFileInfo"
{
VALUE "Translation", 0x0409, 0x04B0
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,56 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef SWSCALE_VERSION_H
#define SWSCALE_VERSION_H
/**
* @file
* swscale version macros
*/
#include "libavutil/version.h"
#define LIBSWSCALE_VERSION_MAJOR 3
#define LIBSWSCALE_VERSION_MINOR 1
#define LIBSWSCALE_VERSION_MICRO 101
#define LIBSWSCALE_VERSION_INT AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
LIBSWSCALE_VERSION_MINOR, \
LIBSWSCALE_VERSION_MICRO)
#define LIBSWSCALE_VERSION AV_VERSION(LIBSWSCALE_VERSION_MAJOR, \
LIBSWSCALE_VERSION_MINOR, \
LIBSWSCALE_VERSION_MICRO)
#define LIBSWSCALE_BUILD LIBSWSCALE_VERSION_INT
#define LIBSWSCALE_IDENT "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION)
/**
* FF_API_* defines may be placed below to indicate public API that will be
* dropped at a future version bump. The defines themselves are not part of
* the public API and may change, break or disappear at any time.
*/
#ifndef FF_API_SWS_CPU_CAPS
#define FF_API_SWS_CPU_CAPS (LIBSWSCALE_VERSION_MAJOR < 4)
#endif
#ifndef FF_API_ARCH_BFIN
#define FF_API_ARCH_BFIN (LIBSWSCALE_VERSION_MAJOR < 4)
#endif
#endif /* SWSCALE_VERSION_H */

View File

@@ -0,0 +1,315 @@
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
typedef struct VScalerContext
{
uint16_t *filter[2];
int32_t *filter_pos;
int filter_size;
int isMMX;
void *pfn;
yuv2packedX_fn yuv2packedX;
} VScalerContext;
static int lum_planar_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
VScalerContext *inst = desc->instance;
int dstW = desc->dst->width;
int first = FFMAX(1-inst->filter_size, inst->filter_pos[sliceY]);
int sp = first - desc->src->plane[0].sliceY;
int dp = sliceY - desc->dst->plane[0].sliceY;
uint8_t **src = desc->src->plane[0].line + sp;
uint8_t **dst = desc->dst->plane[0].line + dp;
uint16_t *filter = inst->filter[0] + (inst->isMMX ? 0 : sliceY * inst->filter_size);
if (inst->filter_size == 1)
((yuv2planar1_fn)inst->pfn)((const int16_t*)src[0], dst[0], dstW, c->lumDither8, 0);
else
((yuv2planarX_fn)inst->pfn)(filter, inst->filter_size, (const int16_t**)src, dst[0], dstW, c->lumDither8, 0);
if (desc->alpha) {
int sp = first - desc->src->plane[3].sliceY;
int dp = sliceY - desc->dst->plane[3].sliceY;
uint8_t **src = desc->src->plane[3].line + sp;
uint8_t **dst = desc->dst->plane[3].line + dp;
uint16_t *filter = inst->filter[1] + (inst->isMMX ? 0 : sliceY * inst->filter_size);
if (inst->filter_size == 1)
((yuv2planar1_fn)inst->pfn)((const int16_t*)src[0], dst[0], dstW, c->lumDither8, 0);
else
((yuv2planarX_fn)inst->pfn)(filter, inst->filter_size, (const int16_t**)src, dst[0], dstW, c->lumDither8, 0);
}
return 1;
}
static int chr_planar_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
const int chrSkipMask = (1 << desc->dst->v_chr_sub_sample) - 1;
if (sliceY & chrSkipMask)
return 0;
else {
VScalerContext *inst = desc->instance;
int dstW = FF_CEIL_RSHIFT(desc->dst->width, desc->dst->h_chr_sub_sample);
int chrSliceY = sliceY >> desc->dst->v_chr_sub_sample;
int first = FFMAX(1-inst->filter_size, inst->filter_pos[chrSliceY]);
int sp1 = first - desc->src->plane[1].sliceY;
int sp2 = first - desc->src->plane[2].sliceY;
int dp1 = chrSliceY - desc->dst->plane[1].sliceY;
int dp2 = chrSliceY - desc->dst->plane[2].sliceY;
uint8_t **src1 = desc->src->plane[1].line + sp1;
uint8_t **src2 = desc->src->plane[2].line + sp2;
uint8_t **dst1 = desc->dst->plane[1].line + dp1;
uint8_t **dst2 = desc->dst->plane[2].line + dp2;
uint16_t *filter = inst->filter[0] + (inst->isMMX ? 0 : chrSliceY * inst->filter_size);
if (c->yuv2nv12cX) {
((yuv2interleavedX_fn)inst->pfn)(c, filter, inst->filter_size, (const int16_t**)src1, (const int16_t**)src2, dst1[0], dstW);
} else if (inst->filter_size == 1) {
((yuv2planar1_fn)inst->pfn)((const int16_t*)src1[0], dst1[0], dstW, c->chrDither8, 0);
((yuv2planar1_fn)inst->pfn)((const int16_t*)src2[0], dst2[0], dstW, c->chrDither8, 3);
} else {
((yuv2planarX_fn)inst->pfn)(filter, inst->filter_size, (const int16_t**)src1, dst1[0], dstW, c->chrDither8, 0);
((yuv2planarX_fn)inst->pfn)(filter, inst->filter_size, (const int16_t**)src2, dst2[0], dstW, c->chrDither8, inst->isMMX ? (c->uv_offx2 >> 1) : 3);
}
}
return 1;
}
static int packed_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
VScalerContext *inst = desc->instance;
int dstW = desc->dst->width;
int chrSliceY = sliceY >> desc->dst->v_chr_sub_sample;
int lum_fsize = inst[0].filter_size;
int chr_fsize = inst[1].filter_size;
uint16_t *lum_filter = inst[0].filter[0];
uint16_t *chr_filter = inst[1].filter[0];
int firstLum = FFMAX(1-lum_fsize, inst[0].filter_pos[chrSliceY]);
int firstChr = FFMAX(1-chr_fsize, inst[1].filter_pos[chrSliceY]);
int sp0 = firstLum - desc->src->plane[0].sliceY;
int sp1 = firstChr - desc->src->plane[1].sliceY;
int sp2 = firstChr - desc->src->plane[2].sliceY;
int sp3 = firstLum - desc->src->plane[3].sliceY;
int dp = sliceY - desc->dst->plane[0].sliceY;
uint8_t **src0 = desc->src->plane[0].line + sp0;
uint8_t **src1 = desc->src->plane[1].line + sp1;
uint8_t **src2 = desc->src->plane[2].line + sp2;
uint8_t **src3 = desc->alpha ? desc->src->plane[3].line + sp3 : NULL;
uint8_t **dst = desc->dst->plane[0].line + dp;
if (c->yuv2packed1 && lum_fsize == 1 && chr_fsize == 1) { // unscaled RGB
((yuv2packed1_fn)inst->pfn)(c, (const int16_t*)*src0, (const int16_t**)src1, (const int16_t**)src2,
(const int16_t*)(desc->alpha ? *src3 : NULL), *dst, dstW, 0, sliceY);
} else if (c->yuv2packed1 && lum_fsize == 1 && chr_fsize == 2 &&
chr_filter[2 * sliceY + 1] + chr_filter[2 * chrSliceY] == 4096 &&
chr_filter[2 * sliceY + 1] <= 4096U) { // unscaled RGB
int chrAlpha = chr_filter[2 * sliceY + 1];
((yuv2packed1_fn)inst->pfn)(c, (const int16_t*)*src0, (const int16_t**)src1, (const int16_t**)src2,
(const int16_t*)(desc->alpha ? *src3 : NULL), *dst, dstW, chrAlpha, sliceY);
} else if (c->yuv2packed2 && lum_fsize == 2 && chr_fsize == 2 &&
lum_filter[2 * sliceY + 1] + lum_filter[2 * sliceY] == 4096 &&
lum_filter[2 * sliceY + 1] <= 4096U &&
chr_filter[2 * chrSliceY + 1] + chr_filter[2 * chrSliceY] == 4096 &&
chr_filter[2 * chrSliceY + 1] <= 4096U
) { // bilinear upscale RGB
int lumAlpha = lum_filter[2 * sliceY + 1];
int chrAlpha = chr_filter[2 * sliceY + 1];
c->lumMmxFilter[2] =
c->lumMmxFilter[3] = lum_filter[2 * sliceY] * 0x10001;
c->chrMmxFilter[2] =
c->chrMmxFilter[3] = chr_filter[2 * chrSliceY] * 0x10001;
((yuv2packed2_fn)inst->pfn)(c, (const int16_t**)src0, (const int16_t**)src1, (const int16_t**)src2, (const int16_t**)src3,
*dst, dstW, lumAlpha, chrAlpha, sliceY);
} else { // general RGB
if ((c->yuv2packed1 && lum_fsize == 1 && chr_fsize == 2) ||
(c->yuv2packed2 && lum_fsize == 2 && chr_fsize == 2)) {
if (!c->warned_unuseable_bilinear)
av_log(c, AV_LOG_INFO, "Optimized 2 tap filter code cannot be used\n");
c->warned_unuseable_bilinear = 1;
}
inst->yuv2packedX(c, lum_filter + sliceY * lum_fsize,
(const int16_t**)src0, lum_fsize, chr_filter + sliceY * chr_fsize,
(const int16_t**)src1, (const int16_t**)src2, chr_fsize, (const int16_t**)src3, *dst, dstW, sliceY);
}
return 1;
}
static int any_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
VScalerContext *inst = desc->instance;
int dstW = desc->dst->width;
int chrSliceY = sliceY >> desc->dst->v_chr_sub_sample;
int lum_fsize = inst[0].filter_size;
int chr_fsize = inst[1].filter_size;
uint16_t *lum_filter = inst[0].filter[0];
uint16_t *chr_filter = inst[1].filter[0];
int firstLum = FFMAX(1-lum_fsize, inst[0].filter_pos[chrSliceY]);
int firstChr = FFMAX(1-chr_fsize, inst[1].filter_pos[chrSliceY]);
int sp0 = firstLum - desc->src->plane[0].sliceY;
int sp1 = firstChr - desc->src->plane[1].sliceY;
int sp2 = firstChr - desc->src->plane[2].sliceY;
int sp3 = firstLum - desc->src->plane[3].sliceY;
int dp0 = sliceY - desc->dst->plane[0].sliceY;
int dp1 = chrSliceY - desc->dst->plane[1].sliceY;
int dp2 = chrSliceY - desc->dst->plane[2].sliceY;
int dp3 = sliceY - desc->dst->plane[3].sliceY;
uint8_t **src0 = desc->src->plane[0].line + sp0;
uint8_t **src1 = desc->src->plane[1].line + sp1;
uint8_t **src2 = desc->src->plane[2].line + sp2;
uint8_t **src3 = desc->alpha ? desc->src->plane[3].line + sp3 : NULL;
uint8_t *dst[4] = { desc->dst->plane[0].line[dp0],
desc->dst->plane[1].line[dp1],
desc->dst->plane[2].line[dp2],
desc->alpha ? desc->dst->plane[3].line[dp3] : NULL };
av_assert1(!c->yuv2packed1 && !c->yuv2packed2);
((yuv2anyX_fn)inst->pfn)(c, lum_filter + sliceY * lum_fsize,
(const int16_t**)src0, lum_fsize, chr_filter + sliceY * chr_fsize,
(const int16_t**)src1, (const int16_t**)src2, chr_fsize, (const int16_t**)src3, dst, dstW, sliceY);
return 1;
}
int ff_init_vscale(SwsContext *c, SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst)
{
VScalerContext *lumCtx = NULL;
VScalerContext *chrCtx = NULL;
if (isPlanarYUV(c->dstFormat) || (isGray(c->dstFormat) && !isALPHA(c->dstFormat))) {
lumCtx = av_mallocz(sizeof(VScalerContext));
if (!lumCtx)
return AVERROR(ENOMEM);
desc[0].process = lum_planar_vscale;
desc[0].instance = lumCtx;
desc[0].src = src;
desc[0].dst = dst;
desc[0].alpha = c->alpPixBuf != 0;
if (!isGray(c->dstFormat)) {
chrCtx = av_mallocz(sizeof(VScalerContext));
if (!chrCtx)
return AVERROR(ENOMEM);
desc[1].process = chr_planar_vscale;
desc[1].instance = chrCtx;
desc[1].src = src;
desc[1].dst = dst;
}
} else {
lumCtx = av_mallocz_array(sizeof(VScalerContext), 2);
if (!lumCtx)
return AVERROR(ENOMEM);
chrCtx = &lumCtx[1];
desc[0].process = c->yuv2packedX ? packed_vscale : any_vscale;
desc[0].instance = lumCtx;
desc[0].src = src;
desc[0].dst = dst;
desc[0].alpha = c->alpPixBuf != 0;
}
ff_init_vscale_pfn(c, c->yuv2plane1, c->yuv2planeX, c->yuv2nv12cX,
c->yuv2packed1, c->yuv2packed2, c->yuv2packedX, c->yuv2anyX, c->use_mmx_vfilter);
return 0;
}
void ff_init_vscale_pfn(SwsContext *c,
yuv2planar1_fn yuv2plane1,
yuv2planarX_fn yuv2planeX,
yuv2interleavedX_fn yuv2nv12cX,
yuv2packed1_fn yuv2packed1,
yuv2packed2_fn yuv2packed2,
yuv2packedX_fn yuv2packedX,
yuv2anyX_fn yuv2anyX, int use_mmx)
{
VScalerContext *lumCtx = NULL;
VScalerContext *chrCtx = NULL;
int idx = c->numDesc - (c->is_internal_gamma ? 2 : 1);
if (isPlanarYUV(c->dstFormat) || (isGray(c->dstFormat) && !isALPHA(c->dstFormat))) {
if (!isGray(c->dstFormat)) {
chrCtx = c->desc[idx].instance;
chrCtx->filter[0] = use_mmx ? (int16_t*)c->chrMmxFilter : c->vChrFilter;
chrCtx->filter_size = c->vChrFilterSize;
chrCtx->filter_pos = c->vChrFilterPos;
chrCtx->isMMX = use_mmx;
--idx;
if (yuv2nv12cX) chrCtx->pfn = yuv2nv12cX;
else if (c->vChrFilterSize == 1) chrCtx->pfn = yuv2plane1;
else chrCtx->pfn = yuv2planeX;
}
lumCtx = c->desc[idx].instance;
lumCtx->filter[0] = use_mmx ? (int16_t*)c->lumMmxFilter : c->vLumFilter;
lumCtx->filter[1] = use_mmx ? (int16_t*)c->alpMmxFilter : c->vLumFilter;
lumCtx->filter_size = c->vLumFilterSize;
lumCtx->filter_pos = c->vLumFilterPos;
lumCtx->isMMX = use_mmx;
if (c->vLumFilterSize == 1) lumCtx->pfn = yuv2plane1;
else lumCtx->pfn = yuv2planeX;
} else {
lumCtx = c->desc[idx].instance;
chrCtx = &lumCtx[1];
lumCtx->filter[0] = c->vLumFilter;
lumCtx->filter_size = c->vLumFilterSize;
lumCtx->filter_pos = c->vLumFilterPos;
chrCtx->filter[0] = c->vChrFilter;
chrCtx->filter_size = c->vChrFilterSize;
chrCtx->filter_pos = c->vChrFilterPos;
lumCtx->isMMX = use_mmx;
chrCtx->isMMX = use_mmx;
if (yuv2packedX) {
if (c->yuv2packed1 && c->vLumFilterSize == 1 && c->vChrFilterSize <= 2)
lumCtx->pfn = yuv2packed1;
else if (c->yuv2packed2 && c->vLumFilterSize == 2 && c->vChrFilterSize == 2)
lumCtx->pfn = yuv2packed2;
lumCtx->yuv2packedX = yuv2packedX;
} else
lumCtx->pfn = yuv2anyX;
}
}

View File

@@ -0,0 +1,13 @@
$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
OBJS += x86/rgb2rgb.o \
x86/swscale.o \
x86/yuv2rgb.o \
MMX-OBJS += x86/hscale_fast_bilinear_simd.o \
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
YASM-OBJS += x86/input.o \
x86/output.o \
x86/scale.o \

View File

@@ -0,0 +1,359 @@
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "../swscale_internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#define RET 0xC3 // near return opcode for x86
#define PREFETCH "prefetchnta"
#if HAVE_INLINE_ASM
av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
int16_t *filter, int32_t *filterPos,
int numSplits)
{
uint8_t *fragmentA;
x86_reg imm8OfPShufW1A;
x86_reg imm8OfPShufW2A;
x86_reg fragmentLengthA;
uint8_t *fragmentB;
x86_reg imm8OfPShufW1B;
x86_reg imm8OfPShufW2B;
x86_reg fragmentLengthB;
int fragmentPos;
int xpos, i;
// create an optimized horizontal scaling routine
/* This scaler is made of runtime-generated MMXEXT code using specially tuned
* pshufw instructions. For every four output pixels, if four input pixels
* are enough for the fast bilinear scaling, then a chunk of fragmentB is
* used. If five input pixels are needed, then a chunk of fragmentA is used.
*/
// code fragment
__asm__ volatile (
"jmp 9f \n\t"
// Begin
"0: \n\t"
"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
"movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
"add $8, %%"REG_a" \n\t"
// End
"9: \n\t"
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
"dec %1 \n\t"
"dec %2 \n\t"
"sub %0, %1 \n\t"
"sub %0, %2 \n\t"
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
"sub %0, %3 \n\t"
: "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
"=r" (fragmentLengthA)
);
__asm__ volatile (
"jmp 9f \n\t"
// Begin
"0: \n\t"
"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm0, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
"add $8, %%"REG_a" \n\t"
// End
"9: \n\t"
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
"dec %1 \n\t"
"dec %2 \n\t"
"sub %0, %1 \n\t"
"sub %0, %2 \n\t"
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
"sub %0, %3 \n\t"
: "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
"=r" (fragmentLengthB)
);
xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
fragmentPos = 0;
for (i = 0; i < dstW / numSplits; i++) {
int xx = xpos >> 16;
if ((i & 3) == 0) {
int a = 0;
int b = ((xpos + xInc) >> 16) - xx;
int c = ((xpos + xInc * 2) >> 16) - xx;
int d = ((xpos + xInc * 3) >> 16) - xx;
int inc = (d + 1 < 4);
uint8_t *fragment = inc ? fragmentB : fragmentA;
x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
int maxShift = 3 - (d + inc);
int shift = 0;
if (filterCode) {
filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
filterPos[i / 2] = xx;
memcpy(filterCode + fragmentPos, fragment, fragmentLength);
filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
((b + inc) << 2) |
((c + inc) << 4) |
((d + inc) << 6);
filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
(c << 4) |
(d << 6);
if (i + 4 - inc >= dstW)
shift = maxShift; // avoid overread
else if ((filterPos[i / 2] & 3) <= maxShift)
shift = filterPos[i / 2] & 3; // align
if (shift && i >= shift) {
filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
filterPos[i / 2] -= shift;
}
}
fragmentPos += fragmentLength;
if (filterCode)
filterCode[fragmentPos] = RET;
}
xpos += xInc;
}
if (filterCode)
filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
return fragmentPos + 1;
}
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
int dstWidth, const uint8_t *src,
int srcW, int xInc)
{
int32_t *filterPos = c->hLumFilterPos;
int16_t *filter = c->hLumFilter;
void *mmxextFilterCode = c->lumMmxextFilterCode;
int i;
#if ARCH_X86_64
uint64_t retsave;
#else
#if defined(PIC)
uint64_t ebxsave;
#endif
#endif
__asm__ volatile(
#if ARCH_X86_64
"mov -8(%%rsp), %%"REG_a" \n\t"
"mov %%"REG_a", %5 \n\t" // retsave
#else
#if defined(PIC)
"mov %%"REG_b", %5 \n\t" // ebxsave
#endif
#endif
"pxor %%mm7, %%mm7 \n\t"
"mov %0, %%"REG_c" \n\t"
"mov %1, %%"REG_D" \n\t"
"mov %2, %%"REG_d" \n\t"
"mov %3, %%"REG_b" \n\t"
"xor %%"REG_a", %%"REG_a" \n\t" // i
PREFETCH" (%%"REG_c") \n\t"
PREFETCH" 32(%%"REG_c") \n\t"
PREFETCH" 64(%%"REG_c") \n\t"
#if ARCH_X86_64
#define CALL_MMXEXT_FILTER_CODE \
"movl (%%"REG_b"), %%esi \n\t"\
"call *%4 \n\t"\
"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
"add %%"REG_S", %%"REG_c" \n\t"\
"add %%"REG_a", %%"REG_D" \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
#else
#define CALL_MMXEXT_FILTER_CODE \
"movl (%%"REG_b"), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
"add %%"REG_a", %%"REG_D" \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
#endif /* ARCH_X86_64 */
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
#if ARCH_X86_64
"mov %5, %%"REG_a" \n\t"
"mov %%"REG_a", -8(%%rsp) \n\t"
#else
#if defined(PIC)
"mov %5, %%"REG_b" \n\t"
#endif
#endif
:: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
"m" (mmxextFilterCode)
#if ARCH_X86_64
,"m"(retsave)
#else
#if defined(PIC)
,"m" (ebxsave)
#endif
#endif
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
#if ARCH_X86_64 || !defined(PIC)
,"%"REG_b
#endif
);
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
dst[i] = src[srcW-1]*128;
}
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc)
{
int32_t *filterPos = c->hChrFilterPos;
int16_t *filter = c->hChrFilter;
void *mmxextFilterCode = c->chrMmxextFilterCode;
int i;
#if ARCH_X86_64
DECLARE_ALIGNED(8, uint64_t, retsave);
#else
#if defined(PIC)
DECLARE_ALIGNED(8, uint64_t, ebxsave);
#endif
#endif
__asm__ volatile(
#if ARCH_X86_64
"mov -8(%%rsp), %%"REG_a" \n\t"
"mov %%"REG_a", %7 \n\t" // retsave
#else
#if defined(PIC)
"mov %%"REG_b", %7 \n\t" // ebxsave
#endif
#endif
"pxor %%mm7, %%mm7 \n\t"
"mov %0, %%"REG_c" \n\t"
"mov %1, %%"REG_D" \n\t"
"mov %2, %%"REG_d" \n\t"
"mov %3, %%"REG_b" \n\t"
"xor %%"REG_a", %%"REG_a" \n\t" // i
PREFETCH" (%%"REG_c") \n\t"
PREFETCH" 32(%%"REG_c") \n\t"
PREFETCH" 64(%%"REG_c") \n\t"
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
"xor %%"REG_a", %%"REG_a" \n\t" // i
"mov %5, %%"REG_c" \n\t" // src2
"mov %6, %%"REG_D" \n\t" // dst2
PREFETCH" (%%"REG_c") \n\t"
PREFETCH" 32(%%"REG_c") \n\t"
PREFETCH" 64(%%"REG_c") \n\t"
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
#if ARCH_X86_64
"mov %7, %%"REG_a" \n\t"
"mov %%"REG_a", -8(%%rsp) \n\t"
#else
#if defined(PIC)
"mov %7, %%"REG_b" \n\t"
#endif
#endif
:: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
"m" (mmxextFilterCode), "m" (src2), "m"(dst2)
#if ARCH_X86_64
,"m"(retsave)
#else
#if defined(PIC)
,"m" (ebxsave)
#endif
#endif
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
#if ARCH_X86_64 || !defined(PIC)
,"%"REG_b
#endif
);
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
dst1[i] = src1[srcW-1]*128;
dst2[i] = src2[srcW-1]*128;
}
}
#endif //HAVE_INLINE_ASM

View File

@@ -0,0 +1,740 @@
;******************************************************************************
;* x86-optimized input routines; does shuffling of packed
;* YUV formats into individual planes, and converts RGB
;* into YUV planes also.
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
%define RY 0x20DE
%define GY 0x4087
%define BY 0x0C88
%define RU 0xECFF
%define GU 0xDAC8
%define BU 0x3838
%define RV 0x3838
%define GV 0xD0E3
%define BV 0xF6E4
rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15
rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15
%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq
%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq
%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq
%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq
%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq
%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq
%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq
%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq
%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq
%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq
%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq
%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq
%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq
%define rgba_Ycoeff_br 16*4 + 16*13 + tableq
%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq
%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq
%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq
%define rgba_Ucoeff_br 16*4 + 16*17 + tableq
%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq
%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq
%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq
%define rgba_Vcoeff_br 16*4 + 16*21 + tableq
%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq
%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq
; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
; rgba_Ycoeff_rb: times 4 dw RY, BY
; rgba_Ycoeff_br: times 4 dw BY, RY
; rgba_Ycoeff_ga: times 4 dw GY, 0
; rgba_Ycoeff_ag: times 4 dw 0, GY
; rgba_Ucoeff_rb: times 4 dw RU, BU
; rgba_Ucoeff_br: times 4 dw BU, RU
; rgba_Ucoeff_ga: times 4 dw GU, 0
; rgba_Ucoeff_ag: times 4 dw 0, GU
; rgba_Vcoeff_rb: times 4 dw RV, BV
; rgba_Vcoeff_br: times 4 dw BV, RV
; rgba_Vcoeff_ga: times 4 dw GV, 0
; rgba_Vcoeff_ag: times 4 dw 0, GV
shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \
6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80
shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \
8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80
SECTION .text
;-----------------------------------------------------------------------------
; RGB to Y/UV.
;
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
; and
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
; const uint8_t *unused, int w);
;-----------------------------------------------------------------------------
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_Y_FN 2-3
cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
%if mmsize == 8
mova m5, [%2_Ycoeff_12x4]
mova m6, [%2_Ycoeff_3x56]
%define coeff1 m5
%define coeff2 m6
%elif ARCH_X86_64
mova m8, [%2_Ycoeff_12x4]
mova m9, [%2_Ycoeff_3x56]
%define coeff1 m8
%define coeff2 m9
%else ; x86-32 && mmsize == 16
%define coeff1 [%2_Ycoeff_12x4]
%define coeff2 [%2_Ycoeff_3x56]
%endif ; x86-32/64 && mmsize == 8/16
%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
.body:
%if cpuflag(ssse3)
mova m7, [shuf_rgb_12x4]
%define shuf_rgb1 m7
%if ARCH_X86_64
mova m10, [shuf_rgb_3x56]
%define shuf_rgb2 m10
%else ; x86-32
%define shuf_rgb2 [shuf_rgb_3x56]
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
movsxd wq, wd
%endif
add wq, wq
add dstq, wq
neg wq
%if notcpuflag(ssse3)
pxor m7, m7
%endif ; !cpuflag(ssse3)
mova m4, [rgb_Yrnd]
.loop:
%if cpuflag(ssse3)
movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
pshufb m3, m2, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m2, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else ; !cpuflag(ssse3)
movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
%if mmsize == 16 ; i.e. sse2
punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 }
movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 }
movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; cpuflag(ssse3)
add srcq, 3 * mmsize / 2
pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
psrad m0, 9
psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
mova [dstq+wq], m0
add wq, mmsize
jl .loop
REP_RET
%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
%endmacro
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_UV_FN 2-3
cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [%2_Ucoeff_12x4]
mova m9, [%2_Ucoeff_3x56]
mova m10, [%2_Vcoeff_12x4]
mova m11, [%2_Vcoeff_3x56]
%define coeffU1 m8
%define coeffU2 m9
%define coeffV1 m10
%define coeffV2 m11
%else ; x86-32
%define coeffU1 [%2_Ucoeff_12x4]
%define coeffU2 [%2_Ucoeff_3x56]
%define coeffV1 [%2_Vcoeff_12x4]
%define coeffV2 [%2_Vcoeff_3x56]
%endif ; x86-32/64
%if ARCH_X86_64 && %0 == 3
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body
%else ; ARCH_X86_64 && %0 == 3
.body:
%if cpuflag(ssse3)
mova m7, [shuf_rgb_12x4]
%define shuf_rgb1 m7
%if ARCH_X86_64
mova m12, [shuf_rgb_3x56]
%define shuf_rgb2 m12
%else ; x86-32
%define shuf_rgb2 [shuf_rgb_3x56]
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add wq, wq
add dstUq, wq
add dstVq, wq
neg wq
mova m6, [rgb_UVrnd]
%if notcpuflag(ssse3)
pxor m7, m7
%endif
.loop:
%if cpuflag(ssse3)
movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
movu m4, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
%else ; !cpuflag(ssse3)
movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
%if mmsize == 16
punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
%endif ; cpuflag(ssse3)
pmaddwd m2, m0, coeffV1 ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
pmaddwd m3, m1, coeffV2 ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
pmaddwd m0, coeffU1 ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
pmaddwd m1, coeffU2 ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
paddd m0, m1 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
paddd m2, m3 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
%if cpuflag(ssse3)
pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else ; !cpuflag(ssse3)
%if mmsize == 16
movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; mmsize == 16 && !cpuflag(ssse3)
punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; cpuflag(ssse3)
add srcq, 3 * mmsize / 2
pmaddwd m1, m4, coeffU1 ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
pmaddwd m3, m5, coeffU2 ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
pmaddwd m4, coeffV1 ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
pmaddwd m5, coeffV2 ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
paddd m1, m3 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
paddd m4, m5 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
paddd m0, m6 ; += rgb_UVrnd, i.e. (dword) { U[0-3] }
paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
psrad m0, 9
psrad m2, 9
psrad m1, 9
psrad m4, 9
packssdw m0, m1 ; (word) { U[0-7] }
packssdw m2, m4 ; (word) { V[0-7] }
%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%else ; mmsize == 16
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
%endmacro
; %1 = nr. of XMM registers for rgb-to-Y func
; %2 = nr. of XMM registers for rgb-to-UV func
%macro RGB24_FUNCS 2
RGB24_TO_Y_FN %1, rgb
RGB24_TO_Y_FN %1, bgr, rgb
RGB24_TO_UV_FN %2, rgb
RGB24_TO_UV_FN %2, bgr, rgb
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
RGB24_FUNCS 0, 0
%endif
INIT_XMM sse2
RGB24_FUNCS 10, 12
INIT_XMM ssse3
RGB24_FUNCS 11, 13
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB24_FUNCS 11, 13
%endif
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_Y_FN 5-6
cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
mova m5, [rgba_Ycoeff_%2%4]
mova m6, [rgba_Ycoeff_%3%5]
%if %0 == 6
jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
%else ; %0 == 6
.body:
%if ARCH_X86_64
movsxd wq, wd
%endif
add wq, wq
sub wq, mmsize - 1
lea srcq, [srcq+wq*2]
add dstq, wq
neg wq
mova m4, [rgb_Yrnd]
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
.loop:
; FIXME check alignment and use mova
movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7]
pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7]
paddd m0, m4 ; += rgb_Yrnd
paddd m2, m4 ; += rgb_Yrnd
paddd m0, m1 ; (dword) { Y[0-3] }
paddd m2, m3 ; (dword) { Y[4-7] }
psrad m0, 9
psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
mova [dstq+wq], m0
add wq, mmsize
jl .loop
sub wq, mmsize - 1
jz .end
add srcq, 2*mmsize - 2
add dstq, mmsize - 1
.loop2:
movd m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
paddd m0, m4 ; += rgb_Yrnd
paddd m0, m1 ; (dword) { Y[0-3] }
psrad m0, 9
packssdw m0, m0 ; (word) { Y[0-7] }
movd [dstq+wq], m0
add wq, 2
jl .loop2
.end:
REP_RET
%endif ; %0 == 3
%endmacro
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_UV_FN 5-6
cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [rgba_Ucoeff_%2%4]
mova m9, [rgba_Ucoeff_%3%5]
mova m10, [rgba_Vcoeff_%2%4]
mova m11, [rgba_Vcoeff_%3%5]
%define coeffU1 m8
%define coeffU2 m9
%define coeffV1 m10
%define coeffV2 m11
%else ; x86-32
%define coeffU1 [rgba_Ucoeff_%2%4]
%define coeffU2 [rgba_Ucoeff_%3%5]
%define coeffV1 [rgba_Vcoeff_%2%4]
%define coeffV2 [rgba_Vcoeff_%3%5]
%endif ; x86-64/32
%if ARCH_X86_64 && %0 == 6
jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
%else ; ARCH_X86_64 && %0 == 6
.body:
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add wq, wq
sub wq, mmsize - 1
add dstUq, wq
add dstVq, wq
lea srcq, [srcq+wq*2]
neg wq
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
mova m6, [rgb_UVrnd]
.loop:
; FIXME check alignment and use mova
movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
paddd m3, m6 ; += rgb_UVrnd
paddd m1, m6 ; += rgb_UVrnd
paddd m2, m3 ; (dword) { V[0-3] }
paddd m0, m1 ; (dword) { U[0-3] }
pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7]
pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7]
pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7]
pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
paddd m3, m6 ; += rgb_UVrnd
paddd m5, m6 ; += rgb_UVrnd
psrad m0, 9
paddd m1, m3 ; (dword) { V[4-7] }
paddd m4, m5 ; (dword) { U[4-7] }
psrad m2, 9
psrad m4, 9
psrad m1, 9
packssdw m0, m4 ; (word) { U[0-7] }
packssdw m2, m1 ; (word) { V[0-7] }
%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%else ; mmsize == 16
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
sub wq, mmsize - 1
jz .end
add srcq , 2*mmsize - 2
add dstUq, mmsize - 1
add dstVq, mmsize - 1
.loop2:
movd m0, [srcq+wq*2] ; (byte) { Bx, Gx, Rx, xx }[0-3]
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
paddd m3, m6 ; += rgb_UVrnd
paddd m1, m6 ; += rgb_UVrnd
paddd m2, m3 ; (dword) { V[0-3] }
paddd m0, m1 ; (dword) { U[0-3] }
psrad m0, 9
psrad m2, 9
packssdw m0, m0 ; (word) { U[0-7] }
packssdw m2, m2 ; (word) { V[0-7] }
movd [dstUq+wq], m0
movd [dstVq+wq], m2
add wq, 2
jl .loop2
.end:
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
%endmacro
; %1 = nr. of XMM registers for rgb-to-Y func
; %2 = nr. of XMM registers for rgb-to-UV func
%macro RGB32_FUNCS 2
RGB32_TO_Y_FN %1, r, g, b, a
RGB32_TO_Y_FN %1, b, g, r, a, rgba
RGB32_TO_Y_FN %1, a, r, g, b, rgba
RGB32_TO_Y_FN %1, a, b, g, r, rgba
RGB32_TO_UV_FN %2, r, g, b, a
RGB32_TO_UV_FN %2, b, g, r, a, rgba
RGB32_TO_UV_FN %2, a, r, g, b, rgba
RGB32_TO_UV_FN %2, a, b, g, r, rgba
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
RGB32_FUNCS 0, 0
%endif
INIT_XMM sse2
RGB32_FUNCS 8, 12
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB32_FUNCS 8, 12
%endif
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
;
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
; and
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
; const uint8_t *unused, int w);
;-----------------------------------------------------------------------------
; %1 = a (aligned) or u (unaligned)
; %2 = yuyv or uyvy
%macro LOOP_YUYV_TO_Y 2
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
%ifidn %2, yuyv
pand m0, m2 ; (word) { Y0, Y1, ..., Y7 }
pand m1, m2 ; (word) { Y8, Y9, ..., Y15 }
%else ; uyvy
psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 }
psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 }
%endif ; yuyv/uyvy
packuswb m0, m1 ; (byte) { Y0, ..., Y15 }
mova [dstq+wq], m0
add wq, mmsize
jl .loop_%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = yuyv or uyvy
; %3 = if specified, it means that unaligned and aligned code in loop
; will be the same (i.e. YUYV+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_Y_FN 2-3
cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
add dstq, wq
%if mmsize == 16
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
%ifidn %2, yuyv
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
%endif ; yuyv
%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_Y a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_Y u, %2
%else ; mmsize == 8
neg wq
LOOP_YUYV_TO_Y a, %2
%endif ; mmsize == 8/16
%endmacro
; %1 = a (aligned) or u (unaligned)
; %2 = yuyv or uyvy
%macro LOOP_YUYV_TO_UV 2
.loop_%1:
%ifidn %2, yuyv
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 }
psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 }
%else ; uyvy
%if cpuflag(avx)
vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 }
vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
%else
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
pand m0, m2 ; (word) { U0, V0, ..., U3, V3 }
pand m1, m2 ; (word) { U4, V4, ..., U7, V7 }
%endif
%endif ; yuyv/uyvy
packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
%if mmsize == 16
packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
movh [dstUq+wq], m1
movhps [dstVq+wq], m1
%else ; mmsize == 8
packuswb m1, m1 ; (byte) { U0, ... U3 }
packuswb m0, m0 ; (byte) { V0, ... V3 }
movh [dstUq+wq], m1
movh [dstVq+wq], m0
%endif ; mmsize == 8/16
add wq, mmsize / 2
jl .loop_%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = yuyv or uyvy
; %3 = if specified, it means that unaligned and aligned code in loop
; will be the same (i.e. UYVY+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_UV_FN 2-3
cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
%if mmsize == 16 && %0 == 2
test srcq, 15
%endif
lea srcq, [srcq+wq*4]
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
; NOTE: if uyvy+avx, u/a are identical
%if mmsize == 16 && %0 == 2
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_UV u, %2
%else ; mmsize == 8
neg wq
LOOP_YUYV_TO_UV a, %2
%endif ; mmsize == 8/16
%endmacro
; %1 = a (aligned) or u (unaligned)
; %2 = nv12 or nv21
%macro LOOP_NVXX_TO_UV 2
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
packuswb m2, m3 ; (byte) { U0, ..., U15 }
packuswb m0, m1 ; (byte) { V0, ..., V15 }
%ifidn %2, nv12
mova [dstUq+wq], m2
mova [dstVq+wq], m0
%else ; nv21
mova [dstVq+wq], m2
mova [dstUq+wq], m0
%endif ; nv12/21
add wq, mmsize
jl .loop_%1
REP_RET
%endmacro
; %1 = nr. of XMM registers
; %2 = nv12 or nv21
%macro NVXX_TO_UV_FN 2
cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
%if mmsize == 16
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
pcmpeqb m5, m5 ; (byte) { 0xff } x 16
psrlw m5, 8 ; (word) { 0x00ff } x 8
%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_NVXX_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_NVXX_TO_UV u, %2
%else ; mmsize == 8
neg wq
LOOP_NVXX_TO_UV a, %2
%endif ; mmsize == 8/16
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
YUYV_TO_Y_FN 0, yuyv
YUYV_TO_Y_FN 0, uyvy
YUYV_TO_UV_FN 0, yuyv
YUYV_TO_UV_FN 0, uyvy
NVXX_TO_UV_FN 0, nv12
NVXX_TO_UV_FN 0, nv21
%endif
INIT_XMM sse2
YUYV_TO_Y_FN 3, yuyv
YUYV_TO_Y_FN 2, uyvy
YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
; that's not faster in practice
YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy, 1
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
%endif

View File

@@ -0,0 +1,413 @@
;******************************************************************************
;* x86-optimized vertical line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;* Kieran Kunhya <kieran@kunhya.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
minshort: times 8 dw 0x8000
yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
yuv2yuvX_10_start: times 4 dd 0x10000
yuv2yuvX_9_start: times 4 dd 0x20000
yuv2yuvX_10_upper: times 8 dw 0x3ff
yuv2yuvX_9_upper: times 8 dw 0x1ff
pd_4: times 4 dd 4
pd_4min0x40000:times 4 dd 4 - (0x40000)
pw_16: times 8 dw 16
pw_32: times 8 dw 32
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
SECTION .text
;-----------------------------------------------------------------------------
; vertical line scaling
;
; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
; const uint8_t *dither, int offset)
; and
; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
; const int16_t **src, uint8_t *dst, int dstW,
; const uint8_t *dither, int offset)
;
; Scale one or $filterSize lines of source data to generate one line of output
; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
; of 2. $offset is either 0 or 3. $dither holds 8 values.
;-----------------------------------------------------------------------------
%macro yuv2planeX_fn 3
%if ARCH_X86_32
%define cntr_reg fltsizeq
%define movsx mov
%else
%define cntr_reg r7
%define movsx movsxd
%endif
cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%if %1 == 8 || %1 == 9 || %1 == 10
pxor m6, m6
%endif ; %1 == 8/9/10
%if %1 == 8
%if ARCH_X86_32
%assign pad 0x2c - (stack_offset & 15)
SUB rsp, pad
%define m_dith m7
%else ; x86-64
%define m_dith m9
%endif ; x86-32
; create registers holding dither
movq m_dith, [ditherq] ; dither
test offsetd, offsetd
jz .no_rot
%if mmsize == 16
punpcklqdq m_dith, m_dith
%endif ; mmsize == 16
PALIGNR m_dith, m_dith, 3, m0
.no_rot:
%if mmsize == 16
punpcklbw m_dith, m6
%if ARCH_X86_64
punpcklwd m8, m_dith, m6
pslld m8, 12
%else ; x86-32
punpcklwd m5, m_dith, m6
pslld m5, 12
%endif ; x86-32/64
punpckhwd m_dith, m6
pslld m_dith, 12
%if ARCH_X86_32
mova [rsp+ 0], m5
mova [rsp+16], m_dith
%endif
%else ; mmsize == 8
punpcklbw m5, m_dith, m6
punpckhbw m_dith, m6
punpcklwd m4, m5, m6
punpckhwd m5, m6
punpcklwd m3, m_dith, m6
punpckhwd m_dith, m6
pslld m4, 12
pslld m5, 12
pslld m3, 12
pslld m_dith, 12
mova [rsp+ 0], m4
mova [rsp+ 8], m5
mova [rsp+16], m3
mova [rsp+24], m_dith
%endif ; mmsize == 8/16
%endif ; %1 == 8
xor r5, r5
.pixelloop:
%assign %%i 0
; the rep here is for the 8bit output mmx case, where dither covers
; 8 pixels but we can only handle 2 pixels per register, and thus 4
; pixels per iteration. In order to not have to keep track of where
; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
%if %1 == 8
%assign %%repcnt 16/mmsize
%else
%assign %%repcnt 1
%endif
%rep %%repcnt
%if %1 == 8
%if ARCH_X86_32
mova m2, [rsp+mmsize*(0+%%i)]
mova m1, [rsp+mmsize*(1+%%i)]
%else ; x86-64
mova m2, m8
mova m1, m_dith
%endif ; x86-32/64
%else ; %1 == 9/10/16
mova m1, [yuv2yuvX_%1_start]
mova m2, m1
%endif ; %1 == 8/9/10/16
movsx cntr_reg, fltsizem
.filterloop_ %+ %%i:
; input pixels
mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
%if %1 == 16
mova m3, [r6+r5*4]
mova m5, [r6+r5*4+mmsize]
%else ; %1 == 8/9/10
mova m3, [r6+r5*2]
%endif ; %1 == 8/9/10/16
mov r6, [srcq+gprsize*cntr_reg-gprsize]
%if %1 == 16
mova m4, [r6+r5*4]
mova m6, [r6+r5*4+mmsize]
%else ; %1 == 8/9/10
mova m4, [r6+r5*2]
%endif ; %1 == 8/9/10/16
; coefficients
movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
%if %1 == 16
pshuflw m7, m0, 0 ; coeff[0]
pshuflw m0, m0, 0x55 ; coeff[1]
pmovsxwd m7, m7 ; word -> dword
pmovsxwd m0, m0 ; word -> dword
pmulld m3, m7
pmulld m5, m7
pmulld m4, m0
pmulld m6, m0
paddd m2, m3
paddd m1, m5
paddd m2, m4
paddd m1, m6
%else ; %1 == 10/9/8
punpcklwd m5, m3, m4
punpckhwd m3, m4
SPLATD m0
pmaddwd m5, m0
pmaddwd m3, m0
paddd m2, m5
paddd m1, m3
%endif ; %1 == 8/9/10/16
sub cntr_reg, 2
jg .filterloop_ %+ %%i
%if %1 == 16
psrad m2, 31 - %1
psrad m1, 31 - %1
%else ; %1 == 10/9/8
psrad m2, 27 - %1
psrad m1, 27 - %1
%endif ; %1 == 8/9/10/16
%if %1 == 8
packssdw m2, m1
packuswb m2, m2
movh [dstq+r5*1], m2
%else ; %1 == 9/10/16
%if %1 == 16
packssdw m2, m1
paddw m2, [minshort]
%else ; %1 == 9/10
%if cpuflag(sse4)
packusdw m2, m1
%else ; mmxext/sse2
packssdw m2, m1
pmaxsw m2, m6
%endif ; mmxext/sse2/sse4/avx
pminsw m2, [yuv2yuvX_%1_upper]
%endif ; %1 == 9/10/16
mova [dstq+r5*2], m2
%endif ; %1 == 8/9/10/16
add r5, mmsize/2
sub wd, mmsize/2
%assign %%i %%i+2
%endrep
jg .pixelloop
%if %1 == 8
%if ARCH_X86_32
ADD rsp, pad
RET
%else ; x86-64
REP_RET
%endif ; x86-32/64
%else ; %1 == 9/10/16
REP_RET
%endif ; %1 == 8/9/10/16
%endmacro
%if ARCH_X86_32
INIT_MMX mmxext
yuv2planeX_fn 8, 0, 7
yuv2planeX_fn 9, 0, 5
yuv2planeX_fn 10, 0, 5
%endif
INIT_XMM sse2
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
INIT_XMM sse4
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
yuv2planeX_fn 16, 8, 5
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
%endif
; %1=outout-bpc, %2=alignment (u/a)
%macro yuv2plane1_mainloop 2
.loop_%2:
%if %1 == 8
paddsw m0, m2, [srcq+wq*2+mmsize*0]
paddsw m1, m3, [srcq+wq*2+mmsize*1]
psraw m0, 7
psraw m1, 7
packuswb m0, m1
mov%2 [dstq+wq], m0
%elif %1 == 16
paddd m0, m4, [srcq+wq*4+mmsize*0]
paddd m1, m4, [srcq+wq*4+mmsize*1]
paddd m2, m4, [srcq+wq*4+mmsize*2]
paddd m3, m4, [srcq+wq*4+mmsize*3]
psrad m0, 3
psrad m1, 3
psrad m2, 3
psrad m3, 3
%if cpuflag(sse4) ; avx/sse4
packusdw m0, m1
packusdw m2, m3
%else ; mmx/sse2
packssdw m0, m1
packssdw m2, m3
paddw m0, m5
paddw m2, m5
%endif ; mmx/sse2/sse4/avx
mov%2 [dstq+wq*2+mmsize*0], m0
mov%2 [dstq+wq*2+mmsize*1], m2
%else ; %1 == 9/10
paddsw m0, m2, [srcq+wq*2+mmsize*0]
paddsw m1, m2, [srcq+wq*2+mmsize*1]
psraw m0, 15 - %1
psraw m1, 15 - %1
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m3
pminsw m1, m3
mov%2 [dstq+wq*2+mmsize*0], m0
mov%2 [dstq+wq*2+mmsize*1], m1
%endif
add wq, mmsize
jl .loop_%2
%endmacro
%macro yuv2plane1_fn 3
cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
movsxdifnidn wq, wd
add wq, mmsize - 1
and wq, ~(mmsize - 1)
%if %1 == 8
add dstq, wq
%else ; %1 != 8
lea dstq, [dstq+wq*2]
%endif ; %1 == 8
%if %1 == 16
lea srcq, [srcq+wq*4]
%else ; %1 != 16
lea srcq, [srcq+wq*2]
%endif ; %1 == 16
neg wq
%if %1 == 8
pxor m4, m4 ; zero
; create registers holding dither
movq m3, [ditherq] ; dither
test offsetd, offsetd
jz .no_rot
%if mmsize == 16
punpcklqdq m3, m3
%endif ; mmsize == 16
PALIGNR m3, m3, 3, m2
.no_rot:
%if mmsize == 8
mova m2, m3
punpckhbw m3, m4 ; byte->word
punpcklbw m2, m4 ; byte->word
%else
punpcklbw m3, m4
mova m2, m3
%endif
%elif %1 == 9
pxor m4, m4
mova m3, [pw_512]
mova m2, [pw_32]
%elif %1 == 10
pxor m4, m4
mova m3, [pw_1024]
mova m2, [pw_16]
%else ; %1 == 16
%if cpuflag(sse4) ; sse4/avx
mova m4, [pd_4]
%else ; mmx/sse2
mova m4, [pd_4min0x40000]
mova m5, [minshort]
%endif ; mmx/sse2/sse4/avx
%endif ; %1 == ..
; actual pixel scaling
%if mmsize == 8
yuv2plane1_mainloop %1, a
%else ; mmsize == 16
test dstq, 15
jnz .unaligned
yuv2plane1_mainloop %1, a
REP_RET
.unaligned:
yuv2plane1_mainloop %1, u
%endif ; mmsize == 8/16
REP_RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
yuv2plane1_fn 8, 0, 5
yuv2plane1_fn 16, 0, 3
INIT_MMX mmxext
yuv2plane1_fn 9, 0, 3
yuv2plane1_fn 10, 0, 3
%endif
INIT_XMM sse2
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 6, 3
INIT_XMM sse4
yuv2plane1_fn 16, 5, 3
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 5, 3
%endif

View File

@@ -0,0 +1,164 @@
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/bswap.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#if HAVE_INLINE_ASM
DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL;
DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL;
DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
#define mask16b mask15b
DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset);
DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111);
DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one.
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_AMD3DNOW 0
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_AVX 0
//MMX versions
#undef RENAME
#define RENAME(a) a ## _mmx
#include "rgb2rgb_template.c"
// MMXEXT versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "rgb2rgb_template.c"
//SSE2 versions
#undef RENAME
#undef COMPILE_TEMPLATE_SSE2
#define COMPILE_TEMPLATE_SSE2 1
#define RENAME(a) a ## _sse2
#include "rgb2rgb_template.c"
//AVX versions
#undef RENAME
#undef COMPILE_TEMPLATE_AVX
#define COMPILE_TEMPLATE_AVX 1
#define RENAME(a) a ## _avx
#include "rgb2rgb_template.c"
//3DNOW versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_AVX
#undef COMPILE_TEMPLATE_AMD3DNOW
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_AVX 0
#define COMPILE_TEMPLATE_AMD3DNOW 1
#define RENAME(a) a ## _3dnow
#include "rgb2rgb_template.c"
/*
RGB15->RGB16 original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMXEXT, 3DNOW optimization by Nick Kurshev
32-bit C version, and and&add trick by Michael Niedermayer
*/
#endif /* HAVE_INLINE_ASM */
av_cold void rgb2rgb_init_x86(void)
{
#if HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
rgb2rgb_init_mmx();
if (INLINE_AMD3DNOW(cpu_flags))
rgb2rgb_init_3dnow();
if (INLINE_MMXEXT(cpu_flags))
rgb2rgb_init_mmxext();
if (INLINE_SSE2(cpu_flags))
rgb2rgb_init_sse2();
if (INLINE_AVX(cpu_flags))
rgb2rgb_init_avx();
#endif /* HAVE_INLINE_ASM */
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,431 @@
;******************************************************************************
;* x86-optimized horizontal line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
max_19bit_int: times 4 dd 0x7ffff
max_19bit_flt: times 4 dd 524287.0
minshort: times 8 dw 0x8000
unicoeff: times 4 dd 0x20000000
SECTION .text
;-----------------------------------------------------------------------------
; horizontal line scaling
;
; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
; (SwsContext *c, int{16,32}_t *dst,
; int dstW, const uint{8,16}_t *src,
; const int16_t *filter,
; const int32_t *filterPos, int filterSize);
;
; Scale one horizontal line. Input is either 8-bits width or 16-bits width
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
; downscale before multiplying). Filter is 14-bits. Output is either 15bits
; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
; output pixel is generated from $filterSize input pixels, the position of
; the first pixel is given in filterPos[nOutputPixel].
;-----------------------------------------------------------------------------
; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
%macro SCALE_FUNC 6
%ifnidn %3, X
cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
%else
cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
%endif
%if ARCH_X86_64
movsxd wq, wd
%define mov32 movsxd
%else ; x86-32
%define mov32 mov
%endif ; x86-64
%if %2 == 19
%if mmsize == 8 ; mmx
mova m2, [max_19bit_int]
%elif cpuflag(sse4)
mova m2, [max_19bit_int]
%else ; ssse3/sse2
mova m2, [max_19bit_flt]
%endif ; mmx/sse2/ssse3/sse4
%endif ; %2 == 19
%if %1 == 16
mova m6, [minshort]
mova m7, [unicoeff]
%elif %1 == 8
pxor m3, m3
%endif ; %1 == 8/16
%if %1 == 8
%define movlh movd
%define movbh movh
%define srcmul 1
%else ; %1 == 9-16
%define movlh movq
%define movbh movu
%define srcmul 2
%endif ; %1 == 8/9-16
%ifnidn %3, X
; setup loop
%if %3 == 8
shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
%define wshr 1
%else ; %3 == 4
%define wshr 0
%endif ; %3 == 8
lea filterq, [filterq+wq*8]
%if %2 == 15
lea dstq, [dstq+wq*(2>>wshr)]
%else ; %2 == 19
lea dstq, [dstq+wq*(4>>wshr)]
%endif ; %2 == 15/19
lea fltposq, [fltposq+wq*(4>>wshr)]
neg wq
.loop:
%if %3 == 4 ; filterSize == 4 scaling
; load 2x4 or 4x4 source pixels into m0/m1
mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
%if mmsize == 8
movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%else ; mmsize == 16
%if %1 > 8
movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%else ; %1 == 8
movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%endif
mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
%if %1 > 8
movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
%else ; %1 == 8
movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
punpckldq m0, m4
punpckldq m1, m5
%endif ; %1 == 8
%endif ; mmsize == 8/16
%if %1 == 8
punpcklbw m0, m3 ; byte -> word
punpcklbw m1, m3 ; byte -> word
%endif ; %1 == 8
; multiply with filter coefficients
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
%endif ; %1 == 16
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
%if mmsize == 8 ; mmx
movq m4, m0
punpckldq m0, m1
punpckhdq m4, m1
paddd m0, m4
%elif notcpuflag(ssse3) ; sse2
mova m4, m0
shufps m0, m1, 10001000b
shufps m4, m1, 11011101b
paddd m0, m4
%else ; ssse3/sse4
phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
%endif ; mmx/sse2/ssse3/sse4
%else ; %3 == 8, i.e. filterSize == 8 scaling
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
%if mmsize == 8
movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
%else ; mmsize == 16
movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
%endif ; mmsize == 8/16
%if %1 == 8
punpcklbw m0, m3 ; byte -> word
punpcklbw m1, m3 ; byte -> word
punpcklbw m4, m3 ; byte -> word
punpcklbw m5, m3 ; byte -> word
%endif ; %1 == 8
; multiply
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
psubw m4, m6
psubw m5, m6
%endif ; %1 == 16
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
%if mmsize == 8
paddd m0, m1
paddd m4, m5
movq m1, m0
punpckldq m0, m4
punpckhdq m1, m4
paddd m0, m1
%elif notcpuflag(ssse3) ; sse2
%if %1 == 8
%define mex m6
%else
%define mex m3
%endif
; emulate horizontal add as transpose + vertical add
mova mex, m0
punpckldq m0, m1
punpckhdq mex, m1
paddd m0, mex
mova m1, m4
punpckldq m4, m5
punpckhdq m1, m5
paddd m4, m1
mova m1, m0
punpcklqdq m0, m4
punpckhqdq m1, m4
paddd m0, m1
%else ; ssse3/sse4
; FIXME if we rearrange the filter in pairs of 4, we can
; load pixels likewise and use 2 x paddd + phaddd instead
; of 3 x phaddd here, faster on older cpus
phaddd m0, m1
phaddd m4, m5
phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
%endif ; mmx/sse2/ssse3/sse4
%endif ; %3 == 4/8
%else ; %3 == X, i.e. any filterSize scaling
%ifidn %4, X4
%define dlt 4
%else ; %4 == X || %4 == X8
%define dlt 0
%endif ; %4 ==/!= X4
%if ARCH_X86_64
%define srcq r8
%define pos1q r7
%define srcendq r9
movsxd fltsizeq, fltsized ; filterSize
lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
%else ; x86-32
%define srcq srcmemq
%define pos1q dstq
%define srcendq r6m
lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
mov srcendq, pos0q
%endif ; x86-32/64
lea fltposq, [fltposq+wq*4]
%if %2 == 15
lea dstq, [dstq+wq*2]
%else ; %2 == 19
lea dstq, [dstq+wq*4]
%endif ; %2 == 15/19
movifnidn dstmp, dstq
neg wq
.loop:
mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
pxor m4, m4
pxor m5, m5
mov srcq, srcmemmp
.innerloop:
; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
%if %1 == 8
punpcklbw m0, m3
punpcklbw m1, m3
%endif ; %1 == 8
; multiply
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
%endif ; %1 == 16
pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
paddd m4, m0
paddd m5, m1
add filterq, mmsize
add srcq, srcmul*mmsize/2
cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
jl .innerloop
%ifidn %4, X4
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
%if %1 > 8
movhps m0, [srcq+(pos1q+dlt)*srcmul]
%else ; %1 == 8
movd m1, [srcq+(pos1q+dlt)*srcmul]
punpckldq m0, m1
%endif ; %1 == 8
%if %1 == 8
punpcklbw m0, m3
%endif ; %1 == 8
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
%endif ; %1 == 16
pmaddwd m0, [filterq]
%endif ; %4 == X4
lea filterq, [filterq+(fltsizeq+dlt)*2]
%if mmsize == 8 ; mmx
movq m0, m4
punpckldq m4, m5
punpckhdq m0, m5
paddd m0, m4
%else ; mmsize == 16
%if notcpuflag(ssse3) ; sse2
mova m1, m4
punpcklqdq m4, m5
punpckhqdq m1, m5
paddd m4, m1
%else ; ssse3/sse4
phaddd m4, m5
%endif ; sse2/ssse3/sse4
%ifidn %4, X4
paddd m4, m0
%endif ; %3 == X4
%if notcpuflag(ssse3) ; sse2
pshufd m4, m4, 11011000b
movhlps m0, m4
paddd m0, m4
%else ; ssse3/sse4
phaddd m4, m4
SWAP 0, 4
%endif ; sse2/ssse3/sse4
%endif ; mmsize == 8/16
%endif ; %3 ==/!= X
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
paddd m0, m7
%endif ; %1 == 16
; clip, store
psrad m0, 14 + %1 - %2
%ifidn %3, X
movifnidn dstq, dstmp
%endif ; %3 == X
%if %2 == 15
packssdw m0, m0
%ifnidn %3, X
movh [dstq+wq*(2>>wshr)], m0
%else ; %3 == X
movd [dstq+wq*2], m0
%endif ; %3 ==/!= X
%else ; %2 == 19
%if mmsize == 8
PMINSD_MMX m0, m2, m4
%elif cpuflag(sse4)
pminsd m0, m2
%else ; sse2/ssse3
cvtdq2ps m0, m0
minps m0, m2
cvtps2dq m0, m0
%endif ; mmx/sse2/ssse3/sse4
%ifnidn %3, X
mova [dstq+wq*(4>>wshr)], m0
%else ; %3 == X
movq [dstq+wq*4], m0
%endif ; %3 ==/!= X
%endif ; %2 == 15/19
%ifnidn %3, X
add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
; per iteration. see "shl wq,1" above as for why we do this
%else ; %3 == X
add wq, 2
%endif ; %3 ==/!= X
jl .loop
REP_RET
%endmacro
; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
%macro SCALE_FUNCS 3
SCALE_FUNC %1, %2, 4, 4, 6, %3
SCALE_FUNC %1, %2, 8, 8, 6, %3
%if mmsize == 8
SCALE_FUNC %1, %2, X, X, 7, %3
%else
SCALE_FUNC %1, %2, X, X4, 7, %3
SCALE_FUNC %1, %2, X, X8, 7, %3
%endif
%endmacro
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
%macro SCALE_FUNCS2 3
%if notcpuflag(sse4)
SCALE_FUNCS 8, 15, %1
SCALE_FUNCS 9, 15, %2
SCALE_FUNCS 10, 15, %2
SCALE_FUNCS 12, 15, %2
SCALE_FUNCS 14, 15, %2
SCALE_FUNCS 16, 15, %3
%endif ; !sse4
SCALE_FUNCS 8, 19, %1
SCALE_FUNCS 9, 19, %2
SCALE_FUNCS 10, 19, %2
SCALE_FUNCS 12, 19, %2
SCALE_FUNCS 14, 19, %2
SCALE_FUNCS 16, 19, %3
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
SCALE_FUNCS2 0, 0, 0
%endif
INIT_XMM sse2
SCALE_FUNCS2 7, 6, 8
INIT_XMM ssse3
SCALE_FUNCS2 6, 6, 8
INIT_XMM sse4
SCALE_FUNCS2 6, 6, 8

View File

@@ -0,0 +1,607 @@
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
#if HAVE_INLINE_ASM
#define DITHER1XBPP
DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL;
DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL;
const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
0x0103010301030103LL,
0x0200020002000200LL,};
const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
0x0602060206020602LL,
0x0004000400040004LL,};
DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL;
DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL;
DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL;
DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL;
DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL;
DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
//MMX versions
#if HAVE_MMX_INLINE
#undef RENAME
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _mmx
#include "swscale_template.c"
#endif
// MMXEXT versions
#if HAVE_MMXEXT_INLINE
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "swscale_template.c"
#endif
void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
int lastInLumBuf, int lastInChrBuf)
{
const int dstH= c->dstH;
const int flags= c->flags;
#ifdef NEW_FILTER
SwsPlane *lumPlane = &c->slice[c->numSlice-2].plane[0];
SwsPlane *chrUPlane = &c->slice[c->numSlice-2].plane[1];
SwsPlane *alpPlane = &c->slice[c->numSlice-2].plane[3];
#else
int16_t **lumPixBuf= c->lumPixBuf;
int16_t **chrUPixBuf= c->chrUPixBuf;
int16_t **alpPixBuf= c->alpPixBuf;
const int vLumBufSize= c->vLumBufSize;
const int vChrBufSize= c->vChrBufSize;
#endif
int hasAlpha = c->alpPixBuf != NULL;
int32_t *vLumFilterPos= c->vLumFilterPos;
int32_t *vChrFilterPos= c->vChrFilterPos;
int16_t *vLumFilter= c->vLumFilter;
int16_t *vChrFilter= c->vChrFilter;
int32_t *lumMmxFilter= c->lumMmxFilter;
int32_t *chrMmxFilter= c->chrMmxFilter;
int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
const int vLumFilterSize= c->vLumFilterSize;
const int vChrFilterSize= c->vChrFilterSize;
const int chrDstY= dstY>>c->chrDstVSubSample;
const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
c->blueDither= ff_dither8[dstY&1];
if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555)
c->greenDither= ff_dither8[dstY&1];
else
c->greenDither= ff_dither4[dstY&1];
c->redDither= ff_dither8[(dstY+1)&1];
if (dstY < dstH - 2) {
#ifdef NEW_FILTER
const int16_t **lumSrcPtr = (const int16_t **)(void*) lumPlane->line + firstLumSrcY - lumPlane->sliceY;
const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPlane->line + firstChrSrcY - chrUPlane->sliceY;
const int16_t **alpSrcPtr = (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) ? (const int16_t **)(void*) alpPlane->line + firstLumSrcY - alpPlane->sliceY : NULL;
#else
const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
#endif
int i;
if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
#ifdef NEW_FILTER
const int16_t **tmpY = (const int16_t **) lumPlane->tmp;
#else
const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
#endif
int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
for (i = 0; i < neg; i++)
tmpY[i] = lumSrcPtr[neg];
for ( ; i < end; i++)
tmpY[i] = lumSrcPtr[i];
for ( ; i < vLumFilterSize; i++)
tmpY[i] = tmpY[i-1];
lumSrcPtr = tmpY;
if (alpSrcPtr) {
#ifdef NEW_FILTER
const int16_t **tmpA = (const int16_t **) alpPlane->tmp;
#else
const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
#endif
for (i = 0; i < neg; i++)
tmpA[i] = alpSrcPtr[neg];
for ( ; i < end; i++)
tmpA[i] = alpSrcPtr[i];
for ( ; i < vLumFilterSize; i++)
tmpA[i] = tmpA[i - 1];
alpSrcPtr = tmpA;
}
}
if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
#ifdef NEW_FILTER
const int16_t **tmpU = (const int16_t **) chrUPlane->tmp;
#else
const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
#endif
int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
for (i = 0; i < neg; i++) {
tmpU[i] = chrUSrcPtr[neg];
}
for ( ; i < end; i++) {
tmpU[i] = chrUSrcPtr[i];
}
for ( ; i < vChrFilterSize; i++) {
tmpU[i] = tmpU[i - 1];
}
chrUSrcPtr = tmpU;
}
if (flags & SWS_ACCURATE_RND) {
int s= APCK_SIZE / 8;
for (i=0; i<vLumFilterSize; i+=2) {
*(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
*(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
lumMmxFilter[s*i+APCK_COEF/4 ]=
lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
+ (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
*(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
*(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
alpMmxFilter[s*i+APCK_COEF/4 ]=
alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
}
}
for (i=0; i<vChrFilterSize; i+=2) {
*(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ];
*(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)];
chrMmxFilter[s*i+APCK_COEF/4 ]=
chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
+ (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
}
} else {
for (i=0; i<vLumFilterSize; i++) {
*(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
lumMmxFilter[4*i+2]=
lumMmxFilter[4*i+3]=
((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
*(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
alpMmxFilter[4*i+2]=
alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
}
}
for (i=0; i<vChrFilterSize; i++) {
*(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
chrMmxFilter[4*i+2]=
chrMmxFilter[4*i+3]=
((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
}
}
}
}
#if HAVE_MMXEXT
static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
if(((uintptr_t)dest) & 15){
yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
return;
}
filterSize--;
#define MAIN_FUNCTION \
"pxor %%xmm0, %%xmm0 \n\t" \
"punpcklbw %%xmm0, %%xmm3 \n\t" \
"movd %4, %%xmm1 \n\t" \
"punpcklwd %%xmm1, %%xmm1 \n\t" \
"punpckldq %%xmm1, %%xmm1 \n\t" \
"punpcklqdq %%xmm1, %%xmm1 \n\t" \
"psllw $3, %%xmm1 \n\t" \
"paddw %%xmm1, %%xmm3 \n\t" \
"psraw $4, %%xmm3 \n\t" \
"movdqa %%xmm3, %%xmm4 \n\t" \
"movdqa %%xmm3, %%xmm7 \n\t" \
"movl %3, %%ecx \n\t" \
"mov %0, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
".p2align 4 \n\t" /* FIXME Unroll? */\
"1: \n\t"\
"movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\
"movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\
"movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\
"add $16, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\
"pmulhw %%xmm0, %%xmm2 \n\t"\
"pmulhw %%xmm0, %%xmm5 \n\t"\
"paddw %%xmm2, %%xmm3 \n\t"\
"paddw %%xmm5, %%xmm4 \n\t"\
" jnz 1b \n\t"\
"psraw $3, %%xmm3 \n\t"\
"psraw $3, %%xmm4 \n\t"\
"packuswb %%xmm4, %%xmm3 \n\t"\
"movntdq %%xmm3, (%1, %%"REG_c")\n\t"\
"add $16, %%"REG_c" \n\t"\
"cmp %2, %%"REG_c" \n\t"\
"movdqa %%xmm7, %%xmm3 \n\t" \
"movdqa %%xmm7, %%xmm4 \n\t" \
"mov %0, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"
if (offset) {
__asm__ volatile(
"movq %5, %%xmm3 \n\t"
"movdqa %%xmm3, %%xmm4 \n\t"
"psrlq $24, %%xmm3 \n\t"
"psllq $40, %%xmm4 \n\t"
"por %%xmm4, %%xmm3 \n\t"
MAIN_FUNCTION
:: "g" (filter),
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
"m"(filterSize), "m"(((uint64_t *) dither)[0])
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
"%"REG_d, "%"REG_S, "%"REG_c
);
} else {
__asm__ volatile(
"movq %5, %%xmm3 \n\t"
MAIN_FUNCTION
:: "g" (filter),
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
"m"(filterSize), "m"(((uint64_t *) dither)[0])
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
"%"REG_d, "%"REG_S, "%"REG_c
);
}
}
#endif
#endif /* HAVE_INLINE_ASM */
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
SwsContext *c, int16_t *data, \
int dstW, const uint8_t *src, \
const int16_t *filter, \
const int32_t *filterPos, int filterSize)
#define SCALE_FUNCS(filter_n, opt) \
SCALE_FUNC(filter_n, 8, 15, opt); \
SCALE_FUNC(filter_n, 9, 15, opt); \
SCALE_FUNC(filter_n, 10, 15, opt); \
SCALE_FUNC(filter_n, 12, 15, opt); \
SCALE_FUNC(filter_n, 14, 15, opt); \
SCALE_FUNC(filter_n, 16, 15, opt); \
SCALE_FUNC(filter_n, 8, 19, opt); \
SCALE_FUNC(filter_n, 9, 19, opt); \
SCALE_FUNC(filter_n, 10, 19, opt); \
SCALE_FUNC(filter_n, 12, 19, opt); \
SCALE_FUNC(filter_n, 14, 19, opt); \
SCALE_FUNC(filter_n, 16, 19, opt)
#define SCALE_FUNCS_MMX(opt) \
SCALE_FUNCS(4, opt); \
SCALE_FUNCS(8, opt); \
SCALE_FUNCS(X, opt)
#define SCALE_FUNCS_SSE(opt) \
SCALE_FUNCS(4, opt); \
SCALE_FUNCS(8, opt); \
SCALE_FUNCS(X4, opt); \
SCALE_FUNCS(X8, opt)
#if ARCH_X86_32
SCALE_FUNCS_MMX(mmx);
#endif
SCALE_FUNCS_SSE(sse2);
SCALE_FUNCS_SSE(ssse3);
SCALE_FUNCS_SSE(sse4);
#define VSCALEX_FUNC(size, opt) \
void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset)
#define VSCALEX_FUNCS(opt) \
VSCALEX_FUNC(8, opt); \
VSCALEX_FUNC(9, opt); \
VSCALEX_FUNC(10, opt)
#if ARCH_X86_32
VSCALEX_FUNCS(mmxext);
#endif
VSCALEX_FUNCS(sse2);
VSCALEX_FUNCS(sse4);
VSCALEX_FUNC(16, sse4);
VSCALEX_FUNCS(avx);
#define VSCALE_FUNC(size, opt) \
void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
const uint8_t *dither, int offset)
#define VSCALE_FUNCS(opt1, opt2) \
VSCALE_FUNC(8, opt1); \
VSCALE_FUNC(9, opt2); \
VSCALE_FUNC(10, opt2); \
VSCALE_FUNC(16, opt1)
#if ARCH_X86_32
VSCALE_FUNCS(mmx, mmxext);
#endif
VSCALE_FUNCS(sse2, sse2);
VSCALE_FUNC(16, sse4);
VSCALE_FUNCS(avx, avx);
#define INPUT_Y_FUNC(fmt, opt) \
void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
const uint8_t *unused1, const uint8_t *unused2, \
int w, uint32_t *unused)
#define INPUT_UV_FUNC(fmt, opt) \
void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *unused0, \
const uint8_t *src1, \
const uint8_t *src2, \
int w, uint32_t *unused)
#define INPUT_FUNC(fmt, opt) \
INPUT_Y_FUNC(fmt, opt); \
INPUT_UV_FUNC(fmt, opt)
#define INPUT_FUNCS(opt) \
INPUT_FUNC(uyvy, opt); \
INPUT_FUNC(yuyv, opt); \
INPUT_UV_FUNC(nv12, opt); \
INPUT_UV_FUNC(nv21, opt); \
INPUT_FUNC(rgba, opt); \
INPUT_FUNC(bgra, opt); \
INPUT_FUNC(argb, opt); \
INPUT_FUNC(abgr, opt); \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(bgr24, opt)
#if ARCH_X86_32
INPUT_FUNCS(mmx);
#endif
INPUT_FUNCS(sse2);
INPUT_FUNCS(ssse3);
INPUT_FUNCS(avx);
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_MMX_INLINE
if (INLINE_MMX(cpu_flags))
sws_init_swscale_mmx(c);
#endif
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
sws_init_swscale_mmxext(c);
if (cpu_flags & AV_CPU_FLAG_SSE3){
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
c->yuv2planeX = yuv2yuvX_sse3;
}
#endif
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
if (c->srcBpc == 8) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
ff_hscale8to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 9) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
ff_hscale9to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 10) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
ff_hscale10to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 12) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
ff_hscale12to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
ff_hscale14to19_ ## filtersize ## _ ## opt1; \
} else { /* c->srcBpc == 16 */ \
av_assert0(c->srcBpc == 16);\
hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
} \
} while (0)
#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
}
#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
switch(c->dstBpc){ \
case 16: do_16_case; break; \
case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \
case 8: if ((condition_8bit) && !c->use_mmx_vfilter) vscalefn = ff_yuv2planeX_8_ ## opt; break; \
}
#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
switch(c->dstBpc){ \
case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
default: av_assert0(c->dstBpc>8); \
}
#define case_rgb(x, X, opt) \
case AV_PIX_FMT_ ## X: \
c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
if (!c->chrSrcHSubSample) \
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
break
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
switch (c->srcFormat) {
case AV_PIX_FMT_YA8:
c->lumToYV12 = ff_yuyvToY_mmx;
if (c->alpPixBuf)
c->alpToYV12 = ff_uyvyToY_mmx;
break;
case AV_PIX_FMT_YUYV422:
c->lumToYV12 = ff_yuyvToY_mmx;
c->chrToYV12 = ff_yuyvToUV_mmx;
break;
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = ff_uyvyToY_mmx;
c->chrToYV12 = ff_uyvyToUV_mmx;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_mmx;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_mmx;
break;
case_rgb(rgb24, RGB24, mmx);
case_rgb(bgr24, BGR24, mmx);
case_rgb(bgra, BGRA, mmx);
case_rgb(rgba, RGBA, mmx);
case_rgb(abgr, ABGR, mmx);
case_rgb(argb, ARGB, mmx);
default:
break;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
}
#endif /* ARCH_X86_32 */
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
break; \
}
if (EXTERNAL_SSE2(cpu_flags)) {
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
HAVE_ALIGNED_STACK || ARCH_X86_64);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
switch (c->srcFormat) {
case AV_PIX_FMT_YA8:
c->lumToYV12 = ff_yuyvToY_sse2;
if (c->alpPixBuf)
c->alpToYV12 = ff_uyvyToY_sse2;
break;
case AV_PIX_FMT_YUYV422:
c->lumToYV12 = ff_yuyvToY_sse2;
c->chrToYV12 = ff_yuyvToUV_sse2;
break;
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = ff_uyvyToY_sse2;
c->chrToYV12 = ff_uyvyToUV_sse2;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_sse2;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_sse2;
break;
case_rgb(rgb24, RGB24, sse2);
case_rgb(bgr24, BGR24, sse2);
case_rgb(bgra, BGRA, sse2);
case_rgb(rgba, RGBA, sse2);
case_rgb(abgr, ABGR, sse2);
case_rgb(argb, ARGB, sse2);
default:
break;
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
switch (c->srcFormat) {
case_rgb(rgb24, RGB24, ssse3);
case_rgb(bgr24, BGR24, ssse3);
default:
break;
}
}
if (EXTERNAL_SSE4(cpu_flags)) {
/* Xto15 don't need special sse4 functions */
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
HAVE_ALIGNED_STACK || ARCH_X86_64);
if (c->dstBpc == 16 && !isBE(c->dstFormat))
c->yuv2plane1 = ff_yuv2plane1_16_sse4;
}
if (EXTERNAL_AVX(cpu_flags)) {
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
HAVE_ALIGNED_STACK || ARCH_X86_64);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
switch (c->srcFormat) {
case AV_PIX_FMT_YUYV422:
c->chrToYV12 = ff_yuyvToUV_avx;
break;
case AV_PIX_FMT_UYVY422:
c->chrToYV12 = ff_uyvyToUV_avx;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_avx;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_avx;
break;
case_rgb(rgb24, RGB24, avx);
case_rgb(bgr24, BGR24, avx);
case_rgb(bgra, BGRA, avx);
case_rgb(rgba, RGBA, avx);
case_rgb(abgr, ABGR, avx);
case_rgb(argb, ARGB, avx);
default:
break;
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,31 @@
/*
* check XMM registers for clobbers on Win64
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/w64xmmtest.h"
#include "libswscale/swscale.h"
wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
const int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *const dst[], const int dstStride[]))
{
testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
srcSliceH, dst, dstStride);
}

View File

@@ -0,0 +1,119 @@
/*
* software YUV to RGB converter
*
* Copyright (C) 2009 Konstantin Shishkov
*
* MMX/MMXEXT template stuff (needed for fast movntq support),
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#if HAVE_INLINE_ASM
#define DITHER1XBPP // only for MMX
/* hope these constant values are cache line aligned */
DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL;
DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
//MMX versions
#if HAVE_MMX_INLINE && HAVE_6REGS
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _mmx
#include "yuv2rgb_template.c"
#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
// MMXEXT versions
#if HAVE_MMXEXT_INLINE && HAVE_6REGS
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "yuv2rgb_template.c"
#endif /* HAVE_MMXEXT_INLINE && HAVE_6REGS */
#endif /* HAVE_INLINE_ASM */
av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
{
#if HAVE_MMX_INLINE && HAVE_6REGS
int cpu_flags = av_get_cpu_flags();
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB24:
return yuv420_rgb24_mmxext;
case AV_PIX_FMT_BGR24:
return yuv420_bgr24_mmxext;
}
}
#endif
if (INLINE_MMX(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
return yuva420_rgb32_mmx;
#endif
break;
} else
return yuv420_rgb32_mmx;
case AV_PIX_FMT_BGR32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
return yuva420_bgr32_mmx;
#endif
break;
} else
return yuv420_bgr32_mmx;
case AV_PIX_FMT_RGB24:
return yuv420_rgb24_mmx;
case AV_PIX_FMT_BGR24:
return yuv420_bgr24_mmx;
case AV_PIX_FMT_RGB565:
return yuv420_rgb16_mmx;
case AV_PIX_FMT_RGB555:
return yuv420_rgb15_mmx;
}
}
#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
return NULL;
}

View File

@@ -0,0 +1,467 @@
/*
* software YUV to RGB converter
*
* Copyright (C) 2001-2007 Michael Niedermayer
* (c) 2010 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/x86/asm.h"
#include "libswscale/swscale_internal.h"
#undef MOVNTQ
#undef EMMS
#undef SFENCE
#if COMPILE_TEMPLATE_MMXEXT
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE " # nop"
#endif
#define REG_BLUE "0"
#define REG_RED "1"
#define REG_GREEN "2"
#define REG_ALPHA "3"
#define YUV2RGB_LOOP(depth) \
h_size = (c->dstW + 7) & ~7; \
if (h_size * depth > FFABS(dstStride[0])) \
h_size -= 8; \
\
vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
\
__asm__ volatile ("pxor %mm4, %mm4\n\t"); \
for (y = 0; y < srcSliceH; y++) { \
uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
const uint8_t *py = src[0] + y * srcStride[0]; \
const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
x86_reg index = -h_size / 2; \
#define YUV2RGB_INITIAL_LOAD \
__asm__ volatile ( \
"movq (%5, %0, 2), %%mm6\n\t" \
"movd (%2, %0), %%mm0\n\t" \
"movd (%3, %0), %%mm1\n\t" \
"1: \n\t" \
/* YUV2RGB core
* Conversion is performed in usual way:
* R = Y' * Ycoef + Vred * V'
* G = Y' * Ycoef + Vgreen * V' + Ugreen * U'
* B = Y' * Ycoef + Ublue * U'
*
* where X' = X * 8 - Xoffset (multiplication is performed to increase
* precision a bit).
* Since it operates in YUV420 colorspace, Y component is additionally
* split into Y1 and Y2 for even and odd pixels.
*
* Input:
* mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register
* Output:
* mm1 - R, mm2 - G, mm0 - B
*/
#define YUV2RGB \
/* convert Y, U, V into Y1', Y2', U', V' */ \
"movq %%mm6, %%mm7\n\t" \
"punpcklbw %%mm4, %%mm0\n\t" \
"punpcklbw %%mm4, %%mm1\n\t" \
"pand "MANGLE(mmx_00ffw)", %%mm6\n\t" \
"psrlw $8, %%mm7\n\t" \
"psllw $3, %%mm0\n\t" \
"psllw $3, %%mm1\n\t" \
"psllw $3, %%mm6\n\t" \
"psllw $3, %%mm7\n\t" \
"psubsw "U_OFFSET"(%4), %%mm0\n\t" \
"psubsw "V_OFFSET"(%4), %%mm1\n\t" \
"psubw "Y_OFFSET"(%4), %%mm6\n\t" \
"psubw "Y_OFFSET"(%4), %%mm7\n\t" \
\
/* multiply by coefficients */ \
"movq %%mm0, %%mm2\n\t" \
"movq %%mm1, %%mm3\n\t" \
"pmulhw "UG_COEFF"(%4), %%mm2\n\t" \
"pmulhw "VG_COEFF"(%4), %%mm3\n\t" \
"pmulhw "Y_COEFF" (%4), %%mm6\n\t" \
"pmulhw "Y_COEFF" (%4), %%mm7\n\t" \
"pmulhw "UB_COEFF"(%4), %%mm0\n\t" \
"pmulhw "VR_COEFF"(%4), %%mm1\n\t" \
"paddsw %%mm3, %%mm2\n\t" \
/* now: mm0 = UB, mm1 = VR, mm2 = CG */ \
/* mm6 = Y1, mm7 = Y2 */ \
\
/* produce RGB */ \
"movq %%mm7, %%mm3\n\t" \
"movq %%mm7, %%mm5\n\t" \
"paddsw %%mm0, %%mm3\n\t" \
"paddsw %%mm1, %%mm5\n\t" \
"paddsw %%mm2, %%mm7\n\t" \
"paddsw %%mm6, %%mm0\n\t" \
"paddsw %%mm6, %%mm1\n\t" \
"paddsw %%mm6, %%mm2\n\t" \
#define RGB_PACK_INTERLEAVE \
/* pack and interleave even/odd pixels */ \
"packuswb %%mm1, %%mm0\n\t" \
"packuswb %%mm5, %%mm3\n\t" \
"packuswb %%mm2, %%mm2\n\t" \
"movq %%mm0, %%mm1\n\n" \
"packuswb %%mm7, %%mm7\n\t" \
"punpcklbw %%mm3, %%mm0\n\t" \
"punpckhbw %%mm3, %%mm1\n\t" \
"punpcklbw %%mm7, %%mm2\n\t" \
#define YUV2RGB_ENDLOOP(depth) \
"movq 8 (%5, %0, 2), %%mm6\n\t" \
"movd 4 (%3, %0), %%mm1\n\t" \
"movd 4 (%2, %0), %%mm0\n\t" \
"add $"AV_STRINGIFY(depth * 8)", %1\n\t" \
"add $4, %0\n\t" \
"js 1b\n\t" \
#if COMPILE_TEMPLATE_MMXEXT
#undef RGB_PACK24_B_OPERANDS
#define RGB_PACK24_B_OPERANDS NAMED_CONSTRAINTS_ARRAY_ADD(mask1101,mask0110,mask0100,mask0010,mask1001)
#else
#undef RGB_PACK24_B_OPERANDS
#define RGB_PACK24_B_OPERANDS
#endif
#define YUV2RGB_OPERANDS \
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index) \
NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_03,pb_07,mmx_redmask,pb_e0) \
RGB_PACK24_B_OPERANDS \
: "memory" \
); \
} \
#define YUV2RGB_OPERANDS_ALPHA \
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index), "r" (pa - 2*index) \
NAMED_CONSTRAINTS_ADD(mmx_00ffw) \
: "memory" \
); \
} \
#define YUV2RGB_ENDFUNC \
__asm__ volatile (SFENCE"\n\t" \
"emms \n\t"); \
return srcSliceH; \
#define IF0(x)
#define IF1(x) x
#define RGB_PACK16(gmask, is15) \
"pand "MANGLE(mmx_redmask)", %%mm0\n\t" \
"pand "MANGLE(mmx_redmask)", %%mm1\n\t" \
"movq %%mm2, %%mm3\n\t" \
"psllw $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \
"psrlw $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \
"psrlw $3, %%mm0\n\t" \
IF##is15("psrlw $1, %%mm1\n\t") \
"pand "MANGLE(pb_e0)", %%mm2\n\t" \
"pand "MANGLE(gmask)", %%mm3\n\t" \
"por %%mm2, %%mm0\n\t" \
"por %%mm3, %%mm1\n\t" \
"movq %%mm0, %%mm2\n\t" \
"punpcklbw %%mm1, %%mm0\n\t" \
"punpckhbw %%mm1, %%mm2\n\t" \
MOVNTQ " %%mm0, (%1)\n\t" \
MOVNTQ " %%mm2, 8(%1)\n\t" \
#define DITHER_RGB \
"paddusb "BLUE_DITHER"(%4), %%mm0\n\t" \
"paddusb "GREEN_DITHER"(%4), %%mm2\n\t" \
"paddusb "RED_DITHER"(%4), %%mm1\n\t" \
#if !COMPILE_TEMPLATE_MMXEXT
static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(2)
#ifdef DITHER1XBPP
c->blueDither = ff_dither8[y & 1];
c->greenDither = ff_dither8[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
#endif
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
#ifdef DITHER1XBPP
DITHER_RGB
#endif
RGB_PACK16(pb_03, 1)
YUV2RGB_ENDLOOP(2)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(2)
#ifdef DITHER1XBPP
c->blueDither = ff_dither8[y & 1];
c->greenDither = ff_dither4[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
#endif
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
#ifdef DITHER1XBPP
DITHER_RGB
#endif
RGB_PACK16(pb_07, 0)
YUV2RGB_ENDLOOP(2)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#endif /* !COMPILE_TEMPLATE_MMXEXT */
#define RGB_PACK24(blue, red)\
"packuswb %%mm3, %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
"packuswb %%mm5, %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
"packuswb %%mm7, %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
"movq %%mm"red", %%mm3 \n"\
"movq %%mm"blue", %%mm6 \n"\
"psrlq $32, %%mm"red" \n" /* R1 R3 R5 R7 */\
"punpcklbw %%mm2, %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
"punpcklbw %%mm"red", %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
"movq %%mm3, %%mm5 \n"\
"punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
"punpcklwd %%mm6, %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
"punpckhwd %%mm6, %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
RGB_PACK24_B
#if COMPILE_TEMPLATE_MMXEXT
DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
#undef RGB_PACK24_B
#define RGB_PACK24_B\
"pshufw $0xc6, %%mm2, %%mm1 \n"\
"pshufw $0x84, %%mm3, %%mm6 \n"\
"pshufw $0x38, %%mm5, %%mm7 \n"\
"pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
"movq %%mm1, %%mm0 \n"\
"pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
"movq %%mm1, %%mm2 \n"\
"pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
"psrlq $48, %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
"pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
"psllq $32, %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
"pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
"por %%mm3, %%mm1 \n"\
"por %%mm6, %%mm0 \n"\
"por %%mm5, %%mm1 \n"\
"por %%mm7, %%mm2 \n"\
MOVNTQ" %%mm0, (%1) \n"\
MOVNTQ" %%mm1, 8(%1) \n"\
MOVNTQ" %%mm2, 16(%1) \n"\
#else
#undef RGB_PACK24_B
#define RGB_PACK24_B\
"movd %%mm3, (%1) \n" /* R0 G0 B0 R1 */\
"movd %%mm2, 4(%1) \n" /* G1 B1 */\
"psrlq $32, %%mm3 \n"\
"psrlq $16, %%mm2 \n"\
"movd %%mm3, 6(%1) \n" /* R2 G2 B2 R3 */\
"movd %%mm2, 10(%1) \n" /* G3 B3 */\
"psrlq $16, %%mm2 \n"\
"movd %%mm5, 12(%1) \n" /* R4 G4 B4 R5 */\
"movd %%mm2, 16(%1) \n" /* G5 B5 */\
"psrlq $32, %%mm5 \n"\
"movd %%mm2, 20(%1) \n" /* -- -- G7 B7 */\
"movd %%mm5, 18(%1) \n" /* R6 G6 B6 R7 */\
#endif
static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(3)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK24(REG_BLUE, REG_RED)
YUV2RGB_ENDLOOP(3)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(3)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK24(REG_RED, REG_BLUE)
YUV2RGB_ENDLOOP(3)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#define SET_EMPTY_ALPHA \
"pcmpeqd %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \
#define LOAD_ALPHA \
"movq (%6, %0, 2), %%mm"REG_ALPHA"\n\t" \
#define RGB_PACK32(red, green, blue, alpha) \
"movq %%mm"blue", %%mm5\n\t" \
"movq %%mm"red", %%mm6\n\t" \
"punpckhbw %%mm"green", %%mm5\n\t" \
"punpcklbw %%mm"green", %%mm"blue"\n\t" \
"punpckhbw %%mm"alpha", %%mm6\n\t" \
"punpcklbw %%mm"alpha", %%mm"red"\n\t" \
"movq %%mm"blue", %%mm"green"\n\t" \
"movq %%mm5, %%mm"alpha"\n\t" \
"punpcklwd %%mm"red", %%mm"blue"\n\t" \
"punpckhwd %%mm"red", %%mm"green"\n\t" \
"punpcklwd %%mm6, %%mm5\n\t" \
"punpckhwd %%mm6, %%mm"alpha"\n\t" \
MOVNTQ " %%mm"blue", 0(%1)\n\t" \
MOVNTQ " %%mm"green", 8(%1)\n\t" \
MOVNTQ " %%mm5, 16(%1)\n\t" \
MOVNTQ " %%mm"alpha", 24(%1)\n\t" \
#if !COMPILE_TEMPLATE_MMXEXT
static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
SET_EMPTY_ALPHA
RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
LOAD_ALPHA
RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS_ALPHA
YUV2RGB_ENDFUNC
}
#endif
static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
SET_EMPTY_ALPHA
RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
LOAD_ALPHA
RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS_ALPHA
YUV2RGB_ENDFUNC
}
#endif
#endif /* !COMPILE_TEMPLATE_MMXEXT */

View File

@@ -0,0 +1,977 @@
/*
* software YUV to RGB converter
*
* Copyright (C) 2009 Konstantin Shishkov
*
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "libavutil/cpu.h"
#include "libavutil/bswap.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
#include "libavutil/pixdesc.h"
const int32_t ff_yuv2rgb_coeffs[8][4] = {
{ 117504, 138453, 13954, 34903 }, /* no sequence_display_extension */
{ 117504, 138453, 13954, 34903 }, /* ITU-R Rec. 709 (1990) */
{ 104597, 132201, 25675, 53279 }, /* unspecified */
{ 104597, 132201, 25675, 53279 }, /* reserved */
{ 104448, 132798, 24759, 53109 }, /* FCC */
{ 104597, 132201, 25675, 53279 }, /* ITU-R Rec. 624-4 System B, G */
{ 104597, 132201, 25675, 53279 }, /* SMPTE 170M */
{ 117579, 136230, 16907, 35559 } /* SMPTE 240M (1987) */
};
const int *sws_getCoefficients(int colorspace)
{
if (colorspace > 7 || colorspace < 0)
colorspace = SWS_CS_DEFAULT;
return ff_yuv2rgb_coeffs[colorspace];
}
#define LOADCHROMA(i) \
U = pu[i]; \
V = pv[i]; \
r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM]; \
g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]); \
b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
#define PUTRGB(dst, src, i) \
Y = src[2 * i]; \
dst[2 * i] = r[Y] + g[Y] + b[Y]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y] + g[Y] + b[Y];
#define PUTRGB24(dst, src, i) \
Y = src[2 * i]; \
dst[6 * i + 0] = r[Y]; \
dst[6 * i + 1] = g[Y]; \
dst[6 * i + 2] = b[Y]; \
Y = src[2 * i + 1]; \
dst[6 * i + 3] = r[Y]; \
dst[6 * i + 4] = g[Y]; \
dst[6 * i + 5] = b[Y];
#define PUTBGR24(dst, src, i) \
Y = src[2 * i]; \
dst[6 * i + 0] = b[Y]; \
dst[6 * i + 1] = g[Y]; \
dst[6 * i + 2] = r[Y]; \
Y = src[2 * i + 1]; \
dst[6 * i + 3] = b[Y]; \
dst[6 * i + 4] = g[Y]; \
dst[6 * i + 5] = r[Y];
#define PUTRGBA(dst, ysrc, asrc, i, s) \
Y = ysrc[2 * i]; \
dst[2 * i] = r[Y] + g[Y] + b[Y] + (asrc[2 * i] << s); \
Y = ysrc[2 * i + 1]; \
dst[2 * i + 1] = r[Y] + g[Y] + b[Y] + (asrc[2 * i + 1] << s);
#define PUTRGB48(dst, src, i) \
Y = src[ 2 * i]; \
dst[12 * i + 0] = dst[12 * i + 1] = r[Y]; \
dst[12 * i + 2] = dst[12 * i + 3] = g[Y]; \
dst[12 * i + 4] = dst[12 * i + 5] = b[Y]; \
Y = src[ 2 * i + 1]; \
dst[12 * i + 6] = dst[12 * i + 7] = r[Y]; \
dst[12 * i + 8] = dst[12 * i + 9] = g[Y]; \
dst[12 * i + 10] = dst[12 * i + 11] = b[Y];
#define PUTBGR48(dst, src, i) \
Y = src[2 * i]; \
dst[12 * i + 0] = dst[12 * i + 1] = b[Y]; \
dst[12 * i + 2] = dst[12 * i + 3] = g[Y]; \
dst[12 * i + 4] = dst[12 * i + 5] = r[Y]; \
Y = src[2 * i + 1]; \
dst[12 * i + 6] = dst[12 * i + 7] = b[Y]; \
dst[12 * i + 8] = dst[12 * i + 9] = g[Y]; \
dst[12 * i + 10] = dst[12 * i + 11] = r[Y];
#define YUV2RGBFUNC(func_name, dst_type, alpha) \
static int func_name(SwsContext *c, const uint8_t *src[], \
int srcStride[], int srcSliceY, int srcSliceH, \
uint8_t *dst[], int dstStride[]) \
{ \
int y; \
\
if (!alpha && c->srcFormat == AV_PIX_FMT_YUV422P) { \
srcStride[1] *= 2; \
srcStride[2] *= 2; \
} \
for (y = 0; y < srcSliceH; y += 2) { \
dst_type *dst_1 = \
(dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]); \
dst_type *dst_2 = \
(dst_type *)(dst[0] + (y + srcSliceY + 1) * dstStride[0]); \
dst_type av_unused *r, *g, *b; \
const uint8_t *py_1 = src[0] + y * srcStride[0]; \
const uint8_t *py_2 = py_1 + srcStride[0]; \
const uint8_t *pu = src[1] + (y >> 1) * srcStride[1]; \
const uint8_t *pv = src[2] + (y >> 1) * srcStride[2]; \
const uint8_t av_unused *pa_1, *pa_2; \
unsigned int h_size = c->dstW >> 3; \
if (alpha) { \
pa_1 = src[3] + y * srcStride[3]; \
pa_2 = pa_1 + srcStride[3]; \
} \
while (h_size--) { \
int av_unused U, V, Y; \
#define ENDYUV2RGBLINE(dst_delta, ss) \
pu += 4 >> ss; \
pv += 4 >> ss; \
py_1 += 8 >> ss; \
py_2 += 8 >> ss; \
dst_1 += dst_delta >> ss; \
dst_2 += dst_delta >> ss; \
} \
if (c->dstW & (4 >> ss)) { \
int av_unused Y, U, V; \
#define ENDYUV2RGBFUNC() \
} \
} \
return srcSliceH; \
}
#define CLOSEYUV2RGBFUNC(dst_delta) \
ENDYUV2RGBLINE(dst_delta, 0) \
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_48, uint8_t, 0)
LOADCHROMA(0);
PUTRGB48(dst_1, py_1, 0);
PUTRGB48(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB48(dst_2, py_2, 1);
PUTRGB48(dst_1, py_1, 1);
LOADCHROMA(2);
PUTRGB48(dst_1, py_1, 2);
PUTRGB48(dst_2, py_2, 2);
LOADCHROMA(3);
PUTRGB48(dst_2, py_2, 3);
PUTRGB48(dst_1, py_1, 3);
ENDYUV2RGBLINE(48, 0)
LOADCHROMA(0);
PUTRGB48(dst_1, py_1, 0);
PUTRGB48(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB48(dst_2, py_2, 1);
PUTRGB48(dst_1, py_1, 1);
ENDYUV2RGBLINE(48, 1)
LOADCHROMA(0);
PUTRGB48(dst_1, py_1, 0);
PUTRGB48(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_bgr48, uint8_t, 0)
LOADCHROMA(0);
PUTBGR48(dst_1, py_1, 0);
PUTBGR48(dst_2, py_2, 0);
LOADCHROMA(1);
PUTBGR48(dst_2, py_2, 1);
PUTBGR48(dst_1, py_1, 1);
LOADCHROMA(2);
PUTBGR48(dst_1, py_1, 2);
PUTBGR48(dst_2, py_2, 2);
LOADCHROMA(3);
PUTBGR48(dst_2, py_2, 3);
PUTBGR48(dst_1, py_1, 3);
ENDYUV2RGBLINE(48, 0)
LOADCHROMA(0);
PUTBGR48(dst_1, py_1, 0);
PUTBGR48(dst_2, py_2, 0);
LOADCHROMA(1);
PUTBGR48(dst_2, py_2, 1);
PUTBGR48(dst_1, py_1, 1);
ENDYUV2RGBLINE(48, 1)
LOADCHROMA(0);
PUTBGR48(dst_1, py_1, 0);
PUTBGR48(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_32, uint32_t, 0)
LOADCHROMA(0);
PUTRGB(dst_1, py_1, 0);
PUTRGB(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB(dst_2, py_2, 1);
PUTRGB(dst_1, py_1, 1);
LOADCHROMA(2);
PUTRGB(dst_1, py_1, 2);
PUTRGB(dst_2, py_2, 2);
LOADCHROMA(3);
PUTRGB(dst_2, py_2, 3);
PUTRGB(dst_1, py_1, 3);
ENDYUV2RGBLINE(8, 0)
LOADCHROMA(0);
PUTRGB(dst_1, py_1, 0);
PUTRGB(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB(dst_2, py_2, 1);
PUTRGB(dst_1, py_1, 1);
ENDYUV2RGBLINE(8, 1)
LOADCHROMA(0);
PUTRGB(dst_1, py_1, 0);
PUTRGB(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuva2rgba_c, uint32_t, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 24);
PUTRGBA(dst_2, py_2, pa_2, 0, 24);
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 24);
PUTRGBA(dst_1, py_1, pa_1, 1, 24);
LOADCHROMA(2);
PUTRGBA(dst_1, py_1, pa_1, 2, 24);
PUTRGBA(dst_2, py_2, pa_2, 2, 24);
LOADCHROMA(3);
PUTRGBA(dst_2, py_2, pa_2, 3, 24);
PUTRGBA(dst_1, py_1, pa_1, 3, 24);
pa_1 += 8;
pa_2 += 8;
ENDYUV2RGBLINE(8, 0)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 24);
PUTRGBA(dst_2, py_2, pa_2, 0, 24);
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 24);
PUTRGBA(dst_1, py_1, pa_1, 1, 24);
pa_1 += 4;
pa_2 += 4;
ENDYUV2RGBLINE(8, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 24);
PUTRGBA(dst_2, py_2, pa_2, 0, 24);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuva2argb_c, uint32_t, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 0);
PUTRGBA(dst_2, py_2, pa_2, 0, 0);
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 0);
PUTRGBA(dst_1, py_1, pa_1, 1, 0);
LOADCHROMA(2);
PUTRGBA(dst_1, py_1, pa_1, 2, 0);
PUTRGBA(dst_2, py_2, pa_2, 2, 0);
LOADCHROMA(3);
PUTRGBA(dst_2, py_2, pa_2, 3, 0);
PUTRGBA(dst_1, py_1, pa_1, 3, 0);
pa_1 += 8;
pa_2 += 8;
ENDYUV2RGBLINE(8, 0)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 0);
PUTRGBA(dst_2, py_2, pa_2, 0, 0);
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 0);
PUTRGBA(dst_1, py_1, pa_1, 1, 0);
pa_1 += 4;
pa_2 += 4;
ENDYUV2RGBLINE(8, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 0);
PUTRGBA(dst_2, py_2, pa_2, 0, 0);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_24_rgb, uint8_t, 0)
LOADCHROMA(0);
PUTRGB24(dst_1, py_1, 0);
PUTRGB24(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB24(dst_2, py_2, 1);
PUTRGB24(dst_1, py_1, 1);
LOADCHROMA(2);
PUTRGB24(dst_1, py_1, 2);
PUTRGB24(dst_2, py_2, 2);
LOADCHROMA(3);
PUTRGB24(dst_2, py_2, 3);
PUTRGB24(dst_1, py_1, 3);
ENDYUV2RGBLINE(24, 0)
LOADCHROMA(0);
PUTRGB24(dst_1, py_1, 0);
PUTRGB24(dst_2, py_2, 0);
LOADCHROMA(1);
PUTRGB24(dst_2, py_2, 1);
PUTRGB24(dst_1, py_1, 1);
ENDYUV2RGBLINE(24, 1)
LOADCHROMA(0);
PUTRGB24(dst_1, py_1, 0);
PUTRGB24(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
// only trivial mods from yuv2rgb_c_24_rgb
YUV2RGBFUNC(yuv2rgb_c_24_bgr, uint8_t, 0)
LOADCHROMA(0);
PUTBGR24(dst_1, py_1, 0);
PUTBGR24(dst_2, py_2, 0);
LOADCHROMA(1);
PUTBGR24(dst_2, py_2, 1);
PUTBGR24(dst_1, py_1, 1);
LOADCHROMA(2);
PUTBGR24(dst_1, py_1, 2);
PUTBGR24(dst_2, py_2, 2);
LOADCHROMA(3);
PUTBGR24(dst_2, py_2, 3);
PUTBGR24(dst_1, py_1, 3);
ENDYUV2RGBLINE(24, 0)
LOADCHROMA(0);
PUTBGR24(dst_1, py_1, 0);
PUTBGR24(dst_2, py_2, 0);
LOADCHROMA(1);
PUTBGR24(dst_2, py_2, 1);
PUTBGR24(dst_1, py_1, 1);
ENDYUV2RGBLINE(24, 1)
LOADCHROMA(0);
PUTBGR24(dst_1, py_1, 0);
PUTBGR24(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_16_ordered_dither, uint16_t, 0)
const uint8_t *d16 = ff_dither_2x2_8[y & 1];
const uint8_t *e16 = ff_dither_2x2_4[y & 1];
const uint8_t *f16 = ff_dither_2x2_8[(y & 1)^1];
#define PUTRGB16(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d16[0 + o]] + \
g[Y + e16[0 + o]] + \
b[Y + f16[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d16[1 + o]] + \
g[Y + e16[1 + o]] + \
b[Y + f16[1 + o]];
LOADCHROMA(0);
PUTRGB16(dst_1, py_1, 0, 0);
PUTRGB16(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB16(dst_2, py_2, 1, 2 + 8);
PUTRGB16(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB16(dst_1, py_1, 2, 4);
PUTRGB16(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB16(dst_2, py_2, 3, 6 + 8);
PUTRGB16(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
YUV2RGBFUNC(yuv2rgb_c_15_ordered_dither, uint16_t, 0)
const uint8_t *d16 = ff_dither_2x2_8[y & 1];
const uint8_t *e16 = ff_dither_2x2_8[(y & 1)^1];
#define PUTRGB15(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d16[0 + o]] + \
g[Y + d16[1 + o]] + \
b[Y + e16[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d16[1 + o]] + \
g[Y + d16[0 + o]] + \
b[Y + e16[1 + o]];
LOADCHROMA(0);
PUTRGB15(dst_1, py_1, 0, 0);
PUTRGB15(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB15(dst_2, py_2, 1, 2 + 8);
PUTRGB15(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB15(dst_1, py_1, 2, 4);
PUTRGB15(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB15(dst_2, py_2, 3, 6 + 8);
PUTRGB15(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
// r, g, b, dst_1, dst_2
YUV2RGBFUNC(yuv2rgb_c_12_ordered_dither, uint16_t, 0)
const uint8_t *d16 = ff_dither_4x4_16[y & 3];
#define PUTRGB12(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d16[0 + o]] + \
g[Y + d16[0 + o]] + \
b[Y + d16[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d16[1 + o]] + \
g[Y + d16[1 + o]] + \
b[Y + d16[1 + o]];
LOADCHROMA(0);
PUTRGB12(dst_1, py_1, 0, 0);
PUTRGB12(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB12(dst_2, py_2, 1, 2 + 8);
PUTRGB12(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB12(dst_1, py_1, 2, 4);
PUTRGB12(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB12(dst_2, py_2, 3, 6 + 8);
PUTRGB12(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
// r, g, b, dst_1, dst_2
YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t, 0)
const uint8_t *d32 = ff_dither_8x8_32[y & 7];
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
#define PUTRGB8(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d32[0 + o]] + \
g[Y + d32[0 + o]] + \
b[Y + d64[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d32[1 + o]] + \
g[Y + d32[1 + o]] + \
b[Y + d64[1 + o]];
LOADCHROMA(0);
PUTRGB8(dst_1, py_1, 0, 0);
PUTRGB8(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB8(dst_2, py_2, 1, 2 + 8);
PUTRGB8(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB8(dst_1, py_1, 2, 4);
PUTRGB8(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB8(dst_2, py_2, 3, 6 + 8);
PUTRGB8(dst_1, py_1, 3, 6);
ENDYUV2RGBLINE(8, 0)
const uint8_t *d32 = ff_dither_8x8_32[y & 7];
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
LOADCHROMA(0);
PUTRGB8(dst_1, py_1, 0, 0);
PUTRGB8(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB8(dst_2, py_2, 1, 2 + 8);
PUTRGB8(dst_1, py_1, 1, 2);
ENDYUV2RGBLINE(8, 1)
const uint8_t *d32 = ff_dither_8x8_32[y & 7];
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
LOADCHROMA(0);
PUTRGB8(dst_1, py_1, 0, 0);
PUTRGB8(dst_2, py_2, 0, 0 + 8);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t, 0)
const uint8_t * d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
int acc;
#define PUTRGB4D(dst, src, i, o) \
Y = src[2 * i]; \
acc = r[Y + d128[0 + o]] + \
g[Y + d64[0 + o]] + \
b[Y + d128[0 + o]]; \
Y = src[2 * i + 1]; \
acc |= (r[Y + d128[1 + o]] + \
g[Y + d64[1 + o]] + \
b[Y + d128[1 + o]]) << 4; \
dst[i] = acc;
LOADCHROMA(0);
PUTRGB4D(dst_1, py_1, 0, 0);
PUTRGB4D(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB4D(dst_2, py_2, 1, 2 + 8);
PUTRGB4D(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB4D(dst_1, py_1, 2, 4);
PUTRGB4D(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB4D(dst_2, py_2, 3, 6 + 8);
PUTRGB4D(dst_1, py_1, 3, 6);
ENDYUV2RGBLINE(4, 0)
const uint8_t * d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
int acc;
LOADCHROMA(0);
PUTRGB4D(dst_1, py_1, 0, 0);
PUTRGB4D(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB4D(dst_2, py_2, 1, 2 + 8);
PUTRGB4D(dst_1, py_1, 1, 2);
ENDYUV2RGBLINE(4, 1)
const uint8_t * d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
int acc;
LOADCHROMA(0);
PUTRGB4D(dst_1, py_1, 0, 0);
PUTRGB4D(dst_2, py_2, 0, 0 + 8);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t, 0)
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
#define PUTRGB4DB(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d128[0 + o]] + \
g[Y + d64[0 + o]] + \
b[Y + d128[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d128[1 + o]] + \
g[Y + d64[1 + o]] + \
b[Y + d128[1 + o]];
LOADCHROMA(0);
PUTRGB4DB(dst_1, py_1, 0, 0);
PUTRGB4DB(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB4DB(dst_2, py_2, 1, 2 + 8);
PUTRGB4DB(dst_1, py_1, 1, 2);
LOADCHROMA(2);
PUTRGB4DB(dst_1, py_1, 2, 4);
PUTRGB4DB(dst_2, py_2, 2, 4 + 8);
LOADCHROMA(3);
PUTRGB4DB(dst_2, py_2, 3, 6 + 8);
PUTRGB4DB(dst_1, py_1, 3, 6);
ENDYUV2RGBLINE(8, 0)
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
LOADCHROMA(0);
PUTRGB4DB(dst_1, py_1, 0, 0);
PUTRGB4DB(dst_2, py_2, 0, 0 + 8);
LOADCHROMA(1);
PUTRGB4DB(dst_2, py_2, 1, 2 + 8);
PUTRGB4DB(dst_1, py_1, 1, 2);
ENDYUV2RGBLINE(8, 1)
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
LOADCHROMA(0);
PUTRGB4DB(dst_1, py_1, 0, 0);
PUTRGB4DB(dst_2, py_2, 0, 0 + 8);
ENDYUV2RGBFUNC()
YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
char out_1 = 0, out_2 = 0;
g = c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM];
#define PUTRGB1(out, src, i, o) \
Y = src[2 * i]; \
out += out + g[Y + d128[0 + o]]; \
Y = src[2 * i + 1]; \
out += out + g[Y + d128[1 + o]];
PUTRGB1(out_1, py_1, 0, 0);
PUTRGB1(out_2, py_2, 0, 0 + 8);
PUTRGB1(out_2, py_2, 1, 2 + 8);
PUTRGB1(out_1, py_1, 1, 2);
PUTRGB1(out_1, py_1, 2, 4);
PUTRGB1(out_2, py_2, 2, 4 + 8);
PUTRGB1(out_2, py_2, 3, 6 + 8);
PUTRGB1(out_1, py_1, 3, 6);
dst_1[0] = out_1;
dst_2[0] = out_2;
CLOSEYUV2RGBFUNC(1)
SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
{
SwsFunc t = NULL;
if (ARCH_PPC)
t = ff_yuv2rgb_init_ppc(c);
if (ARCH_X86)
t = ff_yuv2rgb_init_x86(c);
if (t)
return t;
av_log(c, AV_LOG_WARNING,
"No accelerated colorspace conversion found from %s to %s.\n",
av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
switch (c->dstFormat) {
case AV_PIX_FMT_BGR48BE:
case AV_PIX_FMT_BGR48LE:
return yuv2rgb_c_bgr48;
case AV_PIX_FMT_RGB48BE:
case AV_PIX_FMT_RGB48LE:
return yuv2rgb_c_48;
case AV_PIX_FMT_ARGB:
case AV_PIX_FMT_ABGR:
if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat))
return yuva2argb_c;
case AV_PIX_FMT_RGBA:
case AV_PIX_FMT_BGRA:
return (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) ? yuva2rgba_c : yuv2rgb_c_32;
case AV_PIX_FMT_RGB24:
return yuv2rgb_c_24_rgb;
case AV_PIX_FMT_BGR24:
return yuv2rgb_c_24_bgr;
case AV_PIX_FMT_RGB565:
case AV_PIX_FMT_BGR565:
return yuv2rgb_c_16_ordered_dither;
case AV_PIX_FMT_RGB555:
case AV_PIX_FMT_BGR555:
return yuv2rgb_c_15_ordered_dither;
case AV_PIX_FMT_RGB444:
case AV_PIX_FMT_BGR444:
return yuv2rgb_c_12_ordered_dither;
case AV_PIX_FMT_RGB8:
case AV_PIX_FMT_BGR8:
return yuv2rgb_c_8_ordered_dither;
case AV_PIX_FMT_RGB4:
case AV_PIX_FMT_BGR4:
return yuv2rgb_c_4_ordered_dither;
case AV_PIX_FMT_RGB4_BYTE:
case AV_PIX_FMT_BGR4_BYTE:
return yuv2rgb_c_4b_ordered_dither;
case AV_PIX_FMT_MONOBLACK:
return yuv2rgb_c_1_ordered_dither;
}
return NULL;
}
static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize,
const int64_t inc, void *y_tab)
{
int i;
uint8_t *y_table = y_tab;
y_table -= elemsize * (inc >> 9);
for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
int64_t cb = av_clip_uint8(i-YUVRGB_TABLE_HEADROOM)*inc;
table[i] = y_table + elemsize * (cb >> 16);
}
}
static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int64_t inc)
{
int i;
int off = -(inc >> 9);
for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
int64_t cb = av_clip_uint8(i-YUVRGB_TABLE_HEADROOM)*inc;
table[i] = elemsize * (off + (cb >> 16));
}
}
static uint16_t roundToInt16(int64_t f)
{
int r = (f + (1 << 15)) >> 16;
if (r < -0x7FFF)
return 0x8000;
else if (r > 0x7FFF)
return 0x7FFF;
else
return r;
}
av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
int fullRange, int brightness,
int contrast, int saturation)
{
const int isRgb = c->dstFormat == AV_PIX_FMT_RGB32 ||
c->dstFormat == AV_PIX_FMT_RGB32_1 ||
c->dstFormat == AV_PIX_FMT_BGR24 ||
c->dstFormat == AV_PIX_FMT_RGB565BE ||
c->dstFormat == AV_PIX_FMT_RGB565LE ||
c->dstFormat == AV_PIX_FMT_RGB555BE ||
c->dstFormat == AV_PIX_FMT_RGB555LE ||
c->dstFormat == AV_PIX_FMT_RGB444BE ||
c->dstFormat == AV_PIX_FMT_RGB444LE ||
c->dstFormat == AV_PIX_FMT_RGB8 ||
c->dstFormat == AV_PIX_FMT_RGB4 ||
c->dstFormat == AV_PIX_FMT_RGB4_BYTE ||
c->dstFormat == AV_PIX_FMT_MONOBLACK;
const int isNotNe = c->dstFormat == AV_PIX_FMT_NE(RGB565LE, RGB565BE) ||
c->dstFormat == AV_PIX_FMT_NE(RGB555LE, RGB555BE) ||
c->dstFormat == AV_PIX_FMT_NE(RGB444LE, RGB444BE) ||
c->dstFormat == AV_PIX_FMT_NE(BGR565LE, BGR565BE) ||
c->dstFormat == AV_PIX_FMT_NE(BGR555LE, BGR555BE) ||
c->dstFormat == AV_PIX_FMT_NE(BGR444LE, BGR444BE);
const int bpp = c->dstFormatBpp;
uint8_t *y_table;
uint16_t *y_table16;
uint32_t *y_table32;
int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha;
const int yoffs = (fullRange ? 384 : 326) + YUVRGB_TABLE_LUMA_HEADROOM;
const int table_plane_size = 1024 + 2*YUVRGB_TABLE_LUMA_HEADROOM;
int64_t crv = inv_table[0];
int64_t cbu = inv_table[1];
int64_t cgu = -inv_table[2];
int64_t cgv = -inv_table[3];
int64_t cy = 1 << 16;
int64_t oy = 0;
int64_t yb = 0;
if (!fullRange) {
cy = (cy * 255) / 219;
oy = 16 << 16;
} else {
crv = (crv * 224) / 255;
cbu = (cbu * 224) / 255;
cgu = (cgu * 224) / 255;
cgv = (cgv * 224) / 255;
}
cy = (cy * contrast) >> 16;
crv = (crv * contrast * saturation) >> 32;
cbu = (cbu * contrast * saturation) >> 32;
cgu = (cgu * contrast * saturation) >> 32;
cgv = (cgv * contrast * saturation) >> 32;
oy -= 256 * brightness;
c->uOffset = 0x0400040004000400LL;
c->vOffset = 0x0400040004000400LL;
c->yCoeff = roundToInt16(cy * 8192) * 0x0001000100010001ULL;
c->vrCoeff = roundToInt16(crv * 8192) * 0x0001000100010001ULL;
c->ubCoeff = roundToInt16(cbu * 8192) * 0x0001000100010001ULL;
c->vgCoeff = roundToInt16(cgv * 8192) * 0x0001000100010001ULL;
c->ugCoeff = roundToInt16(cgu * 8192) * 0x0001000100010001ULL;
c->yOffset = roundToInt16(oy * 8) * 0x0001000100010001ULL;
c->yuv2rgb_y_coeff = (int16_t)roundToInt16(cy << 13);
c->yuv2rgb_y_offset = (int16_t)roundToInt16(oy << 9);
c->yuv2rgb_v2r_coeff = (int16_t)roundToInt16(crv << 13);
c->yuv2rgb_v2g_coeff = (int16_t)roundToInt16(cgv << 13);
c->yuv2rgb_u2g_coeff = (int16_t)roundToInt16(cgu << 13);
c->yuv2rgb_u2b_coeff = (int16_t)roundToInt16(cbu << 13);
//scale coefficients by cy
crv = ((crv << 16) + 0x8000) / FFMAX(cy, 1);
cbu = ((cbu << 16) + 0x8000) / FFMAX(cy, 1);
cgu = ((cgu << 16) + 0x8000) / FFMAX(cy, 1);
cgv = ((cgv << 16) + 0x8000) / FFMAX(cy, 1);
av_freep(&c->yuvTable);
#define ALLOC_YUV_TABLE(x) \
c->yuvTable = av_malloc(x); \
if (!c->yuvTable) \
return AVERROR(ENOMEM);
switch (bpp) {
case 1:
ALLOC_YUV_TABLE(table_plane_size);
y_table = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size - 110; i++) {
y_table[i + 110] = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
yb += cy;
}
fill_table(c->table_gU, 1, cgu, y_table + yoffs);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 4:
case 4 | 128:
rbase = isRgb ? 3 : 0;
gbase = 1;
bbase = isRgb ? 0 : 3;
ALLOC_YUV_TABLE(table_plane_size * 3);
y_table = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size - 110; i++) {
int yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table[i + 110] = (yval >> 7) << rbase;
y_table[i + 37 + table_plane_size] = ((yval + 43) / 85) << gbase;
y_table[i + 110 + 2*table_plane_size] = (yval >> 7) << bbase;
yb += cy;
}
fill_table(c->table_rV, 1, crv, y_table + yoffs);
fill_table(c->table_gU, 1, cgu, y_table + yoffs + table_plane_size);
fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2*table_plane_size);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 8:
rbase = isRgb ? 5 : 0;
gbase = isRgb ? 2 : 3;
bbase = isRgb ? 0 : 6;
ALLOC_YUV_TABLE(table_plane_size * 3);
y_table = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size - 38; i++) {
int yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table[i + 16] = ((yval + 18) / 36) << rbase;
y_table[i + 16 + table_plane_size] = ((yval + 18) / 36) << gbase;
y_table[i + 37 + 2*table_plane_size] = ((yval + 43) / 85) << bbase;
yb += cy;
}
fill_table(c->table_rV, 1, crv, y_table + yoffs);
fill_table(c->table_gU, 1, cgu, y_table + yoffs + table_plane_size);
fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2*table_plane_size);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 12:
rbase = isRgb ? 8 : 0;
gbase = 4;
bbase = isRgb ? 0 : 8;
ALLOC_YUV_TABLE(table_plane_size * 3 * 2);
y_table16 = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size; i++) {
uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table16[i] = (yval >> 4) << rbase;
y_table16[i + table_plane_size] = (yval >> 4) << gbase;
y_table16[i + 2*table_plane_size] = (yval >> 4) << bbase;
yb += cy;
}
if (isNotNe)
for (i = 0; i < table_plane_size * 3; i++)
y_table16[i] = av_bswap16(y_table16[i]);
fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + table_plane_size);
fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2*table_plane_size);
fill_gv_table(c->table_gV, 2, cgv);
break;
case 15:
case 16:
rbase = isRgb ? bpp - 5 : 0;
gbase = 5;
bbase = isRgb ? 0 : (bpp - 5);
ALLOC_YUV_TABLE(table_plane_size * 3 * 2);
y_table16 = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size; i++) {
uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table16[i] = (yval >> 3) << rbase;
y_table16[i + table_plane_size] = (yval >> (18 - bpp)) << gbase;
y_table16[i + 2*table_plane_size] = (yval >> 3) << bbase;
yb += cy;
}
if (isNotNe)
for (i = 0; i < table_plane_size * 3; i++)
y_table16[i] = av_bswap16(y_table16[i]);
fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + table_plane_size);
fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2*table_plane_size);
fill_gv_table(c->table_gV, 2, cgv);
break;
case 24:
case 48:
ALLOC_YUV_TABLE(table_plane_size);
y_table = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size; i++) {
y_table[i] = av_clip_uint8((yb + 0x8000) >> 16);
yb += cy;
}
fill_table(c->table_rV, 1, crv, y_table + yoffs);
fill_table(c->table_gU, 1, cgu, y_table + yoffs);
fill_table(c->table_bU, 1, cbu, y_table + yoffs);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 32:
case 64:
base = (c->dstFormat == AV_PIX_FMT_RGB32_1 ||
c->dstFormat == AV_PIX_FMT_BGR32_1) ? 8 : 0;
rbase = base + (isRgb ? 16 : 0);
gbase = base + 8;
bbase = base + (isRgb ? 0 : 16);
needAlpha = CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat);
if (!needAlpha)
abase = (base + 24) & 31;
ALLOC_YUV_TABLE(table_plane_size * 3 * 4);
y_table32 = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size; i++) {
unsigned yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table32[i] = (yval << rbase) +
(needAlpha ? 0 : (255u << abase));
y_table32[i + table_plane_size] = yval << gbase;
y_table32[i + 2*table_plane_size] = yval << bbase;
yb += cy;
}
fill_table(c->table_rV, 4, crv, y_table32 + yoffs);
fill_table(c->table_gU, 4, cgu, y_table32 + yoffs + table_plane_size);
fill_table(c->table_bU, 4, cbu, y_table32 + yoffs + 2*table_plane_size);
fill_gv_table(c->table_gV, 4, cgv);
break;
default:
if(!isPlanar(c->dstFormat) || bpp <= 24)
av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp);
return -1;
}
return 0;
}