ffmpeg-2.8.5

git-svn-id: svn://kolibrios.org@6147 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Sergey Semyonov (Serge)
2016-02-05 22:08:02 +00:00
parent a08f61ddb9
commit a4b787f4b8
5429 changed files with 1356786 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
# subsystems
OBJS-$(CONFIG_AUDIODSP) += ppc/audiodsp.o
OBJS-$(CONFIG_BLOCKDSP) += ppc/blockdsp.o
OBJS-$(CONFIG_FFT) += ppc/fft_init.o \
ppc/fft_altivec.o \
ppc/fft_vsx.o
OBJS-$(CONFIG_FMTCONVERT) += ppc/fmtconvert_altivec.o
OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o
OBJS-$(CONFIG_H264DSP) += ppc/h264dsp.o ppc/hpeldsp_altivec.o
OBJS-$(CONFIG_H264QPEL) += ppc/h264qpel.o
OBJS-$(CONFIG_HPELDSP) += ppc/hpeldsp_altivec.o
OBJS-$(CONFIG_HUFFYUVDSP) += ppc/huffyuvdsp_altivec.o
OBJS-$(CONFIG_FDCTDSP) += ppc/fdctdsp.o
OBJS-$(CONFIG_IDCTDSP) += ppc/idctdsp.o
OBJS-$(CONFIG_ME_CMP) += ppc/me_cmp.o
OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o
OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \
ppc/mpegvideodsp.o
OBJS-$(CONFIG_MPEGVIDEOENC) += ppc/mpegvideoencdsp.o
OBJS-$(CONFIG_PIXBLOCKDSP) += ppc/pixblockdsp.o
OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o
OBJS-$(CONFIG_VP8DSP) += ppc/vp8dsp_altivec.o
# decoders/encoders
OBJS-$(CONFIG_LLAUDDSP) += ppc/lossless_audiodsp_altivec.o
OBJS-$(CONFIG_SVQ1_ENCODER) += ppc/svq1enc_altivec.o
OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o
OBJS-$(CONFIG_VORBIS_DECODER) += ppc/vorbisdsp_altivec.o
OBJS-$(CONFIG_VP7_DECODER) += ppc/vp8dsp_altivec.o

View File

@@ -0,0 +1,141 @@
/*
* Copyright (c) 2009 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#define X(s) JOIN(EXTERN_ASM, s)
#if ARCH_PPC64
#define PTR .quad
#define lp ld
#define lpx ldx
#define stp std
#define stpu stdu
#define PS 8
#define L(s) JOIN(., s)
.macro extfunc name
.global X(\name)
#if _CALL_ELF == 2
.text
X(\name):
addis %r2, %r12, .TOC.-X(\name)@ha
addi %r2, %r2, .TOC.-X(\name)@l
.localentry X(\name), .-X(\name)
#else
.section .opd, "aw"
X(\name):
.quad L(\name), .TOC.@tocbase, 0
.previous
.type X(\name), STT_FUNC
L(\name):
#endif
.endm
.macro movrel rd, sym, gp
ld \rd, \sym@got(r2)
.endm
.macro get_got rd
.endm
#else /* ARCH_PPC64 */
#define PTR .int
#define lp lwz
#define lpx lwzx
#define stp stw
#define stpu stwu
#define PS 4
#define L(s) s
.macro extfunc name
.global X(\name)
.type X(\name), STT_FUNC
X(\name):
\name:
.endm
.macro movrel rd, sym, gp
#if CONFIG_PIC
lwz \rd, \sym@got(\gp)
#else
lis \rd, \sym@ha
la \rd, \sym@l(\rd)
#endif
.endm
.macro get_got rd
#if CONFIG_PIC
bcl 20, 31, .Lgot\@
.Lgot\@:
mflr \rd
addis \rd, \rd, _GLOBAL_OFFSET_TABLE_ - .Lgot\@@ha
addi \rd, \rd, _GLOBAL_OFFSET_TABLE_ - .Lgot\@@l
#endif
.endm
#endif /* ARCH_PPC64 */
#if HAVE_IBM_ASM
.macro DEFINE_REG n
.equiv r\n, \n
.equiv f\n, \n
.equiv v\n, \n
.endm
DEFINE_REG 0
DEFINE_REG 1
DEFINE_REG 2
DEFINE_REG 3
DEFINE_REG 4
DEFINE_REG 5
DEFINE_REG 6
DEFINE_REG 7
DEFINE_REG 8
DEFINE_REG 9
DEFINE_REG 10
DEFINE_REG 11
DEFINE_REG 12
DEFINE_REG 13
DEFINE_REG 14
DEFINE_REG 15
DEFINE_REG 16
DEFINE_REG 17
DEFINE_REG 18
DEFINE_REG 19
DEFINE_REG 20
DEFINE_REG 21
DEFINE_REG 22
DEFINE_REG 23
DEFINE_REG 24
DEFINE_REG 25
DEFINE_REG 26
DEFINE_REG 27
DEFINE_REG 28
DEFINE_REG 29
DEFINE_REG 30
DEFINE_REG 31
#endif /* HAVE_IBM_ASM */

View File

@@ -0,0 +1,72 @@
/*
* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* miscellaneous audio operations
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/audiodsp.h"
#if HAVE_ALTIVEC
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
int order)
{
int i;
LOAD_ZERO;
register vec_s16 vec1;
register vec_s32 res = vec_splat_s32(0), t;
int32_t ires;
for (i = 0; i < order; i += 8) {
vec1 = vec_unaligned_load(v1);
t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
res = vec_sums(t, res);
v1 += 8;
v2 += 8;
}
res = vec_splat(res, 3);
vec_ste(res, 0, &ires);
return ires;
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_audiodsp_init_ppc(AudioDSPContext *c)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->scalarproduct_int16 = scalarproduct_int16_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,169 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include <string.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavcodec/blockdsp.h"
/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
* clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with
* a cache line size not equal to 32 bytes. Fortunately all processors used
* by Apple up to at least the 7450 (AKA second generation G4) use 32-byte
* cache lines. This is due to the use of the 'dcbz' instruction. It simply
* clears a single cache line to zero, so you need to know the cache line
* size to use it! It's absurd, but it's fast...
*
* update 24/06/2003: Apple released the G5 yesterday, with a PPC970.
* cache line size: 128 bytes. Oups.
* The semantics of dcbz was changed, it always clears 32 bytes. So the function
* below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
* which is defined to clear a cache line (as dcbz before). So we can still
* distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
*
* see <http://developer.apple.com/technotes/tn/tn2087.html>
* and <http://developer.apple.com/technotes/tn/tn2086.html>
*/
static void clear_blocks_dcbz32_ppc(int16_t *blocks)
{
register int misal = (unsigned long) blocks & 0x00000010, i = 0;
if (misal) {
((unsigned long *) blocks)[0] = 0L;
((unsigned long *) blocks)[1] = 0L;
((unsigned long *) blocks)[2] = 0L;
((unsigned long *) blocks)[3] = 0L;
i += 16;
}
for (; i < sizeof(int16_t) * 6 * 64 - 31; i += 32)
__asm__ volatile ("dcbz %0,%1" :: "b" (blocks), "r" (i) : "memory");
if (misal) {
((unsigned long *) blocks)[188] = 0L;
((unsigned long *) blocks)[189] = 0L;
((unsigned long *) blocks)[190] = 0L;
((unsigned long *) blocks)[191] = 0L;
i += 16;
}
}
/* Same as above, when dcbzl clears a whole 128 bytes cache line
* i.e. the PPC970 AKA G5. */
static void clear_blocks_dcbz128_ppc(int16_t *blocks)
{
#if HAVE_DCBZL
register int misal = (unsigned long) blocks & 0x0000007f, i = 0;
if (misal) {
/* We could probably also optimize this case,
* but there's not much point as the machines
* aren't available yet (2003-06-26). */
memset(blocks, 0, sizeof(int16_t) * 6 * 64);
} else {
for (; i < sizeof(int16_t) * 6 * 64; i += 128)
__asm__ volatile ("dcbzl %0,%1" :: "b" (blocks), "r" (i) : "memory");
}
#else
memset(blocks, 0, sizeof(int16_t) * 6 * 64);
#endif
}
/* Check dcbz report how many bytes are set to 0 by dcbz. */
/* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect
* (Apple "fixed" dcbz). Unfortunately this cannot be used unless the
* assembler knows about dcbzl ... */
static long check_dcbzl_effect(void)
{
long count = 0;
#if HAVE_DCBZL
register char *fakedata = av_malloc(1024);
register char *fakedata_middle;
register long zero = 0, i = 0;
if (!fakedata)
return 0L;
fakedata_middle = fakedata + 512;
memset(fakedata, 0xFF, 1024);
/* Below the constraint "b" seems to mean "address base register"
* in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */
__asm__ volatile ("dcbzl %0, %1" :: "b" (fakedata_middle), "r" (zero));
for (i = 0; i < 1024; i++)
if (fakedata[i] == (char) 0)
count++;
av_free(fakedata);
#endif
return count;
}
#if HAVE_ALTIVEC
static void clear_block_altivec(int16_t *block)
{
LOAD_ZERO;
vec_st(zero_s16v, 0, block);
vec_st(zero_s16v, 16, block);
vec_st(zero_s16v, 32, block);
vec_st(zero_s16v, 48, block);
vec_st(zero_s16v, 64, block);
vec_st(zero_s16v, 80, block);
vec_st(zero_s16v, 96, block);
vec_st(zero_s16v, 112, block);
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth)
{
// common optimizations whether AltiVec is available or not
if (!high_bit_depth) {
switch (check_dcbzl_effect()) {
case 32:
c->clear_blocks = clear_blocks_dcbz32_ppc;
break;
case 128:
c->clear_blocks = clear_blocks_dcbz128_ppc;
break;
default:
break;
}
}
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
if (!high_bit_depth)
c->clear_block = clear_block_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,32 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "fdct.h"
static const struct algo fdct_tab_arch[] = {
#if HAVE_ALTIVEC
{ "altivecfdct", ff_fdct_altivec, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ALTIVEC },
#endif
{ 0 }
};
static const struct algo idct_tab_arch[] = {
{ 0 }
};

View File

@@ -0,0 +1,26 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_FDCT_H
#define AVCODEC_PPC_FDCT_H
#include <stdint.h>
void ff_fdct_altivec(int16_t *block);
#endif /* AVCODEC_PPC_FDCT_H */

View File

@@ -0,0 +1,479 @@
/*
* Copyright (C) 2003 James Klicman <james@klicman.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavcodec/fdctdsp.h"
#include "fdct.h"
#if HAVE_ALTIVEC
#define vs16(v) ((vector signed short) (v))
#define vs32(v) ((vector signed int) (v))
#define vu8(v) ((vector unsigned char) (v))
#define vu16(v) ((vector unsigned short) (v))
#define vu32(v) ((vector unsigned int) (v))
#define C1 0.98078525066375732421875000 /* cos(1 * PI / 16) */
#define C2 0.92387950420379638671875000 /* cos(2 * PI / 16) */
#define C3 0.83146959543228149414062500 /* cos(3 * PI / 16) */
#define C4 0.70710676908493041992187500 /* cos(4 * PI / 16) */
#define C5 0.55557024478912353515625000 /* cos(5 * PI / 16) */
#define C6 0.38268342614173889160156250 /* cos(6 * PI / 16) */
#define C7 0.19509032368659973144531250 /* cos(7 * PI / 16) */
#define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */
#define W0 -(2 * C2)
#define W1 (2 * C6)
#define W2 (SQRT_2 * C6)
#define W3 (SQRT_2 * C3)
#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
#define W5 (SQRT_2 * (C1 + C3 - C5 + C7))
#define W6 (SQRT_2 * (C1 + C3 + C5 - C7))
#define W7 (SQRT_2 * (C1 + C3 - C5 - C7))
#define W8 (SQRT_2 * (C7 - C3))
#define W9 (SQRT_2 * (-C1 - C3))
#define WA (SQRT_2 * (-C3 - C5))
#define WB (SQRT_2 * (C5 - C3))
static const vector float fdctconsts[3] = {
{ W0, W1, W2, W3 },
{ W4, W5, W6, W7 },
{ W8, W9, WA, WB }
};
#define LD_W0 vec_splat(cnsts0, 0)
#define LD_W1 vec_splat(cnsts0, 1)
#define LD_W2 vec_splat(cnsts0, 2)
#define LD_W3 vec_splat(cnsts0, 3)
#define LD_W4 vec_splat(cnsts1, 0)
#define LD_W5 vec_splat(cnsts1, 1)
#define LD_W6 vec_splat(cnsts1, 2)
#define LD_W7 vec_splat(cnsts1, 3)
#define LD_W8 vec_splat(cnsts2, 0)
#define LD_W9 vec_splat(cnsts2, 1)
#define LD_WA vec_splat(cnsts2, 2)
#define LD_WB vec_splat(cnsts2, 3)
#define FDCTROW(b0, b1, b2, b3, b4, b5, b6, b7) /* {{{ */ \
x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
\
b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
\
b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
cnst = LD_W2; \
b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
cnst = LD_W1; \
b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
cnst = LD_W0; \
b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
\
x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
cnst = LD_W3; \
x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
\
cnst = LD_W8; \
x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
cnst = LD_W9; \
x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
cnst = LD_WA; \
x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
cnst = LD_WB; \
x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
\
cnst = LD_W4; \
b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
cnst = LD_W5; \
b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
cnst = LD_W6; \
b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
cnst = LD_W7; \
b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
\
b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \
b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \
b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \
b1 = vec_add(b1, x3) /* b1 = b1 + x3; */ \
/* }}} */
#define FDCTCOL(b0, b1, b2, b3, b4, b5, b6, b7) /* {{{ */ \
x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
\
b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
\
b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
cnst = LD_W2; \
b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
cnst = LD_W1; \
b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
cnst = LD_W0; \
b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
\
x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
cnst = LD_W3; \
x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
\
cnst = LD_W8; \
x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
cnst = LD_W9; \
x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
cnst = LD_WA; \
x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
cnst = LD_WB; \
x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
\
cnst = LD_W4; \
b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
cnst = LD_W5; \
b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
cnst = LD_W6; \
b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
cnst = LD_W7; \
b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
\
b7 = vec_add(b7, x2); /* b7 += x2; */ \
b5 = vec_add(b5, x3); /* b5 += x3; */ \
b3 = vec_add(b3, x2); /* b3 += x2; */ \
b1 = vec_add(b1, x3) /* b1 += x3; */ \
/* }}} */
/* two dimensional discrete cosine transform */
void ff_fdct_altivec(int16_t *block)
{
vector signed short *bp;
const vector float *cp = fdctconsts;
vector float b00, b10, b20, b30, b40, b50, b60, b70;
vector float b01, b11, b21, b31, b41, b51, b61, b71;
vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;
/* setup constants {{{ */
/* mzero = -0.0 */
mzero = ((vector float) vec_splat_u32(-1));
mzero = ((vector float) vec_sl(vu32(mzero), vu32(mzero)));
cnsts0 = vec_ld(0, cp);
cp++;
cnsts1 = vec_ld(0, cp);
cp++;
cnsts2 = vec_ld(0, cp);
/* }}} */
/* 8x8 matrix transpose (vector short[8]) {{{ */
#define MERGE_S16(hl, a, b) vec_merge ## hl(vs16(a), vs16(b))
bp = (vector signed short *) block;
b00 = ((vector float) vec_ld(0, bp));
b40 = ((vector float) vec_ld(16 * 4, bp));
b01 = ((vector float) MERGE_S16(h, b00, b40));
b11 = ((vector float) MERGE_S16(l, b00, b40));
bp++;
b10 = ((vector float) vec_ld(0, bp));
b50 = ((vector float) vec_ld(16 * 4, bp));
b21 = ((vector float) MERGE_S16(h, b10, b50));
b31 = ((vector float) MERGE_S16(l, b10, b50));
bp++;
b20 = ((vector float) vec_ld(0, bp));
b60 = ((vector float) vec_ld(16 * 4, bp));
b41 = ((vector float) MERGE_S16(h, b20, b60));
b51 = ((vector float) MERGE_S16(l, b20, b60));
bp++;
b30 = ((vector float) vec_ld(0, bp));
b70 = ((vector float) vec_ld(16 * 4, bp));
b61 = ((vector float) MERGE_S16(h, b30, b70));
b71 = ((vector float) MERGE_S16(l, b30, b70));
x0 = ((vector float) MERGE_S16(h, b01, b41));
x1 = ((vector float) MERGE_S16(l, b01, b41));
x2 = ((vector float) MERGE_S16(h, b11, b51));
x3 = ((vector float) MERGE_S16(l, b11, b51));
x4 = ((vector float) MERGE_S16(h, b21, b61));
x5 = ((vector float) MERGE_S16(l, b21, b61));
x6 = ((vector float) MERGE_S16(h, b31, b71));
x7 = ((vector float) MERGE_S16(l, b31, b71));
b00 = ((vector float) MERGE_S16(h, x0, x4));
b10 = ((vector float) MERGE_S16(l, x0, x4));
b20 = ((vector float) MERGE_S16(h, x1, x5));
b30 = ((vector float) MERGE_S16(l, x1, x5));
b40 = ((vector float) MERGE_S16(h, x2, x6));
b50 = ((vector float) MERGE_S16(l, x2, x6));
b60 = ((vector float) MERGE_S16(h, x3, x7));
b70 = ((vector float) MERGE_S16(l, x3, x7));
#undef MERGE_S16
/* }}} */
/* Some of the initial calculations can be done as vector short
* before conversion to vector float. The following code section
* takes advantage of this. */
/* fdct rows {{{ */
x0 = ((vector float) vec_add(vs16(b00), vs16(b70)));
x7 = ((vector float) vec_sub(vs16(b00), vs16(b70)));
x1 = ((vector float) vec_add(vs16(b10), vs16(b60)));
x6 = ((vector float) vec_sub(vs16(b10), vs16(b60)));
x2 = ((vector float) vec_add(vs16(b20), vs16(b50)));
x5 = ((vector float) vec_sub(vs16(b20), vs16(b50)));
x3 = ((vector float) vec_add(vs16(b30), vs16(b40)));
x4 = ((vector float) vec_sub(vs16(b30), vs16(b40)));
b70 = ((vector float) vec_add(vs16(x0), vs16(x3)));
b10 = ((vector float) vec_add(vs16(x1), vs16(x2)));
b00 = ((vector float) vec_add(vs16(b70), vs16(b10)));
b40 = ((vector float) vec_sub(vs16(b70), vs16(b10)));
#define CTF0(n) \
b ## n ## 1 = ((vector float) vec_unpackl(vs16(b ## n ## 0))); \
b ## n ## 0 = ((vector float) vec_unpackh(vs16(b ## n ## 0))); \
b ## n ## 1 = vec_ctf(vs32(b ## n ## 1), 0); \
b ## n ## 0 = vec_ctf(vs32(b ## n ## 0), 0)
CTF0(0);
CTF0(4);
b20 = ((vector float) vec_sub(vs16(x0), vs16(x3)));
b60 = ((vector float) vec_sub(vs16(x1), vs16(x2)));
CTF0(2);
CTF0(6);
#undef CTF0
x0 = vec_add(b60, b20);
x1 = vec_add(b61, b21);
cnst = LD_W2;
x0 = vec_madd(cnst, x0, mzero);
x1 = vec_madd(cnst, x1, mzero);
cnst = LD_W1;
b20 = vec_madd(cnst, b20, x0);
b21 = vec_madd(cnst, b21, x1);
cnst = LD_W0;
b60 = vec_madd(cnst, b60, x0);
b61 = vec_madd(cnst, b61, x1);
#define CTFX(x, b) \
b ## 0 = ((vector float) vec_unpackh(vs16(x))); \
b ## 1 = ((vector float) vec_unpackl(vs16(x))); \
b ## 0 = vec_ctf(vs32(b ## 0), 0); \
b ## 1 = vec_ctf(vs32(b ## 1), 0)
CTFX(x4, b7);
CTFX(x5, b5);
CTFX(x6, b3);
CTFX(x7, b1);
#undef CTFX
x0 = vec_add(b70, b10);
x1 = vec_add(b50, b30);
x2 = vec_add(b70, b30);
x3 = vec_add(b50, b10);
x8 = vec_add(x2, x3);
cnst = LD_W3;
x8 = vec_madd(cnst, x8, mzero);
cnst = LD_W8;
x0 = vec_madd(cnst, x0, mzero);
cnst = LD_W9;
x1 = vec_madd(cnst, x1, mzero);
cnst = LD_WA;
x2 = vec_madd(cnst, x2, x8);
cnst = LD_WB;
x3 = vec_madd(cnst, x3, x8);
cnst = LD_W4;
b70 = vec_madd(cnst, b70, x0);
cnst = LD_W5;
b50 = vec_madd(cnst, b50, x1);
cnst = LD_W6;
b30 = vec_madd(cnst, b30, x1);
cnst = LD_W7;
b10 = vec_madd(cnst, b10, x0);
b70 = vec_add(b70, x2);
b50 = vec_add(b50, x3);
b30 = vec_add(b30, x2);
b10 = vec_add(b10, x3);
x0 = vec_add(b71, b11);
x1 = vec_add(b51, b31);
x2 = vec_add(b71, b31);
x3 = vec_add(b51, b11);
x8 = vec_add(x2, x3);
cnst = LD_W3;
x8 = vec_madd(cnst, x8, mzero);
cnst = LD_W8;
x0 = vec_madd(cnst, x0, mzero);
cnst = LD_W9;
x1 = vec_madd(cnst, x1, mzero);
cnst = LD_WA;
x2 = vec_madd(cnst, x2, x8);
cnst = LD_WB;
x3 = vec_madd(cnst, x3, x8);
cnst = LD_W4;
b71 = vec_madd(cnst, b71, x0);
cnst = LD_W5;
b51 = vec_madd(cnst, b51, x1);
cnst = LD_W6;
b31 = vec_madd(cnst, b31, x1);
cnst = LD_W7;
b11 = vec_madd(cnst, b11, x0);
b71 = vec_add(b71, x2);
b51 = vec_add(b51, x3);
b31 = vec_add(b31, x2);
b11 = vec_add(b11, x3);
/* }}} */
/* 8x8 matrix transpose (vector float[8][2]) {{{ */
x0 = vec_mergel(b00, b20);
x1 = vec_mergeh(b00, b20);
x2 = vec_mergel(b10, b30);
x3 = vec_mergeh(b10, b30);
b00 = vec_mergeh(x1, x3);
b10 = vec_mergel(x1, x3);
b20 = vec_mergeh(x0, x2);
b30 = vec_mergel(x0, x2);
x4 = vec_mergel(b41, b61);
x5 = vec_mergeh(b41, b61);
x6 = vec_mergel(b51, b71);
x7 = vec_mergeh(b51, b71);
b41 = vec_mergeh(x5, x7);
b51 = vec_mergel(x5, x7);
b61 = vec_mergeh(x4, x6);
b71 = vec_mergel(x4, x6);
x0 = vec_mergel(b01, b21);
x1 = vec_mergeh(b01, b21);
x2 = vec_mergel(b11, b31);
x3 = vec_mergeh(b11, b31);
x4 = vec_mergel(b40, b60);
x5 = vec_mergeh(b40, b60);
x6 = vec_mergel(b50, b70);
x7 = vec_mergeh(b50, b70);
b40 = vec_mergeh(x1, x3);
b50 = vec_mergel(x1, x3);
b60 = vec_mergeh(x0, x2);
b70 = vec_mergel(x0, x2);
b01 = vec_mergeh(x5, x7);
b11 = vec_mergel(x5, x7);
b21 = vec_mergeh(x4, x6);
b31 = vec_mergel(x4, x6);
/* }}} */
FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);
/* round, convert back to short {{{ */
#define CTS(n) \
b ## n ## 0 = vec_round(b ## n ## 0); \
b ## n ## 1 = vec_round(b ## n ## 1); \
b ## n ## 0 = ((vector float) vec_cts(b ## n ## 0, 0)); \
b ## n ## 1 = ((vector float) vec_cts(b ## n ## 1, 0)); \
b ## n ## 0 = ((vector float) vec_pack(vs32(b ## n ## 0), \
vs32(b ## n ## 1))); \
vec_st(vs16(b ## n ## 0), 0, bp)
bp = (vector signed short *) block;
CTS(0);
bp++;
CTS(1);
bp++;
CTS(2);
bp++;
CTS(3);
bp++;
CTS(4);
bp++;
CTS(5);
bp++;
CTS(6);
bp++;
CTS(7);
#undef CTS
/* }}} */
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_fdctdsp_init_ppc(FDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
if (!high_bit_depth) {
if (avctx->dct_algo == FF_DCT_AUTO ||
avctx->dct_algo == FF_DCT_ALTIVEC) {
c->fdct = ff_fdct_altivec;
}
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,454 @@
/*
* FFT transform with Altivec optimizations
* Copyright (c) 2009 Loren Merritt
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* These functions are not individually interchangeable with the C versions.
* While C takes arrays of FFTComplex, Altivec leaves intermediate results
* in blocks as convenient to the vector size.
* i.e. {4x real, 4x imaginary, 4x real, ...}
*
* I ignore standard calling convention.
* Instead, the following registers are treated as global constants:
* v14: zero
* v15..v18: cosines
* v19..v29: permutations
* r9: 16
* r12: ff_cos_tabs
* and the rest are free for local use.
*/
#include "config.h"
#if HAVE_GNU_AS && HAVE_ALTIVEC
#include "asm.S"
.text
.macro addi2 ra, imm // add 32-bit immediate
.if \imm & 0xffff
addi \ra, \ra, \imm@l
.endif
.if (\imm+0x8000)>>16
addis \ra, \ra, \imm@ha
.endif
.endm
.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
.endm
.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
vperm \b2,\b0,\b1,v20
vperm \b3,\b0,\b1,v21
vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
vaddfp \b0,\b2,\b3
vsubfp \b1,\b2,\b3
vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
vmrghw \b2,\b0,\b1
vperm \b3,\b0,\b1,v22
vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
vaddfp \b0,\b2,\b3
vsubfp \b1,\b2,\b3
vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
vperm \b2,\b0,\b1,v23
vperm \b3,\b0,\b1,v24
.endm
.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
vperm \a2,\a0,\a1,v20 // FFT4 ...
vperm \a3,\a0,\a1,v21
vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
vaddfp \a0,\a2,\a3
vsubfp \a1,\a2,\a3
vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
vmrghw \a2,\a0,\a1
vperm \a3,\a0,\a1,v22
vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
vaddfp \a0,\a2,\a3
vsubfp \a1,\a2,\a3
vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
vperm \a2,\a0,\a1,v23
vperm \a3,\a0,\a1,v24
vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
.endm
.macro BF d0,d1,s0,s1
vsubfp \d1,\s0,\s1
vaddfp \d0,\s0,\s1
.endm
.macro zip d0,d1,s0,s1
vmrghw \d0,\s0,\s1
vmrglw \d1,\s0,\s1
.endm
.macro def_fft4 interleave
fft4\interleave\()_altivec:
lvx v0, 0,r3
lvx v1,r9,r3
FFT4 v0,v1,v2,v3
.ifnb \interleave
zip v0,v1,v2,v3
stvx v0, 0,r3
stvx v1,r9,r3
.else
stvx v2, 0,r3
stvx v3,r9,r3
.endif
blr
.endm
.macro def_fft8 interleave
fft8\interleave\()_altivec:
addi r4,r3,32
lvx v0, 0,r3
lvx v1,r9,r3
lvx v2, 0,r4
lvx v3,r9,r4
FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
.ifnb \interleave
zip v4,v5,v0,v1
zip v6,v7,v2,v3
stvx v4, 0,r3
stvx v5,r9,r3
stvx v6, 0,r4
stvx v7,r9,r4
.else
stvx v0, 0,r3
stvx v1,r9,r3
stvx v2, 0,r4
stvx v3,r9,r4
.endif
blr
.endm
.macro def_fft16 interleave
fft16\interleave\()_altivec:
addi r5,r3,64
addi r6,r3,96
addi r4,r3,32
lvx v0, 0,r5
lvx v1,r9,r5
lvx v2, 0,r6
lvx v3,r9,r6
FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
lvx v0, 0,r3
lvx v1,r9,r3
lvx v2, 0,r4
lvx v3,r9,r4
FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
vmaddfp v8,v4,v15,v14 // r2*wre
vmaddfp v9,v5,v15,v14 // i2*wre
vmaddfp v10,v6,v15,v14 // r3*wre
vmaddfp v11,v7,v15,v14 // i3*wre
vmaddfp v8,v5,v16,v8 // i2*wim
vnmsubfp v9,v4,v16,v9 // r2*wim
vnmsubfp v10,v7,v16,v10 // i3*wim
vmaddfp v11,v6,v16,v11 // r3*wim
BF v10,v12,v10,v8
BF v11,v13,v9,v11
BF v0,v4,v0,v10
BF v3,v7,v3,v12
BF v1,v5,v1,v11
BF v2,v6,v2,v13
.ifnb \interleave
zip v8, v9,v0,v1
zip v10,v11,v2,v3
zip v12,v13,v4,v5
zip v14,v15,v6,v7
stvx v8, 0,r3
stvx v9,r9,r3
stvx v10, 0,r4
stvx v11,r9,r4
stvx v12, 0,r5
stvx v13,r9,r5
stvx v14, 0,r6
stvx v15,r9,r6
.else
stvx v0, 0,r3
stvx v4, 0,r5
stvx v3,r9,r4
stvx v7,r9,r6
stvx v1,r9,r3
stvx v5,r9,r5
stvx v2, 0,r4
stvx v6, 0,r6
.endif
blr
.endm
// void pass(float *z, float *wre, int n)
.macro PASS interleave, suffix
fft_pass\suffix\()_altivec:
mtctr r5
slwi r0,r5,4
slwi r7,r5,6 // o2
slwi r5,r5,5 // o1
add r10,r5,r7 // o3
add r0,r4,r0 // wim
addi r6,r5,16 // o1+16
addi r8,r7,16 // o2+16
addi r11,r10,16 // o3+16
1:
lvx v8, 0,r4 // wre
lvx v10, 0,r0 // wim
sub r0,r0,r9
lvx v9, 0,r0
vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
lvx v4,r3,r7 // r2 = z[o2]
lvx v5,r3,r8 // i2 = z[o2+16]
lvx v6,r3,r10 // r3 = z[o3]
lvx v7,r3,r11 // i3 = z[o3+16]
vmaddfp v10,v4,v8,v14 // r2*wre
vmaddfp v11,v5,v8,v14 // i2*wre
vmaddfp v12,v6,v8,v14 // r3*wre
vmaddfp v13,v7,v8,v14 // i3*wre
lvx v0, 0,r3 // r0 = z[0]
lvx v3,r3,r6 // i1 = z[o1+16]
vmaddfp v10,v5,v9,v10 // i2*wim
vnmsubfp v11,v4,v9,v11 // r2*wim
vnmsubfp v12,v7,v9,v12 // i3*wim
vmaddfp v13,v6,v9,v13 // r3*wim
lvx v1,r3,r9 // i0 = z[16]
lvx v2,r3,r5 // r1 = z[o1]
BF v12,v8,v12,v10
BF v13,v9,v11,v13
BF v0,v4,v0,v12
BF v3,v7,v3,v8
.if !\interleave
stvx v0, 0,r3
stvx v4,r3,r7
stvx v3,r3,r6
stvx v7,r3,r11
.endif
BF v1,v5,v1,v13
BF v2,v6,v2,v9
.if !\interleave
stvx v1,r3,r9
stvx v2,r3,r5
stvx v5,r3,r8
stvx v6,r3,r10
.else
vmrghw v8,v0,v1
vmrglw v9,v0,v1
stvx v8, 0,r3
stvx v9,r3,r9
vmrghw v8,v2,v3
vmrglw v9,v2,v3
stvx v8,r3,r5
stvx v9,r3,r6
vmrghw v8,v4,v5
vmrglw v9,v4,v5
stvx v8,r3,r7
stvx v9,r3,r8
vmrghw v8,v6,v7
vmrglw v9,v6,v7
stvx v8,r3,r10
stvx v9,r3,r11
.endif
addi r3,r3,32
addi r4,r4,16
bdnz 1b
sub r3,r3,r5
blr
.endm
#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
#define WORD_0 0x00,0x01,0x02,0x03
#define WORD_1 0x04,0x05,0x06,0x07
#define WORD_2 0x08,0x09,0x0a,0x0b
#define WORD_3 0x0c,0x0d,0x0e,0x0f
#define WORD_s0 0x10,0x11,0x12,0x13
#define WORD_s1 0x14,0x15,0x16,0x17
#define WORD_s2 0x18,0x19,0x1a,0x1b
#define WORD_s3 0x1c,0x1d,0x1e,0x1f
#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
.rodata
.align 4
fft_data:
.float 0, 0, 0, 0
.float 1, 0.92387953, M_SQRT1_2, 0.38268343
.float 0, 0.38268343, M_SQRT1_2, 0.92387953
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
.float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
vcprm(s0,3,2,1)
vcprm(0,1,s2,s1)
vcprm(2,3,s0,s3)
vcprm(2,s3,3,s2)
vcprm(0,1,s0,s1)
vcprm(2,3,s2,s3)
vcprm(2,3,0,1)
vcprm(1,2,s3,s0)
vcprm(0,3,s2,s1)
vcprm(0,2,s1,s3)
vcprm(1,3,s0,s2)
.macro lvm b, r, regs:vararg
lvx \r, 0, \b
addi \b, \b, 16
.ifnb \regs
lvm \b, \regs
.endif
.endm
.macro stvm b, r, regs:vararg
stvx \r, 0, \b
addi \b, \b, 16
.ifnb \regs
stvm \b, \regs
.endif
.endm
.macro fft_calc interleave
extfunc ff_fft_calc\interleave\()_altivec
mflr r0
stp r0, 2*PS(r1)
stpu r1, -(160+16*PS)(r1)
get_got r11
addi r6, r1, 16*PS
stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
mfvrsave r0
stw r0, 15*PS(r1)
li r6, 0xfffffffc
mtvrsave r6
movrel r6, fft_data, r11
lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
li r9, 16
movrel r12, X(ff_cos_tabs), r11
movrel r6, fft_dispatch_tab\interleave\()_altivec, r11
lwz r3, 0(r3)
subi r3, r3, 2
slwi r3, r3, 2+ARCH_PPC64
lpx r3, r3, r6
mtctr r3
mr r3, r4
bctrl
addi r6, r1, 16*PS
lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lwz r6, 15*PS(r1)
mtvrsave r6
lp r1, 0(r1)
lp r0, 2*PS(r1)
mtlr r0
blr
.endm
.macro DECL_FFT suffix, bits, n, n2, n4
fft\n\suffix\()_altivec:
mflr r0
stp r0,PS*(\bits-3)(r1)
bl fft\n2\()_altivec
addi2 r3,\n*4
bl fft\n4\()_altivec
addi2 r3,\n*2
bl fft\n4\()_altivec
addi2 r3,\n*-6
lp r0,PS*(\bits-3)(r1)
lp r4,\bits*PS(r12)
mtlr r0
li r5,\n/16
b fft_pass\suffix\()_altivec
.endm
.macro DECL_FFTS interleave, suffix
.text
def_fft4 \suffix
def_fft8 \suffix
def_fft16 \suffix
PASS \interleave, \suffix
DECL_FFT \suffix, 5, 32, 16, 8
DECL_FFT \suffix, 6, 64, 32, 16
DECL_FFT \suffix, 7, 128, 64, 32
DECL_FFT \suffix, 8, 256, 128, 64
DECL_FFT \suffix, 9, 512, 256, 128
DECL_FFT \suffix,10, 1024, 512, 256
DECL_FFT \suffix,11, 2048, 1024, 512
DECL_FFT \suffix,12, 4096, 2048, 1024
DECL_FFT \suffix,13, 8192, 4096, 2048
DECL_FFT \suffix,14,16384, 8192, 4096
DECL_FFT \suffix,15,32768,16384, 8192
DECL_FFT \suffix,16,65536,32768,16384
fft_calc \suffix
.rodata
.align 3
fft_dispatch_tab\suffix\()_altivec:
PTR fft4\suffix\()_altivec
PTR fft8\suffix\()_altivec
PTR fft16\suffix\()_altivec
PTR fft32\suffix\()_altivec
PTR fft64\suffix\()_altivec
PTR fft128\suffix\()_altivec
PTR fft256\suffix\()_altivec
PTR fft512\suffix\()_altivec
PTR fft1024\suffix\()_altivec
PTR fft2048\suffix\()_altivec
PTR fft4096\suffix\()_altivec
PTR fft8192\suffix\()_altivec
PTR fft16384\suffix\()_altivec
PTR fft32768\suffix\()_altivec
PTR fft65536\suffix\()_altivec
.endm
DECL_FFTS 0
DECL_FFTS 1, _interleave
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC */

View File

@@ -0,0 +1,168 @@
/*
* FFT/IFFT transforms
* AltiVec-enabled
* Copyright (c) 2009 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
/**
* Do a complex FFT with the parameters defined in ff_fft_init().
* The input data must be permuted before with s->revtab table.
* No 1.0 / sqrt(n) normalization is done.
* AltiVec-enabled:
* This code assumes that the 'z' pointer is 16 bytes-aligned.
* It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
*/
#if HAVE_VSX
#include "fft_vsx.h"
#else
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
#endif
#if HAVE_GNU_AS && HAVE_ALTIVEC
static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int j, k;
int n = 1 << s->mdct_bits;
int n4 = n >> 2;
int n8 = n >> 3;
int n32 = n >> 5;
const uint16_t *revtabj = s->revtab;
const uint16_t *revtabk = s->revtab+n4;
const vec_f *tcos = (const vec_f*)(s->tcos+n8);
const vec_f *tsin = (const vec_f*)(s->tsin+n8);
const vec_f *pin = (const vec_f*)(input+n4);
vec_f *pout = (vec_f*)(output+n4);
/* pre rotation */
k = n32-1;
do {
vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
#define CMULA(p,o0,o1,o2,o3)\
a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\
b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\
im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\
cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
r##p = im*cos - re*sin;\
i##p = re*cos + im*sin;
#define STORE2(v,dst)\
j = dst;\
vec_ste(v, 0, output+j*2);\
vec_ste(v, 4, output+j*2);
#define STORE8(p)\
a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
STORE2(a, revtabk[ p*2-4]);\
STORE2(b, revtabk[ p*2-3]);\
STORE2(c, revtabj[-p*2+2]);\
STORE2(d, revtabj[-p*2+3]);
cos0 = tcos[k];
sin0 = tsin[k];
cos1 = tcos[-k-1];
sin1 = tsin[-k-1];
CMULA(0, 0,1,2,3);
CMULA(1, 2,3,0,1);
STORE8(0);
STORE8(1);
revtabj += 4;
revtabk -= 4;
k--;
} while(k >= 0);
#if HAVE_VSX
ff_fft_calc_vsx(s, (FFTComplex*)output);
#else
ff_fft_calc_altivec(s, (FFTComplex*)output);
#endif
/* post rotation + reordering */
j = -n32;
k = n32-1;
do {
vec_f cos,sin,re,im,a,b,c,d;
#define CMULB(d0,d1,o)\
re = pout[o*2];\
im = pout[o*2+1];\
cos = tcos[o];\
sin = tsin[o];\
d0 = im*sin - re*cos;\
d1 = re*sin + im*cos;
CMULB(a,b,j);
CMULB(c,d,k);
pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2));
pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2));
pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
j++;
k--;
} while(k >= 0);
}
static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int k;
int n = 1 << s->mdct_bits;
int n4 = n >> 2;
int n16 = n >> 4;
vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
vec_u32 *p0 = (vec_u32*)(output+n4);
vec_u32 *p1 = (vec_u32*)(output+n4*3);
imdct_half_altivec(s, output + n4, input);
for (k = 0; k < n16; k++) {
vec_u32 a = p0[k] ^ sign;
vec_u32 b = p1[-k-1];
p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
p1[k] = vec_perm(b, b, vcprm(3,2,1,0));
}
}
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC */
av_cold void ff_fft_init_ppc(FFTContext *s)
{
#if HAVE_GNU_AS && HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
#if HAVE_VSX
s->fft_calc = ff_fft_calc_interleave_vsx;
#else
s->fft_calc = ff_fft_calc_interleave_altivec;
#endif
if (s->mdct_bits >= 5) {
s->imdct_calc = imdct_calc_altivec;
s->imdct_half = imdct_half_altivec;
}
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,227 @@
/*
* FFT transform, optimized with VSX built-in functions
* Copyright (c) 2014 Rong Yan
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
#include "libavcodec/fft-internal.h"
#include "fft_vsx.h"
#if HAVE_VSX
static void fft32_vsx_interleave(FFTComplex *z)
{
fft16_vsx_interleave(z);
fft8_vsx_interleave(z+16);
fft8_vsx_interleave(z+24);
pass_vsx_interleave(z,ff_cos_32,4);
}
static void fft64_vsx_interleave(FFTComplex *z)
{
fft32_vsx_interleave(z);
fft16_vsx_interleave(z+32);
fft16_vsx_interleave(z+48);
pass_vsx_interleave(z,ff_cos_64, 8);
}
static void fft128_vsx_interleave(FFTComplex *z)
{
fft64_vsx_interleave(z);
fft32_vsx_interleave(z+64);
fft32_vsx_interleave(z+96);
pass_vsx_interleave(z,ff_cos_128,16);
}
static void fft256_vsx_interleave(FFTComplex *z)
{
fft128_vsx_interleave(z);
fft64_vsx_interleave(z+128);
fft64_vsx_interleave(z+192);
pass_vsx_interleave(z,ff_cos_256,32);
}
static void fft512_vsx_interleave(FFTComplex *z)
{
fft256_vsx_interleave(z);
fft128_vsx_interleave(z+256);
fft128_vsx_interleave(z+384);
pass_vsx_interleave(z,ff_cos_512,64);
}
static void fft1024_vsx_interleave(FFTComplex *z)
{
fft512_vsx_interleave(z);
fft256_vsx_interleave(z+512);
fft256_vsx_interleave(z+768);
pass_vsx_interleave(z,ff_cos_1024,128);
}
static void fft2048_vsx_interleave(FFTComplex *z)
{
fft1024_vsx_interleave(z);
fft512_vsx_interleave(z+1024);
fft512_vsx_interleave(z+1536);
pass_vsx_interleave(z,ff_cos_2048,256);
}
static void fft4096_vsx_interleave(FFTComplex *z)
{
fft2048_vsx_interleave(z);
fft1024_vsx_interleave(z+2048);
fft1024_vsx_interleave(z+3072);
pass_vsx_interleave(z,ff_cos_4096, 512);
}
static void fft8192_vsx_interleave(FFTComplex *z)
{
fft4096_vsx_interleave(z);
fft2048_vsx_interleave(z+4096);
fft2048_vsx_interleave(z+6144);
pass_vsx_interleave(z,ff_cos_8192,1024);
}
static void fft16384_vsx_interleave(FFTComplex *z)
{
fft8192_vsx_interleave(z);
fft4096_vsx_interleave(z+8192);
fft4096_vsx_interleave(z+12288);
pass_vsx_interleave(z,ff_cos_16384,2048);
}
static void fft32768_vsx_interleave(FFTComplex *z)
{
fft16384_vsx_interleave(z);
fft8192_vsx_interleave(z+16384);
fft8192_vsx_interleave(z+24576);
pass_vsx_interleave(z,ff_cos_32768,4096);
}
static void fft65536_vsx_interleave(FFTComplex *z)
{
fft32768_vsx_interleave(z);
fft16384_vsx_interleave(z+32768);
fft16384_vsx_interleave(z+49152);
pass_vsx_interleave(z,ff_cos_65536,8192);
}
static void fft32_vsx(FFTComplex *z)
{
fft16_vsx(z);
fft8_vsx(z+16);
fft8_vsx(z+24);
pass_vsx(z,ff_cos_32,4);
}
static void fft64_vsx(FFTComplex *z)
{
fft32_vsx(z);
fft16_vsx(z+32);
fft16_vsx(z+48);
pass_vsx(z,ff_cos_64, 8);
}
static void fft128_vsx(FFTComplex *z)
{
fft64_vsx(z);
fft32_vsx(z+64);
fft32_vsx(z+96);
pass_vsx(z,ff_cos_128,16);
}
static void fft256_vsx(FFTComplex *z)
{
fft128_vsx(z);
fft64_vsx(z+128);
fft64_vsx(z+192);
pass_vsx(z,ff_cos_256,32);
}
static void fft512_vsx(FFTComplex *z)
{
fft256_vsx(z);
fft128_vsx(z+256);
fft128_vsx(z+384);
pass_vsx(z,ff_cos_512,64);
}
static void fft1024_vsx(FFTComplex *z)
{
fft512_vsx(z);
fft256_vsx(z+512);
fft256_vsx(z+768);
pass_vsx(z,ff_cos_1024,128);
}
static void fft2048_vsx(FFTComplex *z)
{
fft1024_vsx(z);
fft512_vsx(z+1024);
fft512_vsx(z+1536);
pass_vsx(z,ff_cos_2048,256);
}
static void fft4096_vsx(FFTComplex *z)
{
fft2048_vsx(z);
fft1024_vsx(z+2048);
fft1024_vsx(z+3072);
pass_vsx(z,ff_cos_4096, 512);
}
static void fft8192_vsx(FFTComplex *z)
{
fft4096_vsx(z);
fft2048_vsx(z+4096);
fft2048_vsx(z+6144);
pass_vsx(z,ff_cos_8192,1024);
}
static void fft16384_vsx(FFTComplex *z)
{
fft8192_vsx(z);
fft4096_vsx(z+8192);
fft4096_vsx(z+12288);
pass_vsx(z,ff_cos_16384,2048);
}
static void fft32768_vsx(FFTComplex *z)
{
fft16384_vsx(z);
fft8192_vsx(z+16384);
fft8192_vsx(z+24576);
pass_vsx(z,ff_cos_32768,4096);
}
static void fft65536_vsx(FFTComplex *z)
{
fft32768_vsx(z);
fft16384_vsx(z+32768);
fft16384_vsx(z+49152);
pass_vsx(z,ff_cos_65536,8192);
}
static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
};
static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
};
void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
{
fft_dispatch_vsx_interleave[s->nbits-2](z);
}
void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
{
fft_dispatch_vsx[s->nbits-2](z);
}
#endif /* HAVE_VSX */

View File

@@ -0,0 +1,830 @@
#ifndef AVCODEC_PPC_FFT_VSX_H
#define AVCODEC_PPC_FFT_VSX_H
/*
* FFT transform, optimized with VSX built-in functions
* Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
#include "libavcodec/fft-internal.h"
#if HAVE_VSX
void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
#define byte_2complex (2*sizeof(FFTComplex))
#define byte_4complex (4*sizeof(FFTComplex))
#define byte_6complex (6*sizeof(FFTComplex))
#define byte_8complex (8*sizeof(FFTComplex))
#define byte_10complex (10*sizeof(FFTComplex))
#define byte_12complex (12*sizeof(FFTComplex))
#define byte_14complex (14*sizeof(FFTComplex))
inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
{
int o1 = n<<1;
int o2 = n<<2;
int o3 = o1+o2;
int i1, i2, i3;
FFTSample* out = (FFTSample*)z;
const FFTSample *wim = wre+o1;
vec_f vz0, vzo1, vzo2, vzo3;
vec_f x0, x1, x2, x3;
vec_f x4, x5, x6, x7;
vec_f x8, x9, x10, x11;
vec_f x12, x13, x14, x15;
vec_f x16, x17, x18, x19;
vec_f x20, x21, x22, x23;
vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
vec_f y0, y1, y2, y3;
vec_f y4, y5, y8, y9;
vec_f y10, y13, y14, y15;
vec_f y16, y17, y18, y19;
vec_f y20, y21, y22, y23;
vec_f wr1, wi1, wr0, wi0;
vec_f wr2, wi2, wr3, wi3;
vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
n = n-2;
i1 = o1*sizeof(FFTComplex);
i2 = o2*sizeof(FFTComplex);
i3 = o3*sizeof(FFTComplex);
vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
vzo2plus1 = vec_ld(i2+16, &(out[0]));
vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
vzo3plus1 = vec_ld(i3+16, &(out[0]));
vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
vz0plus1 = vec_ld(16, &(out[0]));
vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
vzo1plus1 = vec_ld(i1+16, &(out[0]));
x0 = vec_add(vzo2, vzo3);
x1 = vec_sub(vzo2, vzo3);
y0 = vec_add(vzo2plus1, vzo3plus1);
y1 = vec_sub(vzo2plus1, vzo3plus1);
wr1 = vec_splats(wre[1]);
wi1 = vec_splats(wim[-1]);
wi2 = vec_splats(wim[-2]);
wi3 = vec_splats(wim[-3]);
wr2 = vec_splats(wre[2]);
wr3 = vec_splats(wre[3]);
x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
ymulwi2 = vec_mul(y4, wi2);
ymulwi3 = vec_mul(y5, wi3);
x4 = vec_mul(x2, wr1);
x5 = vec_mul(x3, wi1);
y8 = vec_madd(y2, wr2, ymulwi2);
y9 = vec_msub(y2, wr2, ymulwi2);
x6 = vec_add(x4, x5);
x7 = vec_sub(x4, x5);
y13 = vec_madd(y3, wr3, ymulwi3);
y14 = vec_msub(y3, wr3, ymulwi3);
x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
x11 = vec_add(vz0, x9);
x12 = vec_sub(vz0, x9);
x13 = vec_add(vzo1, x10);
x14 = vec_sub(vzo1, x10);
y18 = vec_add(vz0plus1, y16);
y19 = vec_sub(vz0plus1, y16);
y20 = vec_add(vzo1plus1, y17);
y21 = vec_sub(vzo1plus1, y17);
x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
vec_st(x11, 0, &(out[0]));
vec_st(y18, 16, &(out[0]));
vec_st(x15, i1, &(out[0]));
vec_st(y22, i1+16, &(out[0]));
vec_st(x12, i2, &(out[0]));
vec_st(y19, i2+16, &(out[0]));
vec_st(x16, i3, &(out[0]));
vec_st(y23, i3+16, &(out[0]));
do {
out += 8;
wre += 4;
wim -= 4;
wr0 = vec_splats(wre[0]);
wr1 = vec_splats(wre[1]);
wi0 = vec_splats(wim[0]);
wi1 = vec_splats(wim[-1]);
wr2 = vec_splats(wre[2]);
wr3 = vec_splats(wre[3]);
wi2 = vec_splats(wim[-2]);
wi3 = vec_splats(wim[-3]);
vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
vzo2plus1 = vec_ld(i2+16, &(out[0]));
vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
vzo3plus1 = vec_ld(i3+16, &(out[0]));
vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
vz0plus1 = vec_ld(16, &(out[0]));
vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
vzo1plus1 = vec_ld(i1+16, &(out[0]));
x0 = vec_add(vzo2, vzo3);
x1 = vec_sub(vzo2, vzo3);
y0 = vec_add(vzo2plus1, vzo3plus1);
y1 = vec_sub(vzo2plus1, vzo3plus1);
x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
xmulwi0 = vec_mul(x4, wi0);
xmulwi1 = vec_mul(x5, wi1);
y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
x8 = vec_madd(x2, wr0, xmulwi0);
x9 = vec_msub(x2, wr0, xmulwi0);
ymulwi2 = vec_mul(y4, wi2);
ymulwi3 = vec_mul(y5, wi3);
x13 = vec_madd(x3, wr1, xmulwi1);
x14 = vec_msub(x3, wr1, xmulwi1);
y8 = vec_madd(y2, wr2, ymulwi2);
y9 = vec_msub(y2, wr2, ymulwi2);
y13 = vec_madd(y3, wr3, ymulwi3);
y14 = vec_msub(y3, wr3, ymulwi3);
x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
x18 = vec_add(vz0, x16);
x19 = vec_sub(vz0, x16);
x20 = vec_add(vzo1, x17);
x21 = vec_sub(vzo1, x17);
y18 = vec_add(vz0plus1, y16);
y19 = vec_sub(vz0plus1, y16);
y20 = vec_add(vzo1plus1, y17);
y21 = vec_sub(vzo1plus1, y17);
x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
vec_st(x18, 0, &(out[0]));
vec_st(y18, 16, &(out[0]));
vec_st(x22, i1, &(out[0]));
vec_st(y22, i1+16, &(out[0]));
vec_st(x19, i2, &(out[0]));
vec_st(y19, i2+16, &(out[0]));
vec_st(x23, i3, &(out[0]));
vec_st(y23, i3+16, &(out[0]));
} while (n-=2);
}
inline static void fft2_vsx_interleave(FFTComplex *z)
{
FFTSample r1, i1;
r1 = z[0].re - z[1].re;
z[0].re += z[1].re;
z[1].re = r1;
i1 = z[0].im - z[1].im;
z[0].im += z[1].im;
z[1].im = i1;
}
inline static void fft4_vsx_interleave(FFTComplex *z)
{
vec_f a, b, c, d;
float* out= (float*)z;
a = vec_ld(0, &(out[0]));
b = vec_ld(byte_2complex, &(out[0]));
c = vec_perm(a, b, vcprm(0,1,s2,s1));
d = vec_perm(a, b, vcprm(2,3,s0,s3));
a = vec_add(c, d);
b = vec_sub(c, d);
c = vec_perm(a, b, vcprm(0,1,s0,s1));
d = vec_perm(a, b, vcprm(2,3,s3,s2));
a = vec_add(c, d);
b = vec_sub(c, d);
vec_st(a, 0, &(out[0]));
vec_st(b, byte_2complex, &(out[0]));
}
inline static void fft8_vsx_interleave(FFTComplex *z)
{
vec_f vz0, vz1, vz2, vz3;
vec_f x0, x1, x2, x3;
vec_f x4, x5, x6, x7;
vec_f x8, x9, x10, x11;
vec_f x12, x13, x14, x15;
vec_f x16, x17, x18, x19;
vec_f x20, x21, x22, x23;
vec_f x24, x25, x26, x27;
vec_f x28, x29, x30, x31;
vec_f x32, x33, x34;
float* out= (float*)z;
vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
vz0 = vec_ld(0, &(out[0]));
vz1 = vec_ld(byte_2complex, &(out[0]));
vz2 = vec_ld(byte_4complex, &(out[0]));
vz3 = vec_ld(byte_6complex, &(out[0]));
x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
x4 = vec_add(x0, x1);
x5 = vec_sub(x0, x1);
x6 = vec_add(x2, x3);
x7 = vec_sub(x2, x3);
x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
x12 = vec_add(x8, x9);
x13 = vec_sub(x8, x9);
x14 = vec_add(x10, x11);
x15 = vec_sub(x10, x11);
x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i
x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i
x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
x24 = vec_add(x22, x23);
x25 = vec_sub(x22, x23);
x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i
x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i
x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i
x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i
x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i
x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i
x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i
x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i
vec_st(x29, 0, &(out[0]));
vec_st(x33, byte_2complex, &(out[0]));
vec_st(x31, byte_4complex, &(out[0]));
vec_st(x34, byte_6complex, &(out[0]));
}
inline static void fft16_vsx_interleave(FFTComplex *z)
{
float* out= (float*)z;
vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
vec_f vz0, vz1, vz2, vz3;
vec_f vz4, vz5, vz6, vz7;
vec_f x0, x1, x2, x3;
vec_f x4, x5, x6, x7;
vec_f x8, x9, x10, x11;
vec_f x12, x13, x14, x15;
vec_f x16, x17, x18, x19;
vec_f x20, x21, x22, x23;
vec_f x24, x25, x26, x27;
vec_f x28, x29, x30, x31;
vec_f x32, x33, x34, x35;
vec_f x36, x37, x38, x39;
vec_f x40, x41, x42, x43;
vec_f x44, x45, x46, x47;
vec_f x48, x49, x50, x51;
vec_f x52, x53, x54, x55;
vec_f x56, x57, x58, x59;
vec_f x60, x61, x62, x63;
vec_f x64, x65, x66, x67;
vec_f x68, x69, x70, x71;
vec_f x72, x73, x74, x75;
vec_f x76, x77, x78, x79;
vec_f x80, x81, x82, x83;
vec_f x84, x85, x86;
vz0 = vec_ld(0, &(out[0]));
vz1 = vec_ld(byte_2complex, &(out[0]));
vz2 = vec_ld(byte_4complex, &(out[0]));
vz3 = vec_ld(byte_6complex, &(out[0]));
vz4 = vec_ld(byte_8complex, &(out[0]));
vz5 = vec_ld(byte_10complex, &(out[0]));
vz6 = vec_ld(byte_12complex, &(out[0]));
vz7 = vec_ld(byte_14complex, &(out[0]));
x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
x8 = vec_add(x0, x1);
x9 = vec_sub(x0, x1);
x10 = vec_add(x2, x3);
x11 = vec_sub(x2, x3);
x12 = vec_add(x4, x5);
x13 = vec_sub(x4, x5);
x14 = vec_add(x6, x7);
x15 = vec_sub(x6, x7);
x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
x24 = vec_add(x16, x17);
x25 = vec_sub(x16, x17);
x26 = vec_add(x18, x19);
x27 = vec_sub(x18, x19);
x28 = vec_add(x20, x21);
x29 = vec_sub(x20, x21);
x30 = vec_add(x22, x23);
x31 = vec_sub(x22, x23);
x32 = vec_add(x24, x26);
x33 = vec_sub(x24, x26);
x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
x37 = vec_add(x35, x36);
x38 = vec_sub(x35, x36);
x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2));
x42 = vec_add(x40, x41);
x43 = vec_sub(x40, x41);
x44 = vec_mul(x42, vc0);
x45 = vec_mul(x43, vc0);
x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i
x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i
x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
x50 = vec_add(x48, x49);
x51 = vec_sub(x48, x49);
x52 = vec_mul(x50, vc1);
x53 = vec_mul(x50, vc2);
x54 = vec_mul(x51, vc1);
x55 = vec_mul(x51, vc2);
x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
x58 = vec_add(x56, x57);
x59 = vec_sub(x56, x57);
x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
x62 = vec_add(x52, x61);
x63 = vec_sub(x52, x61);
x64 = vec_add(x60, x53);
x65 = vec_sub(x60, x53);
x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i
x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i
x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i
x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i
x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
x73 = vec_add(x25, x72);
x74 = vec_sub(x25, x72);
x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i
x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i
x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i
x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i
x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i
x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i
vec_st(x79, 0, &(out[0]));
vec_st(x80, byte_2complex, &(out[0]));
vec_st(x81, byte_4complex, &(out[0]));
vec_st(x82, byte_6complex, &(out[0]));
x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i
x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i
x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i
x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i
vec_st(x83, byte_8complex, &(out[0]));
vec_st(x84, byte_10complex, &(out[0]));
vec_st(x85, byte_12complex, &(out[0]));
vec_st(x86, byte_14complex, &(out[0]));
}
inline static void fft4_vsx(FFTComplex *z)
{
vec_f a, b, c, d;
float* out= (float*)z;
a = vec_ld(0, &(out[0]));
b = vec_ld(byte_2complex, &(out[0]));
c = vec_perm(a, b, vcprm(0,1,s2,s1));
d = vec_perm(a, b, vcprm(2,3,s0,s3));
a = vec_add(c, d);
b = vec_sub(c, d);
c = vec_perm(a,b, vcprm(0,s0,1,s1));
d = vec_perm(a, b, vcprm(2,s3,3,s2));
a = vec_add(c, d);
b = vec_sub(c, d);
c = vec_perm(a, b, vcprm(0,1,s0,s1));
d = vec_perm(a, b, vcprm(2,3,s2,s3));
vec_st(c, 0, &(out[0]));
vec_st(d, byte_2complex, &(out[0]));
return;
}
inline static void fft8_vsx(FFTComplex *z)
{
vec_f vz0, vz1, vz2, vz3;
vec_f vz4, vz5, vz6, vz7, vz8;
float* out= (float*)z;
vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
vz0 = vec_ld(0, &(out[0]));
vz1 = vec_ld(byte_2complex, &(out[0]));
vz2 = vec_ld(byte_4complex, &(out[0]));
vz3 = vec_ld(byte_6complex, &(out[0]));
vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
vz2 = vec_add(vz6, vz7);
vz3 = vec_sub(vz6, vz7);
vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
vz0 = vec_add(vz4, vz5);
vz1 = vec_sub(vz4, vz5);
vz3 = vec_madd(vz3, vc1, vc0);
vz3 = vec_madd(vz8, vc2, vz3);
vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
vz0 = vec_add(vz4, vz5);
vz1 = vec_sub(vz4, vz5);
vz2 = vec_add(vz6, vz7);
vz3 = vec_sub(vz6, vz7);
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
vz2 = vec_sub(vz4, vz6);
vz3 = vec_sub(vz5, vz7);
vz0 = vec_add(vz4, vz6);
vz1 = vec_add(vz5, vz7);
vec_st(vz0, 0, &(out[0]));
vec_st(vz1, byte_2complex, &(out[0]));
vec_st(vz2, byte_4complex, &(out[0]));
vec_st(vz3, byte_6complex, &(out[0]));
return;
}
inline static void fft16_vsx(FFTComplex *z)
{
float* out= (float*)z;
vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
vec_f vz0, vz1, vz2, vz3;
vec_f vz4, vz5, vz6, vz7;
vec_f vz8, vz9, vz10, vz11;
vec_f vz12, vz13;
vz0 = vec_ld(byte_8complex, &(out[0]));
vz1 = vec_ld(byte_10complex, &(out[0]));
vz2 = vec_ld(byte_12complex, &(out[0]));
vz3 = vec_ld(byte_14complex, &(out[0]));
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
vz0 = vec_add(vz4, vz5);
vz1= vec_sub(vz4, vz5);
vz2 = vec_add(vz6, vz7);
vz3 = vec_sub(vz6, vz7);
vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
vz0 = vec_add(vz4, vz5);
vz1 = vec_sub(vz4, vz5);
vz2 = vec_add(vz6, vz7);
vz3 = vec_sub(vz6, vz7);
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
vz0 = vec_ld(0, &(out[0]));
vz1 = vec_ld(byte_2complex, &(out[0]));
vz2 = vec_ld(byte_4complex, &(out[0]));
vz3 = vec_ld(byte_6complex, &(out[0]));
vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
vz2 = vec_add(vz10, vz11);
vz3 = vec_sub(vz10, vz11);
vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
vz0 = vec_add(vz8, vz9);
vz1 = vec_sub(vz8, vz9);
vz3 = vec_madd(vz3, vc1, vc0);
vz3 = vec_madd(vz12, vc2, vz3);
vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
vz0 = vec_add(vz8, vz9);
vz1 = vec_sub(vz8, vz9);
vz2 = vec_add(vz10, vz11);
vz3 = vec_sub(vz10, vz11);
vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
vz2 = vec_sub(vz8, vz10);
vz3 = vec_sub(vz9, vz11);
vz0 = vec_add(vz8, vz10);
vz1 = vec_add(vz9, vz11);
vz8 = vec_madd(vz4, vc3, vc0);
vz9 = vec_madd(vz5, vc3, vc0);
vz10 = vec_madd(vz6, vc3, vc0);
vz11 = vec_madd(vz7, vc3, vc0);
vz8 = vec_madd(vz5, vc4, vz8);
vz9 = vec_madd(vz4, vc5, vz9);
vz10 = vec_madd(vz7, vc5, vz10);
vz11 = vec_madd(vz6, vc4, vz11);
vz12 = vec_sub(vz10, vz8);
vz10 = vec_add(vz10, vz8);
vz13 = vec_sub(vz9, vz11);
vz11 = vec_add(vz9, vz11);
vz4 = vec_sub(vz0, vz10);
vz0 = vec_add(vz0, vz10);
vz7= vec_sub(vz3, vz12);
vz3= vec_add(vz3, vz12);
vz5 = vec_sub(vz1, vz11);
vz1 = vec_add(vz1, vz11);
vz6 = vec_sub(vz2, vz13);
vz2 = vec_add(vz2, vz13);
vec_st(vz0, 0, &(out[0]));
vec_st(vz1, byte_2complex, &(out[0]));
vec_st(vz2, byte_4complex, &(out[0]));
vec_st(vz3, byte_6complex, &(out[0]));
vec_st(vz4, byte_8complex, &(out[0]));
vec_st(vz5, byte_10complex, &(out[0]));
vec_st(vz6, byte_12complex, &(out[0]));
vec_st(vz7, byte_14complex, &(out[0]));
return;
}
inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
{
int o1 = n<<1;
int o2 = n<<2;
int o3 = o1+o2;
int i1, i2, i3;
FFTSample* out = (FFTSample*)z;
const FFTSample *wim = wre+o1;
vec_f v0, v1, v2, v3;
vec_f v4, v5, v6, v7;
vec_f v8, v9, v10, v11;
vec_f v12, v13;
n = n-2;
i1 = o1*sizeof(FFTComplex);
i2 = o2*sizeof(FFTComplex);
i3 = o3*sizeof(FFTComplex);
v8 = vec_ld(0, &(wre[0]));
v10 = vec_ld(0, &(wim[0]));
v9 = vec_ld(0, &(wim[-4]));
v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
v4 = vec_ld(i2, &(out[0]));
v5 = vec_ld(i2+16, &(out[0]));
v6 = vec_ld(i3, &(out[0]));
v7 = vec_ld(i3+16, &(out[0]));
v10 = vec_mul(v4, v8); // r2*wre
v11 = vec_mul(v5, v8); // i2*wre
v12 = vec_mul(v6, v8); // r3*wre
v13 = vec_mul(v7, v8); // i3*wre
v0 = vec_ld(0, &(out[0])); // r0
v3 = vec_ld(i1+16, &(out[0])); // i1
v10 = vec_madd(v5, v9, v10); // r2*wim
v11 = vec_nmsub(v4, v9, v11); // i2*wim
v12 = vec_nmsub(v7, v9, v12); // r3*wim
v13 = vec_madd(v6, v9, v13); // i3*wim
v1 = vec_ld(16, &(out[0])); // i0
v2 = vec_ld(i1, &(out[0])); // r1
v8 = vec_sub(v12, v10);
v12 = vec_add(v12, v10);
v9 = vec_sub(v11, v13);
v13 = vec_add(v11, v13);
v4 = vec_sub(v0, v12);
v0 = vec_add(v0, v12);
v7 = vec_sub(v3, v8);
v3 = vec_add(v3, v8);
vec_st(v0, 0, &(out[0])); // r0
vec_st(v3, i1+16, &(out[0])); // i1
vec_st(v4, i2, &(out[0])); // r2
vec_st(v7, i3+16, &(out[0]));// i3
v5 = vec_sub(v1, v13);
v1 = vec_add(v1, v13);
v6 = vec_sub(v2, v9);
v2 = vec_add(v2, v9);
vec_st(v1, 16, &(out[0])); // i0
vec_st(v2, i1, &(out[0])); // r1
vec_st(v5, i2+16, &(out[0])); // i2
vec_st(v6, i3, &(out[0])); // r3
do {
out += 8;
wre += 4;
wim -= 4;
v8 = vec_ld(0, &(wre[0]));
v10 = vec_ld(0, &(wim[0]));
v9 = vec_ld(0, &(wim[-4]));
v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
v4 = vec_ld(i2, &(out[0])); // r2
v5 = vec_ld(i2+16, &(out[0])); // i2
v6 = vec_ld(i3, &(out[0])); // r3
v7 = vec_ld(i3+16, &(out[0]));// i3
v10 = vec_mul(v4, v8); // r2*wre
v11 = vec_mul(v5, v8); // i2*wre
v12 = vec_mul(v6, v8); // r3*wre
v13 = vec_mul(v7, v8); // i3*wre
v0 = vec_ld(0, &(out[0])); // r0
v3 = vec_ld(i1+16, &(out[0])); // i1
v10 = vec_madd(v5, v9, v10); // r2*wim
v11 = vec_nmsub(v4, v9, v11); // i2*wim
v12 = vec_nmsub(v7, v9, v12); // r3*wim
v13 = vec_madd(v6, v9, v13); // i3*wim
v1 = vec_ld(16, &(out[0])); // i0
v2 = vec_ld(i1, &(out[0])); // r1
v8 = vec_sub(v12, v10);
v12 = vec_add(v12, v10);
v9 = vec_sub(v11, v13);
v13 = vec_add(v11, v13);
v4 = vec_sub(v0, v12);
v0 = vec_add(v0, v12);
v7 = vec_sub(v3, v8);
v3 = vec_add(v3, v8);
vec_st(v0, 0, &(out[0])); // r0
vec_st(v3, i1+16, &(out[0])); // i1
vec_st(v4, i2, &(out[0])); // r2
vec_st(v7, i3+16, &(out[0])); // i3
v5 = vec_sub(v1, v13);
v1 = vec_add(v1, v13);
v6 = vec_sub(v2, v9);
v2 = vec_add(v2, v9);
vec_st(v1, 16, &(out[0])); // i0
vec_st(v2, i1, &(out[0])); // r1
vec_st(v5, i2+16, &(out[0])); // i2
vec_st(v6, i3, &(out[0])); // r3
} while (n-=2);
}
#endif
#endif /* AVCODEC_PPC_FFT_VSX_H */

View File

@@ -0,0 +1,66 @@
/*
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fmtconvert.h"
#if HAVE_ALTIVEC
static void int32_to_float_fmul_scalar_altivec(float *dst, const int32_t *src,
float mul, int len)
{
union {
vector float v;
float s[4];
} mul_u;
int i;
vector float src1, src2, dst1, dst2, mul_v, zero;
zero = (vector float)vec_splat_u32(0);
mul_u.s[0] = mul;
mul_v = vec_splat(mul_u.v, 0);
for (i = 0; i < len; i += 8) {
src1 = vec_ctf(vec_ld(0, src+i), 0);
src2 = vec_ctf(vec_ld(16, src+i), 0);
dst1 = vec_madd(src1, mul_v, zero);
dst2 = vec_madd(src2, mul_v, zero);
vec_st(dst1, 0, dst+i);
vec_st(dst2, 16, dst+i);
}
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_fmt_convert_init_ppc(FmtConvertContext *c,
AVCodecContext *avctx)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,64 @@
/*
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/h264chroma.h"
#if HAVE_ALTIVEC
#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
#define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec
#define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num
#include "h264chroma_template.c"
#undef OP_U8_ALTIVEC
#undef PREFIX_h264_chroma_mc8_altivec
#undef PREFIX_h264_chroma_mc8_num
#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
#define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec
#define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num
#include "h264chroma_template.c"
#undef OP_U8_ALTIVEC
#undef PREFIX_h264_chroma_mc8_altivec
#undef PREFIX_h264_chroma_mc8_num
#endif /* HAVE_ALTIVEC */
av_cold void ff_h264chroma_init_ppc(H264ChromaContext *c, int bit_depth)
{
#if HAVE_ALTIVEC
const int high_bit_depth = bit_depth > 8;
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
if (!high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,242 @@
/*
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
/* this code assume that stride % 16 == 0 */
#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
vsrc2ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc2uc);\
vsrc3ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc3uc);\
\
psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
psum = vec_mladd(vB, vsrc1ssH, psum);\
psum = vec_mladd(vC, vsrc2ssH, psum);\
psum = vec_mladd(vD, vsrc3ssH, psum);\
psum = BIAS2(psum);\
psum = vec_sr(psum, v6us);\
\
vdst = vec_ld(0, dst);\
ppsum = (vec_u8)vec_pack(psum, psum);\
vfdst = vec_perm(vdst, ppsum, fperm);\
\
OP_U8_ALTIVEC(fsum, vfdst, vdst);\
\
vec_st(fsum, 0, dst);\
\
vsrc0ssH = vsrc2ssH;\
vsrc1ssH = vsrc3ssH;\
\
dst += stride;\
src += stride;
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
\
vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);\
vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);\
\
psum = vec_mladd(vA, vsrc0ssH, v32ss);\
psum = vec_mladd(vE, vsrc1ssH, psum);\
psum = vec_sr(psum, v6us);\
\
vdst = vec_ld(0, dst);\
ppsum = (vec_u8)vec_pack(psum, psum);\
vfdst = vec_perm(vdst, ppsum, fperm);\
\
OP_U8_ALTIVEC(fsum, vfdst, vdst);\
\
vec_st(fsum, 0, dst);\
\
dst += stride;\
src += stride;
#define noop(a) a
#define add28(a) vec_add(v28ss, a)
#if HAVE_BIGENDIAN
#define GET_VSRC1(vs0, off, b, perm0, s){ \
vec_u8 vsrcCuc, vsrcDuc; \
vsrcCuc = vec_ld(off, s); \
if (loadSecond){ \
vsrcDuc = vec_ld(off + b, s); \
} else \
vsrcDuc = vsrcCuc; \
\
vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
}
#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
vec_u8 vsrcCuc, vsrcDuc; \
vsrcCuc = vec_ld(off, s); \
if (loadSecond){ \
vsrcDuc = vec_ld(off + b, s); \
} else \
vsrcDuc = vsrcCuc; \
\
vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
if (reallyBadAlign){ \
vs1 = vsrcDuc; \
} else \
vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1); \
}
#else
#define GET_VSRC1(vs0, off, b, perm0, s){ \
vs0 = vec_vsx_ld(off, s); \
}
#define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
vs0 = vec_vsx_ld(off, s); \
vs1 = vec_vsx_ld(off + 1, s); \
}
#endif /* HAVE_BIGENDIAN */
#ifdef PREFIX_h264_chroma_mc8_altivec
static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
int stride, int h, int x, int y) {
DECLARE_ALIGNED(16, signed int, ABCD)[4] =
{((8 - x) * (8 - y)),
(( x) * (8 - y)),
((8 - x) * ( y)),
(( x) * ( y))};
register int i;
vec_u8 fperm;
LOAD_ZERO;
const vec_s32 vABCD = vec_ld(0, ABCD);
const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
const vec_u16 v6us = vec_splat_u16(6);
vec_u8 vsrcperm0, vsrcperm1;
vec_u8 vsrc0uc, vsrc1uc;
vec_s16 vsrc0ssH, vsrc1ssH;
vec_u8 vsrc2uc, vsrc3uc;
vec_s16 vsrc2ssH, vsrc3ssH, psum;
vec_u8 vdst, ppsum, vfdst, fsum;
#if HAVE_BIGENDIAN
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
vsrcperm0 = vec_lvsl(0, src);
vsrcperm1 = vec_lvsl(1, src);
#endif
if (((unsigned long)dst) % 16 == 0) {
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F};
} else {
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F};
}
GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);
vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);
if (ABCD[3]) {
for (i = 0 ; i < h ; i++) {
GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
CHROMA_MC8_ALTIVEC_CORE(v32ss, noop);
}
} else {
const vec_s16 vE = vec_add(vB, vC);
if (ABCD[2]) { // x == 0 B == 0
for (i = 0 ; i < h ; i++) {
GET_VSRC1(vsrc1uc, stride, 15, vsrcperm0, src);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
vsrc0uc = vsrc1uc;
}
} else { // y == 0 C == 0
for (i = 0 ; i < h ; i++) {
GET_VSRC(vsrc0uc, vsrc1uc, 0, 15, vsrcperm0, vsrcperm1, src);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
}
}
}
}
#endif
/* this code assume that stride % 16 == 0 */
#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
DECLARE_ALIGNED(16, signed int, ABCD)[4] =
{((8 - x) * (8 - y)),
(( x) * (8 - y)),
((8 - x) * ( y)),
(( x) * ( y))};
register int i;
vec_u8 fperm;
LOAD_ZERO;
const vec_s32 vABCD = vec_ld(0, ABCD);
const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
const vec_u16 v6us = vec_splat_u16(6);
vec_u8 vsrcperm0, vsrcperm1;
vec_u8 vsrc0uc, vsrc1uc;
vec_s16 vsrc0ssH, vsrc1ssH;
vec_u8 vsrc2uc, vsrc3uc;
vec_s16 vsrc2ssH, vsrc3ssH, psum;
vec_u8 vdst, ppsum, vfdst, fsum;
#if HAVE_BIGENDIAN
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
vsrcperm0 = vec_lvsl(0, src);
vsrcperm1 = vec_lvsl(1, src);
#endif
if (((unsigned long)dst) % 16 == 0) {
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F};
} else {
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F};
}
GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc0uc);
vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc1uc);
for (i = 0 ; i < h ; i++) {
GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28);
}
}
#endif
#undef noop
#undef add28
#undef CHROMA_MC8_ALTIVEC_CORE

View File

@@ -0,0 +1,810 @@
/*
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/h264data.h"
#include "libavcodec/h264dsp.h"
#if HAVE_ALTIVEC
/****************************************************************************
* IDCT transform:
****************************************************************************/
#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \
/* 1st stage */ \
vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \
vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \
vz2 = vec_sra(vb1,vec_splat_u16(1)); \
vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \
vz3 = vec_sra(vb3,vec_splat_u16(1)); \
vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \
/* 2nd stage: output */ \
va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \
va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \
va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \
va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */
#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
b0 = vec_mergeh( a0, a0 ); \
b1 = vec_mergeh( a1, a0 ); \
b2 = vec_mergeh( a2, a0 ); \
b3 = vec_mergeh( a3, a0 ); \
a0 = vec_mergeh( b0, b2 ); \
a1 = vec_mergel( b0, b2 ); \
a2 = vec_mergeh( b1, b3 ); \
a3 = vec_mergel( b1, b3 ); \
b0 = vec_mergeh( a0, a2 ); \
b1 = vec_mergel( a0, a2 ); \
b2 = vec_mergeh( a1, a3 ); \
b3 = vec_mergel( a1, a3 )
#if HAVE_BIGENDIAN
#define vdst_load(d) \
vdst_orig = vec_ld(0, dst); \
vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);
#else
#define vdst_load(d) vdst = vec_vsx_ld(0, dst)
#endif
#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
vdst_load(); \
vdst_ss = (vec_s16) VEC_MERGEH(zero_u8v, vdst); \
va = vec_add(va, vdst_ss); \
va_u8 = vec_packsu(va, zero_s16v); \
va_u32 = vec_splat((vec_u32)va_u8, 0); \
vec_ste(va_u32, element, (uint32_t*)dst);
static void h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
{
vec_s16 va0, va1, va2, va3;
vec_s16 vz0, vz1, vz2, vz3;
vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
vec_u8 va_u8;
vec_u32 va_u32;
vec_s16 vdst_ss;
const vec_u16 v6us = vec_splat_u16(6);
vec_u8 vdst, vdst_orig;
vec_u8 vdst_mask = vec_lvsl(0, dst);
int element = ((unsigned long)dst & 0xf) >> 2;
LOAD_ZERO;
block[0] += 32; /* add 32 as a DC-level for rounding */
vtmp0 = vec_ld(0,block);
vtmp1 = vec_sld(vtmp0, vtmp0, 8);
vtmp2 = vec_ld(16,block);
vtmp3 = vec_sld(vtmp2, vtmp2, 8);
memset(block, 0, 16 * sizeof(int16_t));
VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
va0 = vec_sra(va0,v6us);
va1 = vec_sra(va1,v6us);
va2 = vec_sra(va2,v6us);
va3 = vec_sra(va3,v6us);
VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
dst += stride;
VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
dst += stride;
VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
dst += stride;
VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
}
#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
/* a0 = SRC(0) + SRC(4); */ \
vec_s16 a0v = vec_add(s0, s4); \
/* a2 = SRC(0) - SRC(4); */ \
vec_s16 a2v = vec_sub(s0, s4); \
/* a4 = (SRC(2)>>1) - SRC(6); */ \
vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6); \
/* a6 = (SRC(6)>>1) + SRC(2); */ \
vec_s16 a6v = vec_add(vec_sra(s6, onev), s2); \
/* b0 = a0 + a6; */ \
vec_s16 b0v = vec_add(a0v, a6v); \
/* b2 = a2 + a4; */ \
vec_s16 b2v = vec_add(a2v, a4v); \
/* b4 = a2 - a4; */ \
vec_s16 b4v = vec_sub(a2v, a4v); \
/* b6 = a0 - a6; */ \
vec_s16 b6v = vec_sub(a0v, a6v); \
/* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
/* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
/* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
/* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
/* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
/* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \
vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
/* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \
vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
/* b1 = (a7>>2) + a1; */ \
vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \
/* b3 = a3 + (a5>>2); */ \
vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \
/* b5 = (a3>>2) - a5; */ \
vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \
/* b7 = a7 - (a1>>2); */ \
vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
/* DST(0, b0 + b7); */ \
d0 = vec_add(b0v, b7v); \
/* DST(1, b2 + b5); */ \
d1 = vec_add(b2v, b5v); \
/* DST(2, b4 + b3); */ \
d2 = vec_add(b4v, b3v); \
/* DST(3, b6 + b1); */ \
d3 = vec_add(b6v, b1v); \
/* DST(4, b6 - b1); */ \
d4 = vec_sub(b6v, b1v); \
/* DST(5, b4 - b3); */ \
d5 = vec_sub(b4v, b3v); \
/* DST(6, b2 - b5); */ \
d6 = vec_sub(b2v, b5v); \
/* DST(7, b0 - b7); */ \
d7 = vec_sub(b0v, b7v); \
}
#if HAVE_BIGENDIAN
#define GET_2PERM(ldv, stv, d) \
ldv = vec_lvsl(0, d); \
stv = vec_lvsr(8, d);
#define dstv_load(d) \
vec_u8 hv = vec_ld( 0, d ); \
vec_u8 lv = vec_ld( 7, d); \
vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv );
#define dest_unligned_store(d) \
vec_u8 edgehv; \
vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv ); \
vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
lv = vec_sel( lv, bodyv, edgelv ); \
vec_st( lv, 7, d ); \
hv = vec_ld( 0, d ); \
edgehv = vec_perm( zero_u8v, sel, perm_stv ); \
hv = vec_sel( hv, bodyv, edgehv ); \
vec_st( hv, 0, d );
#else
#define GET_2PERM(ldv, stv, d) {}
#define dstv_load(d) vec_u8 dstv = vec_vsx_ld(0, d)
#define dest_unligned_store(d)\
vec_u8 dst8 = vec_perm((vec_u8)idstsum8, dstv, vcprm(2,3,s2,s3));\
vec_vsx_st(dst8, 0, d)
#endif /* HAVE_BIGENDIAN */
#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
/* unaligned load */ \
dstv_load(dest); \
vec_s16 idct_sh6 = vec_sra(idctv, sixv); \
vec_u16 dst16 = (vec_u16)VEC_MERGEH(zero_u8v, dstv); \
vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \
vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \
/* unaligned store */ \
dest_unligned_store(dest);\
}
static void h264_idct8_add_altivec(uint8_t *dst, int16_t *dct, int stride)
{
vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
vec_u8 perm_ldv, perm_stv;
GET_2PERM(perm_ldv, perm_stv, dst);
const vec_u16 onev = vec_splat_u16(1);
const vec_u16 twov = vec_splat_u16(2);
const vec_u16 sixv = vec_splat_u16(6);
const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
LOAD_ZERO;
dct[0] += 32; // rounding for the >>6 at the end
s0 = vec_ld(0x00, (int16_t*)dct);
s1 = vec_ld(0x10, (int16_t*)dct);
s2 = vec_ld(0x20, (int16_t*)dct);
s3 = vec_ld(0x30, (int16_t*)dct);
s4 = vec_ld(0x40, (int16_t*)dct);
s5 = vec_ld(0x50, (int16_t*)dct);
s6 = vec_ld(0x60, (int16_t*)dct);
s7 = vec_ld(0x70, (int16_t*)dct);
memset(dct, 0, 64 * sizeof(int16_t));
IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
d0, d1, d2, d3, d4, d5, d6, d7);
TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 );
IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7,
idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
}
#if HAVE_BIGENDIAN
#define DST_LD vec_ld
#else
#define DST_LD vec_vsx_ld
#endif
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
{
vec_s16 dc16;
vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
vec_s32 v_dc32;
LOAD_ZERO;
DECLARE_ALIGNED(16, int, dc);
int i;
dc = (block[0] + 32) >> 6;
block[0] = 0;
v_dc32 = vec_lde(0, &dc);
dc16 = VEC_SPLAT16((vec_s16)v_dc32, 1);
if (size == 4)
dc16 = VEC_SLD16(dc16, zero_s16v, 8);
dcplus = vec_packsu(dc16, zero_s16v);
dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
#if HAVE_BIGENDIAN
aligner = vec_lvsr(0, dst);
dcplus = vec_perm(dcplus, dcplus, aligner);
dcminus = vec_perm(dcminus, dcminus, aligner);
#endif
for (i = 0; i < size; i += 4) {
v0 = DST_LD(0, dst+0*stride);
v1 = DST_LD(0, dst+1*stride);
v2 = DST_LD(0, dst+2*stride);
v3 = DST_LD(0, dst+3*stride);
v0 = vec_adds(v0, dcplus);
v1 = vec_adds(v1, dcplus);
v2 = vec_adds(v2, dcplus);
v3 = vec_adds(v3, dcplus);
v0 = vec_subs(v0, dcminus);
v1 = vec_subs(v1, dcminus);
v2 = vec_subs(v2, dcminus);
v3 = vec_subs(v3, dcminus);
VEC_ST(v0, 0, dst+0*stride);
VEC_ST(v1, 0, dst+1*stride);
VEC_ST(v2, 0, dst+2*stride);
VEC_ST(v3, 0, dst+3*stride);
dst += 4*stride;
}
}
static void h264_idct_dc_add_altivec(uint8_t *dst, int16_t *block, int stride)
{
h264_idct_dc_add_internal(dst, block, stride, 4);
}
static void h264_idct8_dc_add_altivec(uint8_t *dst, int16_t *block, int stride)
{
h264_idct_dc_add_internal(dst, block, stride, 8);
}
static void h264_idct_add16_altivec(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[15 * 8])
{
int i;
for(i=0; i<16; i++){
int nnz = nnzc[ scan8[i] ];
if(nnz){
if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
else h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
}
}
}
static void h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[15 * 8])
{
int i;
for(i=0; i<16; i++){
if(nnzc[ scan8[i] ]) h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
}
}
static void h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[15 * 8])
{
int i;
for(i=0; i<16; i+=4){
int nnz = nnzc[ scan8[i] ];
if(nnz){
if(nnz==1 && block[i*16]) h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
else h264_idct8_add_altivec(dst + block_offset[i], block + i*16, stride);
}
}
}
static void h264_idct_add8_altivec(uint8_t **dest, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[15 * 8])
{
int i, j;
for (j = 1; j < 3; j++) {
for(i = j * 16; i < j * 16 + 4; i++){
if(nnzc[ scan8[i] ])
h264_idct_add_altivec(dest[j-1] + block_offset[i], block + i*16, stride);
else if(block[i*16])
h264_idct_dc_add_altivec(dest[j-1] + block_offset[i], block + i*16, stride);
}
}
}
#define transpose4x16(r0, r1, r2, r3) { \
register vec_u8 r4; \
register vec_u8 r5; \
register vec_u8 r6; \
register vec_u8 r7; \
\
r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
\
r0 = vec_mergeh(r4, r6); /*all set 0*/ \
r1 = vec_mergel(r4, r6); /*all set 1*/ \
r2 = vec_mergeh(r5, r7); /*all set 2*/ \
r3 = vec_mergel(r5, r7); /*all set 3*/ \
}
static inline void write16x4(uint8_t *dst, int dst_stride,
register vec_u8 r0, register vec_u8 r1,
register vec_u8 r2, register vec_u8 r3) {
DECLARE_ALIGNED(16, unsigned char, result)[64];
uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
int int_dst_stride = dst_stride/4;
vec_st(r0, 0, result);
vec_st(r1, 16, result);
vec_st(r2, 32, result);
vec_st(r3, 48, result);
/* FIXME: there has to be a better way!!!! */
*dst_int = *src_int;
*(dst_int+ int_dst_stride) = *(src_int + 1);
*(dst_int+ 2*int_dst_stride) = *(src_int + 2);
*(dst_int+ 3*int_dst_stride) = *(src_int + 3);
*(dst_int+ 4*int_dst_stride) = *(src_int + 4);
*(dst_int+ 5*int_dst_stride) = *(src_int + 5);
*(dst_int+ 6*int_dst_stride) = *(src_int + 6);
*(dst_int+ 7*int_dst_stride) = *(src_int + 7);
*(dst_int+ 8*int_dst_stride) = *(src_int + 8);
*(dst_int+ 9*int_dst_stride) = *(src_int + 9);
*(dst_int+10*int_dst_stride) = *(src_int + 10);
*(dst_int+11*int_dst_stride) = *(src_int + 11);
*(dst_int+12*int_dst_stride) = *(src_int + 12);
*(dst_int+13*int_dst_stride) = *(src_int + 13);
*(dst_int+14*int_dst_stride) = *(src_int + 14);
*(dst_int+15*int_dst_stride) = *(src_int + 15);
}
/** @brief performs a 6x16 transpose of data in src, and stores it to dst
@todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
out of unaligned_load() */
#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
register vec_u8 r0 = unaligned_load(0, src); \
register vec_u8 r1 = unaligned_load( src_stride, src); \
register vec_u8 r2 = unaligned_load(2* src_stride, src); \
register vec_u8 r3 = unaligned_load(3* src_stride, src); \
register vec_u8 r4 = unaligned_load(4* src_stride, src); \
register vec_u8 r5 = unaligned_load(5* src_stride, src); \
register vec_u8 r6 = unaligned_load(6* src_stride, src); \
register vec_u8 r7 = unaligned_load(7* src_stride, src); \
register vec_u8 r14 = unaligned_load(14*src_stride, src); \
register vec_u8 r15 = unaligned_load(15*src_stride, src); \
\
r8 = unaligned_load( 8*src_stride, src); \
r9 = unaligned_load( 9*src_stride, src); \
r10 = unaligned_load(10*src_stride, src); \
r11 = unaligned_load(11*src_stride, src); \
r12 = unaligned_load(12*src_stride, src); \
r13 = unaligned_load(13*src_stride, src); \
\
/*Merge first pairs*/ \
r0 = vec_mergeh(r0, r8); /*0, 8*/ \
r1 = vec_mergeh(r1, r9); /*1, 9*/ \
r2 = vec_mergeh(r2, r10); /*2,10*/ \
r3 = vec_mergeh(r3, r11); /*3,11*/ \
r4 = vec_mergeh(r4, r12); /*4,12*/ \
r5 = vec_mergeh(r5, r13); /*5,13*/ \
r6 = vec_mergeh(r6, r14); /*6,14*/ \
r7 = vec_mergeh(r7, r15); /*7,15*/ \
\
/*Merge second pairs*/ \
r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \
r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \
r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \
r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \
r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \
r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \
r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \
r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
\
/*Third merge*/ \
r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
/* Don't need to compute 3 and 7*/ \
\
/*Final merge*/ \
r8 = vec_mergeh(r0, r4); /*all set 0*/ \
r9 = vec_mergel(r0, r4); /*all set 1*/ \
r10 = vec_mergeh(r1, r5); /*all set 2*/ \
r11 = vec_mergel(r1, r5); /*all set 3*/ \
r12 = vec_mergeh(r2, r6); /*all set 4*/ \
r13 = vec_mergel(r2, r6); /*all set 5*/ \
/* Don't need to compute 14 and 15*/ \
\
}
// out: o = |x-y| < a
static inline vec_u8 diff_lt_altivec ( register vec_u8 x,
register vec_u8 y,
register vec_u8 a) {
register vec_u8 diff = vec_subs(x, y);
register vec_u8 diffneg = vec_subs(y, x);
register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */
o = (vec_u8)vec_cmplt(o, a);
return o;
}
static inline vec_u8 h264_deblock_mask ( register vec_u8 p0,
register vec_u8 p1,
register vec_u8 q0,
register vec_u8 q1,
register vec_u8 alpha,
register vec_u8 beta) {
register vec_u8 mask;
register vec_u8 tempmask;
mask = diff_lt_altivec(p0, q0, alpha);
tempmask = diff_lt_altivec(p1, p0, beta);
mask = vec_and(mask, tempmask);
tempmask = diff_lt_altivec(q1, q0, beta);
mask = vec_and(mask, tempmask);
return mask;
}
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
register vec_u8 p1,
register vec_u8 p2,
register vec_u8 q0,
register vec_u8 tc0) {
register vec_u8 average = vec_avg(p0, q0);
register vec_u8 temp;
register vec_u8 uncliped;
register vec_u8 ones;
register vec_u8 max;
register vec_u8 min;
register vec_u8 newp1;
temp = vec_xor(average, p2);
average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
ones = vec_splat_u8(1);
temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */
uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
max = vec_adds(p1, tc0);
min = vec_subs(p1, tc0);
newp1 = vec_max(min, uncliped);
newp1 = vec_min(max, newp1);
return newp1;
}
#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
\
const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
\
register vec_u8 pq0bit = vec_xor(p0,q0); \
register vec_u8 q1minus; \
register vec_u8 p0minus; \
register vec_u8 stage1; \
register vec_u8 stage2; \
register vec_u8 vec160; \
register vec_u8 delta; \
register vec_u8 deltaneg; \
\
q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
p0minus = vec_nor(p0, p0); /* 255 - p0 */ \
stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \
pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \
stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
vec160 = vec_ld(0, &A0v); \
deltaneg = vec_subs(vec160, stage2); /* -d */ \
delta = vec_subs(stage2, vec160); /* d */ \
deltaneg = vec_min(tc0masked, deltaneg); \
delta = vec_min(tc0masked, delta); \
p0 = vec_subs(p0, deltaneg); \
q0 = vec_subs(q0, delta); \
p0 = vec_adds(p0, delta); \
q0 = vec_adds(q0, deltaneg); \
}
#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
DECLARE_ALIGNED(16, unsigned char, temp)[16]; \
register vec_u8 alphavec; \
register vec_u8 betavec; \
register vec_u8 mask; \
register vec_u8 p1mask; \
register vec_u8 q1mask; \
register vector signed char tc0vec; \
register vec_u8 finaltc0; \
register vec_u8 tc0masked; \
register vec_u8 newp1; \
register vec_u8 newq1; \
\
temp[0] = alpha; \
temp[1] = beta; \
alphavec = vec_ld(0, temp); \
betavec = vec_splat(alphavec, 0x1); \
alphavec = vec_splat(alphavec, 0x0); \
mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \
\
AV_COPY32(temp, tc0); \
tc0vec = vec_ld(0, (signed char*)temp); \
tc0vec = vec_mergeh(tc0vec, tc0vec); \
tc0vec = vec_mergeh(tc0vec, tc0vec); \
mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
finaltc0 = vec_and((vec_u8)tc0vec, mask); /* tc = tc0 */ \
\
p1mask = diff_lt_altivec(p2, p0, betavec); \
p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \
tc0masked = vec_and(p1mask, (vec_u8)tc0vec); \
finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
/*end if*/ \
\
q1mask = diff_lt_altivec(q2, q0, betavec); \
q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
tc0masked = vec_and(q1mask, (vec_u8)tc0vec); \
finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
/*end if*/ \
\
h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
p1 = newp1; \
q1 = newq1; \
}
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
register vec_u8 p2 = vec_ld(-3*stride, pix);
register vec_u8 p1 = vec_ld(-2*stride, pix);
register vec_u8 p0 = vec_ld(-1*stride, pix);
register vec_u8 q0 = vec_ld(0, pix);
register vec_u8 q1 = vec_ld(stride, pix);
register vec_u8 q2 = vec_ld(2*stride, pix);
h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
vec_st(p1, -2*stride, pix);
vec_st(p0, -1*stride, pix);
vec_st(q0, 0, pix);
vec_st(q1, stride, pix);
}
}
static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
register vec_u8 line0, line1, line2, line3, line4, line5;
if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
return;
readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
transpose4x16(line1, line2, line3, line4);
write16x4(pix-2, stride, line1, line2, line3, line4);
}
static av_always_inline
void weight_h264_W_altivec(uint8_t *block, int stride, int height,
int log2_denom, int weight, int offset, int w)
{
int y, aligned;
vec_u8 vblock;
vec_s16 vtemp, vweight, voffset, v0, v1;
vec_u16 vlog2_denom;
DECLARE_ALIGNED(16, int32_t, temp)[4];
LOAD_ZERO;
offset <<= log2_denom;
if(log2_denom) offset += 1<<(log2_denom-1);
temp[0] = log2_denom;
temp[1] = weight;
temp[2] = offset;
vtemp = (vec_s16)vec_ld(0, temp);
#if !HAVE_BIGENDIAN
vtemp =(vec_s16)vec_perm(vtemp, vtemp, vcswapi2s(0,1,2,3));
#endif
vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
vweight = vec_splat(vtemp, 3);
voffset = vec_splat(vtemp, 5);
aligned = !((unsigned long)block & 0xf);
for (y = 0; y < height; y++) {
vblock = vec_ld(0, block);
v0 = (vec_s16)VEC_MERGEH(zero_u8v, vblock);
v1 = (vec_s16)VEC_MERGEL(zero_u8v, vblock);
if (w == 16 || aligned) {
v0 = vec_mladd(v0, vweight, zero_s16v);
v0 = vec_adds(v0, voffset);
v0 = vec_sra(v0, vlog2_denom);
}
if (w == 16 || !aligned) {
v1 = vec_mladd(v1, vweight, zero_s16v);
v1 = vec_adds(v1, voffset);
v1 = vec_sra(v1, vlog2_denom);
}
vblock = vec_packsu(v0, v1);
vec_st(vblock, 0, block);
block += stride;
}
}
static av_always_inline
void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
int log2_denom, int weightd, int weights, int offset, int w)
{
int y, dst_aligned, src_aligned;
vec_u8 vsrc, vdst;
vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3;
vec_u16 vlog2_denom;
DECLARE_ALIGNED(16, int32_t, temp)[4];
LOAD_ZERO;
offset = ((offset + 1) | 1) << log2_denom;
temp[0] = log2_denom+1;
temp[1] = weights;
temp[2] = weightd;
temp[3] = offset;
vtemp = (vec_s16)vec_ld(0, temp);
#if !HAVE_BIGENDIAN
vtemp =(vec_s16)vec_perm(vtemp, vtemp, vcswapi2s(0,1,2,3));
#endif
vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
vweights = vec_splat(vtemp, 3);
vweightd = vec_splat(vtemp, 5);
voffset = vec_splat(vtemp, 7);
dst_aligned = !((unsigned long)dst & 0xf);
src_aligned = !((unsigned long)src & 0xf);
for (y = 0; y < height; y++) {
vdst = vec_ld(0, dst);
vsrc = vec_ld(0, src);
v0 = (vec_s16)VEC_MERGEH(zero_u8v, vdst);
v1 = (vec_s16)VEC_MERGEL(zero_u8v, vdst);
v2 = (vec_s16)VEC_MERGEH(zero_u8v, vsrc);
v3 = (vec_s16)VEC_MERGEL(zero_u8v, vsrc);
if (w == 8) {
if (src_aligned)
v3 = v2;
else
v2 = v3;
}
if (w == 16 || dst_aligned) {
v0 = vec_mladd(v0, vweightd, zero_s16v);
v2 = vec_mladd(v2, vweights, zero_s16v);
v0 = vec_adds(v0, voffset);
v0 = vec_adds(v0, v2);
v0 = vec_sra(v0, vlog2_denom);
}
if (w == 16 || !dst_aligned) {
v1 = vec_mladd(v1, vweightd, zero_s16v);
v3 = vec_mladd(v3, vweights, zero_s16v);
v1 = vec_adds(v1, voffset);
v1 = vec_adds(v1, v3);
v1 = vec_sra(v1, vlog2_denom);
}
vdst = vec_packsu(v0, v1);
vec_st(vdst, 0, dst);
dst += stride;
src += stride;
}
}
#define H264_WEIGHT(W) \
static void weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
int log2_denom, int weight, int offset) \
{ \
weight_h264_W_altivec(block, stride, height, log2_denom, weight, offset, W); \
}\
static void biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
int log2_denom, int weightd, int weights, int offset) \
{ \
biweight_h264_W_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
}
H264_WEIGHT(16)
H264_WEIGHT( 8)
#endif /* HAVE_ALTIVEC */
av_cold void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
if (bit_depth == 8) {
c->h264_idct_add = h264_idct_add_altivec;
if (chroma_format_idc <= 1)
c->h264_idct_add8 = h264_idct_add8_altivec;
c->h264_idct_add16 = h264_idct_add16_altivec;
c->h264_idct_add16intra = h264_idct_add16intra_altivec;
c->h264_idct_dc_add= h264_idct_dc_add_altivec;
c->h264_idct8_dc_add = h264_idct8_dc_add_altivec;
c->h264_idct8_add = h264_idct8_add_altivec;
c->h264_idct8_add4 = h264_idct8_add4_altivec;
c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
c->weight_h264_pixels_tab[0] = weight_h264_pixels16_altivec;
c->weight_h264_pixels_tab[1] = weight_h264_pixels8_altivec;
c->biweight_h264_pixels_tab[0] = biweight_h264_pixels16_altivec;
c->biweight_h264_pixels_tab[1] = biweight_h264_pixels8_altivec;
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,312 @@
/*
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/h264qpel.h"
#include "hpeldsp_altivec.h"
#if HAVE_ALTIVEC
#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
#define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec
#define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num
#define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec
#define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num
#define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec
#define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num
#include "h264qpel_template.c"
#undef OP_U8_ALTIVEC
#undef PREFIX_h264_qpel16_h_lowpass_altivec
#undef PREFIX_h264_qpel16_h_lowpass_num
#undef PREFIX_h264_qpel16_v_lowpass_altivec
#undef PREFIX_h264_qpel16_v_lowpass_num
#undef PREFIX_h264_qpel16_hv_lowpass_altivec
#undef PREFIX_h264_qpel16_hv_lowpass_num
#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
#define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec
#define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num
#define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec
#define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num
#define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec
#define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num
#include "h264qpel_template.c"
#undef OP_U8_ALTIVEC
#undef PREFIX_h264_qpel16_h_lowpass_altivec
#undef PREFIX_h264_qpel16_h_lowpass_num
#undef PREFIX_h264_qpel16_v_lowpass_altivec
#undef PREFIX_h264_qpel16_v_lowpass_num
#undef PREFIX_h264_qpel16_hv_lowpass_altivec
#undef PREFIX_h264_qpel16_hv_lowpass_num
#define H264_MC(OPNAME, SIZE, CODETYPE) \
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{ \
DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
}\
#if HAVE_BIGENDIAN
#define put_unligned_store(s, dest) { \
tmp1 = vec_ld(0, dest); \
mask = vec_lvsl(0, dest); \
tmp2 = vec_ld(15, dest); \
edges = vec_perm(tmp2, tmp1, mask); \
align = vec_lvsr(0, dest); \
tmp2 = vec_perm(s, edges, align); \
tmp1 = vec_perm(edges, s, align); \
vec_st(tmp2, 15, dest); \
vec_st(tmp1, 0 , dest); \
}
#else
#define put_unligned_store(s, dest) vec_vsx_st(s, 0, dest);
#endif /* HAVE_BIGENDIAN */
static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
const uint8_t * src2, int dst_stride,
int src_stride1, int h)
{
int i;
vec_u8 a, b, d, mask_;
#if HAVE_BIGENDIAN
vec_u8 tmp1, tmp2, mask, edges, align;
mask_ = vec_lvsl(0, src2);
#endif
for (i = 0; i < h; i++) {
a = unaligned_load(i * src_stride1, src1);
b = load_with_perm_vec(i * 16, src2, mask_);
d = vec_avg(a, b);
put_unligned_store(d, dst);
dst += dst_stride;
}
}
#if HAVE_BIGENDIAN
#define avg_unligned_store(s, dest){ \
tmp1 = vec_ld(0, dest); \
mask = vec_lvsl(0, dest); \
tmp2 = vec_ld(15, dest); \
a = vec_avg(vec_perm(tmp1, tmp2, mask), s); \
edges = vec_perm(tmp2, tmp1, mask); \
align = vec_lvsr(0, dest); \
tmp2 = vec_perm(a, edges, align); \
tmp1 = vec_perm(edges, a, align); \
vec_st(tmp2, 15, dest); \
vec_st(tmp1, 0 , dest); \
}
#else
#define avg_unligned_store(s, dest){ \
a = vec_avg(vec_vsx_ld(0, dst), s); \
vec_vsx_st(a, 0, dst); \
}
#endif /* HAVE_BIGENDIAN */
static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
const uint8_t * src2, int dst_stride,
int src_stride1, int h)
{
int i;
vec_u8 a, b, d, mask_;
#if HAVE_BIGENDIAN
vec_u8 tmp1, tmp2, mask, edges, align;
mask_ = vec_lvsl(0, src2);
#endif
for (i = 0; i < h; i++) {
a = unaligned_load(i * src_stride1, src1);
b = load_with_perm_vec(i * 16, src2, mask_);
d = vec_avg(a, b);
avg_unligned_store(d, dst);
dst += dst_stride;
}
}
/* Implemented but could be faster
#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
*/
H264_MC(put_, 16, altivec)
H264_MC(avg_, 16, altivec)
#endif /* HAVE_ALTIVEC */
av_cold void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth)
{
#if HAVE_ALTIVEC
const int high_bit_depth = bit_depth > 8;
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
if (!high_bit_depth) {
#define dspfunc(PFX, IDX, NUM) \
c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
dspfunc(put_h264_qpel, 0, 16);
dspfunc(avg_h264_qpel, 0, 16);
#undef dspfunc
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,484 @@
/*
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_UNISTD_H
#include <unistd.h>
#endif
#include "libavutil/avassert.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
#if HAVE_BIGENDIAN
#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
vec_u8 srcR1 = vec_ld(-2, s);\
vec_u8 srcR2 = vec_ld(14, s);\
switch (ali) {\
default: {\
srcM2 = vec_perm(srcR1, srcR2, pm2);\
srcM1 = vec_perm(srcR1, srcR2, pm1);\
srcP0 = vec_perm(srcR1, srcR2, pp0);\
srcP1 = vec_perm(srcR1, srcR2, pp1);\
srcP2 = vec_perm(srcR1, srcR2, pp2);\
srcP3 = vec_perm(srcR1, srcR2, pp3);\
} break;\
case 11: {\
srcM2 = vec_perm(srcR1, srcR2, pm2);\
srcM1 = vec_perm(srcR1, srcR2, pm1);\
srcP0 = vec_perm(srcR1, srcR2, pp0);\
srcP1 = vec_perm(srcR1, srcR2, pp1);\
srcP2 = vec_perm(srcR1, srcR2, pp2);\
srcP3 = srcR2;\
} break;\
case 12: {\
vec_u8 srcR3 = vec_ld(30, s);\
srcM2 = vec_perm(srcR1, srcR2, pm2);\
srcM1 = vec_perm(srcR1, srcR2, pm1);\
srcP0 = vec_perm(srcR1, srcR2, pp0);\
srcP1 = vec_perm(srcR1, srcR2, pp1);\
srcP2 = srcR2;\
srcP3 = vec_perm(srcR2, srcR3, pp3);\
} break;\
case 13: {\
vec_u8 srcR3 = vec_ld(30, s);\
srcM2 = vec_perm(srcR1, srcR2, pm2);\
srcM1 = vec_perm(srcR1, srcR2, pm1);\
srcP0 = vec_perm(srcR1, srcR2, pp0);\
srcP1 = srcR2;\
srcP2 = vec_perm(srcR2, srcR3, pp2);\
srcP3 = vec_perm(srcR2, srcR3, pp3);\
} break;\
case 14: {\
vec_u8 srcR3 = vec_ld(30, s);\
srcM2 = vec_perm(srcR1, srcR2, pm2);\
srcM1 = vec_perm(srcR1, srcR2, pm1);\
srcP0 = srcR2;\
srcP1 = vec_perm(srcR2, srcR3, pp1);\
srcP2 = vec_perm(srcR2, srcR3, pp2);\
srcP3 = vec_perm(srcR2, srcR3, pp3);\
} break;\
case 15: {\
vec_u8 srcR3 = vec_ld(30, s);\
srcM2 = vec_perm(srcR1, srcR2, pm2);\
srcM1 = srcR2;\
srcP0 = vec_perm(srcR2, srcR3, pp0);\
srcP1 = vec_perm(srcR2, srcR3, pp1);\
srcP2 = vec_perm(srcR2, srcR3, pp2);\
srcP3 = vec_perm(srcR2, srcR3, pp3);\
} break;\
}\
}
#else
#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
srcM2 = vec_vsx_ld(-2, s);\
srcM1 = vec_vsx_ld(-1, s);\
srcP0 = vec_vsx_ld(0, s);\
srcP1 = vec_vsx_ld(1, s);\
srcP2 = vec_vsx_ld(2, s);\
srcP3 = vec_vsx_ld(3, s);\
}
#endif /* HAVE_BIGENDIAN */
/* this code assume stride % 16 == 0 */
#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
const uint8_t *src,
int dstStride, int srcStride)
{
register int i;
LOAD_ZERO;
vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
const vec_s16 v5ss = vec_splat_s16(5);
const vec_u16 v5us = vec_splat_u16(5);
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
register int align = ((((unsigned long)src) - 2) % 16);
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB;
vec_u8 sum, fsum;
#if HAVE_BIGENDIAN
permM2 = vec_lvsl(-2, src);
permM1 = vec_lvsl(-1, src);
permP0 = vec_lvsl(+0, src);
permP1 = vec_lvsl(+1, src);
permP2 = vec_lvsl(+2, src);
permP3 = vec_lvsl(+3, src);
#endif /* HAVE_BIGENDIAN */
for (i = 0 ; i < 16 ; i ++) {
load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
psumA = vec_sub(pp3A, pp2A);
psumB = vec_sub(pp3B, pp2B);
sumA = vec_sra(psumA, v5us);
sumB = vec_sra(psumB, v5us);
sum = vec_packsu(sumA, sumB);
ASSERT_ALIGNED(dst);
OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
vec_st(fsum, 0, dst);
src += srcStride;
dst += dstStride;
}
}
#endif
/* this code assume stride % 16 == 0 */
#ifdef PREFIX_h264_qpel16_v_lowpass_altivec
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
const uint8_t *src,
int dstStride, int srcStride)
{
register int i;
LOAD_ZERO;
vec_u8 perm;
#if HAVE_BIGENDIAN
perm = vec_lvsl(0, src);
#endif
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u16 v5us = vec_splat_u16(5);
const vec_s16 v5ss = vec_splat_s16(5);
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
const uint8_t *srcbis = src - (srcStride * 2);
const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
srcbis += srcStride;
const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
srcbis += srcStride;
const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
srcbis += srcStride;
const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
srcbis += srcStride;
const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
srcbis += srcStride;
vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB,
srcP3ssA, srcP3ssB,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
vec_u8 sum, fsum, srcP3;
for (i = 0 ; i < 16 ; i++) {
srcP3 = load_with_perm_vec(0, srcbis, perm);
srcbis += srcStride;
srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
sum1A = vec_adds(srcP0ssA, srcP1ssA);
sum1B = vec_adds(srcP0ssB, srcP1ssB);
sum2A = vec_adds(srcM1ssA, srcP2ssA);
sum2B = vec_adds(srcM1ssB, srcP2ssB);
sum3A = vec_adds(srcM2ssA, srcP3ssA);
sum3B = vec_adds(srcM2ssB, srcP3ssB);
srcM2ssA = srcM1ssA;
srcM2ssB = srcM1ssB;
srcM1ssA = srcP0ssA;
srcM1ssB = srcP0ssB;
srcP0ssA = srcP1ssA;
srcP0ssB = srcP1ssB;
srcP1ssA = srcP2ssA;
srcP1ssB = srcP2ssB;
srcP2ssA = srcP3ssA;
srcP2ssB = srcP3ssB;
pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
psumA = vec_sub(pp3A, pp2A);
psumB = vec_sub(pp3B, pp2B);
sumA = vec_sra(psumA, v5us);
sumB = vec_sra(psumB, v5us);
sum = vec_packsu(sumA, sumB);
ASSERT_ALIGNED(dst);
OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
vec_st(fsum, 0, dst);
dst += dstStride;
}
}
#endif
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
const uint8_t *src,
int dstStride, int tmpStride,
int srcStride)
{
register int i;
LOAD_ZERO;
vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u32 v10ui = vec_splat_u32(10);
const vec_s16 v5ss = vec_splat_s16(5);
const vec_s16 v1ss = vec_splat_s16(1);
const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
register int align = ((((unsigned long)src) - 2) % 16);
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, psumA, psumB;
const vec_u8 mperm = (const vec_u8)
{0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
int16_t *tmpbis = tmp;
vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
tmpP2ssA, tmpP2ssB;
vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
ssumAe, ssumAo, ssumBe, ssumBo;
vec_u8 fsum, sumv, sum;
vec_s16 ssume, ssumo;
#if HAVE_BIGENDIAN
permM2 = vec_lvsl(-2, src);
permM1 = vec_lvsl(-1, src);
permP0 = vec_lvsl(+0, src);
permP1 = vec_lvsl(+1, src);
permP2 = vec_lvsl(+2, src);
permP3 = vec_lvsl(+3, src);
#endif /* HAVE_BIGENDIAN */
src -= (2 * srcStride);
for (i = 0 ; i < 21 ; i ++) {
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
pp1A = vec_mladd(sum1A, v20ss, sum3A);
pp1B = vec_mladd(sum1B, v20ss, sum3B);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
psumA = vec_sub(pp1A, pp2A);
psumB = vec_sub(pp1B, pp2B);
vec_st(psumA, 0, tmp);
vec_st(psumB, 16, tmp);
src += srcStride;
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
}
tmpM2ssA = vec_ld(0, tmpbis);
tmpM2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpM1ssA = vec_ld(0, tmpbis);
tmpM1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP0ssA = vec_ld(0, tmpbis);
tmpP0ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP1ssA = vec_ld(0, tmpbis);
tmpP1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP2ssA = vec_ld(0, tmpbis);
tmpP2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
for (i = 0 ; i < 16 ; i++) {
const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
tmpbis += tmpStride;
tmpM2ssA = tmpM1ssA;
tmpM2ssB = tmpM1ssB;
tmpM1ssA = tmpP0ssA;
tmpM1ssB = tmpP0ssB;
tmpP0ssA = tmpP1ssA;
tmpP0ssB = tmpP1ssB;
tmpP1ssA = tmpP2ssA;
tmpP1ssB = tmpP2ssB;
tmpP2ssA = tmpP3ssA;
tmpP2ssB = tmpP3ssB;
pp1Ae = vec_mule(sum1A, v20ss);
pp1Ao = vec_mulo(sum1A, v20ss);
pp1Be = vec_mule(sum1B, v20ss);
pp1Bo = vec_mulo(sum1B, v20ss);
pp2Ae = vec_mule(sum2A, v5ss);
pp2Ao = vec_mulo(sum2A, v5ss);
pp2Be = vec_mule(sum2B, v5ss);
pp2Bo = vec_mulo(sum2B, v5ss);
pp3Ao = vec_mulo(sum3A, v1ss);
pp3Bo = vec_mulo(sum3B, v1ss);
#if !HAVE_BIGENDIAN
sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
#endif
pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
pp3Be = vec_sra((vec_s32)sum3B, v16ui);
pp1cAe = vec_add(pp1Ae, v512si);
pp1cAo = vec_add(pp1Ao, v512si);
pp1cBe = vec_add(pp1Be, v512si);
pp1cBo = vec_add(pp1Bo, v512si);
pp32Ae = vec_sub(pp3Ae, pp2Ae);
pp32Ao = vec_sub(pp3Ao, pp2Ao);
pp32Be = vec_sub(pp3Be, pp2Be);
pp32Bo = vec_sub(pp3Bo, pp2Bo);
sumAe = vec_add(pp1cAe, pp32Ae);
sumAo = vec_add(pp1cAo, pp32Ao);
sumBe = vec_add(pp1cBe, pp32Be);
sumBo = vec_add(pp1cBo, pp32Bo);
ssumAe = vec_sra(sumAe, v10ui);
ssumAo = vec_sra(sumAo, v10ui);
ssumBe = vec_sra(sumBe, v10ui);
ssumBo = vec_sra(sumBo, v10ui);
ssume = vec_packs(ssumAe, ssumBe);
ssumo = vec_packs(ssumAo, ssumBo);
sumv = vec_packsu(ssume, ssumo);
sum = vec_perm(sumv, sumv, mperm);
ASSERT_ALIGNED(dst);
OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
vec_st(fsum, 0, dst);
dst += dstStride;
}
}
#endif

View File

@@ -0,0 +1,389 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/hpeldsp.h"
#include "hpeldsp_altivec.h"
#if HAVE_ALTIVEC
/* next one assumes that ((line_size % 16) == 0) */
void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
register vector unsigned char pixelsv1;
register vector unsigned char pixelsv1B;
register vector unsigned char pixelsv1C;
register vector unsigned char pixelsv1D;
int i;
register ptrdiff_t line_size_2 = line_size << 1;
register ptrdiff_t line_size_3 = line_size + line_size_2;
register ptrdiff_t line_size_4 = line_size << 2;
// hand-unrolling the loop by 4 gains about 15%
// mininum execution time goes from 74 to 60 cycles
// it's faster than -funroll-loops, but using
// -funroll-loops w/ this is bad - 74 cycles again.
// all this is on a 7450, tuning for the 7450
for (i = 0; i < h; i += 4) {
pixelsv1 = unaligned_load( 0, pixels);
pixelsv1B = unaligned_load(line_size, pixels);
pixelsv1C = unaligned_load(line_size_2, pixels);
pixelsv1D = unaligned_load(line_size_3, pixels);
VEC_ST(pixelsv1, 0, (unsigned char*)block);
VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
pixels+=line_size_4;
block +=line_size_4;
}
}
/* next one assumes that ((line_size % 16) == 0) */
#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
register vector unsigned char pixelsv, blockv;
int i;
for (i = 0; i < h; i++) {
blockv = vec_ld(0, block);
pixelsv = VEC_LD( 0, pixels);
blockv = vec_avg(blockv,pixelsv);
vec_st(blockv, 0, (unsigned char*)block);
pixels+=line_size;
block +=line_size;
}
}
/* next one assumes that ((line_size % 8) == 0) */
static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
{
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
int i;
for (i = 0; i < h; i++) {
/* block is 8 bytes-aligned, so we're either in the
left block (16 bytes-aligned) or in the right block (not) */
int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block);
pixelsv = VEC_LD( 0, pixels);
if (rightside) {
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
} else {
pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
}
blockv = vec_avg(blockv, pixelsv);
vec_st(blockv, 0, block);
pixels += line_size;
block += line_size;
}
}
/* next one assumes that ((line_size % 8) == 0) */
static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
register int i;
register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
register vector unsigned char blockv;
register vector unsigned short pixelssum1, pixelssum2, temp3;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
pixelsv1 = VEC_LD(0, pixels);
pixelsv2 = VEC_LD(1, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
pixelssum1 = vec_add(pixelssum1, vctwo);
for (i = 0; i < h ; i++) {
int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block);
pixelsv1 = unaligned_load(line_size, pixels);
pixelsv2 = unaligned_load(line_size+1, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
temp3 = vec_add(pixelssum1, pixelssum2);
temp3 = vec_sra(temp3, vctwo);
pixelssum1 = vec_add(pixelssum2, vctwo);
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
if (rightside) {
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} else {
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
}
vec_st(blockv, 0, block);
block += line_size;
pixels += line_size;
}
}
/* next one assumes that ((line_size % 8) == 0) */
static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
register int i;
register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
register vector unsigned char blockv;
register vector unsigned short pixelssum1, pixelssum2, temp3;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
pixelsv1 = VEC_LD(0, pixels);
pixelsv2 = VEC_LD(1, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
pixelssum1 = vec_add(pixelssum1, vcone);
for (i = 0; i < h ; i++) {
int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block);
pixelsv1 = unaligned_load(line_size, pixels);
pixelsv2 = unaligned_load(line_size+1, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
temp3 = vec_add(pixelssum1, pixelssum2);
temp3 = vec_sra(temp3, vctwo);
pixelssum1 = vec_add(pixelssum2, vcone);
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
if (rightside) {
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} else {
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
}
vec_st(blockv, 0, block);
block += line_size;
pixels += line_size;
}
}
/* next one assumes that ((line_size % 16) == 0) */
static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
{
register int i;
register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
register vector unsigned char blockv;
register vector unsigned short temp3, temp4,
pixelssum1, pixelssum2, pixelssum3, pixelssum4;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
pixelsv1 = VEC_LD(0, pixels);
pixelsv2 = VEC_LD(1, pixels);
pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum3 = vec_add((vector unsigned short)pixelsv3,
(vector unsigned short)pixelsv4);
pixelssum3 = vec_add(pixelssum3, vctwo);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
pixelssum1 = vec_add(pixelssum1, vctwo);
for (i = 0; i < h ; i++) {
blockv = vec_ld(0, block);
pixelsv1 = unaligned_load(line_size, pixels);
pixelsv2 = unaligned_load(line_size+1, pixels);
pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum4 = vec_add((vector unsigned short)pixelsv3,
(vector unsigned short)pixelsv4);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
temp4 = vec_add(pixelssum3, pixelssum4);
temp4 = vec_sra(temp4, vctwo);
temp3 = vec_add(pixelssum1, pixelssum2);
temp3 = vec_sra(temp3, vctwo);
pixelssum3 = vec_add(pixelssum4, vctwo);
pixelssum1 = vec_add(pixelssum2, vctwo);
blockv = vec_packsu(temp3, temp4);
vec_st(blockv, 0, block);
block += line_size;
pixels += line_size;
}
}
/* next one assumes that ((line_size % 16) == 0) */
static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
{
register int i;
register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
register vector unsigned char blockv;
register vector unsigned short temp3, temp4,
pixelssum1, pixelssum2, pixelssum3, pixelssum4;
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
pixelsv1 = VEC_LD(0, pixels);
pixelsv2 = VEC_LD(1, pixels);
pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum3 = vec_add((vector unsigned short)pixelsv3,
(vector unsigned short)pixelsv4);
pixelssum3 = vec_add(pixelssum3, vcone);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
pixelssum1 = vec_add(pixelssum1, vcone);
for (i = 0; i < h ; i++) {
pixelsv1 = unaligned_load(line_size, pixels);
pixelsv2 = unaligned_load(line_size+1, pixels);
pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum4 = vec_add((vector unsigned short)pixelsv3,
(vector unsigned short)pixelsv4);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
temp4 = vec_add(pixelssum3, pixelssum4);
temp4 = vec_sra(temp4, vctwo);
temp3 = vec_add(pixelssum1, pixelssum2);
temp3 = vec_sra(temp3, vctwo);
pixelssum3 = vec_add(pixelssum4, vcone);
pixelssum1 = vec_add(pixelssum2, vcone);
blockv = vec_packsu(temp3, temp4);
VEC_ST(blockv, 0, block);
block += line_size;
pixels += line_size;
}
}
/* next one assumes that ((line_size % 8) == 0) */
static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
register int i;
register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
register vector unsigned char blockv, blocktemp;
register vector unsigned short pixelssum1, pixelssum2, temp3;
register const vector unsigned char vczero = (const vector unsigned char)
vec_splat_u8(0);
register const vector unsigned short vctwo = (const vector unsigned short)
vec_splat_u16(2);
pixelsv1 = VEC_LD(0, pixels);
pixelsv2 = VEC_LD(1, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum1 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
pixelssum1 = vec_add(pixelssum1, vctwo);
for (i = 0; i < h ; i++) {
int rightside = ((unsigned long)block & 0x0000000F);
blockv = vec_ld(0, block);
pixelsv1 = unaligned_load(line_size, pixels);
pixelsv2 = unaligned_load(line_size+1, pixels);
pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
pixelssum2 = vec_add((vector unsigned short)pixelsv1,
(vector unsigned short)pixelsv2);
temp3 = vec_add(pixelssum1, pixelssum2);
temp3 = vec_sra(temp3, vctwo);
pixelssum1 = vec_add(pixelssum2, vctwo);
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
if (rightside) {
blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
} else {
blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
}
blockv = vec_avg(blocktemp, blockv);
vec_st(blockv, 0, block);
block += line_size;
pixels += line_size;
}
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec;
c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
c->put_pixels_tab[0][0] = ff_put_pixels16_altivec;
c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,34 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_HPELDSP_ALTIVEC_H
#define AVCODEC_PPC_HPELDSP_ALTIVEC_H
#include <stddef.h>
#include <stdint.h>
void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
#endif /* AVCODEC_PPC_HPELDSP_ALTIVEC_H */

View File

@@ -0,0 +1,62 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/huffyuvdsp.h"
#if HAVE_ALTIVEC
static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w)
{
register int i;
register vector unsigned char vdst, vsrc;
/* dst and src are 16 bytes-aligned (guaranteed). */
for (i = 0; i + 15 < w; i += 16) {
vdst = vec_ld(i, (unsigned char *) dst);
vsrc = vec_ld(i, (unsigned char *) src);
vdst = vec_add(vsrc, vdst);
vec_st(vdst, i, (unsigned char *) dst);
}
/* If w is not a multiple of 16. */
for (; i < w; i++)
dst[i] = src[i];
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_huffyuvdsp_init_ppc(HuffYUVDSPContext *c)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->add_bytes = add_bytes_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,273 @@
/*
* Copyright (c) 2001 Michel Lespinasse
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* NOTE: This code is based on GPL code from the libmpeg2 project. The
* author, Michel Lespinasses, has given explicit permission to release
* under LGPL as part of FFmpeg.
*
* FFmpeg integration by Dieter Shirley
*
* This file is a direct copy of the AltiVec IDCT module from the libmpeg2
* project. I've deleted all of the libmpeg2-specific code, renamed the
* functions and reordered the function parameters. The only change to the
* IDCT function itself was to factor out the partial transposition, and to
* perform a full transpose at the end of the function. */
#include <stdlib.h>
#include <string.h>
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavcodec/idctdsp.h"
#if HAVE_ALTIVEC
#define IDCT_HALF \
/* 1st stage */ \
t1 = vec_mradds(a1, vx7, vx1); \
t8 = vec_mradds(a1, vx1, vec_subs(zero, vx7)); \
t7 = vec_mradds(a2, vx5, vx3); \
t3 = vec_mradds(ma2, vx3, vx5); \
\
/* 2nd stage */ \
t5 = vec_adds(vx0, vx4); \
t0 = vec_subs(vx0, vx4); \
t2 = vec_mradds(a0, vx6, vx2); \
t4 = vec_mradds(a0, vx2, vec_subs(zero, vx6)); \
t6 = vec_adds(t8, t3); \
t3 = vec_subs(t8, t3); \
t8 = vec_subs(t1, t7); \
t1 = vec_adds(t1, t7); \
\
/* 3rd stage */ \
t7 = vec_adds(t5, t2); \
t2 = vec_subs(t5, t2); \
t5 = vec_adds(t0, t4); \
t0 = vec_subs(t0, t4); \
t4 = vec_subs(t8, t3); \
t3 = vec_adds(t8, t3); \
\
/* 4th stage */ \
vy0 = vec_adds(t7, t1); \
vy7 = vec_subs(t7, t1); \
vy1 = vec_mradds(c4, t3, t5); \
vy6 = vec_mradds(mc4, t3, t5); \
vy2 = vec_mradds(c4, t4, t0); \
vy5 = vec_mradds(mc4, t4, t0); \
vy3 = vec_adds(t2, t6); \
vy4 = vec_subs(t2, t6)
#define IDCT \
vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
\
vec_s16 c4 = vec_splat(constants[0], 0); \
vec_s16 a0 = vec_splat(constants[0], 1); \
vec_s16 a1 = vec_splat(constants[0], 2); \
vec_s16 a2 = vec_splat(constants[0], 3); \
vec_s16 mc4 = vec_splat(constants[0], 4); \
vec_s16 ma2 = vec_splat(constants[0], 5); \
vec_s16 bias = (vec_s16) vec_splat((vec_s32) constants[0], 3); \
\
vec_s16 zero = vec_splat_s16(0); \
vec_u16 shift = vec_splat_u16(4); \
\
vec_s16 vx0 = vec_mradds(vec_sl(block[0], shift), constants[1], zero); \
vec_s16 vx1 = vec_mradds(vec_sl(block[1], shift), constants[2], zero); \
vec_s16 vx2 = vec_mradds(vec_sl(block[2], shift), constants[3], zero); \
vec_s16 vx3 = vec_mradds(vec_sl(block[3], shift), constants[4], zero); \
vec_s16 vx4 = vec_mradds(vec_sl(block[4], shift), constants[1], zero); \
vec_s16 vx5 = vec_mradds(vec_sl(block[5], shift), constants[4], zero); \
vec_s16 vx6 = vec_mradds(vec_sl(block[6], shift), constants[3], zero); \
vec_s16 vx7 = vec_mradds(vec_sl(block[7], shift), constants[2], zero); \
\
IDCT_HALF; \
\
vx0 = vec_mergeh(vy0, vy4); \
vx1 = vec_mergel(vy0, vy4); \
vx2 = vec_mergeh(vy1, vy5); \
vx3 = vec_mergel(vy1, vy5); \
vx4 = vec_mergeh(vy2, vy6); \
vx5 = vec_mergel(vy2, vy6); \
vx6 = vec_mergeh(vy3, vy7); \
vx7 = vec_mergel(vy3, vy7); \
\
vy0 = vec_mergeh(vx0, vx4); \
vy1 = vec_mergel(vx0, vx4); \
vy2 = vec_mergeh(vx1, vx5); \
vy3 = vec_mergel(vx1, vx5); \
vy4 = vec_mergeh(vx2, vx6); \
vy5 = vec_mergel(vx2, vx6); \
vy6 = vec_mergeh(vx3, vx7); \
vy7 = vec_mergel(vx3, vx7); \
\
vx0 = vec_adds(vec_mergeh(vy0, vy4), bias); \
vx1 = vec_mergel(vy0, vy4); \
vx2 = vec_mergeh(vy1, vy5); \
vx3 = vec_mergel(vy1, vy5); \
vx4 = vec_mergeh(vy2, vy6); \
vx5 = vec_mergel(vy2, vy6); \
vx6 = vec_mergeh(vy3, vy7); \
vx7 = vec_mergel(vy3, vy7); \
\
IDCT_HALF; \
\
shift = vec_splat_u16(6); \
vx0 = vec_sra(vy0, shift); \
vx1 = vec_sra(vy1, shift); \
vx2 = vec_sra(vy2, shift); \
vx3 = vec_sra(vy3, shift); \
vx4 = vec_sra(vy4, shift); \
vx5 = vec_sra(vy5, shift); \
vx6 = vec_sra(vy6, shift); \
vx7 = vec_sra(vy7, shift)
static const vec_s16 constants[5] = {
{ 23170, 13573, 6518, 21895, -23170, -21895, 32, 31 },
{ 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725 },
{ 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521 },
{ 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692 },
{ 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722 }
};
static void idct_altivec(int16_t *blk)
{
vec_s16 *block = (vec_s16 *) blk;
IDCT;
block[0] = vx0;
block[1] = vx1;
block[2] = vx2;
block[3] = vx3;
block[4] = vx4;
block[5] = vx5;
block[6] = vx6;
block[7] = vx7;
}
static void idct_put_altivec(uint8_t *dest, int stride, int16_t *blk)
{
vec_s16 *block = (vec_s16 *) blk;
vec_u8 tmp;
IDCT;
#define COPY(dest, src) \
tmp = vec_packsu(src, src); \
vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \
vec_ste((vec_u32) tmp, 4, (unsigned int *) dest)
COPY(dest, vx0);
dest += stride;
COPY(dest, vx1);
dest += stride;
COPY(dest, vx2);
dest += stride;
COPY(dest, vx3);
dest += stride;
COPY(dest, vx4);
dest += stride;
COPY(dest, vx5);
dest += stride;
COPY(dest, vx6);
dest += stride;
COPY(dest, vx7);
}
static void idct_add_altivec(uint8_t *dest, int stride, int16_t *blk)
{
vec_s16 *block = (vec_s16 *) blk;
vec_u8 tmp;
vec_s16 tmp2, tmp3;
vec_u8 perm0;
vec_u8 perm1;
vec_u8 p0, p1, p;
IDCT;
#if HAVE_BIGENDIAN
p0 = vec_lvsl(0, dest);
p1 = vec_lvsl(stride, dest);
p = vec_splat_u8(-1);
perm0 = vec_mergeh(p, p0);
perm1 = vec_mergeh(p, p1);
#endif
#if HAVE_BIGENDIAN
#define GET_TMP2(dest, prm) \
tmp = vec_ld(0, dest); \
tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, prm);
#else
#define GET_TMP2(dest, prm) \
tmp = vec_vsx_ld(0, dest); \
tmp2 = (vec_s16) vec_mergeh(tmp, (vec_u8) zero)
#endif
#define ADD(dest, src, perm) \
GET_TMP2(dest, perm); \
tmp3 = vec_adds(tmp2, src); \
tmp = vec_packsu(tmp3, tmp3); \
vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \
vec_ste((vec_u32) tmp, 4, (unsigned int *) dest)
ADD(dest, vx0, perm0);
dest += stride;
ADD(dest, vx1, perm1);
dest += stride;
ADD(dest, vx2, perm0);
dest += stride;
ADD(dest, vx3, perm1);
dest += stride;
ADD(dest, vx4, perm0);
dest += stride;
ADD(dest, vx5, perm1);
dest += stride;
ADD(dest, vx6, perm0);
dest += stride;
ADD(dest, vx7, perm1);
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
if (!high_bit_depth && avctx->lowres == 0) {
if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
(avctx->idct_algo == FF_IDCT_ALTIVEC)) {
c->idct = idct_altivec;
c->idct_add = idct_add_altivec;
c->idct_put = idct_put_altivec;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,93 @@
/*
* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavcodec/lossless_audiodsp.h"
#if HAVE_BIGENDIAN
#define GET_T(tt0,tt1,src,a,b){ \
a = vec_ld(16, src); \
tt0 = vec_perm(b, a, align); \
b = vec_ld(32, src); \
tt1 = vec_perm(a, b, align); \
}
#else
#define GET_T(tt0,tt1,src,a,b){ \
tt0 = vec_vsx_ld(0, src); \
tt1 = vec_vsx_ld(16, src); \
}
#endif
#if HAVE_ALTIVEC
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1,
const int16_t *v2,
const int16_t *v3,
int order, int mul)
{
LOAD_ZERO;
vec_s16 *pv1 = (vec_s16 *) v1;
register vec_s16 muls = { mul, mul, mul, mul, mul, mul, mul, mul };
register vec_s16 t0, t1, i0, i1, i4, i2, i3;
register vec_s32 res = zero_s32v;
#if HAVE_BIGENDIAN
register vec_u8 align = vec_lvsl(0, v2);
i2 = vec_ld(0, v2);
i3 = vec_ld(0, v3);
#endif
int32_t ires;
order >>= 4;
do {
GET_T(t0,t1,v2,i1,i2);
i0 = pv1[0];
i1 = pv1[1];
res = vec_msum(t0, i0, res);
res = vec_msum(t1, i1, res);
GET_T(t0,t1,v3,i4,i3);
pv1[0] = vec_mladd(t0, muls, i0);
pv1[1] = vec_mladd(t1, muls, i1);
pv1 += 2;
v2 += 16;
v3 += 16;
} while (--order);
res = vec_splat(vec_sums(res, zero_s32v), 3);
vec_ste(res, 0, &ires);
return ires;
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_llauddsp_init_ppc(LLAudDSPContext *c)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,79 @@
/*
* simple math operations
* Copyright (c) 2001, 2002 Fabrice Bellard
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_MATHOPS_H
#define AVCODEC_PPC_MATHOPS_H
#include <stdint.h>
#include "config.h"
#include "libavutil/common.h"
#if HAVE_PPC4XX
/* signed 16x16 -> 32 multiply add accumulate */
#define MAC16(rt, ra, rb) \
__asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
/* signed 16x16 -> 32 multiply */
#define MUL16(ra, rb) \
({ int __rt; \
__asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
__rt; })
#endif
#define MULH MULH
static inline av_const int MULH(int a, int b){
int r;
__asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
return r;
}
#if !ARCH_PPC64
static inline av_const int64_t MAC64(int64_t d, int a, int b)
{
union { uint64_t x; unsigned hl[2]; } x = { d };
int h, l;
__asm__ ("mullw %3, %4, %5 \n\t"
"mulhw %2, %4, %5 \n\t"
"addc %1, %1, %3 \n\t"
"adde %0, %0, %2 \n\t"
: "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
: "r"(a), "r"(b));
return x.x;
}
#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
static inline av_const int64_t MLS64(int64_t d, int a, int b)
{
union { uint64_t x; unsigned hl[2]; } x = { d };
int h, l;
__asm__ ("mullw %3, %4, %5 \n\t"
"mulhw %2, %4, %5 \n\t"
"subfc %1, %3, %1 \n\t"
"subfe %0, %2, %0 \n\t"
: "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
: "r"(a), "r"(b));
return x.x;
}
#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
#endif
#endif /* AVCODEC_PPC_MATHOPS_H */

View File

@@ -0,0 +1,749 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
#include "libavcodec/me_cmp.h"
#if HAVE_ALTIVEC
#if HAVE_BIGENDIAN
#define GET_PERM(per1, per2, pix) {\
per1 = vec_lvsl(0, pix);\
per2 = vec_add(per1, vec_splat_u8(1));\
}
#define LOAD_PIX(v, iv, pix, per1, per2) {\
vector unsigned char pix2l = vec_ld(0, pix);\
vector unsigned char pix2r = vec_ld(16, pix);\
v = vec_perm(pix2l, pix2r, per1);\
iv = vec_perm(pix2l, pix2r, per2);\
}
#else
#define GET_PERM(per1, per2, pix) {}
#define LOAD_PIX(v, iv, pix, per1, per2) {\
v = vec_vsx_ld(0, pix);\
iv = vec_vsx_ld(1, pix);\
}
#endif
static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int i;
int __attribute__((aligned(16))) s = 0;
const vector unsigned char zero =
(const vector unsigned char) vec_splat_u8(0);
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
vector signed int sumdiffs;
vector unsigned char perm1, perm2, pix2v, pix2iv;
GET_PERM(perm1, perm2, pix2);
for (i = 0; i < h; i++) {
/* Read unaligned pixels into our vectors. The vectors are as follows:
* pix1v: pix1[0] - pix1[15]
* pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */
vector unsigned char pix1v = vec_ld(0, pix1);
LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
/* Calculate the average vector. */
vector unsigned char avgv = vec_avg(pix2v, pix2iv);
/* Calculate a sum of abs differences vector. */
vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv),
vec_min(pix1v, avgv));
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s(t5, sad);
pix1 += stride;
pix2 += stride;
}
/* Sum up the four partial sums, and put the result into s. */
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
sumdiffs = vec_splat(sumdiffs, 3);
vec_ste(sumdiffs, 0, &s);
return s;
}
static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int i;
int __attribute__((aligned(16))) s = 0;
const vector unsigned char zero =
(const vector unsigned char) vec_splat_u8(0);
vector unsigned char pix1v, pix3v, avgv, t5;
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
vector signed int sumdiffs;
uint8_t *pix3 = pix2 + stride;
/* Due to the fact that pix3 = pix2 + stride, the pix3 of one
* iteration becomes pix2 in the next iteration. We can use this
* fact to avoid a potentially expensive unaligned read, each
* time around the loop.
* Read unaligned pixels into our vectors. The vectors are as follows:
* pix2v: pix2[0] - pix2[15]
* Split the pixel vectors into shorts. */
vector unsigned char pix2v = VEC_LD(0, pix2);
for (i = 0; i < h; i++) {
/* Read unaligned pixels into our vectors. The vectors are as follows:
* pix1v: pix1[0] - pix1[15]
* pix3v: pix3[0] - pix3[15] */
pix1v = vec_ld(0, pix1);
pix3v = VEC_LD(0, pix3);
/* Calculate the average vector. */
avgv = vec_avg(pix2v, pix3v);
/* Calculate a sum of abs differences vector. */
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s(t5, sad);
pix1 += stride;
pix2v = pix3v;
pix3 += stride;
}
/* Sum up the four partial sums, and put the result into s. */
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
sumdiffs = vec_splat(sumdiffs, 3);
vec_ste(sumdiffs, 0, &s);
return s;
}
static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int i;
int __attribute__((aligned(16))) s = 0;
uint8_t *pix3 = pix2 + stride;
const vector unsigned char zero =
(const vector unsigned char) vec_splat_u8(0);
const vector unsigned short two =
(const vector unsigned short) vec_splat_u16(2);
vector unsigned char avgv, t5;
vector unsigned char pix1v, pix3v, pix3iv;
vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
vector unsigned short avghv, avglv;
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
vector signed int sumdiffs;
vector unsigned char perm1, perm2, pix2v, pix2iv;
GET_PERM(perm1, perm2, pix2);
/* Due to the fact that pix3 = pix2 + stride, the pix3 of one
* iteration becomes pix2 in the next iteration. We can use this
* fact to avoid a potentially expensive unaligned read, as well
* as some splitting, and vector addition each time around the loop.
* Read unaligned pixels into our vectors. The vectors are as follows:
* pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16]
* Split the pixel vectors into shorts. */
LOAD_PIX(pix2v, pix2iv, pix2, perm1, perm2);
vector unsigned short pix2hv =
(vector unsigned short) VEC_MERGEH(zero, pix2v);
vector unsigned short pix2lv =
(vector unsigned short) VEC_MERGEL(zero, pix2v);
vector unsigned short pix2ihv =
(vector unsigned short) VEC_MERGEH(zero, pix2iv);
vector unsigned short pix2ilv =
(vector unsigned short) VEC_MERGEL(zero, pix2iv);
vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
vector unsigned short t3, t4;
for (i = 0; i < h; i++) {
/* Read unaligned pixels into our vectors. The vectors are as follows:
* pix1v: pix1[0] - pix1[15]
* pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */
pix1v = vec_ld(0, pix1);
LOAD_PIX(pix3v, pix3iv, pix3, perm1, perm2);
/* Note that AltiVec does have vec_avg, but this works on vector pairs
* and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
* rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
* it should be 1. Instead, we have to split the pixel vectors into
* vectors of shorts and do the averaging by hand. */
/* Split the pixel vectors into shorts. */
pix3hv = (vector unsigned short) VEC_MERGEH(zero, pix3v);
pix3lv = (vector unsigned short) VEC_MERGEL(zero, pix3v);
pix3ihv = (vector unsigned short) VEC_MERGEH(zero, pix3iv);
pix3ilv = (vector unsigned short) VEC_MERGEL(zero, pix3iv);
/* Do the averaging on them. */
t3 = vec_add(pix3hv, pix3ihv);
t4 = vec_add(pix3lv, pix3ilv);
avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
/* Pack the shorts back into a result. */
avgv = vec_pack(avghv, avglv);
/* Calculate a sum of abs differences vector. */
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s(t5, sad);
pix1 += stride;
pix3 += stride;
/* Transfer the calculated values for pix3 into pix2. */
t1 = t3;
t2 = t4;
}
/* Sum up the four partial sums, and put the result into s. */
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
sumdiffs = vec_splat(sumdiffs, 3);
vec_ste(sumdiffs, 0, &s);
return s;
}
static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int i;
int __attribute__((aligned(16))) s;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
vector signed int sumdiffs;
for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2. */
vector unsigned char t1 =vec_ld(0, pix1);
vector unsigned char t2 = VEC_LD(0, pix2);
/* Calculate a sum of abs differences vector. */
vector unsigned char t3 = vec_max(t1, t2);
vector unsigned char t4 = vec_min(t1, t2);
vector unsigned char t5 = vec_sub(t3, t4);
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s(t5, sad);
pix1 += stride;
pix2 += stride;
}
/* Sum up the four partial sums, and put the result into s. */
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
sumdiffs = vec_splat(sumdiffs, 3);
vec_ste(sumdiffs, 0, &s);
return s;
}
static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int i;
int __attribute__((aligned(16))) s;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
const vector unsigned char permclear =
(vector unsigned char)
{ 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
vector signed int sumdiffs;
for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2.
* Since we're reading 16 pixels, and actually only want 8,
* mask out the last 8 pixels. The 0s don't change the sum. */
vector unsigned char pix1l = VEC_LD(0, pix1);
vector unsigned char pix2l = VEC_LD(0, pix2);
vector unsigned char t1 = vec_and(pix1l, permclear);
vector unsigned char t2 = vec_and(pix2l, permclear);
/* Calculate a sum of abs differences vector. */
vector unsigned char t3 = vec_max(t1, t2);
vector unsigned char t4 = vec_min(t1, t2);
vector unsigned char t5 = vec_sub(t3, t4);
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s(t5, sad);
pix1 += stride;
pix2 += stride;
}
/* Sum up the four partial sums, and put the result into s. */
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
sumdiffs = vec_splat(sumdiffs, 3);
vec_ste(sumdiffs, 0, &s);
return s;
}
/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
* It's the sad8_altivec code above w/ squaring added. */
static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int i;
int __attribute__((aligned(16))) s;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
const vector unsigned char permclear =
(vector unsigned char)
{ 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
vector signed int sumsqr;
for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2.
* Since we're reading 16 pixels, and actually only want 8,
* mask out the last 8 pixels. The 0s don't change the sum. */
vector unsigned char t1 = vec_and(VEC_LD(0, pix1), permclear);
vector unsigned char t2 = vec_and(VEC_LD(0, pix2), permclear);
/* Since we want to use unsigned chars, we can take advantage
* of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
/* Calculate abs differences vector. */
vector unsigned char t3 = vec_max(t1, t2);
vector unsigned char t4 = vec_min(t1, t2);
vector unsigned char t5 = vec_sub(t3, t4);
/* Square the values and add them to our sum. */
sum = vec_msum(t5, t5, sum);
pix1 += stride;
pix2 += stride;
}
/* Sum up the four partial sums, and put the result into s. */
sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
sumsqr = vec_splat(sumsqr, 3);
vec_ste(sumsqr, 0, &s);
return s;
}
/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
* It's the sad16_altivec code above w/ squaring added. */
static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
ptrdiff_t stride, int h)
{
int i;
int __attribute__((aligned(16))) s;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
vector signed int sumsqr;
for (i = 0; i < h; i++) {
/* Read potentially unaligned pixels into t1 and t2. */
vector unsigned char t1 = vec_ld(0, pix1);
vector unsigned char t2 = VEC_LD(0, pix2);
/* Since we want to use unsigned chars, we can take advantage
* of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
/* Calculate abs differences vector. */
vector unsigned char t3 = vec_max(t1, t2);
vector unsigned char t4 = vec_min(t1, t2);
vector unsigned char t5 = vec_sub(t3, t4);
/* Square the values and add them to our sum. */
sum = vec_msum(t5, t5, sum);
pix1 += stride;
pix2 += stride;
}
/* Sum up the four partial sums, and put the result into s. */
sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
sumsqr = vec_splat(sumsqr, 3);
vec_ste(sumsqr, 0, &s);
return s;
}
static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
uint8_t *src, ptrdiff_t stride, int h)
{
int __attribute__((aligned(16))) sum;
register const vector unsigned char vzero =
(const vector unsigned char) vec_splat_u8(0);
register vector signed short temp0, temp1, temp2, temp3, temp4,
temp5, temp6, temp7;
{
register const vector signed short vprod1 =
(const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
register const vector signed short vprod2 =
(const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
register const vector signed short vprod3 =
(const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
register const vector unsigned char perm1 =
(const vector unsigned char)
{ 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
register const vector unsigned char perm2 =
(const vector unsigned char)
{ 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
register const vector unsigned char perm3 =
(const vector unsigned char)
{ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
#define ONEITERBUTTERFLY(i, res) \
{ \
register vector unsigned char srcO = unaligned_load(stride * i, src); \
register vector unsigned char dstO = unaligned_load(stride * i, dst);\
\
/* Promote the unsigned chars to signed shorts. */ \
/* We're in the 8x8 function, we only care for the first 8. */ \
register vector signed short srcV = \
(vector signed short) VEC_MERGEH((vector signed char) vzero, \
(vector signed char) srcO); \
register vector signed short dstV = \
(vector signed short) VEC_MERGEH((vector signed char) vzero, \
(vector signed char) dstO); \
\
/* subtractions inside the first butterfly */ \
register vector signed short but0 = vec_sub(srcV, dstV); \
register vector signed short op1 = vec_perm(but0, but0, perm1); \
register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
register vector signed short op2 = vec_perm(but1, but1, perm2); \
register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
register vector signed short op3 = vec_perm(but2, but2, perm3); \
res = vec_mladd(but2, vprod3, op3); \
}
ONEITERBUTTERFLY(0, temp0);
ONEITERBUTTERFLY(1, temp1);
ONEITERBUTTERFLY(2, temp2);
ONEITERBUTTERFLY(3, temp3);
ONEITERBUTTERFLY(4, temp4);
ONEITERBUTTERFLY(5, temp5);
ONEITERBUTTERFLY(6, temp6);
ONEITERBUTTERFLY(7, temp7);
}
#undef ONEITERBUTTERFLY
{
register vector signed int vsum;
register vector signed short line0 = vec_add(temp0, temp1);
register vector signed short line1 = vec_sub(temp0, temp1);
register vector signed short line2 = vec_add(temp2, temp3);
register vector signed short line3 = vec_sub(temp2, temp3);
register vector signed short line4 = vec_add(temp4, temp5);
register vector signed short line5 = vec_sub(temp4, temp5);
register vector signed short line6 = vec_add(temp6, temp7);
register vector signed short line7 = vec_sub(temp6, temp7);
register vector signed short line0B = vec_add(line0, line2);
register vector signed short line2B = vec_sub(line0, line2);
register vector signed short line1B = vec_add(line1, line3);
register vector signed short line3B = vec_sub(line1, line3);
register vector signed short line4B = vec_add(line4, line6);
register vector signed short line6B = vec_sub(line4, line6);
register vector signed short line5B = vec_add(line5, line7);
register vector signed short line7B = vec_sub(line5, line7);
register vector signed short line0C = vec_add(line0B, line4B);
register vector signed short line4C = vec_sub(line0B, line4B);
register vector signed short line1C = vec_add(line1B, line5B);
register vector signed short line5C = vec_sub(line1B, line5B);
register vector signed short line2C = vec_add(line2B, line6B);
register vector signed short line6C = vec_sub(line2B, line6B);
register vector signed short line3C = vec_add(line3B, line7B);
register vector signed short line7C = vec_sub(line3B, line7B);
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
vsum = vec_sum4s(vec_abs(line1C), vsum);
vsum = vec_sum4s(vec_abs(line2C), vsum);
vsum = vec_sum4s(vec_abs(line3C), vsum);
vsum = vec_sum4s(vec_abs(line4C), vsum);
vsum = vec_sum4s(vec_abs(line5C), vsum);
vsum = vec_sum4s(vec_abs(line6C), vsum);
vsum = vec_sum4s(vec_abs(line7C), vsum);
vsum = vec_sums(vsum, (vector signed int) vzero);
vsum = vec_splat(vsum, 3);
vec_ste(vsum, 0, &sum);
}
return sum;
}
/*
* 16x8 works with 16 elements; it can avoid replicating loads, and
* gives the compiler more room for scheduling. It's only used from
* inside hadamard8_diff16_altivec.
*
* Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
* a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
* registers by itself. The following code includes hand-made register
* allocation. It's not clean, but on a 7450 the resulting code is much faster
* (best case falls from 700+ cycles to 550).
*
* xlc doesn't add spill code, but it doesn't know how to schedule for the
* 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
* 25% fewer instructions...)
*
* On the 970, the hand-made RA is still a win (around 690 vs. around 780),
* but xlc goes to around 660 on the regular C code...
*/
static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
uint8_t *src, ptrdiff_t stride, int h)
{
int __attribute__((aligned(16))) sum;
register vector signed short
temp0 __asm__ ("v0"),
temp1 __asm__ ("v1"),
temp2 __asm__ ("v2"),
temp3 __asm__ ("v3"),
temp4 __asm__ ("v4"),
temp5 __asm__ ("v5"),
temp6 __asm__ ("v6"),
temp7 __asm__ ("v7");
register vector signed short
temp0S __asm__ ("v8"),
temp1S __asm__ ("v9"),
temp2S __asm__ ("v10"),
temp3S __asm__ ("v11"),
temp4S __asm__ ("v12"),
temp5S __asm__ ("v13"),
temp6S __asm__ ("v14"),
temp7S __asm__ ("v15");
register const vector unsigned char vzero __asm__ ("v31") =
(const vector unsigned char) vec_splat_u8(0);
{
register const vector signed short vprod1 __asm__ ("v16") =
(const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
register const vector signed short vprod2 __asm__ ("v17") =
(const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
register const vector signed short vprod3 __asm__ ("v18") =
(const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
register const vector unsigned char perm1 __asm__ ("v19") =
(const vector unsigned char)
{ 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
register const vector unsigned char perm2 __asm__ ("v20") =
(const vector unsigned char)
{ 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
register const vector unsigned char perm3 __asm__ ("v21") =
(const vector unsigned char)
{ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
#define ONEITERBUTTERFLY(i, res1, res2) \
{ \
register vector unsigned char srcO __asm__ ("v22") = \
unaligned_load(stride * i, src); \
register vector unsigned char dstO __asm__ ("v23") = \
unaligned_load(stride * i, dst);\
\
/* Promote the unsigned chars to signed shorts. */ \
register vector signed short srcV __asm__ ("v24") = \
(vector signed short) VEC_MERGEH((vector signed char) vzero, \
(vector signed char) srcO); \
register vector signed short dstV __asm__ ("v25") = \
(vector signed short) VEC_MERGEH((vector signed char) vzero, \
(vector signed char) dstO); \
register vector signed short srcW __asm__ ("v26") = \
(vector signed short) VEC_MERGEL((vector signed char) vzero, \
(vector signed char) srcO); \
register vector signed short dstW __asm__ ("v27") = \
(vector signed short) VEC_MERGEL((vector signed char) vzero, \
(vector signed char) dstO); \
\
/* subtractions inside the first butterfly */ \
register vector signed short but0 __asm__ ("v28") = \
vec_sub(srcV, dstV); \
register vector signed short but0S __asm__ ("v29") = \
vec_sub(srcW, dstW); \
register vector signed short op1 __asm__ ("v30") = \
vec_perm(but0, but0, perm1); \
register vector signed short but1 __asm__ ("v22") = \
vec_mladd(but0, vprod1, op1); \
register vector signed short op1S __asm__ ("v23") = \
vec_perm(but0S, but0S, perm1); \
register vector signed short but1S __asm__ ("v24") = \
vec_mladd(but0S, vprod1, op1S); \
register vector signed short op2 __asm__ ("v25") = \
vec_perm(but1, but1, perm2); \
register vector signed short but2 __asm__ ("v26") = \
vec_mladd(but1, vprod2, op2); \
register vector signed short op2S __asm__ ("v27") = \
vec_perm(but1S, but1S, perm2); \
register vector signed short but2S __asm__ ("v28") = \
vec_mladd(but1S, vprod2, op2S); \
register vector signed short op3 __asm__ ("v29") = \
vec_perm(but2, but2, perm3); \
register vector signed short op3S __asm__ ("v30") = \
vec_perm(but2S, but2S, perm3); \
res1 = vec_mladd(but2, vprod3, op3); \
res2 = vec_mladd(but2S, vprod3, op3S); \
}
ONEITERBUTTERFLY(0, temp0, temp0S);
ONEITERBUTTERFLY(1, temp1, temp1S);
ONEITERBUTTERFLY(2, temp2, temp2S);
ONEITERBUTTERFLY(3, temp3, temp3S);
ONEITERBUTTERFLY(4, temp4, temp4S);
ONEITERBUTTERFLY(5, temp5, temp5S);
ONEITERBUTTERFLY(6, temp6, temp6S);
ONEITERBUTTERFLY(7, temp7, temp7S);
}
#undef ONEITERBUTTERFLY
{
register vector signed int vsum;
register vector signed short line0 = vec_add(temp0, temp1);
register vector signed short line1 = vec_sub(temp0, temp1);
register vector signed short line2 = vec_add(temp2, temp3);
register vector signed short line3 = vec_sub(temp2, temp3);
register vector signed short line4 = vec_add(temp4, temp5);
register vector signed short line5 = vec_sub(temp4, temp5);
register vector signed short line6 = vec_add(temp6, temp7);
register vector signed short line7 = vec_sub(temp6, temp7);
register vector signed short line0B = vec_add(line0, line2);
register vector signed short line2B = vec_sub(line0, line2);
register vector signed short line1B = vec_add(line1, line3);
register vector signed short line3B = vec_sub(line1, line3);
register vector signed short line4B = vec_add(line4, line6);
register vector signed short line6B = vec_sub(line4, line6);
register vector signed short line5B = vec_add(line5, line7);
register vector signed short line7B = vec_sub(line5, line7);
register vector signed short line0C = vec_add(line0B, line4B);
register vector signed short line4C = vec_sub(line0B, line4B);
register vector signed short line1C = vec_add(line1B, line5B);
register vector signed short line5C = vec_sub(line1B, line5B);
register vector signed short line2C = vec_add(line2B, line6B);
register vector signed short line6C = vec_sub(line2B, line6B);
register vector signed short line3C = vec_add(line3B, line7B);
register vector signed short line7C = vec_sub(line3B, line7B);
register vector signed short line0S = vec_add(temp0S, temp1S);
register vector signed short line1S = vec_sub(temp0S, temp1S);
register vector signed short line2S = vec_add(temp2S, temp3S);
register vector signed short line3S = vec_sub(temp2S, temp3S);
register vector signed short line4S = vec_add(temp4S, temp5S);
register vector signed short line5S = vec_sub(temp4S, temp5S);
register vector signed short line6S = vec_add(temp6S, temp7S);
register vector signed short line7S = vec_sub(temp6S, temp7S);
register vector signed short line0BS = vec_add(line0S, line2S);
register vector signed short line2BS = vec_sub(line0S, line2S);
register vector signed short line1BS = vec_add(line1S, line3S);
register vector signed short line3BS = vec_sub(line1S, line3S);
register vector signed short line4BS = vec_add(line4S, line6S);
register vector signed short line6BS = vec_sub(line4S, line6S);
register vector signed short line5BS = vec_add(line5S, line7S);
register vector signed short line7BS = vec_sub(line5S, line7S);
register vector signed short line0CS = vec_add(line0BS, line4BS);
register vector signed short line4CS = vec_sub(line0BS, line4BS);
register vector signed short line1CS = vec_add(line1BS, line5BS);
register vector signed short line5CS = vec_sub(line1BS, line5BS);
register vector signed short line2CS = vec_add(line2BS, line6BS);
register vector signed short line6CS = vec_sub(line2BS, line6BS);
register vector signed short line3CS = vec_add(line3BS, line7BS);
register vector signed short line7CS = vec_sub(line3BS, line7BS);
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
vsum = vec_sum4s(vec_abs(line1C), vsum);
vsum = vec_sum4s(vec_abs(line2C), vsum);
vsum = vec_sum4s(vec_abs(line3C), vsum);
vsum = vec_sum4s(vec_abs(line4C), vsum);
vsum = vec_sum4s(vec_abs(line5C), vsum);
vsum = vec_sum4s(vec_abs(line6C), vsum);
vsum = vec_sum4s(vec_abs(line7C), vsum);
vsum = vec_sum4s(vec_abs(line0CS), vsum);
vsum = vec_sum4s(vec_abs(line1CS), vsum);
vsum = vec_sum4s(vec_abs(line2CS), vsum);
vsum = vec_sum4s(vec_abs(line3CS), vsum);
vsum = vec_sum4s(vec_abs(line4CS), vsum);
vsum = vec_sum4s(vec_abs(line5CS), vsum);
vsum = vec_sum4s(vec_abs(line6CS), vsum);
vsum = vec_sum4s(vec_abs(line7CS), vsum);
vsum = vec_sums(vsum, (vector signed int) vzero);
vsum = vec_splat(vsum, 3);
vec_ste(vsum, 0, &sum);
}
return sum;
}
static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
uint8_t *src, ptrdiff_t stride, int h)
{
int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
if (h == 16) {
dst += 8 * stride;
src += 8 * stride;
score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
}
return score;
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->pix_abs[0][1] = sad16_x2_altivec;
c->pix_abs[0][2] = sad16_y2_altivec;
c->pix_abs[0][3] = sad16_xy2_altivec;
c->pix_abs[0][0] = sad16_altivec;
c->pix_abs[1][0] = sad8_altivec;
c->sad[0] = sad16_altivec;
c->sad[1] = sad8_altivec;
c->sse[0] = sse16_altivec;
c->sse[1] = sse8_altivec;
c->hadamard8_diff[0] = hadamard8_diff16_altivec;
c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,141 @@
/*
* Altivec optimized MP3 decoding functions
* Copyright (c) 2010 Vitor Sessak
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/internal.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/mpegaudiodsp.h"
#if HAVE_ALTIVEC
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
#define SUM8(op, sum, w, p) \
{ \
op(sum, (w)[0 * 64], (p)[0 * 64]); \
op(sum, (w)[1 * 64], (p)[1 * 64]); \
op(sum, (w)[2 * 64], (p)[2 * 64]); \
op(sum, (w)[3 * 64], (p)[3 * 64]); \
op(sum, (w)[4 * 64], (p)[4 * 64]); \
op(sum, (w)[5 * 64], (p)[5 * 64]); \
op(sum, (w)[6 * 64], (p)[6 * 64]); \
op(sum, (w)[7 * 64], (p)[7 * 64]); \
}
static void apply_window(const float *buf, const float *win1,
const float *win2, float *sum1, float *sum2, int len)
{
const vector float *win1a = (const vector float *) win1;
const vector float *win2a = (const vector float *) win2;
const vector float *bufa = (const vector float *) buf;
vector float *sum1a = (vector float *) sum1;
vector float *sum2a = (vector float *) sum2;
vector float av_uninit(v0), av_uninit(v4);
vector float v1, v2, v3;
len = len >> 2;
#define MULT(a, b) \
{ \
v1 = vec_ld(a, win1a); \
v2 = vec_ld(b, win2a); \
v3 = vec_ld(a, bufa); \
v0 = vec_madd(v3, v1, v0); \
v4 = vec_madd(v2, v3, v4); \
}
while (len--) {
v0 = vec_xor(v0, v0);
v4 = vec_xor(v4, v4);
MULT( 0, 0);
MULT( 256, 64);
MULT( 512, 128);
MULT( 768, 192);
MULT(1024, 256);
MULT(1280, 320);
MULT(1536, 384);
MULT(1792, 448);
vec_st(v0, 0, sum1a);
vec_st(v4, 0, sum2a);
sum1a++;
sum2a++;
win1a++;
win2a++;
bufa++;
}
}
static void apply_window_mp3(float *in, float *win, int *unused, float *out,
int incr)
{
LOCAL_ALIGNED_16(float, suma, [17]);
LOCAL_ALIGNED_16(float, sumb, [17]);
LOCAL_ALIGNED_16(float, sumc, [17]);
LOCAL_ALIGNED_16(float, sumd, [17]);
float sum;
int j;
float *out2 = out + 32 * incr;
/* copy to avoid wrap */
memcpy(in + 512, in, 32 * sizeof(*in));
apply_window(in + 16, win , win + 512, suma, sumc, 16);
apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
SUM8(MLSS, suma[0], win + 32, in + 48);
sumc[ 0] = 0;
sumb[16] = 0;
sumd[16] = 0;
out[0 ] = suma[ 0];
out += incr;
out2 -= incr;
for(j=1;j<16;j++) {
*out = suma[ j] - sumd[16-j];
*out2 = -sumb[16-j] - sumc[ j];
out += incr;
out2 -= incr;
}
sum = 0;
SUM8(MLSS, sum, win + 16 + 32, in + 32);
*out = sum;
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_mpadsp_init_ppc(MPADSPContext *s)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
s->apply_window_float = apply_window_mp3;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,129 @@
/*
* Copyright (c) 2002 Dieter Shirley
*
* dct_unquantize_h263_altivec:
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdlib.h>
#include <stdio.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/mpegvideo.h"
#if HAVE_ALTIVEC
/* AltiVec version of dct_unquantize_h263
this code assumes `block' is 16 bytes-aligned */
static void dct_unquantize_h263_altivec(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
int i, level, qmul, qadd;
int nCoeffs;
qadd = (qscale - 1) | 1;
qmul = qscale << 1;
if (s->mb_intra) {
if (!s->h263_aic) {
if (n < 4)
block[0] = block[0] * s->y_dc_scale;
else
block[0] = block[0] * s->c_dc_scale;
}else
qadd = 0;
i = 1;
nCoeffs= 63; //does not always use zigzag table
} else {
i = 0;
av_assert2(s->block_last_index[n]>=0);
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
}
{
register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
DECLARE_ALIGNED(16, short, qmul8) = qmul;
DECLARE_ALIGNED(16, short, qadd8) = qadd;
register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
register vector bool short blockv_null, blockv_neg;
register short backup_0 = block[0];
register int j = 0;
qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
nqaddv = vec_sub(vczero, qaddv);
// vectorize all the 16 bytes-aligned blocks
// of 8 elements
for(; (j + 7) <= nCoeffs ; j+=8) {
blockv = vec_ld(j << 1, block);
blockv_neg = vec_cmplt(blockv, vczero);
blockv_null = vec_cmpeq(blockv, vczero);
// choose between +qadd or -qadd as the third operand
temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
// multiply & add (block{i,i+7} * qmul [+-] qadd)
temp1 = vec_mladd(blockv, qmulv, temp1);
// put 0 where block[{i,i+7} used to have 0
blockv = vec_sel(temp1, blockv, blockv_null);
vec_st(blockv, j << 1, block);
}
// if nCoeffs isn't a multiple of 8, finish the job
// using good old scalar units.
// (we could do it using a truncated vector,
// but I'm not sure it's worth the hassle)
for(; j <= nCoeffs ; j++) {
level = block[j];
if (level) {
if (level < 0) {
level = level * qmul - qadd;
} else {
level = level * qmul + qadd;
}
block[j] = level;
}
}
if (i == 1) {
// cheat. this avoid special-casing the first iteration
block[0] = backup_0;
}
}
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_mpv_common_init_ppc(MpegEncContext *s)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
(s->avctx->dct_algo == FF_DCT_ALTIVEC)) {
s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,133 @@
/*
* GMC (Global Motion Compensation), AltiVec-enabled
*
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/mpegvideodsp.h"
#if HAVE_ALTIVEC
/* AltiVec-enhanced gmc1. ATM this code assumes stride is a multiple of 8
* to preserve proper dst alignment. */
static void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */,
int stride, int h, int x16, int y16, int rounder)
{
int i;
const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder;
const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] = {
(16 - x16) * (16 - y16), /* A */
(x16) * (16 - y16), /* B */
(16 - x16) * (y16), /* C */
(x16) * (y16), /* D */
0, 0, 0, 0 /* padding */
};
register const vector unsigned char vczero =
(const vector unsigned char) vec_splat_u8(0);
register const vector unsigned short vcsr8 =
(const vector unsigned short) vec_splat_u16(8);
register vector unsigned char dstv, dstv2, srcvB, srcvC, srcvD;
register vector unsigned short tempB, tempC, tempD;
unsigned long dst_odd = (unsigned long) dst & 0x0000000F;
unsigned long src_really_odd = (unsigned long) src & 0x0000000F;
register vector unsigned short tempA =
vec_ld(0, (const unsigned short *) ABCD);
register vector unsigned short Av = vec_splat(tempA, 0);
register vector unsigned short Bv = vec_splat(tempA, 1);
register vector unsigned short Cv = vec_splat(tempA, 2);
register vector unsigned short Dv = vec_splat(tempA, 3);
register vector unsigned short rounderV =
vec_splat((vec_u16) vec_lde(0, &rounder_a), 0);
/* we'll be able to pick-up our 9 char elements at src from those
* 32 bytes we load the first batch here, as inside the loop we can
* reuse 'src + stride' from one iteration as the 'src' of the next. */
register vector unsigned char src_0 = vec_ld(0, src);
register vector unsigned char src_1 = vec_ld(16, src);
register vector unsigned char srcvA = vec_perm(src_0, src_1,
vec_lvsl(0, src));
if (src_really_odd != 0x0000000F)
/* If (src & 0xF) == 0xF, then (src + 1) is properly aligned
* on the second vector. */
srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
else
srcvB = src_1;
srcvA = vec_mergeh(vczero, srcvA);
srcvB = vec_mergeh(vczero, srcvB);
for (i = 0; i < h; i++) {
dst_odd = (unsigned long) dst & 0x0000000F;
src_really_odd = (((unsigned long) src) + stride) & 0x0000000F;
dstv = vec_ld(0, dst);
/* We'll be able to pick-up our 9 char elements at src + stride from
* those 32 bytes then reuse the resulting 2 vectors srvcC and srcvD
* as the next srcvA and srcvB. */
src_0 = vec_ld(stride + 0, src);
src_1 = vec_ld(stride + 16, src);
srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
if (src_really_odd != 0x0000000F)
/* If (src & 0xF) == 0xF, then (src + 1) is properly aligned
* on the second vector. */
srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
else
srcvD = src_1;
srcvC = vec_mergeh(vczero, srcvC);
srcvD = vec_mergeh(vczero, srcvD);
/* OK, now we (finally) do the math :-)
* Those four instructions replace 32 int muls & 32 int adds.
* Isn't AltiVec nice? */
tempA = vec_mladd((vector unsigned short) srcvA, Av, rounderV);
tempB = vec_mladd((vector unsigned short) srcvB, Bv, tempA);
tempC = vec_mladd((vector unsigned short) srcvC, Cv, tempB);
tempD = vec_mladd((vector unsigned short) srcvD, Dv, tempC);
srcvA = srcvC;
srcvB = srcvD;
tempD = vec_sr(tempD, vcsr8);
dstv2 = vec_pack(tempD, (vector unsigned short) vczero);
if (dst_odd)
dstv2 = vec_perm(dstv, dstv2, vcprm(0, 1, s0, s1));
else
dstv2 = vec_perm(dstv, dstv2, vcprm(s0, s1, 2, 3));
vec_st(dstv2, 0, dst);
dst += stride;
src += stride;
}
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_mpegvideodsp_init_ppc(MpegVideoDSPContext *c)
{
#if HAVE_ALTIVEC
c->gmc1 = gmc1_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,163 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include <stdint.h>
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/mpegvideoencdsp.h"
#if HAVE_ALTIVEC
#if HAVE_VSX
static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
int i, s = 0;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
vector signed int sum;
for (i = 0; i < 16; i++) {
/* Read the potentially unaligned pixels. */
//vector unsigned char pixl = vec_ld(0, pix);
//vector unsigned char pixr = vec_ld(15, pix);
//vector unsigned char pixv = vec_perm(pixl, pixr, perm);
vector unsigned char pixv = vec_vsx_ld(0, pix);
/* Square the values, and add them to our sum. */
sv = vec_msum(pixv, pixv, sv);
pix += line_size;
}
/* Sum up the four partial sums, and put the result into s. */
sum = vec_sums((vector signed int) sv, (vector signed int) zero);
sum = vec_splat(sum, 3);
vec_ste(sum, 0, &s);
return s;
}
#else
static int pix_norm1_altivec(uint8_t *pix, int line_size)
{
int i, s = 0;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
vector unsigned char perm = vec_lvsl(0, pix);
vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
vector signed int sum;
for (i = 0; i < 16; i++) {
/* Read the potentially unaligned pixels. */
vector unsigned char pixl = vec_ld(0, pix);
vector unsigned char pixr = vec_ld(15, pix);
vector unsigned char pixv = vec_perm(pixl, pixr, perm);
/* Square the values, and add them to our sum. */
sv = vec_msum(pixv, pixv, sv);
pix += line_size;
}
/* Sum up the four partial sums, and put the result into s. */
sum = vec_sums((vector signed int) sv, (vector signed int) zero);
sum = vec_splat(sum, 3);
vec_ste(sum, 0, &s);
return s;
}
#endif /* HAVE_VSX */
#if HAVE_VSX
static int pix_sum_altivec(uint8_t *pix, int line_size)
{
int i, s;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
vector signed int sumdiffs;
for (i = 0; i < 16; i++) {
/* Read the potentially unaligned 16 pixels into t1. */
//vector unsigned char pixl = vec_ld(0, pix);
//vector unsigned char pixr = vec_ld(15, pix);
//vector unsigned char t1 = vec_perm(pixl, pixr, perm);
vector unsigned char t1 = vec_vsx_ld(0, pix);
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s(t1, sad);
pix += line_size;
}
/* Sum up the four partial sums, and put the result into s. */
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
sumdiffs = vec_splat(sumdiffs, 3);
vec_ste(sumdiffs, 0, &s);
return s;
}
#else
static int pix_sum_altivec(uint8_t *pix, int line_size)
{
int i, s;
const vector unsigned int zero =
(const vector unsigned int) vec_splat_u32(0);
vector unsigned char perm = vec_lvsl(0, pix);
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
vector signed int sumdiffs;
for (i = 0; i < 16; i++) {
/* Read the potentially unaligned 16 pixels into t1. */
vector unsigned char pixl = vec_ld(0, pix);
vector unsigned char pixr = vec_ld(15, pix);
vector unsigned char t1 = vec_perm(pixl, pixr, perm);
/* Add each 4 pixel group together and put 4 results into sad. */
sad = vec_sum4s(t1, sad);
pix += line_size;
}
/* Sum up the four partial sums, and put the result into s. */
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
sumdiffs = vec_splat(sumdiffs, 3);
vec_ste(sumdiffs, 0, &s);
return s;
}
#endif /* HAVE_VSX */
#endif /* HAVE_ALTIVEC */
av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->pix_norm1 = pix_norm1_altivec;
c->pix_sum = pix_sum_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,289 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/pixblockdsp.h"
#if HAVE_ALTIVEC
#if HAVE_VSX
static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
ptrdiff_t line_size)
{
int i;
vector unsigned char perm =
(vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\
0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17};
const vector unsigned char zero =
(const vector unsigned char) vec_splat_u8(0);
for (i = 0; i < 8; i++) {
/* Read potentially unaligned pixels.
* We're reading 16 pixels, and actually only want 8,
* but we simply ignore the extras. */
vector unsigned char bytes = vec_vsx_ld(0, pixels);
// Convert the bytes into shorts.
//vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm);
vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm);
// Save the data to the block, we assume the block is 16-byte aligned.
vec_vsx_st(shorts, i * 16, (vector signed short *) block);
pixels += line_size;
}
}
#else
static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
ptrdiff_t line_size)
{
int i;
vec_u8 perm = vec_lvsl(0, pixels);
const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
for (i = 0; i < 8; i++) {
/* Read potentially unaligned pixels.
* We're reading 16 pixels, and actually only want 8,
* but we simply ignore the extras. */
vec_u8 pixl = vec_ld(0, pixels);
vec_u8 pixr = vec_ld(7, pixels);
vec_u8 bytes = vec_perm(pixl, pixr, perm);
// Convert the bytes into shorts.
vec_s16 shorts = (vec_s16)vec_mergeh(zero, bytes);
// Save the data to the block, we assume the block is 16-byte aligned.
vec_st(shorts, i * 16, (vec_s16 *)block);
pixels += line_size;
}
}
#endif /* HAVE_VSX */
#if HAVE_VSX
static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
const uint8_t *s2, int stride)
{
int i;
const vector unsigned char zero =
(const vector unsigned char) vec_splat_u8(0);
vector signed short shorts1, shorts2;
for (i = 0; i < 4; i++) {
/* Read potentially unaligned pixels.
* We're reading 16 pixels, and actually only want 8,
* but we simply ignore the extras. */
vector unsigned char bytes = vec_vsx_ld(0, s1);
// Convert the bytes into shorts.
shorts1 = (vector signed short) vec_mergeh(bytes, zero);
// Do the same for the second block of pixels.
bytes =vec_vsx_ld(0, s2);
// Convert the bytes into shorts.
shorts2 = (vector signed short) vec_mergeh(bytes, zero);
// Do the subtraction.
shorts1 = vec_sub(shorts1, shorts2);
// Save the data to the block, we assume the block is 16-byte aligned.
vec_vsx_st(shorts1, 0, (vector signed short *) block);
s1 += stride;
s2 += stride;
block += 8;
/* The code below is a copy of the code above...
* This is a manual unroll. */
/* Read potentially unaligned pixels.
* We're reading 16 pixels, and actually only want 8,
* but we simply ignore the extras. */
bytes = vec_vsx_ld(0, s1);
// Convert the bytes into shorts.
shorts1 = (vector signed short) vec_mergeh(bytes, zero);
// Do the same for the second block of pixels.
bytes = vec_vsx_ld(0, s2);
// Convert the bytes into shorts.
shorts2 = (vector signed short) vec_mergeh(bytes, zero);
// Do the subtraction.
shorts1 = vec_sub(shorts1, shorts2);
// Save the data to the block, we assume the block is 16-byte aligned.
vec_vsx_st(shorts1, 0, (vector signed short *) block);
s1 += stride;
s2 += stride;
block += 8;
}
}
#else
static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
const uint8_t *s2, int stride)
{
int i;
vec_u8 perm1 = vec_lvsl(0, s1);
vec_u8 perm2 = vec_lvsl(0, s2);
const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
vec_s16 shorts1, shorts2;
for (i = 0; i < 4; i++) {
/* Read potentially unaligned pixels.
* We're reading 16 pixels, and actually only want 8,
* but we simply ignore the extras. */
vec_u8 pixl = vec_ld(0, s1);
vec_u8 pixr = vec_ld(15, s1);
vec_u8 bytes = vec_perm(pixl, pixr, perm1);
// Convert the bytes into shorts.
shorts1 = (vec_s16)vec_mergeh(zero, bytes);
// Do the same for the second block of pixels.
pixl = vec_ld(0, s2);
pixr = vec_ld(15, s2);
bytes = vec_perm(pixl, pixr, perm2);
// Convert the bytes into shorts.
shorts2 = (vec_s16)vec_mergeh(zero, bytes);
// Do the subtraction.
shorts1 = vec_sub(shorts1, shorts2);
// Save the data to the block, we assume the block is 16-byte aligned.
vec_st(shorts1, 0, (vec_s16 *)block);
s1 += stride;
s2 += stride;
block += 8;
/* The code below is a copy of the code above...
* This is a manual unroll. */
/* Read potentially unaligned pixels.
* We're reading 16 pixels, and actually only want 8,
* but we simply ignore the extras. */
pixl = vec_ld(0, s1);
pixr = vec_ld(15, s1);
bytes = vec_perm(pixl, pixr, perm1);
// Convert the bytes into shorts.
shorts1 = (vec_s16)vec_mergeh(zero, bytes);
// Do the same for the second block of pixels.
pixl = vec_ld(0, s2);
pixr = vec_ld(15, s2);
bytes = vec_perm(pixl, pixr, perm2);
// Convert the bytes into shorts.
shorts2 = (vec_s16)vec_mergeh(zero, bytes);
// Do the subtraction.
shorts1 = vec_sub(shorts1, shorts2);
// Save the data to the block, we assume the block is 16-byte aligned.
vec_st(shorts1, 0, (vec_s16 *)block);
s1 += stride;
s2 += stride;
block += 8;
}
}
#endif /* HAVE_VSX */
#endif /* HAVE_ALTIVEC */
#if HAVE_VSX
static void get_pixels_vsx(int16_t *restrict block, const uint8_t *pixels,
ptrdiff_t line_size)
{
int i;
for (i = 0; i < 8; i++) {
vec_s16 shorts = vsx_ld_u8_s16(0, pixels);
vec_vsx_st(shorts, i * 16, block);
pixels += line_size;
}
}
static void diff_pixels_vsx(int16_t *restrict block, const uint8_t *s1,
const uint8_t *s2, int stride)
{
int i;
vec_s16 shorts1, shorts2;
for (i = 0; i < 8; i++) {
shorts1 = vsx_ld_u8_s16(0, s1);
shorts2 = vsx_ld_u8_s16(0, s2);
shorts1 = vec_sub(shorts1, shorts2);
vec_vsx_st(shorts1, 0, block);
s1 += stride;
s2 += stride;
block += 8;
}
}
#endif /* HAVE_VSX */
av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c,
AVCodecContext *avctx,
unsigned high_bit_depth)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->diff_pixels = diff_pixels_altivec;
if (!high_bit_depth) {
c->get_pixels = get_pixels_altivec;
}
#endif /* HAVE_ALTIVEC */
#if HAVE_VSX
if (!PPC_VSX(av_get_cpu_flags()))
return;
c->diff_pixels = diff_pixels_vsx;
if (!high_bit_depth)
c->get_pixels = get_pixels_vsx;
#endif /* HAVE_VSX */
}

View File

@@ -0,0 +1,85 @@
/*
* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/svq1enc.h"
#if HAVE_ALTIVEC
static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
int size)
{
int i, size16 = size >> 4;
vector signed char vpix1;
vector signed short vpix2, vdiff, vpix1l, vpix1h;
union {
vector signed int vscore;
int32_t score[4];
} u = { .vscore = vec_splat_s32(0) };
while (size16) {
// score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
// load pix1 and the first batch of pix2
vpix1 = vec_unaligned_load(pix1);
vpix2 = vec_unaligned_load(pix2);
pix2 += 8;
// unpack
vpix1h = vec_unpackh(vpix1);
vdiff = vec_sub(vpix1h, vpix2);
vpix1l = vec_unpackl(vpix1);
// load another batch from pix2
vpix2 = vec_unaligned_load(pix2);
u.vscore = vec_msum(vdiff, vdiff, u.vscore);
vdiff = vec_sub(vpix1l, vpix2);
u.vscore = vec_msum(vdiff, vdiff, u.vscore);
pix1 += 16;
pix2 += 8;
size16--;
}
u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
size %= 16;
for (i = 0; i < size; i++)
u.score[3] += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
return u.score[3];
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_svq1enc_init_ppc(SVQ1EncContext *c)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,363 @@
/*
* VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
* Copyright (c) 2006 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/vc1dsp.h"
#if HAVE_ALTIVEC
// main steps of 8x8 transform
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
do { \
t0 = vec_sl(vec_add(s0, s4), vec_2); \
t0 = vec_add(vec_sl(t0, vec_1), t0); \
t0 = vec_add(t0, vec_rnd); \
t1 = vec_sl(vec_sub(s0, s4), vec_2); \
t1 = vec_add(vec_sl(t1, vec_1), t1); \
t1 = vec_add(t1, vec_rnd); \
t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \
t2 = vec_add(t2, vec_sl(s2, vec_4)); \
t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \
t3 = vec_sub(t3, vec_sl(s6, vec_4)); \
t4 = vec_add(t0, t2); \
t5 = vec_add(t1, t3); \
t6 = vec_sub(t1, t3); \
t7 = vec_sub(t0, t2); \
\
t0 = vec_sl(vec_add(s1, s3), vec_4); \
t0 = vec_add(t0, vec_sl(s5, vec_3)); \
t0 = vec_add(t0, vec_sl(s7, vec_2)); \
t0 = vec_add(t0, vec_sub(s5, s3)); \
\
t1 = vec_sl(vec_sub(s1, s5), vec_4); \
t1 = vec_sub(t1, vec_sl(s7, vec_3)); \
t1 = vec_sub(t1, vec_sl(s3, vec_2)); \
t1 = vec_sub(t1, vec_add(s1, s7)); \
\
t2 = vec_sl(vec_sub(s7, s3), vec_4); \
t2 = vec_add(t2, vec_sl(s1, vec_3)); \
t2 = vec_add(t2, vec_sl(s5, vec_2)); \
t2 = vec_add(t2, vec_sub(s1, s7)); \
\
t3 = vec_sl(vec_sub(s5, s7), vec_4); \
t3 = vec_sub(t3, vec_sl(s3, vec_3)); \
t3 = vec_add(t3, vec_sl(s1, vec_2)); \
t3 = vec_sub(t3, vec_add(s3, s5)); \
\
s0 = vec_add(t4, t0); \
s1 = vec_add(t5, t1); \
s2 = vec_add(t6, t2); \
s3 = vec_add(t7, t3); \
s4 = vec_sub(t7, t3); \
s5 = vec_sub(t6, t2); \
s6 = vec_sub(t5, t1); \
s7 = vec_sub(t4, t0); \
}while(0)
#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \
do { \
s0 = vec_sra(s0, vec_3); \
s1 = vec_sra(s1, vec_3); \
s2 = vec_sra(s2, vec_3); \
s3 = vec_sra(s3, vec_3); \
s4 = vec_sra(s4, vec_3); \
s5 = vec_sra(s5, vec_3); \
s6 = vec_sra(s6, vec_3); \
s7 = vec_sra(s7, vec_3); \
}while(0)
#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \
do { \
s0 = vec_sra(s0, vec_7); \
s1 = vec_sra(s1, vec_7); \
s2 = vec_sra(s2, vec_7); \
s3 = vec_sra(s3, vec_7); \
s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \
s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \
s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \
s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \
}while(0)
/* main steps of 4x4 transform */
#define STEP4(s0, s1, s2, s3, vec_rnd) \
do { \
t1 = vec_add(vec_sl(s0, vec_4), s0); \
t1 = vec_add(t1, vec_rnd); \
t2 = vec_add(vec_sl(s2, vec_4), s2); \
t0 = vec_add(t1, t2); \
t1 = vec_sub(t1, t2); \
t3 = vec_sl(vec_sub(s3, s1), vec_1); \
t3 = vec_add(t3, vec_sl(t3, vec_2)); \
t2 = vec_add(t3, vec_sl(s1, vec_5)); \
t3 = vec_add(t3, vec_sl(s3, vec_3)); \
t3 = vec_add(t3, vec_sl(s3, vec_2)); \
s0 = vec_add(t0, t2); \
s1 = vec_sub(t1, t3); \
s2 = vec_add(t1, t3); \
s3 = vec_sub(t0, t2); \
}while (0)
#define SHIFT_HOR4(s0, s1, s2, s3) \
s0 = vec_sra(s0, vec_3); \
s1 = vec_sra(s1, vec_3); \
s2 = vec_sra(s2, vec_3); \
s3 = vec_sra(s3, vec_3);
#define SHIFT_VERT4(s0, s1, s2, s3) \
s0 = vec_sra(s0, vec_7); \
s1 = vec_sra(s1, vec_7); \
s2 = vec_sra(s2, vec_7); \
s3 = vec_sra(s3, vec_7);
/** Do inverse transform on 8x8 block
*/
static void vc1_inv_trans_8x8_altivec(int16_t block[64])
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
const vector unsigned int vec_7 = vec_splat_u32(7);
const vector unsigned int vec_4 = vec_splat_u32(4);
const vector signed int vec_4s = vec_splat_s32(4);
const vector unsigned int vec_3 = vec_splat_u32(3);
const vector unsigned int vec_2 = vec_splat_u32(2);
const vector signed int vec_1s = vec_splat_s32(1);
const vector unsigned int vec_1 = vec_splat_u32(1);
src0 = vec_ld( 0, block);
src1 = vec_ld( 16, block);
src2 = vec_ld( 32, block);
src3 = vec_ld( 48, block);
src4 = vec_ld( 64, block);
src5 = vec_ld( 80, block);
src6 = vec_ld( 96, block);
src7 = vec_ld(112, block);
s0 = vec_unpackl(src0);
s1 = vec_unpackl(src1);
s2 = vec_unpackl(src2);
s3 = vec_unpackl(src3);
s4 = vec_unpackl(src4);
s5 = vec_unpackl(src5);
s6 = vec_unpackl(src6);
s7 = vec_unpackl(src7);
s8 = vec_unpackh(src0);
s9 = vec_unpackh(src1);
sA = vec_unpackh(src2);
sB = vec_unpackh(src3);
sC = vec_unpackh(src4);
sD = vec_unpackh(src5);
sE = vec_unpackh(src6);
sF = vec_unpackh(src7);
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
src0 = vec_pack(s8, s0);
src1 = vec_pack(s9, s1);
src2 = vec_pack(sA, s2);
src3 = vec_pack(sB, s3);
src4 = vec_pack(sC, s4);
src5 = vec_pack(sD, s5);
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackl(src0);
s1 = vec_unpackl(src1);
s2 = vec_unpackl(src2);
s3 = vec_unpackl(src3);
s4 = vec_unpackl(src4);
s5 = vec_unpackl(src5);
s6 = vec_unpackl(src6);
s7 = vec_unpackl(src7);
s8 = vec_unpackh(src0);
s9 = vec_unpackh(src1);
sA = vec_unpackh(src2);
sB = vec_unpackh(src3);
sC = vec_unpackh(src4);
sD = vec_unpackh(src5);
sE = vec_unpackh(src6);
sF = vec_unpackh(src7);
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64);
SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7);
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64);
SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF);
src0 = vec_pack(s8, s0);
src1 = vec_pack(s9, s1);
src2 = vec_pack(sA, s2);
src3 = vec_pack(sB, s3);
src4 = vec_pack(sC, s4);
src5 = vec_pack(sD, s5);
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
vec_st(src0, 0, block);
vec_st(src1, 16, block);
vec_st(src2, 32, block);
vec_st(src3, 48, block);
vec_st(src4, 64, block);
vec_st(src5, 80, block);
vec_st(src6, 96, block);
vec_st(src7,112, block);
}
/** Do inverse transform on 8x4 part of block
*/
static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, int16_t *block)
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
const vector unsigned int vec_7 = vec_splat_u32(7);
const vector unsigned int vec_5 = vec_splat_u32(5);
const vector unsigned int vec_4 = vec_splat_u32(4);
const vector signed int vec_4s = vec_splat_s32(4);
const vector unsigned int vec_3 = vec_splat_u32(3);
const vector unsigned int vec_2 = vec_splat_u32(2);
const vector unsigned int vec_1 = vec_splat_u32(1);
vector unsigned char tmp;
vector signed short tmp2, tmp3;
vector unsigned char perm0, perm1, p0, p1, p;
src0 = vec_ld( 0, block);
src1 = vec_ld( 16, block);
src2 = vec_ld( 32, block);
src3 = vec_ld( 48, block);
src4 = vec_ld( 64, block);
src5 = vec_ld( 80, block);
src6 = vec_ld( 96, block);
src7 = vec_ld(112, block);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackl(src0);
s1 = vec_unpackl(src1);
s2 = vec_unpackl(src2);
s3 = vec_unpackl(src3);
s4 = vec_unpackl(src4);
s5 = vec_unpackl(src5);
s6 = vec_unpackl(src6);
s7 = vec_unpackl(src7);
s8 = vec_unpackh(src0);
s9 = vec_unpackh(src1);
sA = vec_unpackh(src2);
sB = vec_unpackh(src3);
sC = vec_unpackh(src4);
sD = vec_unpackh(src5);
sE = vec_unpackh(src6);
sF = vec_unpackh(src7);
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
src0 = vec_pack(s8, s0);
src1 = vec_pack(s9, s1);
src2 = vec_pack(sA, s2);
src3 = vec_pack(sB, s3);
src4 = vec_pack(sC, s4);
src5 = vec_pack(sD, s5);
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackh(src0);
s1 = vec_unpackh(src1);
s2 = vec_unpackh(src2);
s3 = vec_unpackh(src3);
s8 = vec_unpackl(src0);
s9 = vec_unpackl(src1);
sA = vec_unpackl(src2);
sB = vec_unpackl(src3);
STEP4(s0, s1, s2, s3, vec_64);
SHIFT_VERT4(s0, s1, s2, s3);
STEP4(s8, s9, sA, sB, vec_64);
SHIFT_VERT4(s8, s9, sA, sB);
src0 = vec_pack(s0, s8);
src1 = vec_pack(s1, s9);
src2 = vec_pack(s2, sA);
src3 = vec_pack(s3, sB);
#if HAVE_BIGENDIAN
p0 = vec_lvsl (0, dest);
p1 = vec_lvsl (stride, dest);
p = vec_splat_u8 (-1);
perm0 = vec_mergeh (p, p0);
perm1 = vec_mergeh (p, p1);
#define GET_TMP2(dst, p) \
tmp = vec_ld (0, dest); \
tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), p);
#else
#define GET_TMP2(dst,p) \
tmp = vec_vsx_ld (0, dst); \
tmp2 = (vector signed short)vec_mergeh (tmp, vec_splat_u8(0));
#endif
#define ADD(dest,src,perm) \
GET_TMP2(dest, perm); \
tmp3 = vec_adds (tmp2, src); \
tmp = vec_packsu (tmp3, tmp3); \
vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \
vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest);
ADD (dest, src0, perm0) dest += stride;
ADD (dest, src1, perm1) dest += stride;
ADD (dest, src2, perm0) dest += stride;
ADD (dest, src3, perm1)
}
#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
#define PREFIX_no_rnd_vc1_chroma_mc8_altivec put_no_rnd_vc1_chroma_mc8_altivec
#include "h264chroma_template.c"
#undef OP_U8_ALTIVEC
#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
#define PREFIX_no_rnd_vc1_chroma_mc8_altivec avg_no_rnd_vc1_chroma_mc8_altivec
#include "h264chroma_template.c"
#undef OP_U8_ALTIVEC
#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
#endif /* HAVE_ALTIVEC */
av_cold void ff_vc1dsp_init_ppc(VC1DSPContext *dsp)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,36 @@
/*
* Copyright (c) 2003-2004 Romain Dolbeau
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavcodec/videodsp.h"
static void prefetch_ppc(uint8_t *mem, ptrdiff_t stride, int h)
{
register const uint8_t *p = mem;
do {
__asm__ volatile ("dcbt 0,%0" : : "r" (p));
p += stride;
} while(--h);
}
av_cold void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc)
{
ctx->prefetch = prefetch_ppc;
}

View File

@@ -0,0 +1,63 @@
/*
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavcodec/vorbisdsp.h"
#if HAVE_ALTIVEC
static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
intptr_t blocksize)
{
int i;
vector float m, a;
vector bool int t0, t1;
const vector unsigned int v_31 = //XXX
vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
for (i = 0; i < blocksize; i += 4) {
m = vec_ld(0, mag+i);
a = vec_ld(0, ang+i);
t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
t0 = (vector bool int)vec_and(a, t1);
t1 = (vector bool int)vec_andc(a, t1);
a = vec_sub(m, (vector float)t1);
m = vec_add(m, (vector float)t0);
vec_stl(a, 0, ang+i);
vec_stl(m, 0, mag+i);
}
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_vorbisdsp_init_ppc(VorbisDSPContext *c)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,203 @@
/*
* Copyright (C) 2009 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <string.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/vp3dsp.h"
#if HAVE_ALTIVEC
static const vec_s16 constants =
{0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
#if HAVE_BIGENDIAN
static const vec_u8 interleave_high =
{0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
#else
static const vec_u8 interleave_high =
{2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
#endif
#define IDCT_START \
vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\
vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\
vec_s16 eight = vec_splat_s16(8);\
vec_u16 four = vec_splat_u16(4);\
\
vec_s16 C1 = vec_splat(constants, 1);\
vec_s16 C2 = vec_splat(constants, 2);\
vec_s16 C3 = vec_splat(constants, 3);\
vec_s16 C4 = vec_splat(constants, 4);\
vec_s16 C5 = vec_splat(constants, 5);\
vec_s16 C6 = vec_splat(constants, 6);\
vec_s16 C7 = vec_splat(constants, 7);\
\
vec_s16 b0 = vec_ld(0x00, block);\
vec_s16 b1 = vec_ld(0x10, block);\
vec_s16 b2 = vec_ld(0x20, block);\
vec_s16 b3 = vec_ld(0x30, block);\
vec_s16 b4 = vec_ld(0x40, block);\
vec_s16 b5 = vec_ld(0x50, block);\
vec_s16 b6 = vec_ld(0x60, block);\
vec_s16 b7 = vec_ld(0x70, block);
// these functions do (a*C)>>16
// things are tricky because a is signed, but C unsigned.
// M15 is used if C fits in 15 bit unsigned (C6,C7)
// M16 is used if C requires 16 bits unsigned
static inline vec_s16 M15(vec_s16 a, vec_s16 C)
{
return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high);
}
static inline vec_s16 M16(vec_s16 a, vec_s16 C)
{
return vec_add(a, M15(a, C));
}
#define IDCT_1D(ADD, SHIFT)\
A = vec_add(M16(b1, C1), M15(b7, C7));\
B = vec_sub(M15(b1, C7), M16(b7, C1));\
C = vec_add(M16(b3, C3), M16(b5, C5));\
D = vec_sub(M16(b5, C3), M16(b3, C5));\
\
Ad = M16(vec_sub(A, C), C4);\
Bd = M16(vec_sub(B, D), C4);\
\
Cd = vec_add(A, C);\
Dd = vec_add(B, D);\
\
E = ADD(M16(vec_add(b0, b4), C4));\
F = ADD(M16(vec_sub(b0, b4), C4));\
\
G = vec_add(M16(b2, C2), M15(b6, C6));\
H = vec_sub(M15(b2, C6), M16(b6, C2));\
\
Ed = vec_sub(E, G);\
Gd = vec_add(E, G);\
\
Add = vec_add(F, Ad);\
Bdd = vec_sub(Bd, H);\
\
Fd = vec_sub(F, Ad);\
Hd = vec_add(Bd, H);\
\
b0 = SHIFT(vec_add(Gd, Cd));\
b7 = SHIFT(vec_sub(Gd, Cd));\
\
b1 = SHIFT(vec_add(Add, Hd));\
b2 = SHIFT(vec_sub(Add, Hd));\
\
b3 = SHIFT(vec_add(Ed, Dd));\
b4 = SHIFT(vec_sub(Ed, Dd));\
\
b5 = SHIFT(vec_add(Fd, Bdd));\
b6 = SHIFT(vec_sub(Fd, Bdd));
#define NOP(a) a
#define ADD8(a) vec_add(a, eight)
#define SHIFT4(a) vec_sra(a, four)
static void vp3_idct_put_altivec(uint8_t *dst, int stride, int16_t block[64])
{
vec_u8 t;
IDCT_START
// pixels are signed; so add 128*16 in addition to the normal 8
vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11));
eight = vec_add(eight, v2048);
IDCT_1D(NOP, NOP)
TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
IDCT_1D(ADD8, SHIFT4)
#define PUT(a)\
t = vec_packsu(a, a);\
vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
vec_ste((vec_u32)t, 4, (unsigned int *)dst);
PUT(b0) dst += stride;
PUT(b1) dst += stride;
PUT(b2) dst += stride;
PUT(b3) dst += stride;
PUT(b4) dst += stride;
PUT(b5) dst += stride;
PUT(b6) dst += stride;
PUT(b7)
memset(block, 0, sizeof(*block) * 64);
}
static void vp3_idct_add_altivec(uint8_t *dst, int stride, int16_t block[64])
{
LOAD_ZERO;
vec_u8 t, vdst;
vec_s16 vdst_16;
vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst));
IDCT_START
IDCT_1D(NOP, NOP)
TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
IDCT_1D(ADD8, SHIFT4)
#if HAVE_BIGENDIAN
#define GET_VDST16\
vdst = vec_ld(0, dst);\
vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);
#else
#define GET_VDST16\
vdst = vec_vsx_ld(0,dst);\
vdst_16 = (vec_s16)vec_mergeh(vdst, zero_u8v);
#endif
#define ADD(a)\
GET_VDST16;\
vdst_16 = vec_adds(a, vdst_16);\
t = vec_packsu(vdst_16, vdst_16);\
vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
vec_ste((vec_u32)t, 4, (unsigned int *)dst);
ADD(b0) dst += stride;
ADD(b1) dst += stride;
ADD(b2) dst += stride;
ADD(b3) dst += stride;
ADD(b4) dst += stride;
ADD(b5) dst += stride;
ADD(b6) dst += stride;
ADD(b7)
memset(block, 0, sizeof(*block) * 64);
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->idct_put = vp3_idct_put_altivec;
c->idct_add = vp3_idct_add_altivec;
#endif
}

View File

@@ -0,0 +1,359 @@
/*
* VP8 compatible video decoder
*
* Copyright (C) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/vp8dsp.h"
#include "hpeldsp_altivec.h"
#if HAVE_ALTIVEC
#define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
// h subpel filter uses msum to multiply+add 4 pixel taps at once
static const vec_s8 h_subpel_filters_inner[7] =
{
REPT4( -6, 123, 12, -1),
REPT4(-11, 108, 36, -8),
REPT4( -9, 93, 50, -6),
REPT4(-16, 77, 77, -16),
REPT4( -6, 50, 93, -9),
REPT4( -8, 36, 108, -11),
REPT4( -1, 12, 123, -6),
};
// for 6tap filters, these are the outer two taps
// The zeros mask off pixels 4-7 when filtering 0-3
// and vice-versa
static const vec_s8 h_subpel_filters_outer[3] =
{
REPT4(0, 0, 2, 1),
REPT4(0, 0, 3, 3),
REPT4(0, 0, 1, 2),
};
#define LOAD_H_SUBPEL_FILTER(i) \
vec_s8 filter_inner = h_subpel_filters_inner[i]; \
vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \
vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2)
#if HAVE_BIGENDIAN
#define GET_PIXHL(offset) \
a = vec_ld((offset)-is6tap-1, src); \
b = vec_ld((offset)-is6tap-1+15, src); \
pixh = vec_perm(a, b, permh##offset); \
pixl = vec_perm(a, b, perml##offset)
#define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset)
#else
#define GET_PIXHL(offset) \
a = vec_vsx_ld((offset)-is6tap-1, src); \
pixh = vec_perm(a, a, perm_inner); \
pixl = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4)))
#define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer)
#endif
#define FILTER_H(dstv, off) \
GET_PIXHL(off); \
filth = vec_msum(filter_inner, pixh, c64); \
filtl = vec_msum(filter_inner, pixl, c64); \
\
if (is6tap) { \
GET_OUTER(off); \
filth = vec_msum(filter_outerh, outer, filth); \
filtl = vec_msum(filter_outerl, outer, filtl); \
} \
if (w == 4) \
filtl = filth; /* discard pixels 4-7 */ \
dstv = vec_packs(filth, filtl); \
dstv = vec_sra(dstv, c7)
static av_always_inline
void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
uint8_t *src, ptrdiff_t src_stride,
int h, int mx, int w, int is6tap)
{
LOAD_H_SUBPEL_FILTER(mx-1);
#if HAVE_BIGENDIAN
vec_u8 align_vec0, align_vec8, permh0, permh8;
vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
vec_u8 b;
#endif
vec_u8 filt, a, pixh, pixl, outer;
vec_s16 f16h, f16l;
vec_s32 filth, filtl;
vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4;
vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
vec_u16 c7 = vec_splat_u16(7);
#if HAVE_BIGENDIAN
align_vec0 = vec_lvsl( -is6tap-1, src);
align_vec8 = vec_lvsl(8-is6tap-1, src);
permh0 = vec_perm(align_vec0, align_vec0, perm_inner);
permh8 = vec_perm(align_vec8, align_vec8, perm_inner);
perm_inner = vec_add(perm_inner, vec_splat_u8(4));
perml0 = vec_perm(align_vec0, align_vec0, perm_inner);
perml8 = vec_perm(align_vec8, align_vec8, perm_inner);
perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
#endif
while (h --> 0) {
FILTER_H(f16h, 0);
if (w == 16) {
FILTER_H(f16l, 8);
filt = vec_packsu(f16h, f16l);
vec_st(filt, 0, dst);
} else {
filt = vec_packsu(f16h, f16h);
vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
if (w == 8)
vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
}
src += src_stride;
dst += dst_stride;
}
}
// v subpel filter does a simple vertical multiply + add
static const vec_u8 v_subpel_filters[7] =
{
{ 0, 6, 123, 12, 1, 0 },
{ 2, 11, 108, 36, 8, 1 },
{ 0, 9, 93, 50, 6, 0 },
{ 3, 16, 77, 77, 16, 3 },
{ 0, 6, 50, 93, 9, 0 },
{ 1, 8, 36, 108, 11, 2 },
{ 0, 1, 12, 123, 6, 0 },
};
#define LOAD_V_SUBPEL_FILTER(i) \
vec_u8 subpel_filter = v_subpel_filters[i]; \
vec_u8 f0 = vec_splat(subpel_filter, 0); \
vec_u8 f1 = vec_splat(subpel_filter, 1); \
vec_u8 f2 = vec_splat(subpel_filter, 2); \
vec_u8 f3 = vec_splat(subpel_filter, 3); \
vec_u8 f4 = vec_splat(subpel_filter, 4); \
vec_u8 f5 = vec_splat(subpel_filter, 5)
#define FILTER_V(dstv, vec_mul) \
s1f = (vec_s16)vec_mul(s1, f1); \
s2f = (vec_s16)vec_mul(s2, f2); \
s3f = (vec_s16)vec_mul(s3, f3); \
s4f = (vec_s16)vec_mul(s4, f4); \
s2f = vec_subs(s2f, s1f); \
s3f = vec_subs(s3f, s4f); \
if (is6tap) { \
s0f = (vec_s16)vec_mul(s0, f0); \
s5f = (vec_s16)vec_mul(s5, f5); \
s2f = vec_adds(s2f, s0f); \
s3f = vec_adds(s3f, s5f); \
} \
dstv = vec_adds(s2f, s3f); \
dstv = vec_adds(dstv, c64); \
dstv = vec_sra(dstv, c7)
#if HAVE_BIGENDIAN
#define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm)
#else
#define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s))
#endif
static av_always_inline
void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
uint8_t *src, ptrdiff_t src_stride,
int h, int my, int w, int is6tap)
{
LOAD_V_SUBPEL_FILTER(my-1);
vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl;
vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l;
vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
vec_u16 c7 = vec_splat_u16(7);
#if HAVE_BIGENDIAN
// we want pixels 0-7 to be in the even positions and 8-15 in the odd,
// so combine this permute with the alignment permute vector
align_vech = vec_lvsl(0, src);
align_vecl = vec_sld(align_vech, align_vech, 8);
if (w ==16)
perm_vec = vec_mergeh(align_vech, align_vecl);
else
perm_vec = vec_mergeh(align_vech, align_vech);
#endif
if (is6tap)
s0 = LOAD_HL(-2*src_stride, src, perm_vec);
s1 = LOAD_HL(-1*src_stride, src, perm_vec);
s2 = LOAD_HL( 0*src_stride, src, perm_vec);
s3 = LOAD_HL( 1*src_stride, src, perm_vec);
if (is6tap)
s4 = LOAD_HL( 2*src_stride, src, perm_vec);
src += (2+is6tap)*src_stride;
while (h --> 0) {
if (is6tap)
s5 = LOAD_HL(0, src, perm_vec);
else
s4 = LOAD_HL(0, src, perm_vec);
FILTER_V(f16h, vec_mule);
if (w == 16) {
FILTER_V(f16l, vec_mulo);
filt = vec_packsu(f16h, f16l);
vec_st(filt, 0, dst);
} else {
filt = vec_packsu(f16h, f16h);
if (w == 4)
filt = (vec_u8)vec_splat((vec_u32)filt, 0);
else
vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
}
if (is6tap)
s0 = s1;
s1 = s2;
s2 = s3;
s3 = s4;
if (is6tap)
s4 = s5;
dst += dst_stride;
src += src_stride;
}
}
#define EPEL_FUNCS(WIDTH, TAPS) \
static av_noinline \
void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
{ \
put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \
} \
\
static av_noinline \
void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
{ \
put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \
}
#define EPEL_HV(WIDTH, HTAPS, VTAPS) \
static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \
{ \
DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
if (VTAPS == 6) { \
put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \
put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \
} else { \
put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \
put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \
} \
}
EPEL_FUNCS(16,6)
EPEL_FUNCS(8, 6)
EPEL_FUNCS(8, 4)
EPEL_FUNCS(4, 6)
EPEL_FUNCS(4, 4)
EPEL_HV(16, 6,6)
EPEL_HV(8, 6,6)
EPEL_HV(8, 4,6)
EPEL_HV(8, 6,4)
EPEL_HV(8, 4,4)
EPEL_HV(4, 6,6)
EPEL_HV(4, 4,6)
EPEL_HV(4, 6,4)
EPEL_HV(4, 4,4)
static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my)
{
register vector unsigned char perm;
int i;
register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
#if HAVE_BIGENDIAN
perm = vec_lvsl(0, src);
#endif
// hand-unrolling the loop by 4 gains about 15%
// mininum execution time goes from 74 to 60 cycles
// it's faster than -funroll-loops, but using
// -funroll-loops w/ this is bad - 74 cycles again.
// all this is on a 7450, tuning for the 7450
for (i = 0; i < h; i += 4) {
vec_st(load_with_perm_vec(0, src, perm), 0, dst);
vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst);
vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst);
vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst);
src += sstride4;
dst += dstride4;
}
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
{
#if HAVE_ALTIVEC
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec;
c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec;
c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec;
c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec;
c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec;
c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec;
c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec;
c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec;
c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec;
c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec;
c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec;
c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec;
c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec;
c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec;
c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec;
c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec;
c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec;
c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec;
c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec;
c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec;
#endif /* HAVE_ALTIVEC */
}