ffmpeg-2.1.1: move directory

git-svn-id: svn://kolibrios.org@6148 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Sergey Semyonov (Serge)
2016-02-05 22:14:10 +00:00
parent a4b787f4b8
commit ecf3e862ea
4011 changed files with 1868 additions and 4 deletions

View File

@@ -0,0 +1,104 @@
OBJS += x86/constants.o \
x86/fmtconvert_init.o \
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_DCT) += x86/dct_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o \
x86/dsputil_x86.o
OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \
x86/fdct.o \
x86/motion_est.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \
x86/rv40dsp_init.o
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \
x86/fpel_mmx.o \
x86/idct_mmx_xvid.o \
x86/idct_sse2_xvid.o \
x86/rnd_mmx.o \
x86/simple_idct.o
MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o
MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \
x86/hpeldsp_mmx.o \
x86/rnd_mmx.o
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
YASM-OBJS += x86/deinterlace.o \
x86/fmtconvert.o \
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\
x86/dwt_yasm.o
YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
x86/fpel.o \
x86/mpeg4qpel.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
YASM-OBJS-$(CONFIG_H263_DECODER) += x86/h263_loopfilter.o
YASM-OBJS-$(CONFIG_H263_ENCODER) += x86/h263_loopfilter.o
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
x86/h264_chromamc_10bit.o
YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
x86/h264_deblock_10bit.o \
x86/h264_idct.o \
x86/h264_idct_10bit.o \
x86/h264_weight.o \
x86/h264_weight_10bit.o
YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
x86/h264_intrapred_10bit.o
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
x86/h264_qpel_10bit.o \
x86/fpel.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
x86/hpeldsp.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o
YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
x86/rv40dsp.o
YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp.o
YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o

View File

@@ -0,0 +1,421 @@
;*****************************************************************************
;* x86-optimized AC-3 DSP utils
;* Copyright (c) 2011 Justin Ruggles
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
; 16777216.0f - used in ff_float_to_fixed24()
pf_1_24: times 4 dd 0x4B800000
; used in ff_ac3_compute_mantissa_size()
cextern ac3_bap_bits
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
; used in ff_ac3_extract_exponents()
pd_1: times 4 dd 1
pd_151: times 4 dd 151
SECTION .text
;-----------------------------------------------------------------------------
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
;-----------------------------------------------------------------------------
%macro AC3_EXPONENT_MIN 0
cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
shl reuse_blksq, 8
jz .end
LOOP_ALIGN
.nextexp:
mov offsetq, reuse_blksq
mova m0, [expq+offsetq]
sub offsetq, 256
LOOP_ALIGN
.nextblk:
PMINUB m0, [expq+offsetq], m1
sub offsetq, 256
jae .nextblk
mova [expq], m0
add expq, mmsize
sub expnq, mmsize
jg .nextexp
.end:
REP_RET
%endmacro
%define LOOP_ALIGN
INIT_MMX mmx
AC3_EXPONENT_MIN
%if HAVE_MMXEXT_EXTERNAL
%define LOOP_ALIGN ALIGN 16
INIT_MMX mmxext
AC3_EXPONENT_MIN
%endif
%if HAVE_SSE2_EXTERNAL
INIT_XMM sse2
AC3_EXPONENT_MIN
%endif
%undef LOOP_ALIGN
;-----------------------------------------------------------------------------
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
;
; This function uses 2 different methods to calculate a valid result.
; 1) logical 'or' of abs of each element
; This is used for ssse3 because of the pabsw instruction.
; It is also used for mmx because of the lack of min/max instructions.
; 2) calculate min/max for the array, then or(abs(min),abs(max))
; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
;-----------------------------------------------------------------------------
; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
%macro OR_WORDS_HORIZ 2 ; src, tmp
%if cpuflag(sse2)
movhlps %2, %1
por %1, %2
pshuflw %2, %1, q0032
por %1, %2
pshuflw %2, %1, q0001
por %1, %2
%elif cpuflag(mmxext)
pshufw %2, %1, q0032
por %1, %2
pshufw %2, %1, q0001
por %1, %2
%else ; mmx
movq %2, %1
psrlq %2, 32
por %1, %2
movq %2, %1
psrlq %2, 16
por %1, %2
%endif
%endmacro
%macro AC3_MAX_MSB_ABS_INT16 1
cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
pxor m2, m2
pxor m3, m3
.loop:
%ifidn %1, min_max
mova m0, [srcq]
mova m1, [srcq+mmsize]
pminsw m2, m0
pminsw m2, m1
pmaxsw m3, m0
pmaxsw m3, m1
%else ; or_abs
%if notcpuflag(ssse3)
mova m0, [srcq]
mova m1, [srcq+mmsize]
ABS2 m0, m1, m3, m4
%else ; ssse3
; using memory args is faster for ssse3
pabsw m0, [srcq]
pabsw m1, [srcq+mmsize]
%endif
por m2, m0
por m2, m1
%endif
add srcq, mmsize*2
sub lend, mmsize
ja .loop
%ifidn %1, min_max
ABS2 m2, m3, m0, m1
por m2, m3
%endif
OR_WORDS_HORIZ m2, m0
movd eax, m2
and eax, 0xFFFF
RET
%endmacro
INIT_MMX mmx
AC3_MAX_MSB_ABS_INT16 or_abs
INIT_MMX mmxext
AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM sse2
AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM ssse3
AC3_MAX_MSB_ABS_INT16 or_abs
;-----------------------------------------------------------------------------
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
;-----------------------------------------------------------------------------
%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
movd m0, shiftd
.loop:
mova m1, [srcq ]
mova m2, [srcq+mmsize ]
mova m3, [srcq+mmsize*2]
mova m4, [srcq+mmsize*3]
%3 m1, m0
%3 m2, m0
%3 m3, m0
%3 m4, m0
mova [srcq ], m1
mova [srcq+mmsize ], m2
mova [srcq+mmsize*2], m3
mova [srcq+mmsize*3], m4
add srcq, mmsize*4
sub lend, mmsize*32/%2
ja .loop
.end:
REP_RET
%endmacro
;-----------------------------------------------------------------------------
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
;-----------------------------------------------------------------------------
INIT_MMX mmx
AC3_SHIFT l, 16, psllw
INIT_XMM sse2
AC3_SHIFT l, 16, psllw
;-----------------------------------------------------------------------------
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
;-----------------------------------------------------------------------------
INIT_MMX mmx
AC3_SHIFT r, 32, psrad
INIT_XMM sse2
AC3_SHIFT r, 32, psrad
;-----------------------------------------------------------------------------
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
;-----------------------------------------------------------------------------
; The 3DNow! version is not bit-identical because pf2id uses truncation rather
; than round-to-nearest.
INIT_MMX 3dnow
cglobal float_to_fixed24, 3, 3, 0, dst, src, len
movq m0, [pf_1_24]
.loop:
movq m1, [srcq ]
movq m2, [srcq+8 ]
movq m3, [srcq+16]
movq m4, [srcq+24]
pfmul m1, m0
pfmul m2, m0
pfmul m3, m0
pfmul m4, m0
pf2id m1, m1
pf2id m2, m2
pf2id m3, m3
pf2id m4, m4
movq [dstq ], m1
movq [dstq+8 ], m2
movq [dstq+16], m3
movq [dstq+24], m4
add srcq, 32
add dstq, 32
sub lend, 8
ja .loop
femms
RET
INIT_XMM sse
cglobal float_to_fixed24, 3, 3, 3, dst, src, len
movaps m0, [pf_1_24]
.loop:
movaps m1, [srcq ]
movaps m2, [srcq+16]
mulps m1, m0
mulps m2, m0
cvtps2pi mm0, m1
movhlps m1, m1
cvtps2pi mm1, m1
cvtps2pi mm2, m2
movhlps m2, m2
cvtps2pi mm3, m2
movq [dstq ], mm0
movq [dstq+ 8], mm1
movq [dstq+16], mm2
movq [dstq+24], mm3
add srcq, 32
add dstq, 32
sub lend, 8
ja .loop
emms
RET
INIT_XMM sse2
cglobal float_to_fixed24, 3, 3, 9, dst, src, len
movaps m0, [pf_1_24]
.loop:
movaps m1, [srcq ]
movaps m2, [srcq+16 ]
movaps m3, [srcq+32 ]
movaps m4, [srcq+48 ]
%ifdef m8
movaps m5, [srcq+64 ]
movaps m6, [srcq+80 ]
movaps m7, [srcq+96 ]
movaps m8, [srcq+112]
%endif
mulps m1, m0
mulps m2, m0
mulps m3, m0
mulps m4, m0
%ifdef m8
mulps m5, m0
mulps m6, m0
mulps m7, m0
mulps m8, m0
%endif
cvtps2dq m1, m1
cvtps2dq m2, m2
cvtps2dq m3, m3
cvtps2dq m4, m4
%ifdef m8
cvtps2dq m5, m5
cvtps2dq m6, m6
cvtps2dq m7, m7
cvtps2dq m8, m8
%endif
movdqa [dstq ], m1
movdqa [dstq+16 ], m2
movdqa [dstq+32 ], m3
movdqa [dstq+48 ], m4
%ifdef m8
movdqa [dstq+64 ], m5
movdqa [dstq+80 ], m6
movdqa [dstq+96 ], m7
movdqa [dstq+112], m8
add srcq, 128
add dstq, 128
sub lenq, 32
%else
add srcq, 64
add dstq, 64
sub lenq, 16
%endif
ja .loop
REP_RET
;------------------------------------------------------------------------------
; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
;------------------------------------------------------------------------------
%macro PHADDD4 2 ; xmm src, xmm tmp
movhlps %2, %1
paddd %1, %2
pshufd %2, %1, 0x1
paddd %1, %2
%endmacro
INIT_XMM sse2
cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
movdqa m0, [mant_cntq ]
movdqa m1, [mant_cntq+ 1*16]
paddw m0, [mant_cntq+ 2*16]
paddw m1, [mant_cntq+ 3*16]
paddw m0, [mant_cntq+ 4*16]
paddw m1, [mant_cntq+ 5*16]
paddw m0, [mant_cntq+ 6*16]
paddw m1, [mant_cntq+ 7*16]
paddw m0, [mant_cntq+ 8*16]
paddw m1, [mant_cntq+ 9*16]
paddw m0, [mant_cntq+10*16]
paddw m1, [mant_cntq+11*16]
pmaddwd m0, [ac3_bap_bits ]
pmaddwd m1, [ac3_bap_bits+16]
paddd m0, m1
PHADDD4 m0, m1
movd sumd, m0
movdqa m3, [pw_bap_mul1]
movhpd m0, [mant_cntq +2]
movlpd m0, [mant_cntq+1*32+2]
movhpd m1, [mant_cntq+2*32+2]
movlpd m1, [mant_cntq+3*32+2]
movhpd m2, [mant_cntq+4*32+2]
movlpd m2, [mant_cntq+5*32+2]
pmulhuw m0, m3
pmulhuw m1, m3
pmulhuw m2, m3
paddusw m0, m1
paddusw m0, m2
pmaddwd m0, [pw_bap_mul2]
PHADDD4 m0, m1
movd eax, m0
add eax, sumd
RET
;------------------------------------------------------------------------------
; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
;------------------------------------------------------------------------------
%macro PABSD 1-2 ; src/dst, unused
%if cpuflag(ssse3)
pabsd %1, %1
%else ; src/dst, tmp
pxor %2, %2
pcmpgtd %2, %1
pxor %1, %2
psubd %1, %2
%endif
%endmacro
%macro AC3_EXTRACT_EXPONENTS 0
cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
add expq, lenq
lea coefq, [coefq+4*lenq]
neg lenq
mova m2, [pd_1]
mova m3, [pd_151]
.loop:
; move 4 32-bit coefs to xmm0
mova m0, [coefq+4*lenq]
; absolute value
PABSD m0, m1
; convert to float and extract exponents
pslld m0, 1
por m0, m2
cvtdq2ps m1, m0
psrld m1, 23
mova m0, m3
psubd m0, m1
; move the lowest byte in each of 4 dwords to the low dword
; NOTE: We cannot just extract the low bytes with pshufb because the dword
; result for 16777215 is -1 due to float inaccuracy. Using packuswb
; clips this to 0, which is the correct exponent.
packssdw m0, m0
packuswb m0, m0
movd [expq+lenq], m0
add lenq, 4
jl .loop
REP_RET
%endmacro
%if HAVE_SSE2_EXTERNAL
INIT_XMM sse2
AC3_EXTRACT_EXPONENTS
%endif
%if HAVE_SSSE3_EXTERNAL
INIT_XMM ssse3
AC3_EXTRACT_EXPONENTS
%endif

View File

@@ -0,0 +1,231 @@
/*
* x86-optimized AC-3 DSP utils
* Copyright (c) 2011 Justin Ruggles
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "dsputil_x86.h"
#include "libavcodec/ac3.h"
#include "libavcodec/ac3dsp.h"
void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
#if ARCH_X86_32 && defined(__INTEL_COMPILER)
# undef HAVE_7REGS
# define HAVE_7REGS 0
#endif
#if HAVE_SSE_INLINE && HAVE_7REGS
#define IF1(x) x
#define IF0(x)
#define MIX5(mono, stereo) \
__asm__ volatile ( \
"movss 0(%1), %%xmm5 \n" \
"movss 8(%1), %%xmm6 \n" \
"movss 24(%1), %%xmm7 \n" \
"shufps $0, %%xmm5, %%xmm5 \n" \
"shufps $0, %%xmm6, %%xmm6 \n" \
"shufps $0, %%xmm7, %%xmm7 \n" \
"1: \n" \
"movaps (%0, %2), %%xmm0 \n" \
"movaps (%0, %3), %%xmm1 \n" \
"movaps (%0, %4), %%xmm2 \n" \
"movaps (%0, %5), %%xmm3 \n" \
"movaps (%0, %6), %%xmm4 \n" \
"mulps %%xmm5, %%xmm0 \n" \
"mulps %%xmm6, %%xmm1 \n" \
"mulps %%xmm5, %%xmm2 \n" \
"mulps %%xmm7, %%xmm3 \n" \
"mulps %%xmm7, %%xmm4 \n" \
stereo("addps %%xmm1, %%xmm0 \n") \
"addps %%xmm1, %%xmm2 \n" \
"addps %%xmm3, %%xmm0 \n" \
"addps %%xmm4, %%xmm2 \n" \
mono("addps %%xmm2, %%xmm0 \n") \
"movaps %%xmm0, (%0, %2) \n" \
stereo("movaps %%xmm2, (%0, %3) \n") \
"add $16, %0 \n" \
"jl 1b \n" \
: "+&r"(i) \
: "r"(matrix), \
"r"(samples[0] + len), \
"r"(samples[1] + len), \
"r"(samples[2] + len), \
"r"(samples[3] + len), \
"r"(samples[4] + len) \
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
"memory" \
);
#define MIX_MISC(stereo) \
__asm__ volatile ( \
"mov %5, %2 \n" \
"1: \n" \
"mov -%c7(%6, %2, %c8), %3 \n" \
"movaps (%3, %0), %%xmm0 \n" \
stereo("movaps %%xmm0, %%xmm1 \n") \
"mulps %%xmm4, %%xmm0 \n" \
stereo("mulps %%xmm5, %%xmm1 \n") \
"2: \n" \
"mov (%6, %2, %c8), %1 \n" \
"movaps (%1, %0), %%xmm2 \n" \
stereo("movaps %%xmm2, %%xmm3 \n") \
"mulps (%4, %2, 8), %%xmm2 \n" \
stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \
"addps %%xmm2, %%xmm0 \n" \
stereo("addps %%xmm3, %%xmm1 \n") \
"add $4, %2 \n" \
"jl 2b \n" \
"mov %5, %2 \n" \
stereo("mov (%6, %2, %c8), %1 \n") \
"movaps %%xmm0, (%3, %0) \n" \
stereo("movaps %%xmm1, (%1, %0) \n") \
"add $16, %0 \n" \
"jl 1b \n" \
: "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \
: "r"(matrix_simd + in_ch), \
"g"((intptr_t) - 4 * (in_ch - 1)), \
"r"(samp + in_ch), \
"i"(sizeof(float *)), "i"(sizeof(float *)/4) \
: "memory" \
);
static void ac3_downmix_sse(float **samples, float (*matrix)[2],
int out_ch, int in_ch, int len)
{
int (*matrix_cmp)[2] = (int(*)[2])matrix;
intptr_t i, j, k, m;
i = -len * sizeof(float);
if (in_ch == 5 && out_ch == 2 &&
!(matrix_cmp[0][1] | matrix_cmp[2][0] |
matrix_cmp[3][1] | matrix_cmp[4][0] |
(matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
(matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
MIX5(IF0, IF1);
} else if (in_ch == 5 && out_ch == 1 &&
matrix_cmp[0][0] == matrix_cmp[2][0] &&
matrix_cmp[3][0] == matrix_cmp[4][0]) {
MIX5(IF1, IF0);
} else {
DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
float *samp[AC3_MAX_CHANNELS];
for (j = 0; j < in_ch; j++)
samp[j] = samples[j] + len;
j = 2 * in_ch * sizeof(float);
__asm__ volatile (
"1: \n"
"sub $8, %0 \n"
"movss (%2, %0), %%xmm4 \n"
"movss 4(%2, %0), %%xmm5 \n"
"shufps $0, %%xmm4, %%xmm4 \n"
"shufps $0, %%xmm5, %%xmm5 \n"
"movaps %%xmm4, (%1, %0, 4) \n"
"movaps %%xmm5, 16(%1, %0, 4) \n"
"jg 1b \n"
: "+&r"(j)
: "r"(matrix_simd), "r"(matrix)
: "memory"
);
if (out_ch == 2) {
MIX_MISC(IF1);
} else {
MIX_MISC(IF0);
}
}
}
#endif /* HAVE_SSE_INLINE && HAVE_7REGS */
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (!bit_exact) {
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
}
if (EXTERNAL_SSE(cpu_flags)) {
c->float_to_fixed24 = ff_float_to_fixed24_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
c->extract_exponents = ff_ac3_extract_exponents_sse2;
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
if (!(cpu_flags & AV_CPU_FLAG_ATOM)) {
c->extract_exponents = ff_ac3_extract_exponents_ssse3;
}
}
#if HAVE_SSE_INLINE && HAVE_7REGS
if (INLINE_SSE(cpu_flags)) {
c->downmix = ac3_downmix_sse;
}
#endif
}

View File

@@ -0,0 +1,299 @@
/*
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_CABAC_H
#define AVCODEC_X86_CABAC_H
#include "libavcodec/cabac.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/internal.h"
#include "config.h"
#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
|| ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
# define BROKEN_COMPILER 1
#else
# define BROKEN_COMPILER 0
#endif
#if HAVE_INLINE_ASM
#ifndef UNCHECKED_BITSTREAM_READER
#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
#endif
#if UNCHECKED_BITSTREAM_READER
#define END_CHECK(end) ""
#else
#define END_CHECK(end) \
"cmp "end" , %%"REG_c" \n\t"\
"jge 1f \n\t"
#endif
#ifdef BROKEN_RELOCATIONS
#define TABLES_ARG , "r"(tables)
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
"cmp "low" , "tmp" \n\t"\
"cmova %%ecx , "range" \n\t"\
"sbb %%rcx , %%rcx \n\t"\
"and %%ecx , "tmp" \n\t"\
"xor %%rcx , "retq" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
/* P4 Prescott has crappy cmov,sbb,64bit shift so avoid them */ \
"sub "low" , "tmp" \n\t"\
"sar $31 , "tmp" \n\t"\
"sub %%ecx , "range" \n\t"\
"and "tmp" , "range" \n\t"\
"add %%ecx , "range" \n\t"\
"shl $17 , %%ecx \n\t"\
"and "tmp" , %%ecx \n\t"\
"sub %%ecx , "low" \n\t"\
"xor "tmp" , "ret" \n\t"\
"movslq "ret" , "retq" \n\t"
#endif /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
"lea ("ret", "range", 2), %%ecx \n\t"\
"movzbl "lps_off"("tables", %%rcx), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
"movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
"jnz 2f \n\t"\
"mov "byte" , %%"REG_c" \n\t"\
END_CHECK(end)\
"add"OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
"shr $15 , %%ecx \n\t"\
"bswap "tmp" \n\t"\
"shr $15 , "tmp" \n\t"\
"movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\
"sub $0xFFFF , "tmp" \n\t"\
"neg %%ecx \n\t"\
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
"2: \n\t"
#else /* BROKEN_RELOCATIONS */
#define TABLES_ARG
#define RIP_ARG
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"cmp "low" , "tmp" \n\t"\
"cmova %%ecx , "range" \n\t"\
"sbb %%ecx , %%ecx \n\t"\
"and %%ecx , "tmp" \n\t"\
"xor %%ecx , "ret" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"sub "low" , "tmp" \n\t"\
"sar $31 , "tmp" \n\t" /*lps_mask*/\
"sub %%ecx , "range" \n\t" /*RangeLPS - range*/\
"and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\
"add %%ecx , "range" \n\t" /*new range*/\
"shl $17 , %%ecx \n\t"\
"and "tmp" , %%ecx \n\t"\
"sub %%ecx , "low" \n\t"\
"xor "tmp" , "ret" \n\t"
#endif /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
" jnz 2f \n\t"\
"mov "byte" , %%"REG_c" \n\t"\
END_CHECK(end)\
"add"OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
"shr $15 , %%ecx \n\t"\
"bswap "tmp" \n\t"\
"shr $15 , "tmp" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
"sub $0xFFFF , "tmp" \n\t"\
"neg %%ecx \n\t"\
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
"2: \n\t"
#endif /* BROKEN_RELOCATIONS */
#if HAVE_7REGS && !BROKEN_COMPILER
#define get_cabac_inline get_cabac_inline_x86
static av_always_inline int get_cabac_inline_x86(CABACContext *c,
uint8_t *const state)
{
int bit, tmp;
#ifdef BROKEN_RELOCATIONS
void *tables;
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
);
#endif
__asm__ volatile(
BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
"%2", "%q2", "%3", "%b3",
"%c6(%5)", "%c7(%5)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%8")
: "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
: "r"(state), "r"(c),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end))
TABLES_ARG
,"1"(c->low), "2"(c->range)
: "%"REG_c, "memory"
);
return bit & 1;
}
#endif /* HAVE_7REGS */
#if !BROKEN_COMPILER
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
"movl %c6(%2), %k1 \n\t"
"movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
"cltd \n\t"
"and %%edx, %k1 \n\t"
"add %k1, %%eax \n\t"
"xor %%edx, %%ecx \n\t"
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
"jnz 1f \n\t"
"mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
#if UNCHECKED_BITSTREAM_READER
"add $2, %1 \n\t"
"addl %%edx, %%eax \n\t"
"mov %1, %c4(%2) \n\t"
#else
"addl %%edx, %%eax \n\t"
"cmp %c5(%2), %1 \n\t"
"jge 1f \n\t"
"add"OPSIZE" $2, %c4(%2) \n\t"
#endif
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"
: "+c"(val), "=&r"(tmp)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%edx", "memory"
);
return val;
}
#define get_cabac_bypass get_cabac_bypass_x86
static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
{
x86_reg tmp;
int res;
__asm__ volatile(
"movl %c6(%2), %k1 \n\t"
"movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
"cltd \n\t"
"and %%edx, %k1 \n\t"
"add %k1, %%eax \n\t"
"inc %%edx \n\t"
"test %%ax, %%ax \n\t"
"jnz 1f \n\t"
"mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%ecx \n\t"
"bswap %%ecx \n\t"
"shrl $15, %%ecx \n\t"
"addl %%ecx, %%eax \n\t"
"cmp %c5(%2), %1 \n\t"
"jge 1f \n\t"
"add"OPSIZE" $2, %c4(%2) \n\t"
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"
: "=&d"(res), "=&r"(tmp)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%ecx", "memory"
);
return res;
}
#endif /* !BROKEN_COMPILER */
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_CABAC_H */

View File

@@ -0,0 +1,558 @@
/*
* Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
*
* MMX-optimized DSP functions, based on H.264 optimizations by
* Michael Niedermayer and Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/cavsdsp.h"
#include "constants.h"
#include "dsputil_x86.h"
#include "config.h"
#if HAVE_MMX_INLINE
/* in/out: mma=mma+mmb, mmb=mmb-mma */
#define SUMSUB_BA( a, b ) \
"paddw "#b", "#a" \n\t"\
"paddw "#b", "#b" \n\t"\
"psubw "#a", "#b" \n\t"
/*****************************************************************************
*
* inverse transform
*
****************************************************************************/
static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
{
__asm__ volatile(
"movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */
"movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */
"movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */
"movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */
"movq %%mm4, %%mm0 \n\t"
"movq %%mm5, %%mm3 \n\t"
"movq %%mm2, %%mm6 \n\t"
"movq %%mm7, %%mm1 \n\t"
"paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */
"paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */
"paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */
"paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */
"paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */
"paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */
"paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */
"paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */
"psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
"paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
"psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
"paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
"movq %%mm5, %%mm4 \n\t"
"movq %%mm7, %%mm6 \n\t"
"movq %%mm3, %%mm0 \n\t"
"movq %%mm1, %%mm2 \n\t"
SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */
"paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */
"paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */
"paddw %%mm7, %%mm7 \n\t"
"paddw %%mm5, %%mm5 \n\t"
"paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */
"paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */
SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */
"psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */
"movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */
"psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */
"paddw %%mm1, %%mm1 \n\t"
"paddw %%mm3, %%mm3 \n\t"
"psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */
"paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */
"movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */
"movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */
"movq %%mm2, %%mm4 \n\t"
"movq %%mm6, %%mm0 \n\t"
"psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */
"psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */
"paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */
"paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */
"paddw %%mm2, %%mm2 \n\t"
"paddw %%mm0, %%mm0 \n\t"
"psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
"paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
"movq (%0), %%mm2 \n\t" /* mm2 = src0 */
"movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */
SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */
"psllw $3, %%mm0 \n\t"
"psllw $3, %%mm2 \n\t"
"paddw %1, %%mm0 \n\t" /* add rounding bias */
"paddw %1, %%mm2 \n\t" /* add rounding bias */
SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */
SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */
SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */
SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */
SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */
SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */
:: "r"(block), "m"(bias)
);
}
#define SBUTTERFLY(a,b,t,n,m)\
"mov" #m " " #a ", " #t " \n\t" /* abcd */\
"punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
"punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
#define TRANSPOSE4(a,b,c,d,t)\
SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
{
int i;
DECLARE_ALIGNED(8, int16_t, b2)[64];
for(i=0; i<2; i++){
DECLARE_ALIGNED(8, uint64_t, tmp);
cavs_idct8_1d(block+4*i, ff_pw_4.a);
__asm__ volatile(
"psraw $3, %%mm7 \n\t"
"psraw $3, %%mm6 \n\t"
"psraw $3, %%mm5 \n\t"
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm3 \n\t"
"psraw $3, %%mm2 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm0 \n\t"
"movq %%mm7, %0 \n\t"
TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
"movq %%mm0, 8(%1) \n\t"
"movq %%mm6, 24(%1) \n\t"
"movq %%mm7, 40(%1) \n\t"
"movq %%mm4, 56(%1) \n\t"
"movq %0, %%mm7 \n\t"
TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
"movq %%mm7, (%1) \n\t"
"movq %%mm1, 16(%1) \n\t"
"movq %%mm0, 32(%1) \n\t"
"movq %%mm3, 48(%1) \n\t"
: "=m"(tmp)
: "r"(b2+32*i)
: "memory"
);
}
for(i=0; i<2; i++){
cavs_idct8_1d(b2+4*i, ff_pw_64.a);
__asm__ volatile(
"psraw $7, %%mm7 \n\t"
"psraw $7, %%mm6 \n\t"
"psraw $7, %%mm5 \n\t"
"psraw $7, %%mm4 \n\t"
"psraw $7, %%mm3 \n\t"
"psraw $7, %%mm2 \n\t"
"psraw $7, %%mm1 \n\t"
"psraw $7, %%mm0 \n\t"
"movq %%mm7, (%0) \n\t"
"movq %%mm5, 16(%0) \n\t"
"movq %%mm3, 32(%0) \n\t"
"movq %%mm1, 48(%0) \n\t"
"movq %%mm0, 64(%0) \n\t"
"movq %%mm2, 80(%0) \n\t"
"movq %%mm4, 96(%0) \n\t"
"movq %%mm6, 112(%0) \n\t"
:: "r"(b2+4*i)
: "memory"
);
}
ff_add_pixels_clamped_mmx(b2, dst, stride);
}
#endif /* HAVE_MMX_INLINE */
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
/*****************************************************************************
*
* motion compensation
*
****************************************************************************/
/* vertical filter [-1 -2 96 42 -7 0] */
#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"pmullw %5, %%mm6 \n\t"\
"movq "#D", %%mm7 \n\t"\
"pmullw "MANGLE(MUL2)", %%mm7\n\t"\
"psllw $3, "#E" \n\t"\
"psubw "#E", %%mm6 \n\t"\
"psraw $3, "#E" \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw "#E", %%mm6 \n\t"\
"paddw "#B", "#B" \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psraw $1, "#B" \n\t"\
"psubw "#A", %%mm6 \n\t"\
"paddw %4, %%mm6 \n\t"\
"psraw $7, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
/* vertical filter [ 0 -1 5 5 -1 0] */
#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"paddw "#D", %%mm6 \n\t"\
"pmullw %5, %%mm6 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psubw "#E", %%mm6 \n\t"\
"paddw %4, %%mm6 \n\t"\
"psraw $3, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
/* vertical filter [ 0 -7 42 96 -2 -1] */
#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"pmullw "MANGLE(MUL2)", %%mm6\n\t"\
"movq "#D", %%mm7 \n\t"\
"pmullw %5, %%mm7 \n\t"\
"psllw $3, "#B" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psraw $3, "#B" \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw "#B", %%mm6 \n\t"\
"paddw "#E", "#E" \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#E", %%mm6 \n\t"\
"psraw $1, "#E" \n\t"\
"psubw "#F", %%mm6 \n\t"\
"paddw %4, %%mm6 \n\t"\
"psraw $7, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
int w= 2;\
src -= 2*srcStride;\
\
while(w--){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
: "memory"\
);\
if(h==16){\
__asm__ volatile(\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
: "memory"\
);\
}\
src += 4-(h+5)*srcStride;\
dst += 4-h*dstStride;\
}
#define QPEL_CAVS(OPNAME, OP, MMX)\
static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
int h=8;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq %5, %%mm6 \n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"pmullw %%mm6, %%mm0 \n\t"\
"pmullw %%mm6, %%mm1 \n\t"\
"movq -1(%0), %%mm2 \n\t"\
"movq 2(%0), %%mm4 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm4, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm7, %%mm5 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
"movq %6, %%mm5 \n\t"\
"paddw %%mm5, %%mm0 \n\t"\
"paddw %%mm5, %%mm1 \n\t"\
"psraw $3, %%mm0 \n\t"\
"psraw $3, %%mm1 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q) \
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+m"(h)\
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
: "memory"\
);\
}\
\
static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
}\
\
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \
}\
\
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
}\
\
static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
#define CAVS_MC(OPNAME, SIZE, MMX) \
static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
}\
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
#define AVG_3DNOW_OP(a,b,temp, size) \
"mov" #size " " #b ", " #temp " \n\t"\
"pavgusb " #temp ", " #a " \n\t"\
"mov" #size " " #a ", " #b " \n\t"
#define AVG_MMXEXT_OP(a, b, temp, size) \
"mov" #size " " #b ", " #temp " \n\t"\
"pavgb " #temp ", " #a " \n\t"\
"mov" #size " " #a ", " #b " \n\t"
#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
#if HAVE_MMX_INLINE
static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels8_mmx(dst, src, stride, 8);
}
static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels8_mmx(dst, src, stride, 8);
}
static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_mmx(dst, src, stride, 16);
}
static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_mmx(dst, src, stride, 16);
}
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
AVCodecContext *avctx)
{
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
c->cavs_idct8_add = cavs_idct8_add_mmx;
c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
}
#endif /* HAVE_MMX_INLINE */
#define DSPFUNC(PFX, IDX, NUM, EXT) \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \
#if HAVE_MMXEXT_INLINE
QPEL_CAVS(put_, PUT_OP, mmxext)
QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
CAVS_MC(put_, 8, mmxext)
CAVS_MC(put_, 16, mmxext)
CAVS_MC(avg_, 8, mmxext)
CAVS_MC(avg_, 16, mmxext)
static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c,
AVCodecContext *avctx)
{
DSPFUNC(put, 0, 16, mmxext);
DSPFUNC(put, 1, 8, mmxext);
DSPFUNC(avg, 0, 16, mmxext);
DSPFUNC(avg, 1, 8, mmxext);
}
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_AMD3DNOW_INLINE
QPEL_CAVS(put_, PUT_OP, 3dnow)
QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
CAVS_MC(put_, 8, 3dnow)
CAVS_MC(put_, 16,3dnow)
CAVS_MC(avg_, 8, 3dnow)
CAVS_MC(avg_, 16,3dnow)
static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
AVCodecContext *avctx)
{
DSPFUNC(put, 0, 16, 3dnow);
DSPFUNC(put, 1, 8, 3dnow);
DSPFUNC(avg, 0, 16, 3dnow);
DSPFUNC(avg, 1, 8, 3dnow);
}
#endif /* HAVE_AMD3DNOW_INLINE */
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
{
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
cavsdsp_init_mmx(c, avctx);
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
cavsdsp_init_mmxext(c, avctx);
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_AMD3DNOW_INLINE
if (INLINE_AMD3DNOW(cpu_flags))
cavsdsp_init_3dnow(c, avctx);
#endif /* HAVE_AMD3DNOW_INLINE */
}

View File

@@ -0,0 +1,53 @@
/*
* MMX/SSE constants used across x86 dsp optimizations.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h" // for xmm_reg
#include "constants.h"
DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;

View File

@@ -0,0 +1,51 @@
/*
* MMX/SSE constants used across x86 dsp optimizations.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_CONSTANTS_H
#define AVCODEC_X86_CONSTANTS_H
#include <stdint.h>
#include "libavutil/x86/asm.h"
extern const uint64_t ff_wtwo;
extern const xmm_reg ff_pw_3;
extern const xmm_reg ff_pw_4;
extern const xmm_reg ff_pw_5;
extern const xmm_reg ff_pw_8;
extern const uint64_t ff_pw_15;
extern const xmm_reg ff_pw_16;
extern const xmm_reg ff_pw_18;
extern const uint64_t ff_pw_20;
extern const xmm_reg ff_pw_32;
extern const uint64_t ff_pw_42;
extern const uint64_t ff_pw_53;
extern const xmm_reg ff_pw_64;
extern const uint64_t ff_pw_96;
extern const uint64_t ff_pw_128;
extern const uint64_t ff_pw_255;
extern const xmm_reg ff_pb_1;
extern const xmm_reg ff_pb_3;
extern const xmm_reg ff_pb_F8;
extern const uint64_t ff_pb_FC;
#endif /* AVCODEC_X86_CONSTANTS_H */

View File

@@ -0,0 +1,490 @@
;******************************************************************************
;* 32 point SSE-optimized DCT transform
;* Copyright (c) 2010 Vitor Sessak
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
align 32
ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
dd 0.553104, 0.582935, 0.622504, 0.674808
dd -10.190008, -3.407609, -2.057781, -1.484165
dd -1.169440, -0.972568, -0.839350, -0.744536
dd 0.502419, 0.522499, 0.566944, 0.646822
dd 0.788155, 1.060678, 1.722447, 5.101149
dd 0.509796, 0.601345, 0.899976, 2.562916
dd 0.509796, 0.601345, 0.899976, 2.562916
dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 0.707107, 1.000000, -0.707107
dd 1.000000, 0.707107, 1.000000, -0.707107
dd 0.707107, 0.707107, 0.707107, 0.707107
align 32
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
%macro BUTTERFLY 4
subps %4, %1, %2
addps %2, %2, %1
mulps %1, %4, %3
%endmacro
%macro BUTTERFLY0 5
%if cpuflag(sse2) && notcpuflag(avx)
pshufd %4, %1, %5
xorps %1, %2
addps %1, %4
mulps %1, %3
%else
shufps %4, %1, %1, %5
xorps %1, %1, %2
addps %4, %4, %1
mulps %1, %4, %3
%endif
%endmacro
%macro BUTTERFLY2 4
BUTTERFLY0 %1, %2, %3, %4, 0x1b
%endmacro
%macro BUTTERFLY3 4
BUTTERFLY0 %1, %2, %3, %4, 0xb1
%endmacro
%macro BUTTERFLY3V 5
movaps m%5, m%1
addps m%1, m%2
subps m%5, m%2
SWAP %2, %5
mulps m%2, [ps_cos_vec+192]
movaps m%5, m%3
addps m%3, m%4
subps m%4, m%5
mulps m%4, [ps_cos_vec+192]
%endmacro
%macro PASS6_AND_PERMUTE 0
mov tmpd, [outq+4]
movss m7, [outq+72]
addss m7, [outq+76]
movss m3, [outq+56]
addss m3, [outq+60]
addss m4, m3
movss m2, [outq+52]
addss m2, m3
movss m3, [outq+104]
addss m3, [outq+108]
addss m1, m3
addss m5, m4
movss [outq+ 16], m1
movss m1, [outq+100]
addss m1, m3
movss m3, [outq+40]
movss [outq+ 48], m1
addss m3, [outq+44]
movss m1, [outq+100]
addss m4, m3
addss m3, m2
addss m1, [outq+108]
movss [outq+ 40], m3
addss m2, [outq+36]
movss m3, [outq+8]
movss [outq+ 56], m2
addss m3, [outq+12]
movss [outq+ 32], m3
movss m3, [outq+80]
movss [outq+ 8], m5
movss [outq+ 80], m1
movss m2, [outq+52]
movss m5, [outq+120]
addss m5, [outq+124]
movss m1, [outq+64]
addss m2, [outq+60]
addss m0, m5
addss m5, [outq+116]
mov [outq+64], tmpd
addss m6, m0
addss m1, m6
mov tmpd, [outq+12]
mov [outq+ 96], tmpd
movss [outq+ 4], m1
movss m1, [outq+24]
movss [outq+ 24], m4
movss m4, [outq+88]
addss m4, [outq+92]
addss m3, m4
addss m4, [outq+84]
mov tmpd, [outq+108]
addss m1, [outq+28]
addss m0, m1
addss m1, m5
addss m6, m3
addss m3, m0
addss m0, m7
addss m5, [outq+20]
addss m7, m1
movss [outq+ 12], m6
mov [outq+112], tmpd
movss m6, [outq+28]
movss [outq+ 28], m0
movss m0, [outq+36]
movss [outq+ 36], m7
addss m1, m4
movss m7, [outq+116]
addss m0, m2
addss m7, [outq+124]
movss [outq+ 72], m0
movss m0, [outq+44]
addss m2, m0
movss [outq+ 44], m1
movss [outq+ 88], m2
addss m0, [outq+60]
mov tmpd, [outq+60]
mov [outq+120], tmpd
movss [outq+104], m0
addss m4, m5
addss m5, [outq+68]
movss [outq+52], m4
movss [outq+60], m5
movss m4, [outq+68]
movss m5, [outq+20]
movss [outq+ 20], m3
addss m5, m7
addss m7, m6
addss m4, m5
movss m2, [outq+84]
addss m2, [outq+92]
addss m5, m2
movss [outq+ 68], m4
addss m2, m7
movss m4, [outq+76]
movss [outq+ 84], m2
movss [outq+ 76], m5
addss m7, m4
addss m6, [outq+124]
addss m4, m6
addss m6, [outq+92]
movss [outq+100], m4
movss [outq+108], m6
movss m6, [outq+92]
movss [outq+92], m7
addss m6, [outq+124]
movss [outq+116], m6
%endmacro
INIT_YMM avx
SECTION_TEXT
%if HAVE_AVX_EXTERNAL
; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
cglobal dct32_float, 2,3,8, out, in, tmp
; pass 1
vmovaps m4, [inq+0]
vinsertf128 m5, m5, [inq+96], 1
vinsertf128 m5, m5, [inq+112], 0
vshufps m5, m5, m5, 0x1b
BUTTERFLY m4, m5, [ps_cos_vec], m6
vmovaps m2, [inq+64]
vinsertf128 m6, m6, [inq+32], 1
vinsertf128 m6, m6, [inq+48], 0
vshufps m6, m6, m6, 0x1b
BUTTERFLY m2, m6, [ps_cos_vec+32], m0
; pass 2
BUTTERFLY m5, m6, [ps_cos_vec+64], m0
BUTTERFLY m4, m2, [ps_cos_vec+64], m7
; pass 3
vperm2f128 m3, m6, m4, 0x31
vperm2f128 m1, m6, m4, 0x20
vshufps m3, m3, m3, 0x1b
BUTTERFLY m1, m3, [ps_cos_vec+96], m6
vperm2f128 m4, m5, m2, 0x20
vperm2f128 m5, m5, m2, 0x31
vshufps m5, m5, m5, 0x1b
BUTTERFLY m4, m5, [ps_cos_vec+96], m6
; pass 4
vmovaps m6, [ps_p1p1m1m1+0]
vmovaps m2, [ps_cos_vec+128]
BUTTERFLY2 m5, m6, m2, m7
BUTTERFLY2 m4, m6, m2, m7
BUTTERFLY2 m1, m6, m2, m7
BUTTERFLY2 m3, m6, m2, m7
; pass 5
vshufps m6, m6, m6, 0xcc
vmovaps m2, [ps_cos_vec+160]
BUTTERFLY3 m5, m6, m2, m7
BUTTERFLY3 m4, m6, m2, m7
BUTTERFLY3 m1, m6, m2, m7
BUTTERFLY3 m3, m6, m2, m7
vperm2f128 m6, m3, m3, 0x31
vmovaps [outq], m3
vextractf128 [outq+64], m5, 1
vextractf128 [outq+32], m5, 0
vextractf128 [outq+80], m4, 1
vextractf128 [outq+48], m4, 0
vperm2f128 m0, m1, m1, 0x31
vmovaps [outq+96], m1
vzeroupper
; pass 6, no SIMD...
INIT_XMM
PASS6_AND_PERMUTE
RET
%endif
%if ARCH_X86_64
%define SPILL SWAP
%define UNSPILL SWAP
%macro PASS5 0
nop ; FIXME code alignment
SWAP 5, 8
SWAP 4, 12
SWAP 6, 14
SWAP 7, 13
SWAP 0, 15
PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
TRANSPOSE4x4PS 8, 9, 10, 11, 0
BUTTERFLY3V 8, 9, 10, 11, 0
addps m10, m11
TRANSPOSE4x4PS 12, 13, 14, 15, 0
BUTTERFLY3V 12, 13, 14, 15, 0
addps m14, m15
addps m12, m14
addps m14, m13
addps m13, m15
%endmacro
%macro PASS6 0
SWAP 9, 12
SWAP 11, 14
movss [outq+0x00], m8
pshuflw m0, m8, 0xe
movss [outq+0x10], m9
pshuflw m1, m9, 0xe
movss [outq+0x20], m10
pshuflw m2, m10, 0xe
movss [outq+0x30], m11
pshuflw m3, m11, 0xe
movss [outq+0x40], m12
pshuflw m4, m12, 0xe
movss [outq+0x50], m13
pshuflw m5, m13, 0xe
movss [outq+0x60], m14
pshuflw m6, m14, 0xe
movaps [outq+0x70], m15
pshuflw m7, m15, 0xe
addss m0, m1
addss m1, m2
movss [outq+0x08], m0
addss m2, m3
movss [outq+0x18], m1
addss m3, m4
movss [outq+0x28], m2
addss m4, m5
movss [outq+0x38], m3
addss m5, m6
movss [outq+0x48], m4
addss m6, m7
movss [outq+0x58], m5
movss [outq+0x68], m6
movss [outq+0x78], m7
PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
movhlps m0, m1
pshufd m1, m1, 3
SWAP 0, 2, 4, 6, 8, 10, 12, 14
SWAP 1, 3, 5, 7, 9, 11, 13, 15
%rep 7
movhlps m0, m1
pshufd m1, m1, 3
addss m15, m1
SWAP 0, 2, 4, 6, 8, 10, 12, 14
SWAP 1, 3, 5, 7, 9, 11, 13, 15
%endrep
%assign i 4
%rep 15
addss m0, m1
movss [outq+i], m0
SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
%assign i i+8
%endrep
%endmacro
%else ; ARCH_X86_32
%macro SPILL 2 ; xmm#, mempos
movaps [outq+(%2-8)*16], m%1
%endmacro
%macro UNSPILL 2
movaps m%1, [outq+(%2-8)*16]
%endmacro
%define PASS6 PASS6_AND_PERMUTE
%macro PASS5 0
movaps m2, [ps_cos_vec+160]
shufps m3, m3, 0xcc
BUTTERFLY3 m5, m3, m2, m1
SPILL 5, 8
UNSPILL 1, 9
BUTTERFLY3 m1, m3, m2, m5
SPILL 1, 14
BUTTERFLY3 m4, m3, m2, m5
SPILL 4, 12
BUTTERFLY3 m7, m3, m2, m5
SPILL 7, 13
UNSPILL 5, 10
BUTTERFLY3 m5, m3, m2, m7
SPILL 5, 10
UNSPILL 4, 11
BUTTERFLY3 m4, m3, m2, m7
SPILL 4, 11
BUTTERFLY3 m6, m3, m2, m7
SPILL 6, 9
BUTTERFLY3 m0, m3, m2, m7
SPILL 0, 15
%endmacro
%endif
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
%macro DCT32_FUNC 0
cglobal dct32_float, 2, 3, 16, out, in, tmp
; pass 1
movaps m0, [inq+0]
LOAD_INV m1, [inq+112]
BUTTERFLY m0, m1, [ps_cos_vec], m3
movaps m7, [inq+64]
LOAD_INV m4, [inq+48]
BUTTERFLY m7, m4, [ps_cos_vec+32], m3
; pass 2
movaps m2, [ps_cos_vec+64]
BUTTERFLY m1, m4, m2, m3
SPILL 1, 11
SPILL 4, 8
; pass 1
movaps m1, [inq+16]
LOAD_INV m6, [inq+96]
BUTTERFLY m1, m6, [ps_cos_vec+16], m3
movaps m4, [inq+80]
LOAD_INV m5, [inq+32]
BUTTERFLY m4, m5, [ps_cos_vec+48], m3
; pass 2
BUTTERFLY m0, m7, m2, m3
movaps m2, [ps_cos_vec+80]
BUTTERFLY m6, m5, m2, m3
BUTTERFLY m1, m4, m2, m3
; pass 3
movaps m2, [ps_cos_vec+96]
shufps m1, m1, 0x1b
BUTTERFLY m0, m1, m2, m3
SPILL 0, 15
SPILL 1, 14
UNSPILL 0, 8
shufps m5, m5, 0x1b
BUTTERFLY m0, m5, m2, m3
UNSPILL 1, 11
shufps m6, m6, 0x1b
BUTTERFLY m1, m6, m2, m3
SPILL 1, 11
shufps m4, m4, 0x1b
BUTTERFLY m7, m4, m2, m3
; pass 4
movaps m3, [ps_p1p1m1m1+0]
movaps m2, [ps_cos_vec+128]
BUTTERFLY2 m5, m3, m2, m1
BUTTERFLY2 m0, m3, m2, m1
SPILL 0, 9
BUTTERFLY2 m6, m3, m2, m1
SPILL 6, 10
UNSPILL 0, 11
BUTTERFLY2 m0, m3, m2, m1
SPILL 0, 11
BUTTERFLY2 m4, m3, m2, m1
BUTTERFLY2 m7, m3, m2, m1
UNSPILL 6, 14
BUTTERFLY2 m6, m3, m2, m1
UNSPILL 0, 15
BUTTERFLY2 m0, m3, m2, m1
PASS5
PASS6
RET
%endmacro
%macro LOAD_INV 2
%if cpuflag(sse2)
pshufd %1, %2, 0x1b
%elif cpuflag(sse)
movaps %1, %2
shufps %1, %1, 0x1b
%endif
%endmacro
INIT_XMM sse
DCT32_FUNC
INIT_XMM sse2
DCT32_FUNC

View File

@@ -0,0 +1,39 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dct.h"
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
av_cold void ff_dct_init_x86(DCTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags))
s->dct32 = ff_dct32_float_sse;
if (EXTERNAL_SSE2(cpu_flags))
s->dct32 = ff_dct32_float_sse2;
if (EXTERNAL_AVX(cpu_flags))
s->dct32 = ff_dct32_float_avx;
}

View File

@@ -0,0 +1,82 @@
;******************************************************************************
;* MMX optimized deinterlacing functions
;* Copyright (c) 2010 Vitor Sessak
;* Copyright (c) 2002 Michael Niedermayer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pw_4
SECTION .text
%macro DEINTERLACE 1
%ifidn %1, inplace
;void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size)
cglobal deinterlace_line_inplace_mmx, 6,6,7, lum_m4, lum_m3, lum_m2, lum_m1, lum, size
%else
;void ff_deinterlace_line_mmx(uint8_t *dst, const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size)
cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size
%endif
pxor mm7, mm7
movq mm6, [pw_4]
.nextrow:
movd mm0, [lum_m4q]
movd mm1, [lum_m3q]
movd mm2, [lum_m2q]
%ifidn %1, inplace
movd [lum_m4q], mm2
%endif
movd mm3, [lum_m1q]
movd mm4, [lumq]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpcklbw mm2, mm7
punpcklbw mm3, mm7
punpcklbw mm4, mm7
paddw mm1, mm3
psllw mm2, 1
paddw mm0, mm4
psllw mm1, 2
paddw mm2, mm6
paddw mm1, mm2
psubusw mm1, mm0
psrlw mm1, 3
packuswb mm1, mm7
%ifidn %1, inplace
movd [lum_m2q], mm1
%else
movd [dstq], mm1
add dstq, 4
%endif
add lum_m4q, 4
add lum_m3q, 4
add lum_m2q, 4
add lum_m1q, 4
add lumq, 4
sub sized, 4
jg .nextrow
REP_RET
%endmacro
DEINTERLACE ""
DEINTERLACE inplace

View File

@@ -0,0 +1,202 @@
/*
* MMX optimized discrete wavelet transform
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
* Copyright (c) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/x86/asm.h"
#include "dsputil_x86.h"
#include "dirac_dwt.h"
#define COMPOSE_VERTICAL(ext, align) \
void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
\
static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
{ \
int i, width_align = width&~(align-1); \
\
for(i=width_align; i<width; i++) \
b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
\
ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
} \
\
static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
{ \
int i, width_align = width&~(align-1); \
\
for(i=width_align; i<width; i++) \
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
\
ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
} \
\
static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
IDWTELEM *b3, IDWTELEM *b4, int width) \
{ \
int i, width_align = width&~(align-1); \
\
for(i=width_align; i<width; i++) \
b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
\
ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
} \
\
static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
IDWTELEM *b3, IDWTELEM *b4, int width) \
{ \
int i, width_align = width&~(align-1); \
\
for(i=width_align; i<width; i++) \
b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
\
ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
} \
static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
{ \
int i, width_align = width&~(align-1); \
\
for(i=width_align; i<width; i++) { \
b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
} \
\
ff_vertical_compose_haar##ext(b0, b1, width_align); \
} \
static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
{\
int w2= w>>1;\
int x= w2 - (w2&(align-1));\
ff_horizontal_compose_haar0i##ext(b, tmp, w);\
\
for (; x < w2; x++) {\
b[2*x ] = tmp[x];\
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
}\
}\
static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
{\
int w2= w>>1;\
int x= w2 - (w2&(align-1));\
ff_horizontal_compose_haar1i##ext(b, tmp, w);\
\
for (; x < w2; x++) {\
b[2*x ] = (tmp[x] + 1)>>1;\
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
}\
}\
\
#if HAVE_YASM
#if !ARCH_X86_64
COMPOSE_VERTICAL(_mmx, 4)
#endif
COMPOSE_VERTICAL(_sse2, 8)
void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w)
{
int w2= w>>1;
int x= w2 - (w2&7);
ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
for (; x < w2; x++) {
b[2*x ] = (tmp[x] + 1)>>1;
b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
}
}
#endif
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
{
#if HAVE_YASM
int mm_flags = av_get_cpu_flags();
#if !ARCH_X86_64
if (!(mm_flags & AV_CPU_FLAG_MMX))
return;
switch (type) {
case DWT_DIRAC_DD9_7:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
break;
case DWT_DIRAC_LEGALL5_3:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
break;
case DWT_DIRAC_DD13_7:
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
break;
case DWT_DIRAC_HAAR0:
d->vertical_compose = (void*)vertical_compose_haar_mmx;
d->horizontal_compose = horizontal_compose_haar0i_mmx;
break;
case DWT_DIRAC_HAAR1:
d->vertical_compose = (void*)vertical_compose_haar_mmx;
d->horizontal_compose = horizontal_compose_haar1i_mmx;
break;
}
#endif
if (!(mm_flags & AV_CPU_FLAG_SSE2))
return;
switch (type) {
case DWT_DIRAC_DD9_7:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
break;
case DWT_DIRAC_LEGALL5_3:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
break;
case DWT_DIRAC_DD13_7:
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
break;
case DWT_DIRAC_HAAR0:
d->vertical_compose = (void*)vertical_compose_haar_sse2;
d->horizontal_compose = horizontal_compose_haar0i_sse2;
break;
case DWT_DIRAC_HAAR1:
d->vertical_compose = (void*)vertical_compose_haar_sse2;
d->horizontal_compose = horizontal_compose_haar1i_sse2;
break;
}
if (!(mm_flags & AV_CPU_FLAG_SSSE3))
return;
switch (type) {
case DWT_DIRAC_DD9_7:
d->horizontal_compose = horizontal_compose_dd97i_ssse3;
break;
}
#endif // HAVE_YASM
}

View File

@@ -0,0 +1,30 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_DIRAC_DWT_H
#define AVCODEC_X86_DIRAC_DWT_H
#include "libavcodec/dirac_dwt.h"
void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type);
#endif

View File

@@ -0,0 +1,104 @@
/*
* Copyright (C) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "dsputil_x86.h"
#include "diracdsp_mmx.h"
void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
#define HPEL_FILTER(MMSIZE, EXT) \
void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
\
static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
const uint8_t *src, int stride, int width, int height) \
{ \
while( height-- ) \
{ \
ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
\
dsth += stride; \
dstv += stride; \
dstc += stride; \
src += stride; \
} \
}
#if !ARCH_X86_64
HPEL_FILTER(8, mmx)
#endif
HPEL_FILTER(16, sse2)
#define PIXFUNC(PFX, IDX, EXT) \
/*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
void ff_diracdsp_init_mmx(DiracDSPContext* c)
{
int mm_flags = av_get_cpu_flags();
if (!(mm_flags & AV_CPU_FLAG_MMX))
return;
#if HAVE_YASM
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
#if !ARCH_X86_64
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
c->dirac_hpel_filter = dirac_hpel_filter_mmx;
c->add_rect_clamped = ff_add_rect_clamped_mmx;
c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx;
#endif
#endif
#if HAVE_MMX_INLINE
PIXFUNC(put, 0, mmx);
PIXFUNC(avg, 0, mmx);
#endif
#if HAVE_MMXEXT_INLINE
if (mm_flags & AV_CPU_FLAG_MMX2) {
PIXFUNC(avg, 0, mmxext);
}
#endif
if (mm_flags & AV_CPU_FLAG_SSE2) {
#if HAVE_YASM
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
c->add_rect_clamped = ff_add_rect_clamped_sse2;
c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2;
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
#endif
#if HAVE_SSE2_INLINE
c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
#endif
}
}

View File

@@ -0,0 +1,47 @@
/*
* Copyright (c) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_DIRACDSP_H
#define AVCODEC_X86_DIRACDSP_H
#include "libavcodec/diracdsp.h"
void ff_diracdsp_init_mmx(DiracDSPContext* c);
DECL_DIRAC_PIXOP(put, mmx);
DECL_DIRAC_PIXOP(avg, mmx);
DECL_DIRAC_PIXOP(avg, mmxext);
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
#endif

View File

@@ -0,0 +1,264 @@
;******************************************************************************
;* Copyright (c) 2010 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_3: times 8 dw 3
pw_7: times 8 dw 7
pw_16: times 8 dw 16
pw_32: times 8 dw 32
pb_128: times 16 db 128
section .text
%macro UNPACK_ADD 6
mov%5 %1, %3
mov%6 m5, %4
mova m4, %1
mova %2, m5
punpcklbw %1, m7
punpcklbw m5, m7
punpckhbw m4, m7
punpckhbw %2, m7
paddw %1, m5
paddw %2, m4
%endmacro
%macro HPEL_FILTER 1
; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
mov src0q, srcq
lea stridex3q, [3*strideq]
sub src0q, stridex3q
pxor m7, m7
.loop:
; 7*(src[0] + src[1])
UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
pmullw m0, [pw_7]
pmullw m1, [pw_7]
; 3*( ... + src[-2] + src[3])
UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
paddw m0, m2
paddw m1, m3
pmullw m0, [pw_3]
pmullw m1, [pw_3]
; ... - 7*(src[-1] + src[2])
UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
pmullw m2, [pw_7]
pmullw m3, [pw_7]
psubw m0, m2
psubw m1, m3
; ... - (src[-3] + src[4])
UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
psubw m0, m2
psubw m1, m3
paddw m0, [pw_16]
paddw m1, [pw_16]
psraw m0, 5
psraw m1, 5
packuswb m0, m1
mova [dstq], m0
add dstq, mmsize
add srcq, mmsize
add src0q, mmsize
sub widthd, mmsize
jg .loop
RET
; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
dec widthd
pxor m7, m7
and widthd, ~(mmsize-1)
.loop:
; 7*(src[0] + src[1])
UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
pmullw m0, [pw_7]
pmullw m1, [pw_7]
; 3*( ... + src[-2] + src[3])
UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
paddw m0, m2
paddw m1, m3
pmullw m0, [pw_3]
pmullw m1, [pw_3]
; ... - 7*(src[-1] + src[2])
UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
pmullw m2, [pw_7]
pmullw m3, [pw_7]
psubw m0, m2
psubw m1, m3
; ... - (src[-3] + src[4])
UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
psubw m0, m2
psubw m1, m3
paddw m0, [pw_16]
paddw m1, [pw_16]
psraw m0, 5
psraw m1, 5
packuswb m0, m1
mova [dstq + widthq], m0
sub widthd, mmsize
jge .loop
RET
%endmacro
%macro PUT_RECT 1
; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
mova m0, [pb_128]
add wd, (mmsize-1)
and wd, ~(mmsize-1)
%if ARCH_X86_64
movsxd dst_strideq, dst_strided
movsxd src_strideq, src_strided
mov r7d, r5m
mov r8d, wd
%define wspill r8d
%define hd r7d
%else
mov r4m, wd
%define wspill r4m
%define hd r5mp
%endif
.loopy
lea src2q, [srcq+src_strideq*2]
lea dst2q, [dstq+dst_strideq]
.loopx:
sub wd, mmsize
mova m1, [srcq +2*wq]
mova m2, [src2q+2*wq]
packsswb m1, [srcq +2*wq+mmsize]
packsswb m2, [src2q+2*wq+mmsize]
paddb m1, m0
paddb m2, m0
mova [dstq +wq], m1
mova [dst2q+wq], m2
jg .loopx
lea srcq, [srcq+src_strideq*4]
lea dstq, [dstq+dst_strideq*2]
sub hd, 2
mov wd, wspill
jg .loopy
RET
%endm
%macro ADD_RECT 1
; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
mova m0, [pw_32]
add wd, (mmsize-1)
and wd, ~(mmsize-1)
%if ARCH_X86_64
movsxd strideq, strided
movsxd idwt_strideq, idwt_strided
mov r8d, wd
%define wspill r8d
%else
mov r5m, wd
%define wspill r5m
%endif
.loop:
sub wd, mmsize
movu m1, [srcq +2*wq] ; FIXME: ensure alignment
paddw m1, m0
psraw m1, 6
movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
paddw m2, m0
psraw m2, 6
paddw m1, [idwtq+2*wq]
paddw m2, [idwtq+2*wq+mmsize]
packuswb m1, m2
mova [dstq +wq], m1
jg .loop
lea srcq, [srcq + 2*strideq]
add dstq, strideq
lea idwtq, [idwtq+ 2*idwt_strideq]
sub hd, 1
mov wd, wspill
jg .loop
RET
%endm
%macro ADD_OBMC 2
; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
pxor m4, m4
.loop:
%assign i 0
%rep %1 / mmsize
mova m0, [srcq+i]
mova m1, m0
punpcklbw m0, m4
punpckhbw m1, m4
mova m2, [obmcq+i]
mova m3, m2
punpcklbw m2, m4
punpckhbw m3, m4
pmullw m0, m2
pmullw m1, m3
movu m2, [dstq+2*i]
movu m3, [dstq+2*i+mmsize]
paddw m0, m2
paddw m1, m3
movu [dstq+2*i], m0
movu [dstq+2*i+mmsize], m1
%assign i i+mmsize
%endrep
lea srcq, [srcq+strideq]
lea dstq, [dstq+2*strideq]
add obmcq, 32
sub yblend, 1
jg .loop
RET
%endm
INIT_MMX
%if ARCH_X86_64 == 0
PUT_RECT mmx
ADD_RECT mmx
HPEL_FILTER mmx
ADD_OBMC 32, mmx
ADD_OBMC 16, mmx
%endif
ADD_OBMC 8, mmx
INIT_XMM
PUT_RECT sse2
ADD_RECT sse2
HPEL_FILTER sse2
ADD_OBMC 32, sse2
ADD_OBMC 16, sse2

View File

@@ -0,0 +1,67 @@
/*
* VC3/DNxHD SIMD functions
* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
*
* VC-3 encoder funded by the British Broadcasting Corporation
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dnxhdenc.h"
#if HAVE_SSE2_INLINE
static void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels, int line_size)
{
__asm__ volatile(
"pxor %%xmm5, %%xmm5 \n\t"
"movq (%0), %%xmm0 \n\t"
"add %2, %0 \n\t"
"movq (%0), %%xmm1 \n\t"
"movq (%0, %2), %%xmm2 \n\t"
"movq (%0, %2,2), %%xmm3 \n\t"
"punpcklbw %%xmm5, %%xmm0 \n\t"
"punpcklbw %%xmm5, %%xmm1 \n\t"
"punpcklbw %%xmm5, %%xmm2 \n\t"
"punpcklbw %%xmm5, %%xmm3 \n\t"
"movdqa %%xmm0, (%1) \n\t"
"movdqa %%xmm1, 16(%1) \n\t"
"movdqa %%xmm2, 32(%1) \n\t"
"movdqa %%xmm3, 48(%1) \n\t"
"movdqa %%xmm3 , 64(%1) \n\t"
"movdqa %%xmm2 , 80(%1) \n\t"
"movdqa %%xmm1 , 96(%1) \n\t"
"movdqa %%xmm0, 112(%1) \n\t"
: "+r" (pixels)
: "r" (block), "r" ((x86_reg)line_size)
);
}
#endif /* HAVE_SSE2_INLINE */
av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
{
#if HAVE_SSE2_INLINE
if (INLINE_SSE2(av_get_cpu_flags())) {
if (ctx->cid_table->bit_depth == 8)
ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
}
#endif /* HAVE_SSE2_INLINE */
}

View File

@@ -0,0 +1,653 @@
;******************************************************************************
;* MMX optimized DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pb_f: times 16 db 15
pb_zzzzzzzz77777777: times 8 db -1
pb_7: times 8 db 7
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
pd_16384: times 4 dd 16384
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
SECTION_TEXT
%macro SCALARPRODUCT 0
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
cglobal scalarproduct_int16, 3,3,3, v1, v2, order
shl orderq, 1
add v1q, orderq
add v2q, orderq
neg orderq
pxor m2, m2
.loop:
movu m0, [v1q + orderq]
movu m1, [v1q + orderq + mmsize]
pmaddwd m0, [v2q + orderq]
pmaddwd m1, [v2q + orderq + mmsize]
paddd m2, m0
paddd m2, m1
add orderq, mmsize*2
jl .loop
%if mmsize == 16
movhlps m0, m2
paddd m2, m0
pshuflw m0, m2, 0x4e
%else
pshufw m0, m2, 0x4e
%endif
paddd m2, m0
movd eax, m2
RET
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
%if mmsize == 16
pshuflw m7, m7, 0
punpcklqdq m7, m7
%else
pshufw m7, m7, 0
%endif
pxor m6, m6
add v1q, orderq
add v2q, orderq
add v3q, orderq
neg orderq
.loop:
movu m0, [v2q + orderq]
movu m1, [v2q + orderq + mmsize]
mova m4, [v1q + orderq]
mova m5, [v1q + orderq + mmsize]
movu m2, [v3q + orderq]
movu m3, [v3q + orderq + mmsize]
pmaddwd m0, m4
pmaddwd m1, m5
pmullw m2, m7
pmullw m3, m7
paddd m6, m0
paddd m6, m1
paddw m2, m4
paddw m3, m5
mova [v1q + orderq], m2
mova [v1q + orderq + mmsize], m3
add orderq, mmsize*2
jl .loop
%if mmsize == 16
movhlps m0, m6
paddd m6, m0
pshuflw m0, m6, 0x4e
%else
pshufw m0, m6, 0x4e
%endif
paddd m6, m0
movd eax, m6
RET
%endmacro
INIT_MMX mmxext
SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT
%macro SCALARPRODUCT_LOOP 1
align 16
.loop%1:
sub orderq, mmsize*2
%if %1
mova m1, m4
mova m4, [v2q + orderq]
mova m0, [v2q + orderq + mmsize]
palignr m1, m0, %1
palignr m0, m4, %1
mova m3, m5
mova m5, [v3q + orderq]
mova m2, [v3q + orderq + mmsize]
palignr m3, m2, %1
palignr m2, m5, %1
%else
mova m0, [v2q + orderq]
mova m1, [v2q + orderq + mmsize]
mova m2, [v3q + orderq]
mova m3, [v3q + orderq + mmsize]
%endif
%define t0 [v1q + orderq]
%define t1 [v1q + orderq + mmsize]
%if ARCH_X86_64
mova m8, t0
mova m9, t1
%define t0 m8
%define t1 m9
%endif
pmaddwd m0, t0
pmaddwd m1, t1
pmullw m2, m7
pmullw m3, m7
paddw m2, t0
paddw m3, t1
paddd m6, m0
paddd m6, m1
mova [v1q + orderq], m2
mova [v1q + orderq + mmsize], m3
jg .loop%1
%if %1
jmp .end
%endif
%endmacro
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
INIT_XMM ssse3
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
pshuflw m7, m7, 0
punpcklqdq m7, m7
pxor m6, m6
mov r4d, v2d
and r4d, 15
and v2q, ~15
and v3q, ~15
mova m4, [v2q + orderq]
mova m5, [v3q + orderq]
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
cmp r4d, 0
je .loop0
cmp r4d, 2
je .loop2
cmp r4d, 4
je .loop4
cmp r4d, 6
je .loop6
cmp r4d, 8
je .loop8
cmp r4d, 10
je .loop10
cmp r4d, 12
je .loop12
SCALARPRODUCT_LOOP 14
SCALARPRODUCT_LOOP 12
SCALARPRODUCT_LOOP 10
SCALARPRODUCT_LOOP 8
SCALARPRODUCT_LOOP 6
SCALARPRODUCT_LOOP 4
SCALARPRODUCT_LOOP 2
SCALARPRODUCT_LOOP 0
.end:
movhlps m0, m6
paddd m6, m0
pshuflw m0, m6, 0x4e
paddd m6, m0
movd eax, m6
RET
;-----------------------------------------------------------------------------
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
; const int16_t *window, unsigned int len)
;-----------------------------------------------------------------------------
%macro REVERSE_WORDS 1-2
%if cpuflag(ssse3) && notcpuflag(atom)
pshufb %1, %2
%elif cpuflag(sse2)
pshuflw %1, %1, 0x1B
pshufhw %1, %1, 0x1B
pshufd %1, %1, 0x4E
%elif cpuflag(mmxext)
pshufw %1, %1, 0x1B
%endif
%endmacro
%macro MUL16FIXED 3
%if cpuflag(ssse3) ; dst, src, unused
; dst = ((dst * src) + (1<<14)) >> 15
pmulhrsw %1, %2
%elif cpuflag(mmxext) ; dst, src, temp
; dst = (dst * src) >> 15
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
; in from the pmullw result.
mova %3, %1
pmulhw %1, %2
pmullw %3, %2
psrlw %3, 15
psllw %1, 1
por %1, %3
%endif
%endmacro
%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
%if %1
cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
%else
cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
%endif
lea offset2q, [offsetq-mmsize]
%if cpuflag(ssse3) && notcpuflag(atom)
mova m5, [pb_revwords]
ALIGN 16
%elif %1
mova m5, [pd_16384]
%endif
.loop:
%if cpuflag(ssse3)
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The ssse3 version is bit-identical.
mova m0, [windowq+offset2q]
mova m1, [ inputq+offset2q]
pmulhrsw m1, m0
REVERSE_WORDS m0, m5
pmulhrsw m0, [ inputq+offsetq ]
mova [outputq+offset2q], m1
mova [outputq+offsetq ], m0
%elif %1
; This version expands 16-bit to 32-bit, multiplies by the window,
; adds 16384 for rounding, right shifts 15, then repacks back to words to
; save to the output. The window is reversed for the second half.
mova m3, [windowq+offset2q]
mova m4, [ inputq+offset2q]
pxor m0, m0
punpcklwd m0, m3
punpcklwd m1, m4
pmaddwd m0, m1
paddd m0, m5
psrad m0, 15
pxor m2, m2
punpckhwd m2, m3
punpckhwd m1, m4
pmaddwd m2, m1
paddd m2, m5
psrad m2, 15
packssdw m0, m2
mova [outputq+offset2q], m0
REVERSE_WORDS m3
mova m4, [ inputq+offsetq]
pxor m0, m0
punpcklwd m0, m3
punpcklwd m1, m4
pmaddwd m0, m1
paddd m0, m5
psrad m0, 15
pxor m2, m2
punpckhwd m2, m3
punpckhwd m1, m4
pmaddwd m2, m1
paddd m2, m5
psrad m2, 15
packssdw m0, m2
mova [outputq+offsetq], m0
%else
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
; therefore are not bit-identical to the C version.
mova m0, [windowq+offset2q]
mova m1, [ inputq+offset2q]
mova m2, [ inputq+offsetq ]
MUL16FIXED m1, m0, m3
REVERSE_WORDS m0
MUL16FIXED m2, m0, m3
mova [outputq+offset2q], m1
mova [outputq+offsetq ], m2
%endif
add offsetd, mmsize
sub offset2d, mmsize
jae .loop
REP_RET
%endmacro
INIT_MMX mmxext
APPLY_WINDOW_INT16 0
INIT_XMM sse2
APPLY_WINDOW_INT16 0
INIT_MMX mmxext
APPLY_WINDOW_INT16 1
INIT_XMM sse2
APPLY_WINDOW_INT16 1
INIT_XMM ssse3
APPLY_WINDOW_INT16 1
INIT_XMM ssse3, atom
APPLY_WINDOW_INT16 1
; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
INIT_MMX mmxext
cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
movq mm0, [topq]
movq mm2, mm0
movd mm4, [left_topq]
psllq mm2, 8
movq mm1, mm0
por mm4, mm2
movd mm3, [leftq]
psubb mm0, mm4 ; t-tl
add dstq, wq
add topq, wq
add diffq, wq
neg wq
jmp .skip
.loop:
movq mm4, [topq+wq]
movq mm0, mm4
psllq mm4, 8
por mm4, mm1
movq mm1, mm0 ; t
psubb mm0, mm4 ; t-tl
.skip:
movq mm2, [diffq+wq]
%assign i 0
%rep 8
movq mm4, mm0
paddb mm4, mm3 ; t-tl+l
movq mm5, mm3
pmaxub mm3, mm1
pminub mm5, mm1
pminub mm3, mm4
pmaxub mm3, mm5 ; median
paddb mm3, mm2 ; +residual
%if i==0
movq mm7, mm3
psllq mm7, 56
%else
movq mm6, mm3
psrlq mm7, 8
psllq mm6, 56
por mm7, mm6
%endif
%if i<7
psrlq mm0, 8
psrlq mm1, 8
psrlq mm2, 8
%endif
%assign i i+1
%endrep
movq [dstq+wq], mm7
add wq, 8
jl .loop
movzx r2d, byte [dstq-1]
mov [leftq], r2d
movzx r2d, byte [topq-1]
mov [left_topq], r2d
RET
%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
add srcq, wq
add dstq, wq
neg wq
%%.loop:
%if %2
mova m1, [srcq+wq]
%else
movu m1, [srcq+wq]
%endif
mova m2, m1
psllw m1, 8
paddb m1, m2
mova m2, m1
pshufb m1, m3
paddb m1, m2
pshufb m0, m5
mova m2, m1
pshufb m1, m4
paddb m1, m2
%if mmsize == 16
mova m2, m1
pshufb m1, m6
paddb m1, m2
%endif
paddb m0, m1
%if %1
mova [dstq+wq], m0
%else
movq [dstq+wq], m0
movhps [dstq+wq+8], m0
%endif
add wq, mmsize
jl %%.loop
mov eax, mmsize-1
sub eax, wd
movd m1, eax
pshufb m0, m1
movd eax, m0
RET
%endmacro
; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
INIT_MMX ssse3
cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
.skip_prologue:
mova m5, [pb_7]
mova m4, [pb_zzzz3333zzzzbbbb]
mova m3, [pb_zz11zz55zz99zzdd]
movd m0, leftm
psllq m0, 56
ADD_HFYU_LEFT_LOOP 1, 1
INIT_XMM sse4
cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
mova m5, [pb_f]
mova m6, [pb_zzzzzzzz77777777]
mova m4, [pb_zzzz3333zzzzbbbb]
mova m3, [pb_zz11zz55zz99zzdd]
movd m0, leftm
pslldq m0, 15
test srcq, 15
jnz .src_unaligned
test dstq, 15
jnz .dst_unaligned
ADD_HFYU_LEFT_LOOP 1, 1
.dst_unaligned:
ADD_HFYU_LEFT_LOOP 0, 1
.src_unaligned:
ADD_HFYU_LEFT_LOOP 0, 0
;-----------------------------------------------------------------------------
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
; int32_t max, unsigned int len)
;-----------------------------------------------------------------------------
; %1 = number of xmm registers used
; %2 = number of inline load/process/store loops per asm loop
; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
; %5 = suffix
%macro VECTOR_CLIP_INT32 4-5
cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
%if %4
cvtsi2ss m4, minm
cvtsi2ss m5, maxm
%else
movd m4, minm
movd m5, maxm
%endif
SPLATD m4
SPLATD m5
.loop:
%assign %%i 1
%rep %2
mova m0, [srcq+mmsize*0*%%i]
mova m1, [srcq+mmsize*1*%%i]
mova m2, [srcq+mmsize*2*%%i]
mova m3, [srcq+mmsize*3*%%i]
%if %3
mova m7, [srcq+mmsize*4*%%i]
mova m8, [srcq+mmsize*5*%%i]
mova m9, [srcq+mmsize*6*%%i]
mova m10, [srcq+mmsize*7*%%i]
%endif
CLIPD m0, m4, m5, m6
CLIPD m1, m4, m5, m6
CLIPD m2, m4, m5, m6
CLIPD m3, m4, m5, m6
%if %3
CLIPD m7, m4, m5, m6
CLIPD m8, m4, m5, m6
CLIPD m9, m4, m5, m6
CLIPD m10, m4, m5, m6
%endif
mova [dstq+mmsize*0*%%i], m0
mova [dstq+mmsize*1*%%i], m1
mova [dstq+mmsize*2*%%i], m2
mova [dstq+mmsize*3*%%i], m3
%if %3
mova [dstq+mmsize*4*%%i], m7
mova [dstq+mmsize*5*%%i], m8
mova [dstq+mmsize*6*%%i], m9
mova [dstq+mmsize*7*%%i], m10
%endif
%assign %%i %%i+1
%endrep
add srcq, mmsize*4*(%2+%3)
add dstq, mmsize*4*(%2+%3)
sub lend, mmsize*(%2+%3)
jg .loop
REP_RET
%endmacro
INIT_MMX mmx
%define CLIPD CLIPD_MMX
VECTOR_CLIP_INT32 0, 1, 0, 0
INIT_XMM sse2
VECTOR_CLIP_INT32 6, 1, 0, 0, _int
%define CLIPD CLIPD_SSE2
VECTOR_CLIP_INT32 6, 2, 0, 1
INIT_XMM sse4
%define CLIPD CLIPD_SSE41
%ifdef m8
VECTOR_CLIP_INT32 11, 1, 1, 0
%else
VECTOR_CLIP_INT32 6, 1, 0, 0
%endif
; %1 = aligned/unaligned
%macro BSWAP_LOOPS 1
mov r3, r2
sar r2, 3
jz .left4_%1
.loop8_%1:
mov%1 m0, [r1 + 0]
mov%1 m1, [r1 + 16]
%if cpuflag(ssse3)
pshufb m0, m2
pshufb m1, m2
mov%1 [r0 + 0], m0
mov%1 [r0 + 16], m1
%else
pshuflw m0, m0, 10110001b
pshuflw m1, m1, 10110001b
pshufhw m0, m0, 10110001b
pshufhw m1, m1, 10110001b
mova m2, m0
mova m3, m1
psllw m0, 8
psllw m1, 8
psrlw m2, 8
psrlw m3, 8
por m2, m0
por m3, m1
mov%1 [r0 + 0], m2
mov%1 [r0 + 16], m3
%endif
add r0, 32
add r1, 32
dec r2
jnz .loop8_%1
.left4_%1:
mov r2, r3
and r3, 4
jz .left
mov%1 m0, [r1]
%if cpuflag(ssse3)
pshufb m0, m2
mov%1 [r0], m0
%else
pshuflw m0, m0, 10110001b
pshufhw m0, m0, 10110001b
mova m2, m0
psllw m0, 8
psrlw m2, 8
por m2, m0
mov%1 [r0], m2
%endif
add r1, 16
add r0, 16
%endmacro
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
%macro BSWAP32_BUF 0
%if cpuflag(ssse3)
cglobal bswap32_buf, 3,4,3
mov r3, r1
mova m2, [pb_bswap32]
%else
cglobal bswap32_buf, 3,4,5
mov r3, r1
%endif
or r3, r0
and r3, 15
jz .start_align
BSWAP_LOOPS u
jmp .left
.start_align:
BSWAP_LOOPS a
.left:
%if cpuflag(ssse3)
mov r3, r2
and r2, 2
jz .left1
movq m0, [r1]
pshufb m0, m2
movq [r0], m0
add r1, 8
add r0, 8
.left1:
and r3, 1
jz .end
mov r2d, [r1]
bswap r2d
mov [r0], r2d
%else
and r2, 3
jz .end
.loop2:
mov r3d, [r1]
bswap r3d
mov [r0], r3d
add r1, 4
add r0, 4
dec r2
jnz .loop2
%endif
.end:
RET
%endmacro
INIT_XMM sse2
BSWAP32_BUF
INIT_XMM ssse3
BSWAP32_BUF

View File

@@ -0,0 +1,733 @@
/*
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/simple_idct.h"
#include "dsputil_x86.h"
#include "idct_xvid.h"
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
uint8_t *src2, int dstStride,
int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride,
int h);
void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride,
int h);
void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
int w, int left);
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
#if HAVE_YASM
PIXELS16(static, ff_avg, , , _mmxext)
PIXELS16(static, ff_put, , , _mmxext)
#define QPEL_OP(OPNAME, RND, MMX) \
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
stride, stride); \
} \
\
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
8, stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
stride, stride, 16);\
} \
\
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
stride, stride); \
} \
\
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
}
QPEL_OP(put_, _, mmxext)
QPEL_OP(avg_, _, mmxext)
QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
#endif /* HAVE_YASM */
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_MMX_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth) {
c->clear_block = ff_clear_block_mmx;
c->clear_blocks = ff_clear_blocks_mmx;
c->draw_edges = ff_draw_edges_mmx;
}
#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
c->gmc = ff_gmc_mmx;
#endif
c->add_bytes = ff_add_bytes_mmx;
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMX_EXTERNAL
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
}
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_MMXEXT_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) {
c->idct_put = ff_idct_xvid_mmxext_put;
c->idct_add = ff_idct_xvid_mmxext_add;
c->idct = ff_idct_xvid_mmxext;
}
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_MMXEXT_EXTERNAL
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
/* slower than cmov version on AMD */
if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_mmxext;
} else {
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_SSE_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!high_bit_depth) {
if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
c->clear_block = ff_clear_block_sse;
c->clear_blocks = ff_clear_blocks_sse;
}
}
c->vector_clipf = ff_vector_clipf_sse;
#endif /* HAVE_SSE_INLINE */
#if HAVE_YASM
#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
c->gmc = ff_gmc_sse;
#endif
#endif /* HAVE_YASM */
}
static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_SSE2_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) {
c->idct_put = ff_idct_xvid_sse2_put;
c->idct_add = ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type = FF_SSE2_IDCT_PERM;
}
#endif /* HAVE_SSE2_INLINE */
#if HAVE_SSE2_EXTERNAL
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (cpu_flags & AV_CPU_FLAG_ATOM) {
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
} else {
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
} else if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
}
c->bswap_buf = ff_bswap32_buf_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_SSSE3_EXTERNAL
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
if (cpu_flags & AV_CPU_FLAG_ATOM)
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
else
c->apply_window_int16 = ff_apply_window_int16_ssse3;
if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
c->bswap_buf = ff_bswap32_buf_ssse3;
#endif /* HAVE_SSSE3_EXTERNAL */
}
static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_SSE4_EXTERNAL
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
#endif /* HAVE_SSE4_EXTERNAL */
}
av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_7REGS && HAVE_INLINE_ASM
if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_CMOV)
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_cmov;
#endif
if (X86_MMX(cpu_flags)) {
#if HAVE_INLINE_ASM
const int idct_algo = avctx->idct_algo;
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
c->idct_put = ff_simple_idct_put_mmx;
c->idct_add = ff_simple_idct_add_mmx;
c->idct = ff_simple_idct_mmx;
c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
} else if (idct_algo == FF_IDCT_XVIDMMX) {
c->idct_put = ff_idct_xvid_mmx_put;
c->idct_add = ff_idct_xvid_mmx_add;
c->idct = ff_idct_xvid_mmx;
}
}
#endif /* HAVE_INLINE_ASM */
dsputil_init_mmx(c, avctx, cpu_flags);
}
if (X86_MMXEXT(cpu_flags))
dsputil_init_mmxext(c, avctx, cpu_flags);
if (X86_SSE(cpu_flags))
dsputil_init_sse(c, avctx, cpu_flags);
if (X86_SSE2(cpu_flags))
dsputil_init_sse2(c, avctx, cpu_flags);
if (EXTERNAL_SSSE3(cpu_flags))
dsputil_init_ssse3(c, avctx, cpu_flags);
if (EXTERNAL_SSE4(cpu_flags))
dsputil_init_sse4(c, avctx, cpu_flags);
if (CONFIG_ENCODERS)
ff_dsputilenc_init_mmx(c, avctx);
}

View File

@@ -0,0 +1,637 @@
/*
* MMX optimized DSP utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
#include "config.h"
#include "libavutil/avassert.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/videodsp.h"
#include "constants.h"
#include "dsputil_x86.h"
#include "diracdsp_mmx.h"
#if HAVE_INLINE_ASM
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
const int16_t *p;
uint8_t *pix;
/* read the pixels */
p = block;
pix = pixels;
/* unrolled loop */
__asm__ volatile (
"movq (%3), %%mm0 \n\t"
"movq 8(%3), %%mm1 \n\t"
"movq 16(%3), %%mm2 \n\t"
"movq 24(%3), %%mm3 \n\t"
"movq 32(%3), %%mm4 \n\t"
"movq 40(%3), %%mm5 \n\t"
"movq 48(%3), %%mm6 \n\t"
"movq 56(%3), %%mm7 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"packuswb %%mm7, %%mm6 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm2, (%0, %1) \n\t"
"movq %%mm4, (%0, %1, 2) \n\t"
"movq %%mm6, (%0, %2) \n\t"
:: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
"r"(p)
: "memory");
pix += line_size * 4;
p += 32;
// if here would be an exact copy of the code above
// compiler would generate some very strange code
// thus using "r"
__asm__ volatile (
"movq (%3), %%mm0 \n\t"
"movq 8(%3), %%mm1 \n\t"
"movq 16(%3), %%mm2 \n\t"
"movq 24(%3), %%mm3 \n\t"
"movq 32(%3), %%mm4 \n\t"
"movq 40(%3), %%mm5 \n\t"
"movq 48(%3), %%mm6 \n\t"
"movq 56(%3), %%mm7 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"packuswb %%mm7, %%mm6 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm2, (%0, %1) \n\t"
"movq %%mm4, (%0, %1, 2) \n\t"
"movq %%mm6, (%0, %2) \n\t"
:: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
: "memory");
}
#define put_signed_pixels_clamped_mmx_half(off) \
"movq "#off"(%2), %%mm1 \n\t" \
"movq 16 + "#off"(%2), %%mm2 \n\t" \
"movq 32 + "#off"(%2), %%mm3 \n\t" \
"movq 48 + "#off"(%2), %%mm4 \n\t" \
"packsswb 8 + "#off"(%2), %%mm1 \n\t" \
"packsswb 24 + "#off"(%2), %%mm2 \n\t" \
"packsswb 40 + "#off"(%2), %%mm3 \n\t" \
"packsswb 56 + "#off"(%2), %%mm4 \n\t" \
"paddb %%mm0, %%mm1 \n\t" \
"paddb %%mm0, %%mm2 \n\t" \
"paddb %%mm0, %%mm3 \n\t" \
"paddb %%mm0, %%mm4 \n\t" \
"movq %%mm1, (%0) \n\t" \
"movq %%mm2, (%0, %3) \n\t" \
"movq %%mm3, (%0, %3, 2) \n\t" \
"movq %%mm4, (%0, %1) \n\t"
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
x86_reg line_skip = line_size;
x86_reg line_skip3;
__asm__ volatile (
"movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
"lea (%3, %3, 2), %1 \n\t"
put_signed_pixels_clamped_mmx_half(0)
"lea (%0, %3, 4), %0 \n\t"
put_signed_pixels_clamped_mmx_half(64)
: "+&r"(pixels), "=&r"(line_skip3)
: "r"(block), "r"(line_skip)
: "memory");
}
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
const int16_t *p;
uint8_t *pix;
int i;
/* read the pixels */
p = block;
pix = pixels;
MOVQ_ZERO(mm7);
i = 4;
do {
__asm__ volatile (
"movq (%2), %%mm0 \n\t"
"movq 8(%2), %%mm1 \n\t"
"movq 16(%2), %%mm2 \n\t"
"movq 24(%2), %%mm3 \n\t"
"movq %0, %%mm4 \n\t"
"movq %1, %%mm6 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddsw %%mm4, %%mm0 \n\t"
"paddsw %%mm5, %%mm1 \n\t"
"movq %%mm6, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm6 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddsw %%mm6, %%mm2 \n\t"
"paddsw %%mm5, %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"movq %%mm0, %0 \n\t"
"movq %%mm2, %1 \n\t"
: "+m"(*pix), "+m"(*(pix + line_size))
: "r"(p)
: "memory");
pix += line_size * 2;
p += 16;
} while (--i);
}
#define CLEAR_BLOCKS(name, n) \
void name(int16_t *blocks) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"mov %1, %%"REG_a" \n\t" \
"1: \n\t" \
"movq %%mm7, (%0, %%"REG_a") \n\t" \
"movq %%mm7, 8(%0, %%"REG_a") \n\t" \
"movq %%mm7, 16(%0, %%"REG_a") \n\t" \
"movq %%mm7, 24(%0, %%"REG_a") \n\t" \
"add $32, %%"REG_a" \n\t" \
"js 1b \n\t" \
:: "r"(((uint8_t *)blocks) + 128 * n), \
"i"(-128 * n) \
: "%"REG_a \
); \
}
CLEAR_BLOCKS(ff_clear_blocks_mmx, 6)
CLEAR_BLOCKS(ff_clear_block_mmx, 1)
void ff_clear_block_sse(int16_t *block)
{
__asm__ volatile (
"xorps %%xmm0, %%xmm0 \n"
"movaps %%xmm0, (%0) \n"
"movaps %%xmm0, 16(%0) \n"
"movaps %%xmm0, 32(%0) \n"
"movaps %%xmm0, 48(%0) \n"
"movaps %%xmm0, 64(%0) \n"
"movaps %%xmm0, 80(%0) \n"
"movaps %%xmm0, 96(%0) \n"
"movaps %%xmm0, 112(%0) \n"
:: "r"(block)
: "memory"
);
}
void ff_clear_blocks_sse(int16_t *blocks)
{
__asm__ volatile (
"xorps %%xmm0, %%xmm0 \n"
"mov %1, %%"REG_a" \n"
"1: \n"
"movaps %%xmm0, (%0, %%"REG_a") \n"
"movaps %%xmm0, 16(%0, %%"REG_a") \n"
"movaps %%xmm0, 32(%0, %%"REG_a") \n"
"movaps %%xmm0, 48(%0, %%"REG_a") \n"
"movaps %%xmm0, 64(%0, %%"REG_a") \n"
"movaps %%xmm0, 80(%0, %%"REG_a") \n"
"movaps %%xmm0, 96(%0, %%"REG_a") \n"
"movaps %%xmm0, 112(%0, %%"REG_a") \n"
"add $128, %%"REG_a" \n"
"js 1b \n"
:: "r"(((uint8_t *)blocks) + 128 * 6),
"i"(-128 * 6)
: "%"REG_a
);
}
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
{
x86_reg i = 0;
__asm__ volatile (
"jmp 2f \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq (%2, %0), %%mm1 \n\t"
"paddb %%mm0, %%mm1 \n\t"
"movq %%mm1, (%2, %0) \n\t"
"movq 8(%1, %0), %%mm0 \n\t"
"movq 8(%2, %0), %%mm1 \n\t"
"paddb %%mm0, %%mm1 \n\t"
"movq %%mm1, 8(%2, %0) \n\t"
"add $16, %0 \n\t"
"2: \n\t"
"cmp %3, %0 \n\t"
"js 1b \n\t"
: "+r"(i)
: "r"(src), "r"(dst), "r"((x86_reg)w - 15)
);
for ( ; i < w; i++)
dst[i + 0] += src[i + 0];
}
/* Draw the edges of width 'w' of an image of size width, height
* this MMX version can only handle w == 8 || w == 16. */
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
int w, int h, int sides)
{
uint8_t *ptr, *last_line;
int i;
last_line = buf + (height - 1) * wrap;
/* left and right */
ptr = buf;
if (w == 8) {
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpcklwd %%mm0, %%mm0 \n\t"
"punpckldq %%mm0, %%mm0 \n\t"
"movq %%mm0, -8(%0) \n\t"
"movq -8(%0, %2), %%mm1 \n\t"
"punpckhbw %%mm1, %%mm1 \n\t"
"punpckhwd %%mm1, %%mm1 \n\t"
"punpckhdq %%mm1, %%mm1 \n\t"
"movq %%mm1, (%0, %2) \n\t"
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
);
} else if(w==16){
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpcklwd %%mm0, %%mm0 \n\t"
"punpckldq %%mm0, %%mm0 \n\t"
"movq %%mm0, -8(%0) \n\t"
"movq %%mm0, -16(%0) \n\t"
"movq -8(%0, %2), %%mm1 \n\t"
"punpckhbw %%mm1, %%mm1 \n\t"
"punpckhwd %%mm1, %%mm1 \n\t"
"punpckhdq %%mm1, %%mm1 \n\t"
"movq %%mm1, (%0, %2) \n\t"
"movq %%mm1, 8(%0, %2) \n\t"
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
);
} else {
av_assert1(w == 4);
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpcklwd %%mm0, %%mm0 \n\t"
"movd %%mm0, -4(%0) \n\t"
"movd -4(%0, %2), %%mm1 \n\t"
"punpcklbw %%mm1, %%mm1 \n\t"
"punpckhwd %%mm1, %%mm1 \n\t"
"punpckhdq %%mm1, %%mm1 \n\t"
"movd %%mm1, (%0, %2) \n\t"
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
);
}
/* top and bottom (and hopefully also the corners) */
if (sides & EDGE_TOP) {
for (i = 0; i < h; i += 4) {
ptr = buf - (i + 1) * wrap - w;
__asm__ volatile (
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm0, (%0, %2) \n\t"
"movq %%mm0, (%0, %2, 2) \n\t"
"movq %%mm0, (%0, %3) \n\t"
"add $8, %0 \n\t"
"cmp %4, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
"r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
);
}
}
if (sides & EDGE_BOTTOM) {
for (i = 0; i < h; i += 4) {
ptr = last_line + (i + 1) * wrap - w;
__asm__ volatile (
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm0, (%0, %2) \n\t"
"movq %%mm0, (%0, %2, 2) \n\t"
"movq %%mm0, (%0, %3) \n\t"
"add $8, %0 \n\t"
"cmp %4, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)last_line - (x86_reg)ptr - w),
"r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
"r"(ptr + width + 2 * w)
);
}
}
}
typedef void emulated_edge_mc_func(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_linesize,
int block_w, int block_h,
int src_x, int src_y, int w, int h);
static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height,
emulated_edge_mc_func *emu_edge_fn)
{
const int w = 8;
const int ix = ox >> (16 + shift);
const int iy = oy >> (16 + shift);
const int oxs = ox >> 4;
const int oys = oy >> 4;
const int dxxs = dxx >> 4;
const int dxys = dxy >> 4;
const int dyxs = dyx >> 4;
const int dyys = dyy >> 4;
const uint16_t r4[4] = { r, r, r, r };
const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
const uint64_t shift2 = 2 * shift;
#define MAX_STRIDE 4096U
#define MAX_H 8U
uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
int x, y;
const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
const int dxh = dxy * (h - 1);
const int dyw = dyx * (w - 1);
int need_emu = (unsigned)ix >= width - w ||
(unsigned)iy >= height - h;
if ( // non-constant fullpel offset (3% of blocks)
((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
(oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
// uses more than 16 bits of subpel mv (only at huge resolution)
|| (dxx | dxy | dyx | dyy) & 15
|| (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
// FIXME could still use mmx for some of the rows
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
shift, r, width, height);
return;
}
src += ix + iy * stride;
if (need_emu) {
emu_edge_fn(edge_buf, stride, src, stride, w + 1, h + 1, ix, iy, width, height);
src = edge_buf;
}
__asm__ volatile (
"movd %0, %%mm6 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
:: "r"(1<<shift)
);
for (x = 0; x < w; x += 4) {
uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
oxs - dxys + dxxs * (x + 1),
oxs - dxys + dxxs * (x + 2),
oxs - dxys + dxxs * (x + 3) };
uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
oys - dyys + dyxs * (x + 1),
oys - dyys + dyxs * (x + 2),
oys - dyys + dyxs * (x + 3) };
for (y = 0; y < h; y++) {
__asm__ volatile (
"movq %0, %%mm4 \n\t"
"movq %1, %%mm5 \n\t"
"paddw %2, %%mm4 \n\t"
"paddw %3, %%mm5 \n\t"
"movq %%mm4, %0 \n\t"
"movq %%mm5, %1 \n\t"
"psrlw $12, %%mm4 \n\t"
"psrlw $12, %%mm5 \n\t"
: "+m"(*dx4), "+m"(*dy4)
: "m"(*dxy4), "m"(*dyy4)
);
__asm__ volatile (
"movq %%mm6, %%mm2 \n\t"
"movq %%mm6, %%mm1 \n\t"
"psubw %%mm4, %%mm2 \n\t"
"psubw %%mm5, %%mm1 \n\t"
"movq %%mm2, %%mm0 \n\t"
"movq %%mm4, %%mm3 \n\t"
"pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
"pmullw %%mm5, %%mm3 \n\t" // dx * dy
"pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
"pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
"movd %4, %%mm5 \n\t"
"movd %3, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
"pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
"movd %2, %%mm5 \n\t"
"movd %1, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
"pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
"paddw %5, %%mm1 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"psrlw %6, %%mm0 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"movd %%mm0, %0 \n\t"
: "=m"(dst[x + y * stride])
: "m"(src[0]), "m"(src[1]),
"m"(src[stride]), "m"(src[stride + 1]),
"m"(*r4), "m"(shift2)
);
src += stride;
}
src += 4 - h * stride;
}
}
#if CONFIG_VIDEODSP
#if HAVE_YASM
#if ARCH_X86_32
void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
{
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &ff_emulated_edge_mc_8);
}
#endif
void ff_gmc_sse(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
{
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &ff_emulated_edge_mc_8);
}
#else
void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
{
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &ff_emulated_edge_mc_8);
}
#endif
#endif
#if CONFIG_DIRAC_DECODER
#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3)\
ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
else\
OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
}\
void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3)\
ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
else\
OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
}\
void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3) {\
ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
} else {\
OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
}\
}
#if HAVE_MMX_INLINE
PIXELS16(static, ff_avg, , , _mmxext)
DIRAC_PIXOP(put, ff_put, mmx)
DIRAC_PIXOP(avg, ff_avg, mmx)
#endif
#if HAVE_YASM
DIRAC_PIXOP(avg, ff_avg, mmxext)
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3)
ff_put_dirac_pixels16_c(dst, src, stride, h);
else
ff_put_pixels16_sse2(dst, src[0], stride, h);
}
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3)
ff_avg_dirac_pixels16_c(dst, src, stride, h);
else
ff_avg_pixels16_sse2(dst, src[0], stride, h);
}
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3) {
ff_put_dirac_pixels32_c(dst, src, stride, h);
} else {
ff_put_pixels16_sse2(dst , src[0] , stride, h);
ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
}
}
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3) {
ff_avg_dirac_pixels32_c(dst, src, stride, h);
} else {
ff_avg_pixels16_sse2(dst , src[0] , stride, h);
ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
}
}
#endif
#endif
void ff_vector_clipf_sse(float *dst, const float *src,
float min, float max, int len)
{
x86_reg i = (len - 16) * 4;
__asm__ volatile (
"movss %3, %%xmm4 \n\t"
"movss %4, %%xmm5 \n\t"
"shufps $0, %%xmm4, %%xmm4 \n\t"
"shufps $0, %%xmm5, %%xmm5 \n\t"
"1: \n\t"
"movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
"movaps 16(%2, %0), %%xmm1 \n\t"
"movaps 32(%2, %0), %%xmm2 \n\t"
"movaps 48(%2, %0), %%xmm3 \n\t"
"maxps %%xmm4, %%xmm0 \n\t"
"maxps %%xmm4, %%xmm1 \n\t"
"maxps %%xmm4, %%xmm2 \n\t"
"maxps %%xmm4, %%xmm3 \n\t"
"minps %%xmm5, %%xmm0 \n\t"
"minps %%xmm5, %%xmm1 \n\t"
"minps %%xmm5, %%xmm2 \n\t"
"minps %%xmm5, %%xmm3 \n\t"
"movaps %%xmm0, (%1, %0) \n\t"
"movaps %%xmm1, 16(%1, %0) \n\t"
"movaps %%xmm2, 32(%1, %0) \n\t"
"movaps %%xmm3, 48(%1, %0) \n\t"
"sub $64, %0 \n\t"
"jge 1b \n\t"
: "+&r"(i)
: "r"(dst), "r"(src), "m"(min), "m"(max)
: "memory"
);
}
#endif /* HAVE_INLINE_ASM */

View File

@@ -0,0 +1,101 @@
/*
* DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
* Copyright (c) 2004 Michael Niedermayer
*
* MMX optimization by Michael Niedermayer <michaelni@gmx.at>
* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
{
x86_reg i=0;
av_assert2(FFABS(scale) < MAX_ABS);
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
SET_RND(mm6);
__asm__ volatile(
"pxor %%mm7, %%mm7 \n\t"
"movd %4, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"psraw $6, %%mm0 \n\t"
"psraw $6, %%mm1 \n\t"
"pmullw (%3, %0), %%mm0 \n\t"
"pmullw 8(%3, %0), %%mm1 \n\t"
"pmaddwd %%mm0, %%mm0 \n\t"
"pmaddwd %%mm1, %%mm1 \n\t"
"paddd %%mm1, %%mm0 \n\t"
"psrld $4, %%mm0 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"add $16, %0 \n\t"
"cmp $128, %0 \n\t" //FIXME optimize & bench
" jb 1b \n\t"
PHADDD(%%mm7, %%mm6)
"psrld $2, %%mm7 \n\t"
"movd %%mm7, %0 \n\t"
: "+r" (i)
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
);
return i;
}
static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
{
x86_reg i=0;
if(FFABS(scale) < MAX_ABS){
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
SET_RND(mm6);
__asm__ volatile(
"movd %3, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"movq %%mm0, (%2, %0) \n\t"
"movq %%mm1, 8(%2, %0) \n\t"
"add $16, %0 \n\t"
"cmp $128, %0 \n\t" // FIXME optimize & bench
" jb 1b \n\t"
: "+r" (i)
: "r"(basis), "r"(rem), "g"(scale)
);
}else{
for(i=0; i<8*8; i++){
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
}
}
}

View File

@@ -0,0 +1,65 @@
/*
* Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/x86/asm.h"
#include "dsputil_x86.h"
#if HAVE_INLINE_ASM
#if HAVE_7REGS
void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top)
{
x86_reg w2 = -w;
x86_reg x;
int l = *left & 0xff;
int tl = *left_top & 0xff;
int t;
__asm__ volatile (
"mov %7, %3 \n"
"1: \n"
"movzbl (%3, %4), %2 \n"
"mov %2, %k3 \n"
"sub %b1, %b3 \n"
"add %b0, %b3 \n"
"mov %2, %1 \n"
"cmp %0, %2 \n"
"cmovg %0, %2 \n"
"cmovg %1, %0 \n"
"cmp %k3, %0 \n"
"cmovg %k3, %0 \n"
"mov %7, %3 \n"
"cmp %2, %0 \n"
"cmovl %2, %0 \n"
"add (%6, %4), %b0 \n"
"mov %b0, (%5, %4) \n"
"inc %4 \n"
"jl 1b \n"
: "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
: "r"(dst + w), "r"(diff + w), "rm"(top + w)
);
*left = l;
*left_top = tl;
}
#endif
#endif /* HAVE_INLINE_ASM */

View File

@@ -0,0 +1,198 @@
/*
* MMX optimized DSP utils
* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_DSPUTIL_MMX_H
#define AVCODEC_X86_DSPUTIL_MMX_H
#include <stddef.h>
#include <stdint.h>
#include "libavcodec/dsputil.h"
#include "libavutil/x86/asm.h"
#include "constants.h"
#define MOVQ_WONE(regd) \
__asm__ volatile ( \
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
"psrlw $15, %%" #regd ::)
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
#define MOVQ_BFE(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"paddb %%"#regd", %%"#regd" \n\t" ::)
#ifndef PIC
#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
#else
// for shared library it's better to use this way for accessing constants
// pcmpeqd -> -1
#define MOVQ_WTWO(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"psrlw $15, %%"#regd" \n\t" \
"psllw $1, %%"#regd" \n\t"::)
#endif
// using regr as temporary and for the output result
// first argument is unmodifed and second is trashed
// regfe is supposed to contain 0xfefefefefefefefe
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"paddb "#regb", "#regr" \n\t"
#define PAVGB_MMX(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"por "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psubb "#regb", "#regr" \n\t"
// mm6 is supposed to contain 0xfefefefefefefefe
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pand "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psrlq $1, "#regd" \n\t" \
"paddb "#regb", "#regr" \n\t" \
"paddb "#regd", "#regp" \n\t"
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"por "#regb", "#regr" \n\t" \
"por "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psubb "#regb", "#regr" \n\t" \
"psubb "#regd", "#regp" \n\t"
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
void ff_clear_block_mmx(int16_t *block);
void ff_clear_block_sse(int16_t *block);
void ff_clear_blocks_mmx(int16_t *blocks);
void ff_clear_blocks_sse(int16_t *blocks);
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top);
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
int w, int h, int sides);
void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height);
void ff_gmc_sse(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height);
void ff_vector_clipf_sse(float *dst, const float *src,
float min, float max, int len);
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_mmx_idct(int16_t *block);
void ff_mmxext_idct(int16_t *block);
void ff_deinterlace_line_mmx(uint8_t *dst,
const uint8_t *lum_m4, const uint8_t *lum_m3,
const uint8_t *lum_m2, const uint8_t *lum_m1,
const uint8_t *lum,
int size);
void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4,
const uint8_t *lum_m3,
const uint8_t *lum_m2,
const uint8_t *lum_m1,
const uint8_t *lum, int size);
#define PIXELS16(STATIC, PFX1, PFX2, TYPE, CPUEXT) \
STATIC void PFX1 ## _pixels16 ## TYPE ## CPUEXT(uint8_t *block, \
const uint8_t *pixels, \
ptrdiff_t line_size, \
int h) \
{ \
PFX2 ## PFX1 ## _pixels8 ## TYPE ## CPUEXT(block, pixels, \
line_size, h); \
PFX2 ## PFX1 ## _pixels8 ## TYPE ## CPUEXT(block + 8, pixels + 8, \
line_size, h); \
}
#endif /* AVCODEC_X86_DSPUTIL_MMX_H */

View File

@@ -0,0 +1,487 @@
;*****************************************************************************
;* MMX optimized DSP utils
;*****************************************************************************
;* Copyright (c) 2000, 2001 Fabrice Bellard
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%macro DIFF_PIXELS_1 4
movh %1, %3
movh %2, %4
punpcklbw %2, %1
punpcklbw %1, %1
psubw %1, %2
%endmacro
; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
; %6=temporary storage location
; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
%macro DIFF_PIXELS_8 6
DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
add %1, %5
add %2, %5
DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
%ifdef m8
DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
%else
mova [%6], m0
DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
mova m0, [%6]
%endif
sub %1, %5
sub %2, %5
%endmacro
%macro HADAMARD8 0
SUMSUB_BADC w, 0, 1, 2, 3
SUMSUB_BADC w, 4, 5, 6, 7
SUMSUB_BADC w, 0, 2, 1, 3
SUMSUB_BADC w, 4, 6, 5, 7
SUMSUB_BADC w, 0, 4, 1, 5
SUMSUB_BADC w, 2, 6, 3, 7
%endmacro
%macro ABS1_SUM 3
ABS1 %1, %2
paddusw %3, %1
%endmacro
%macro ABS2_SUM 6
ABS2 %1, %2, %3, %4
paddusw %5, %1
paddusw %6, %2
%endmacro
%macro ABS_SUM_8x8_64 1
ABS2 m0, m1, m8, m9
ABS2_SUM m2, m3, m8, m9, m0, m1
ABS2_SUM m4, m5, m8, m9, m0, m1
ABS2_SUM m6, m7, m8, m9, m0, m1
paddusw m0, m1
%endmacro
%macro ABS_SUM_8x8_32 1
mova [%1], m7
ABS1 m0, m7
ABS1 m1, m7
ABS1_SUM m2, m7, m0
ABS1_SUM m3, m7, m1
ABS1_SUM m4, m7, m0
ABS1_SUM m5, m7, m1
ABS1_SUM m6, m7, m0
mova m2, [%1]
ABS1_SUM m2, m7, m1
paddusw m0, m1
%endmacro
; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
%macro HSUM 3
%if cpuflag(sse2)
movhlps %2, %1
paddusw %1, %2
pshuflw %2, %1, 0xE
paddusw %1, %2
pshuflw %2, %1, 0x1
paddusw %1, %2
movd %3, %1
%elif cpuflag(mmxext)
pshufw %2, %1, 0xE
paddusw %1, %2
pshufw %2, %1, 0x1
paddusw %1, %2
movd %3, %1
%elif cpuflag(mmx)
mova %2, %1
psrlq %1, 32
paddusw %1, %2
mova %2, %1
psrlq %1, 16
paddusw %1, %2
movd %3, %1
%endif
%endmacro
%macro STORE4 5
mova [%1+mmsize*0], %2
mova [%1+mmsize*1], %3
mova [%1+mmsize*2], %4
mova [%1+mmsize*3], %5
%endmacro
%macro LOAD4 5
mova %2, [%1+mmsize*0]
mova %3, [%1+mmsize*1]
mova %4, [%1+mmsize*2]
mova %5, [%1+mmsize*3]
%endmacro
%macro hadamard8_16_wrapper 2
cglobal hadamard8_diff, 4, 4, %1
%ifndef m8
%assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
SUB rsp, pad
%endif
call hadamard8x8_diff %+ SUFFIX
%ifndef m8
ADD rsp, pad
%endif
RET
cglobal hadamard8_diff16, 5, 6, %1
%ifndef m8
%assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
SUB rsp, pad
%endif
call hadamard8x8_diff %+ SUFFIX
mov r5d, eax
add r1, 8
add r2, 8
call hadamard8x8_diff %+ SUFFIX
add r5d, eax
cmp r4d, 16
jne .done
lea r1, [r1+r3*8-8]
lea r2, [r2+r3*8-8]
call hadamard8x8_diff %+ SUFFIX
add r5d, eax
add r1, 8
add r2, 8
call hadamard8x8_diff %+ SUFFIX
add r5d, eax
.done:
mov eax, r5d
%ifndef m8
ADD rsp, pad
%endif
RET
%endmacro
%macro HADAMARD8_DIFF 0-1
%if cpuflag(sse2)
hadamard8x8_diff %+ SUFFIX:
lea r0, [r3*3]
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
HADAMARD8
%if ARCH_X86_64
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
%endif
HADAMARD8
ABS_SUM_8x8 rsp+gprsize
HSUM m0, m1, eax
and eax, 0xFFFF
ret
hadamard8_16_wrapper %1, 3
%elif cpuflag(mmx)
ALIGN 16
; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
; int stride, int h)
; r0 = void *s = unused, int h = unused (always 8)
; note how r1, r2 and r3 are not clobbered in this function, so 16x16
; can simply call this 2x2x (and that's why we access rsp+gprsize
; everywhere, which is rsp of calling func
hadamard8x8_diff %+ SUFFIX:
lea r0, [r3*3]
; first 4x8 pixels
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
HADAMARD8
mova [rsp+gprsize+0x60], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
STORE4 rsp+gprsize, m0, m1, m2, m3
mova m7, [rsp+gprsize+0x60]
TRANSPOSE4x4W 4, 5, 6, 7, 0
STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
; second 4x8 pixels
DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
HADAMARD8
mova [rsp+gprsize+0x60], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
mova m7, [rsp+gprsize+0x60]
TRANSPOSE4x4W 4, 5, 6, 7, 0
LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
HADAMARD8
ABS_SUM_8x8_32 rsp+gprsize+0x60
mova [rsp+gprsize+0x60], m0
LOAD4 rsp+gprsize , m0, m1, m2, m3
LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
HADAMARD8
ABS_SUM_8x8_32 rsp+gprsize
paddusw m0, [rsp+gprsize+0x60]
HSUM m0, m1, eax
and rax, 0xFFFF
ret
hadamard8_16_wrapper 0, 14
%endif
%endmacro
INIT_MMX mmx
HADAMARD8_DIFF
INIT_MMX mmxext
HADAMARD8_DIFF
INIT_XMM sse2
%if ARCH_X86_64
%define ABS_SUM_8x8 ABS_SUM_8x8_64
%else
%define ABS_SUM_8x8 ABS_SUM_8x8_32
%endif
HADAMARD8_DIFF 10
INIT_XMM ssse3
%define ABS_SUM_8x8 ABS_SUM_8x8_64
HADAMARD8_DIFF 9
INIT_XMM sse2
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
cglobal sse16, 5, 5, 8
shr r4d, 1
pxor m0, m0 ; mm0 = 0
pxor m7, m7 ; mm7 holds the sum
.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
movu m1, [r1 ] ; mm1 = pix1[0][0-15]
movu m2, [r2 ] ; mm2 = pix2[0][0-15]
movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
movu m4, [r2+r3] ; mm4 = pix2[1][0-15]
; todo: mm1-mm2, mm3-mm4
; algo: subtract mm1 from mm2 with saturation and vice versa
; OR the result to get the absolute difference
mova m5, m1
mova m6, m3
psubusb m1, m2
psubusb m3, m4
psubusb m2, m5
psubusb m4, m6
por m2, m1
por m4, m3
; now convert to 16-bit vectors so we can square them
mova m1, m2
mova m3, m4
punpckhbw m2, m0
punpckhbw m4, m0
punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
pmaddwd m2, m2
pmaddwd m4, m4
pmaddwd m1, m1
pmaddwd m3, m3
lea r1, [r1+r3*2] ; pix1 += 2*line_size
lea r2, [r2+r3*2] ; pix2 += 2*line_size
paddd m1, m2
paddd m3, m4
paddd m7, m1
paddd m7, m3
dec r4
jnz .next2lines
mova m1, m7
psrldq m7, 8 ; shift hi qword to lo
paddd m7, m1
mova m1, m7
psrldq m7, 4 ; shift hi dword to lo
paddd m7, m1
movd eax, m7 ; return value
RET
INIT_MMX mmx
; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
cglobal get_pixels, 3,4
movsxdifnidn r2, r2d
add r0, 128
mov r3, -128
pxor m7, m7
.loop:
mova m0, [r1]
mova m2, [r1+r2]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
mova [r0+r3+ 0], m0
mova [r0+r3+ 8], m1
mova [r0+r3+16], m2
mova [r0+r3+24], m3
lea r1, [r1+r2*2]
add r3, 32
js .loop
REP_RET
INIT_XMM sse2
cglobal get_pixels, 3, 4
movsxdifnidn r2, r2d
lea r3, [r2*3]
pxor m4, m4
movh m0, [r1]
movh m1, [r1+r2]
movh m2, [r1+r2*2]
movh m3, [r1+r3]
lea r1, [r1+r2*4]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
mova [r0], m0
mova [r0+0x10], m1
mova [r0+0x20], m2
mova [r0+0x30], m3
movh m0, [r1]
movh m1, [r1+r2*1]
movh m2, [r1+r2*2]
movh m3, [r1+r3]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
mova [r0+0x40], m0
mova [r0+0x50], m1
mova [r0+0x60], m2
mova [r0+0x70], m3
RET
INIT_MMX mmx
; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride)
cglobal diff_pixels, 4,5
movsxdifnidn r3, r3d
pxor m7, m7
add r0, 128
mov r4, -128
.loop:
mova m0, [r1]
mova m2, [r2]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
mova [r0+r4+0], m0
mova [r0+r4+8], m1
add r1, r3
add r2, r3
add r4, 16
jne .loop
REP_RET
INIT_MMX mmx
; pix_sum16_mmx(uint8_t * pix, int line_size)
cglobal pix_sum16, 2, 3
movsxdifnidn r1, r1d
mov r2, r1
neg r2
shl r2, 4
sub r0, r2
pxor m7, m7
pxor m6, m6
.loop:
mova m0, [r0+r2+0]
mova m1, [r0+r2+0]
mova m2, [r0+r2+8]
mova m3, [r0+r2+8]
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m1, m0
paddw m3, m2
paddw m3, m1
paddw m6, m3
add r2, r1
js .loop
mova m5, m6
psrlq m6, 32
paddw m6, m5
mova m5, m6
psrlq m6, 16
paddw m6, m5
movd eax, m6
and eax, 0xffff
RET
INIT_MMX mmx
; pix_norm1_mmx(uint8_t *pix, int line_size)
cglobal pix_norm1, 2, 4
movsxdifnidn r1, r1d
mov r2, 16
pxor m0, m0
pxor m7, m7
.loop:
mova m2, [r0+0]
mova m3, [r0+8]
mova m1, m2
punpckhbw m1, m0
punpcklbw m2, m0
mova m4, m3
punpckhbw m3, m0
punpcklbw m4, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
pmaddwd m4, m4
paddd m2, m1
paddd m4, m3
paddd m7, m2
add r0, r1
paddd m7, m4
dec r2
jne .loop
mova m1, m7
psrlq m7, 32
paddd m1, m7
movd eax, m1
RET

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,306 @@
;******************************************************************************
;* MMX optimized discrete wavelet trasnform
;* Copyright (c) 2010 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_1: times 8 dw 1
pw_2: times 8 dw 2
pw_8: times 8 dw 8
pw_16: times 8 dw 16
pw_1991: times 4 dw 9,-1
section .text
; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
%macro COMPOSE_53iL0 4
paddw %2, %3
paddw %2, %4
psraw %2, 2
psubw %1, %2
%endm
; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
; if %4 is supplied, %1 is loaded unaligned from there
; m2: clobbered m3: pw_8 m4: pw_1991
%macro COMPOSE_DD97iH0 3-4
paddw m0, %3
paddw m1, %2
psubw m0, m3
mova m2, m1
punpcklwd m1, m0
punpckhwd m2, m0
pmaddwd m1, m4
pmaddwd m2, m4
%if %0 > 3
movu %1, %4
%endif
psrad m1, 4
psrad m2, 4
packssdw m1, m2
paddw m1, %1
%endm
%macro COMPOSE_VERTICAL 1
; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; int width)
cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
mova m2, [pw_2]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m1, [b0q+2*widthq]
mova m0, [b1q+2*widthq]
COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
mova [b1q+2*widthq], m0
jg .loop
REP_RET
; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; int width)
cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
mova m1, [pw_1]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m0, [b0q+2*widthq]
paddw m0, [b2q+2*widthq]
paddw m0, m1
psraw m0, 1
paddw m0, [b1q+2*widthq]
mova [b1q+2*widthq], m0
jg .loop
REP_RET
; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; IDWTELEM *b3, IDWTELEM *b4, int width)
cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
mova m3, [pw_8]
mova m4, [pw_1991]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m0, [b0q+2*widthq]
mova m1, [b1q+2*widthq]
COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
mova [b2q+2*widthq], m1
jg .loop
REP_RET
; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; IDWTELEM *b3, IDWTELEM *b4, int width)
cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
mova m3, [pw_16]
mova m4, [pw_1991]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m0, [b0q+2*widthq]
mova m1, [b1q+2*widthq]
mova m5, [b2q+2*widthq]
paddw m0, [b4q+2*widthq]
paddw m1, [b3q+2*widthq]
psubw m0, m3
mova m2, m1
punpcklwd m1, m0
punpckhwd m2, m0
pmaddwd m1, m4
pmaddwd m2, m4
psrad m1, 5
psrad m2, 5
packssdw m1, m2
psubw m5, m1
mova [b2q+2*widthq], m5
jg .loop
REP_RET
; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
mova m3, [pw_1]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m1, [b1q+2*widthq]
mova m0, [b0q+2*widthq]
mova m2, m1
paddw m1, m3
psraw m1, 1
psubw m0, m1
mova [b0q+2*widthq], m0
paddw m2, m0
mova [b1q+2*widthq], m2
jg .loop
REP_RET
%endmacro
; extend the left and right edges of the tmp array by %1 and %2 respectively
%macro EDGE_EXTENSION 3
mov %3, [tmpq]
%assign %%i 1
%rep %1
mov [tmpq-2*%%i], %3
%assign %%i %%i+1
%endrep
mov %3, [tmpq+2*w2q-2]
%assign %%i 0
%rep %2
mov [tmpq+2*w2q+2*%%i], %3
%assign %%i %%i+1
%endrep
%endmacro
%macro HAAR_HORIZONTAL 2
; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
mov w2d, wd
xor xq, xq
shr w2d, 1
lea b_w2q, [bq+wq]
mova m3, [pw_1]
.lowpass_loop:
movu m1, [b_w2q + 2*xq]
mova m0, [bq + 2*xq]
paddw m1, m3
psraw m1, 1
psubw m0, m1
mova [tmpq + 2*xq], m0
add xq, mmsize/2
cmp xq, w2q
jl .lowpass_loop
xor xq, xq
and w2q, ~(mmsize/2 - 1)
cmp w2q, mmsize/2
jl .end
.highpass_loop:
movu m1, [b_w2q + 2*xq]
mova m0, [tmpq + 2*xq]
paddw m1, m0
; shift and interleave
%if %2 == 1
paddw m0, m3
paddw m1, m3
psraw m0, 1
psraw m1, 1
%endif
mova m2, m0
punpcklwd m0, m1
punpckhwd m2, m1
mova [bq+4*xq], m0
mova [bq+4*xq+mmsize], m2
add xq, mmsize/2
cmp xq, w2q
jl .highpass_loop
.end:
REP_RET
%endmacro
INIT_XMM
; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
mov w2d, wd
xor xd, xd
shr w2d, 1
lea b_w2q, [bq+wq]
movu m4, [bq+wq]
mova m7, [pw_2]
pslldq m4, 14
.lowpass_loop:
movu m1, [b_w2q + 2*xq]
mova m0, [bq + 2*xq]
mova m2, m1
palignr m1, m4, 14
mova m4, m2
COMPOSE_53iL0 m0, m1, m2, m7
mova [tmpq + 2*xq], m0
add xd, mmsize/2
cmp xd, w2d
jl .lowpass_loop
EDGE_EXTENSION 1, 2, xw
; leave the last up to 7 (sse) or 3 (mmx) values for C
xor xd, xd
and w2d, ~(mmsize/2 - 1)
cmp w2d, mmsize/2
jl .end
mova m7, [tmpq-mmsize]
mova m0, [tmpq]
mova m5, [pw_1]
mova m3, [pw_8]
mova m4, [pw_1991]
.highpass_loop:
mova m6, m0
palignr m0, m7, 14
mova m7, [tmpq + 2*xq + 16]
mova m1, m7
mova m2, m7
palignr m1, m6, 2
palignr m2, m6, 4
COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
mova m0, m7
mova m7, m6
; shift and interleave
paddw m6, m5
paddw m1, m5
psraw m6, 1
psraw m1, 1
mova m2, m6
punpcklwd m6, m1
punpckhwd m2, m1
mova [bq+4*xq], m6
mova [bq+4*xq+mmsize], m2
add xd, mmsize/2
cmp xd, w2d
jl .highpass_loop
.end:
REP_RET
%if ARCH_X86_64 == 0
INIT_MMX
COMPOSE_VERTICAL mmx
HAAR_HORIZONTAL mmx, 0
HAAR_HORIZONTAL mmx, 1
%endif
;;INIT_XMM
INIT_XMM
COMPOSE_VERTICAL sse2
HAAR_HORIZONTAL sse2, 0
HAAR_HORIZONTAL sse2, 1

View File

@@ -0,0 +1,594 @@
/*
* MMX optimized forward DCT
* The gcc porting is Copyright (c) 2001 Fabrice Bellard.
* cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
* SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
*
* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
*
* Intel Application Note AP-922 - fast, precise implementation of DCT
* http://developer.intel.com/vtune/cbts/appnotes.htm
*
* Also of inspiration:
* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
* Skal's fdct at http://skal.planet-d.net/coding/dct.html
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/common.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/dct.h"
#if HAVE_MMX_INLINE
//////////////////////////////////////////////////////////////////////
//
// constants for the forward DCT
// -----------------------------
//
// Be sure to check that your compiler is aligning all constants to QWORD
// (8-byte) memory boundaries! Otherwise the unaligned memory access will
// severely stall MMX execution.
//
//////////////////////////////////////////////////////////////////////
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
#define SHIFT_FRW_COL BITS_FRW_ACC
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
#define X8(x) x,x,x,x,x,x,x,x
//concatenated table, for forward DCT transformation
DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
X8(13036), // tg * (2<<16) + 0.5
X8(27146), // tg * (2<<16) + 0.5
X8(-21746) // tg * (2<<16) + 0.5
};
DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
X8(23170) //cos * (2<<15) + 0.5
};
DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
static const struct
{
DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
} fdct_r_row_sse2 =
{{
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
}};
//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
16384, 16384, 22725, 19266,
16384, 16384, 12873, 4520,
21407, 8867, 19266, -4520,
-8867, -21407, -22725, -12873,
16384, -16384, 12873, -22725,
-16384, 16384, 4520, 19266,
8867, -21407, 4520, -12873,
21407, -8867, 19266, -22725,
22725, 22725, 31521, 26722,
22725, 22725, 17855, 6270,
29692, 12299, 26722, -6270,
-12299, -29692, -31521, -17855,
22725, -22725, 17855, -31521,
-22725, 22725, 6270, 26722,
12299, -29692, 6270, -17855,
29692, -12299, 26722, -31521,
21407, 21407, 29692, 25172,
21407, 21407, 16819, 5906,
27969, 11585, 25172, -5906,
-11585, -27969, -29692, -16819,
21407, -21407, 16819, -29692,
-21407, 21407, 5906, 25172,
11585, -27969, 5906, -16819,
27969, -11585, 25172, -29692,
19266, 19266, 26722, 22654,
19266, 19266, 15137, 5315,
25172, 10426, 22654, -5315,
-10426, -25172, -26722, -15137,
19266, -19266, 15137, -26722,
-19266, 19266, 5315, 22654,
10426, -25172, 5315, -15137,
25172, -10426, 22654, -26722,
16384, 16384, 22725, 19266,
16384, 16384, 12873, 4520,
21407, 8867, 19266, -4520,
-8867, -21407, -22725, -12873,
16384, -16384, 12873, -22725,
-16384, 16384, 4520, 19266,
8867, -21407, 4520, -12873,
21407, -8867, 19266, -22725,
19266, 19266, 26722, 22654,
19266, 19266, 15137, 5315,
25172, 10426, 22654, -5315,
-10426, -25172, -26722, -15137,
19266, -19266, 15137, -26722,
-19266, 19266, 5315, 22654,
10426, -25172, 5315, -15137,
25172, -10426, 22654, -26722,
21407, 21407, 29692, 25172,
21407, 21407, 16819, 5906,
27969, 11585, 25172, -5906,
-11585, -27969, -29692, -16819,
21407, -21407, 16819, -29692,
-21407, 21407, 5906, 25172,
11585, -27969, 5906, -16819,
27969, -11585, 25172, -29692,
22725, 22725, 31521, 26722,
22725, 22725, 17855, 6270,
29692, 12299, 26722, -6270,
-12299, -29692, -31521, -17855,
22725, -22725, 17855, -31521,
-22725, 22725, 6270, 26722,
12299, -29692, 6270, -17855,
29692, -12299, 26722, -31521,
};
static const struct
{
DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
} tab_frw_01234567_sse2 =
{{
//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
C4, C4, C5, C7, C2, C6, C3, -C7, \
-C4, C4, C7, C3, C6, -C2, C7, -C5, \
C4, -C4, C5, -C1, C2, -C6, C3, -C1,
// c1..c7 * cos(pi/4) * 2^15
#define C1 22725
#define C2 21407
#define C3 19266
#define C4 16384
#define C5 12873
#define C6 8867
#define C7 4520
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 31521
#define C2 29692
#define C3 26722
#define C4 22725
#define C5 17855
#define C6 12299
#define C7 6270
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 29692
#define C2 27969
#define C3 25172
#define C4 21407
#define C5 16819
#define C6 11585
#define C7 5906
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 26722
#define C2 25172
#define C3 22654
#define C4 19266
#define C5 15137
#define C6 10426
#define C7 5315
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 22725
#define C2 21407
#define C3 19266
#define C4 16384
#define C5 12873
#define C6 8867
#define C7 4520
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 26722
#define C2 25172
#define C3 22654
#define C4 19266
#define C5 15137
#define C6 10426
#define C7 5315
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 29692
#define C2 27969
#define C3 25172
#define C4 21407
#define C5 16819
#define C6 11585
#define C7 5906
TABLE_SSE2
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 31521
#define C2 29692
#define C3 26722
#define C4 22725
#define C5 17855
#define C6 12299
#define C7 6270
TABLE_SSE2
}};
#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
#define FDCT_COL(cpu, mm, mov)\
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
{\
__asm__ volatile (\
#mov" 16(%0), %%"#mm"0 \n\t" \
#mov" 96(%0), %%"#mm"1 \n\t" \
#mov" %%"#mm"0, %%"#mm"2 \n\t" \
#mov" 32(%0), %%"#mm"3 \n\t" \
"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
#mov" 80(%0), %%"#mm"4 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
#mov" (%0), %%"#mm"5 \n\t" \
"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
"paddsw 112(%0), %%"#mm"5 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
#mov" %%"#mm"0, %%"#mm"6 \n\t" \
"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
#mov" 16(%1), %%"#mm"1 \n\t" \
"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
#mov" 48(%0), %%"#mm"7 \n\t" \
"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
"paddsw 64(%0), %%"#mm"7 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
#mov" %%"#mm"5, %%"#mm"4 \n\t" \
"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
"por (%2), %%"#mm"1 \n\t" \
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
"pmulhw 16(%1), %%"#mm"5 \n\t" \
#mov" %%"#mm"4, %%"#mm"7 \n\t" \
"psubsw 80(%0), %%"#mm"3 \n\t" \
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
#mov" %%"#mm"1, 32(%3) \n\t" \
"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
#mov" 48(%0), %%"#mm"1 \n\t" \
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
"psubsw 64(%0), %%"#mm"1 \n\t" \
#mov" %%"#mm"2, %%"#mm"6 \n\t" \
#mov" %%"#mm"4, 64(%3) \n\t" \
"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
"pmulhw (%4), %%"#mm"2 \n\t" \
"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
"pmulhw (%4), %%"#mm"6 \n\t" \
"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
"por (%2), %%"#mm"5 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
"por (%2), %%"#mm"2 \n\t" \
#mov" %%"#mm"1, %%"#mm"4 \n\t" \
#mov" (%0), %%"#mm"3 \n\t" \
"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
"psubsw 112(%0), %%"#mm"3 \n\t" \
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
#mov" (%1), %%"#mm"0 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
#mov" 32(%1), %%"#mm"6 \n\t" \
"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
#mov" %%"#mm"7, (%3) \n\t" \
"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
#mov" %%"#mm"5, 96(%3) \n\t" \
#mov" %%"#mm"3, %%"#mm"7 \n\t" \
#mov" 32(%1), %%"#mm"5 \n\t" \
"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
"pmulhw (%1), %%"#mm"3 \n\t" \
"por (%2), %%"#mm"0 \n\t" \
"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
#mov" %%"#mm"0, 16(%3) \n\t" \
"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
#mov" %%"#mm"7, 48(%3) \n\t" \
"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
#mov" %%"#mm"5, 80(%3) \n\t" \
#mov" %%"#mm"3, 112(%3) \n\t" \
: \
: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
"r" (out + offset), "r" (ocos_4_16)); \
}
FDCT_COL(mmx, mm, movq)
FDCT_COL(sse2, xmm, movdqa)
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
{
__asm__ volatile(
#define FDCT_ROW_SSE2_H1(i,t) \
"movq " #i "(%0), %%xmm2 \n\t" \
"movq " #i "+8(%0), %%xmm0 \n\t" \
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
"movdqa " #t "+48(%1), %%xmm7 \n\t" \
"movdqa " #t "(%1), %%xmm4 \n\t" \
"movdqa " #t "+16(%1), %%xmm5 \n\t"
#define FDCT_ROW_SSE2_H2(i,t) \
"movq " #i "(%0), %%xmm2 \n\t" \
"movq " #i "+8(%0), %%xmm0 \n\t" \
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
"movdqa " #t "+48(%1), %%xmm7 \n\t"
#define FDCT_ROW_SSE2(i) \
"movq %%xmm2, %%xmm1 \n\t" \
"pshuflw $27, %%xmm0, %%xmm0 \n\t" \
"paddsw %%xmm0, %%xmm1 \n\t" \
"psubsw %%xmm0, %%xmm2 \n\t" \
"punpckldq %%xmm2, %%xmm1 \n\t" \
"pshufd $78, %%xmm1, %%xmm2 \n\t" \
"pmaddwd %%xmm2, %%xmm3 \n\t" \
"pmaddwd %%xmm1, %%xmm7 \n\t" \
"pmaddwd %%xmm5, %%xmm2 \n\t" \
"pmaddwd %%xmm4, %%xmm1 \n\t" \
"paddd %%xmm7, %%xmm3 \n\t" \
"paddd %%xmm2, %%xmm1 \n\t" \
"paddd %%xmm6, %%xmm3 \n\t" \
"paddd %%xmm6, %%xmm1 \n\t" \
"psrad %3, %%xmm3 \n\t" \
"psrad %3, %%xmm1 \n\t" \
"packssdw %%xmm3, %%xmm1 \n\t" \
"movdqa %%xmm1, " #i "(%4) \n\t"
"movdqa (%2), %%xmm6 \n\t"
FDCT_ROW_SSE2_H1(0,0)
FDCT_ROW_SSE2(0)
FDCT_ROW_SSE2_H2(64,0)
FDCT_ROW_SSE2(64)
FDCT_ROW_SSE2_H1(16,64)
FDCT_ROW_SSE2(16)
FDCT_ROW_SSE2_H2(112,64)
FDCT_ROW_SSE2(112)
FDCT_ROW_SSE2_H1(32,128)
FDCT_ROW_SSE2(32)
FDCT_ROW_SSE2_H2(96,128)
FDCT_ROW_SSE2(96)
FDCT_ROW_SSE2_H1(48,192)
FDCT_ROW_SSE2(48)
FDCT_ROW_SSE2_H2(80,192)
FDCT_ROW_SSE2(80)
:
: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
"r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
const int16_t *table)
{
__asm__ volatile (
"pshufw $0x1B, 8(%0), %%mm5 \n\t"
"movq (%0), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"paddsw %%mm5, %%mm0 \n\t"
"psubsw %%mm5, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm1, %%mm0 \n\t"
"punpckhdq %%mm1, %%mm2 \n\t"
"movq (%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm5 \n\t"
"movq 32(%1), %%mm6 \n\t"
"movq 40(%1), %%mm7 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm0, %%mm4 \n\t"
"pmaddwd %%mm2, %%mm5 \n\t"
"pmaddwd %%mm0, %%mm6 \n\t"
"pmaddwd %%mm2, %%mm7 \n\t"
"pmaddwd 48(%1), %%mm0 \n\t"
"pmaddwd 56(%1), %%mm2 \n\t"
"paddd %%mm1, %%mm3 \n\t"
"paddd %%mm4, %%mm5 \n\t"
"paddd %%mm6, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"movq (%2), %%mm0 \n\t"
"paddd %%mm0, %%mm3 \n\t"
"paddd %%mm0, %%mm5 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
"packssdw %%mm5, %%mm3 \n\t"
"packssdw %%mm2, %%mm7 \n\t"
"movq %%mm3, (%3) \n\t"
"movq %%mm7, 8(%3) \n\t"
:
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
}
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
{
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
__asm__ volatile(
"movd 12(%0), %%mm1 \n\t"
"punpcklwd 8(%0), %%mm1 \n\t"
"movq %%mm1, %%mm2 \n\t"
"psrlq $0x20, %%mm1 \n\t"
"movq 0(%0), %%mm0 \n\t"
"punpcklwd %%mm2, %%mm1 \n\t"
"movq %%mm0, %%mm5 \n\t"
"paddsw %%mm1, %%mm0 \n\t"
"psubsw %%mm1, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm5, %%mm0 \n\t"
"punpckhdq %%mm5, %%mm2 \n\t"
"movq 0(%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm5 \n\t"
"movq 32(%1), %%mm6 \n\t"
"movq 40(%1), %%mm7 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm0, %%mm4 \n\t"
"pmaddwd %%mm2, %%mm5 \n\t"
"pmaddwd %%mm0, %%mm6 \n\t"
"pmaddwd %%mm2, %%mm7 \n\t"
"pmaddwd 48(%1), %%mm0 \n\t"
"pmaddwd 56(%1), %%mm2 \n\t"
"paddd %%mm1, %%mm3 \n\t"
"paddd %%mm4, %%mm5 \n\t"
"paddd %%mm6, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"movq (%2), %%mm0 \n\t"
"paddd %%mm0, %%mm3 \n\t"
"paddd %%mm0, %%mm5 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
"packssdw %%mm5, %%mm3 \n\t"
"packssdw %%mm2, %%mm7 \n\t"
"movq %%mm3, 0(%3) \n\t"
"movq %%mm7, 8(%3) \n\t"
:
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
}
void ff_fdct_mmx(int16_t *block)
{
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
int16_t * block1= (int16_t*)align_tmp;
const int16_t *table= tab_frw_01234567;
int i;
fdct_col_mmx(block, block1, 0);
fdct_col_mmx(block, block1, 4);
for(i=8;i>0;i--) {
fdct_row_mmx(block1, block, table);
block1 += 8;
table += 32;
block += 8;
}
}
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMXEXT_INLINE
void ff_fdct_mmxext(int16_t *block)
{
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
int16_t *block1= (int16_t*)align_tmp;
const int16_t *table= tab_frw_01234567;
int i;
fdct_col_mmx(block, block1, 0);
fdct_col_mmx(block, block1, 4);
for(i=8;i>0;i--) {
fdct_row_mmxext(block1, block, table);
block1 += 8;
table += 32;
block += 8;
}
}
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_SSE2_INLINE
void ff_fdct_sse2(int16_t *block)
{
DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
int16_t * const block1= (int16_t*)align_tmp;
fdct_col_sse2(block, block1, 0);
fdct_row_sse2(block1, block);
}
#endif /* HAVE_SSE2_INLINE */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,38 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_FFT_H
#define AVCODEC_X86_FFT_H
#include "libavcodec/fft.h"
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
#endif /* AVCODEC_X86_FFT_H */

View File

@@ -0,0 +1,57 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "fft.h"
av_cold void ff_fft_init_x86(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
/* 3DNow! for K6-2/3 */
s->imdct_calc = ff_imdct_calc_3dnow;
s->imdct_half = ff_imdct_half_3dnow;
s->fft_calc = ff_fft_calc_3dnow;
}
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
/* 3DNowEx for K7 */
s->imdct_calc = ff_imdct_calc_3dnowext;
s->imdct_half = ff_imdct_half_3dnowext;
s->fft_calc = ff_fft_calc_3dnowext;
}
#endif
if (EXTERNAL_SSE(cpu_flags)) {
/* SSE for P3/P4/K8 */
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_sse;
s->fft_permute = ff_fft_permute_sse;
s->fft_calc = ff_fft_calc_sse;
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
}
if (EXTERNAL_AVX(cpu_flags) && s->nbits >= 5) {
/* AVX for SB */
s->imdct_half = ff_imdct_half_avx;
s->fft_calc = ff_fft_calc_avx;
s->fft_permutation = FF_FFT_PERM_AVX;
}
}

View File

@@ -0,0 +1,429 @@
;******************************************************************************
;* x86 optimized Format Conversion Utils
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_TEXT
%macro CVTPS2PI 2
%if cpuflag(sse)
cvtps2pi %1, %2
%elif cpuflag(3dnow)
pf2id %1, %2
%endif
%endmacro
;---------------------------------------------------------------------------------
; void int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul, int len);
;---------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_SCALAR 1
%if UNIX64
cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
%else
cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
%endif
%if WIN64
SWAP 0, 2
%elif ARCH_X86_32
movss m0, mulm
%endif
SPLATD m0
shl lenq, 2
add srcq, lenq
add dstq, lenq
neg lenq
.loop:
%if cpuflag(sse2)
cvtdq2ps m1, [srcq+lenq ]
cvtdq2ps m2, [srcq+lenq+16]
%else
cvtpi2ps m1, [srcq+lenq ]
cvtpi2ps m3, [srcq+lenq+ 8]
cvtpi2ps m2, [srcq+lenq+16]
cvtpi2ps m4, [srcq+lenq+24]
movlhps m1, m3
movlhps m2, m4
%endif
mulps m1, m0
mulps m2, m0
mova [dstq+lenq ], m1
mova [dstq+lenq+16], m2
add lenq, 32
jl .loop
REP_RET
%endmacro
INIT_XMM sse
INT32_TO_FLOAT_FMUL_SCALAR 5
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_SCALAR 3
;------------------------------------------------------------------------------
; void ff_float_to_int16(int16_t *dst, const float *src, long len);
;------------------------------------------------------------------------------
%macro FLOAT_TO_INT16 1
cglobal float_to_int16, 3, 3, %1, dst, src, len
add lenq, lenq
lea srcq, [srcq+2*lenq]
add dstq, lenq
neg lenq
.loop:
%if cpuflag(sse2)
cvtps2dq m0, [srcq+2*lenq ]
cvtps2dq m1, [srcq+2*lenq+16]
packssdw m0, m1
mova [dstq+lenq], m0
%else
CVTPS2PI m0, [srcq+2*lenq ]
CVTPS2PI m1, [srcq+2*lenq+ 8]
CVTPS2PI m2, [srcq+2*lenq+16]
CVTPS2PI m3, [srcq+2*lenq+24]
packssdw m0, m1
packssdw m2, m3
mova [dstq+lenq ], m0
mova [dstq+lenq+8], m2
%endif
add lenq, 16
js .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
INIT_XMM sse2
FLOAT_TO_INT16 2
INIT_MMX sse
FLOAT_TO_INT16 0
INIT_MMX 3dnow
FLOAT_TO_INT16 0
;------------------------------------------------------------------------------
; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
;------------------------------------------------------------------------------
%macro FLOAT_TO_INT16_STEP 1
cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
add lenq, lenq
lea srcq, [srcq+2*lenq]
lea step3q, [stepq*3]
neg lenq
.loop:
%if cpuflag(sse2)
cvtps2dq m0, [srcq+2*lenq ]
cvtps2dq m1, [srcq+2*lenq+16]
packssdw m0, m1
movd v1d, m0
psrldq m0, 4
movd v2d, m0
psrldq m0, 4
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
movd v1d, m0
psrldq m0, 4
movd v2d, m0
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
%else
CVTPS2PI m0, [srcq+2*lenq ]
CVTPS2PI m1, [srcq+2*lenq+ 8]
CVTPS2PI m2, [srcq+2*lenq+16]
CVTPS2PI m3, [srcq+2*lenq+24]
packssdw m0, m1
packssdw m2, m3
movd v1d, m0
psrlq m0, 32
movd v2d, m0
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
movd v1d, m2
psrlq m2, 32
movd v2d, m2
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
%endif
add lenq, 16
js .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
INIT_XMM sse2
FLOAT_TO_INT16_STEP 2
INIT_MMX sse
FLOAT_TO_INT16_STEP 0
INIT_MMX 3dnow
FLOAT_TO_INT16_STEP 0
;-------------------------------------------------------------------------------
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
;-------------------------------------------------------------------------------
%macro FLOAT_TO_INT16_INTERLEAVE2 0
cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
lea lenq, [4*r2q]
mov src1q, [src0q+gprsize]
mov src0q, [src0q]
add dstq, lenq
add src0q, lenq
add src1q, lenq
neg lenq
.loop:
%if cpuflag(sse2)
cvtps2dq m0, [src0q+lenq]
cvtps2dq m1, [src1q+lenq]
packssdw m0, m1
movhlps m1, m0
punpcklwd m0, m1
mova [dstq+lenq], m0
%else
CVTPS2PI m0, [src0q+lenq ]
CVTPS2PI m1, [src0q+lenq+8]
CVTPS2PI m2, [src1q+lenq ]
CVTPS2PI m3, [src1q+lenq+8]
packssdw m0, m1
packssdw m2, m3
mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
mova [dstq+lenq ], m0
mova [dstq+lenq+8], m1
%endif
add lenq, 16
js .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
INIT_MMX 3dnow
FLOAT_TO_INT16_INTERLEAVE2
INIT_MMX sse
FLOAT_TO_INT16_INTERLEAVE2
INIT_XMM sse2
FLOAT_TO_INT16_INTERLEAVE2
%macro FLOAT_TO_INT16_INTERLEAVE6 0
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
.loop:
CVTPS2PI mm0, [srcq]
CVTPS2PI mm1, [srcq+src1q]
CVTPS2PI mm2, [srcq+src2q]
CVTPS2PI mm3, [srcq+src3q]
CVTPS2PI mm4, [srcq+src4q]
CVTPS2PI mm5, [srcq+src5q]
packssdw mm0, mm3
packssdw mm1, mm4
packssdw mm2, mm5
PSWAPD mm3, mm0
punpcklwd mm0, mm1
punpckhwd mm1, mm2
punpcklwd mm2, mm3
PSWAPD mm3, mm0
punpckldq mm0, mm2
punpckhdq mm2, mm1
punpckldq mm1, mm3
movq [dstq ], mm0
movq [dstq+16], mm2
movq [dstq+ 8], mm1
add srcq, 8
add dstq, 24
sub lend, 2
jg .loop
emms
RET
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX sse
FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX 3dnow
FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX 3dnowext
FLOAT_TO_INT16_INTERLEAVE6
;-----------------------------------------------------------------------------
; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
%macro FLOAT_INTERLEAVE6 1
cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
.loop:
%if cpuflag(sse)
movaps m0, [srcq]
movaps m1, [srcq+src1q]
movaps m2, [srcq+src2q]
movaps m3, [srcq+src3q]
movaps m4, [srcq+src4q]
movaps m5, [srcq+src5q]
SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6
movaps m6, m4
shufps m4, m0, 0xe4
movlhps m0, m2
movhlps m6, m2
movaps [dstq ], m0
movaps [dstq+16], m4
movaps [dstq+32], m6
movaps m6, m5
shufps m5, m1, 0xe4
movlhps m1, m3
movhlps m6, m3
movaps [dstq+48], m1
movaps [dstq+64], m5
movaps [dstq+80], m6
%else ; mmx
movq m0, [srcq]
movq m1, [srcq+src1q]
movq m2, [srcq+src2q]
movq m3, [srcq+src3q]
movq m4, [srcq+src4q]
movq m5, [srcq+src5q]
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
SBUTTERFLY dq, 4, 5, 6
movq [dstq ], m0
movq [dstq+ 8], m2
movq [dstq+16], m4
movq [dstq+24], m1
movq [dstq+32], m3
movq [dstq+40], m5
%endif
add srcq, mmsize
add dstq, mmsize*6
sub lend, mmsize/4
jg .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
INIT_MMX mmx
FLOAT_INTERLEAVE6 0
INIT_XMM sse
FLOAT_INTERLEAVE6 7
;-----------------------------------------------------------------------------
; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
%macro FLOAT_INTERLEAVE2 1
cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
mov src1q, [srcq+gprsize]
mov srcq, [srcq ]
sub src1q, srcq
.loop:
mova m0, [srcq ]
mova m1, [srcq+src1q ]
mova m3, [srcq +mmsize]
mova m4, [srcq+src1q+mmsize]
mova m2, m0
PUNPCKLDQ m0, m1
PUNPCKHDQ m2, m1
mova m1, m3
PUNPCKLDQ m3, m4
PUNPCKHDQ m1, m4
mova [dstq ], m0
mova [dstq+1*mmsize], m2
mova [dstq+2*mmsize], m3
mova [dstq+3*mmsize], m1
add srcq, mmsize*2
add dstq, mmsize*4
sub lend, mmsize/2
jg .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
INIT_MMX mmx
%define PUNPCKLDQ punpckldq
%define PUNPCKHDQ punpckhdq
FLOAT_INTERLEAVE2 0
INIT_XMM sse
%define PUNPCKLDQ unpcklps
%define PUNPCKHDQ unpckhps
FLOAT_INTERLEAVE2 5

View File

@@ -0,0 +1,147 @@
/*
* Format Conversion Utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/fmtconvert.h"
#if HAVE_YASM
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step);
void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step);
void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step);
void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len);
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
int c;\
for(c=0; c<channels; c++){\
ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\
}\
}\
\
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
if(channels==1)\
ff_float_to_int16_##cpu(dst, src[0], len);\
else if(channels==2){\
ff_float_to_int16_interleave2_##cpu(dst, src, len);\
}else if(channels==6){\
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
}else\
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
}
FLOAT_TO_INT16_INTERLEAVE(3dnow)
FLOAT_TO_INT16_INTERLEAVE(sse)
FLOAT_TO_INT16_INTERLEAVE(sse2)
static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src,
long len, int channels)
{
if(channels==6)
ff_float_to_int16_interleave6_3dnowext(dst, src, len);
else
float_to_int16_interleave_3dnow(dst, src, len, channels);
}
void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
static void float_interleave_mmx(float *dst, const float **src,
unsigned int len, int channels)
{
if (channels == 2) {
ff_float_interleave2_mmx(dst, src, len);
} else if (channels == 6)
ff_float_interleave6_mmx(dst, src, len);
else
ff_float_interleave_c(dst, src, len, channels);
}
static void float_interleave_sse(float *dst, const float **src,
unsigned int len, int channels)
{
if (channels == 2) {
ff_float_interleave2_sse(dst, src, len);
} else if (channels == 6)
ff_float_interleave6_sse(dst, src, len);
else
ff_float_interleave_c(dst, src, len, channels);
}
#endif /* HAVE_YASM */
av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->float_interleave = float_interleave_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = ff_float_to_int16_3dnow;
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
}
}
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16_interleave = float_to_int16_interleave_3dnowext;
}
}
if (EXTERNAL_SSE(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
c->float_to_int16 = ff_float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
c->float_interleave = float_interleave_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = ff_float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
}
#endif /* HAVE_YASM */
}

View File

@@ -0,0 +1,106 @@
;******************************************************************************
;* MMX optimized DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
INIT_MMX mmxext
; void pixels(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PIXELS48 2
%if %2 == 4
%define OP movh
%else
%define OP mova
%endif
cglobal %1_pixels%2, 4,5
movsxdifnidn r2, r2d
lea r4, [r2*3]
.loop:
OP m0, [r1]
OP m1, [r1+r2]
OP m2, [r1+r2*2]
OP m3, [r1+r4]
lea r1, [r1+r2*4]
%ifidn %1, avg
pavgb m0, [r0]
pavgb m1, [r0+r2]
pavgb m2, [r0+r2*2]
pavgb m3, [r0+r4]
%endif
OP [r0], m0
OP [r0+r2], m1
OP [r0+r2*2], m2
OP [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jne .loop
RET
%endmacro
PIXELS48 put, 4
PIXELS48 avg, 4
PIXELS48 put, 8
PIXELS48 avg, 8
INIT_XMM sse2
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
cglobal put_pixels16, 4,5,4
lea r4, [r2*3]
.loop:
movu m0, [r1]
movu m1, [r1+r2]
movu m2, [r1+r2*2]
movu m3, [r1+r4]
lea r1, [r1+r2*4]
mova [r0], m0
mova [r0+r2], m1
mova [r0+r2*2], m2
mova [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jnz .loop
REP_RET
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
cglobal avg_pixels16, 4,5,4
lea r4, [r2*3]
.loop:
movu m0, [r1]
movu m1, [r1+r2]
movu m2, [r1+r2*2]
movu m3, [r1+r4]
lea r1, [r1+r2*4]
pavgb m0, [r0]
pavgb m1, [r0+r2]
pavgb m2, [r0+r2*2]
pavgb m3, [r0+r4]
mova [r0], m0
mova [r0+r2], m1
mova [r0+r2*2], m2
mova [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jnz .loop
REP_RET

View File

@@ -0,0 +1,139 @@
/*
* MMX-optimized avg/put pixel routines
*
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "config.h"
#include "dsputil_x86.h"
#if HAVE_MMX_INLINE
// in case more speed is needed - unrolling would certainly help
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
JUMPALIGN();
do {
__asm__ volatile(
"movq %0, %%mm0 \n\t"
"movq %1, %%mm1 \n\t"
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
"movq %%mm2, %0 \n\t"
:"+m"(*block)
:"m"(*pixels)
:"memory");
pixels += line_size;
block += line_size;
}
while (--h);
}
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
JUMPALIGN();
do {
__asm__ volatile(
"movq %0, %%mm0 \n\t"
"movq %1, %%mm1 \n\t"
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
"movq %%mm2, %0 \n\t"
"movq 8%0, %%mm0 \n\t"
"movq 8%1, %%mm1 \n\t"
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
"movq %%mm2, 8%0 \n\t"
:"+m"(*block)
:"m"(*pixels)
:"memory");
pixels += line_size;
block += line_size;
}
while (--h);
}
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
__asm__ volatile (
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1 ), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1 ), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r"(pixels), "+r"(block)
: "r"((x86_reg)line_size)
: "%"REG_a, "memory"
);
}
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
__asm__ volatile (
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1 ), %%mm0 \n\t"
"movq 8(%1 ), %%mm4 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq 8(%1, %3), %%mm5 \n\t"
"movq %%mm0, (%2) \n\t"
"movq %%mm4, 8(%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1 ), %%mm0 \n\t"
"movq 8(%1 ), %%mm4 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq 8(%1, %3), %%mm5 \n\t"
"movq %%mm0, (%2) \n\t"
"movq %%mm4, 8(%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r"(pixels), "+r"(block)
: "r"((x86_reg)line_size)
: "%"REG_a, "memory"
);
}
#endif /* HAVE_MMX_INLINE */

View File

@@ -0,0 +1,189 @@
;******************************************************************************
;* MMX-optimized H.263 loop filter
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_FC
cextern h263_loop_filter_strength
SECTION_TEXT
%macro H263_LOOP_FILTER 5
pxor m7, m7
mova m0, [%1]
mova m1, [%1]
mova m2, [%4]
mova m3, [%4]
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
mova m2, [%2]
mova m3, [%2]
mova m4, [%3]
mova m5, [%3]
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
psubw m4, m2
psubw m5, m3
psllw m4, 2
psllw m5, 2
paddw m4, m0
paddw m5, m1
pxor m6, m6
pcmpgtw m6, m4
pcmpgtw m7, m5
pxor m4, m6
pxor m5, m7
psubw m4, m6
psubw m5, m7
psrlw m4, 3
psrlw m5, 3
packuswb m4, m5
packsswb m6, m7
pxor m7, m7
movd m2, %5
punpcklbw m2, m2
punpcklbw m2, m2
punpcklbw m2, m2
psubusb m2, m4
mova m3, m2
psubusb m3, m4
psubb m2, m3
mova m3, [%2]
mova m4, [%3]
pxor m3, m6
pxor m4, m6
paddusb m3, m2
psubusb m4, m2
pxor m3, m6
pxor m4, m6
paddusb m2, m2
packsswb m0, m1
pcmpgtb m7, m0
pxor m0, m7
psubb m0, m7
mova m1, m0
psubusb m0, m2
psubb m1, m0
pand m1, [pb_FC]
psrlw m1, 2
pxor m1, m7
psubb m1, m7
mova m5, [%1]
mova m6, [%4]
psubb m5, m1
paddb m6, m1
%endmacro
INIT_MMX mmx
; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
cglobal h263_v_loop_filter, 3,5
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
lea r4, [h263_loop_filter_strength]
movzx r3d, BYTE [r4+r2]
movsx r2, r3b
shl r2, 1
mov r3, r0
sub r3, r1
mov r4, r3
sub r4, r1
H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
mova [r3], m3
mova [r0], m4
mova [r4], m5
mova [r0+r1], m6
RET
%macro TRANSPOSE4X4 2
movd m0, [%1]
movd m1, [%1+r1]
movd m2, [%1+r1*2]
movd m3, [%1+r3]
punpcklbw m0, m1
punpcklbw m2, m3
mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
movd [%2+ 0], m0
punpckhdq m0, m0
movd [%2+ 8], m0
movd [%2+16], m1
punpckhdq m1, m1
movd [%2+24], m1
%endmacro
; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
INIT_MMX mmx
cglobal h263_h_loop_filter, 3,5,0,32
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
lea r4, [h263_loop_filter_strength]
movzx r3d, BYTE [r4+r2]
movsx r2, r3b
shl r2, 1
sub r0, 2
lea r3, [r1*3]
TRANSPOSE4X4 r0, rsp
lea r4, [r0+r1*4]
TRANSPOSE4X4 r4, rsp+4
H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
mova m1, m5
mova m0, m4
punpcklbw m5, m3
punpcklbw m4, m6
punpckhbw m1, m3
punpckhbw m0, m6
mova m3, m5
mova m6, m1
punpcklwd m5, m4
punpcklwd m1, m0
punpckhwd m3, m4
punpckhwd m6, m0
movd [r0], m5
punpckhdq m5, m5
movd [r0+r1*1], m5
movd [r0+r1*2], m3
punpckhdq m3, m3
movd [r0+r3], m3
movd [r4], m1
punpckhdq m1, m1
movd [r4+r1*1], m1
movd [r4+r1*2], m6
punpckhdq m6, m6
movd [r4+r3], m6
RET

View File

@@ -0,0 +1,678 @@
;******************************************************************************
;* MMX/SSSE3-optimized functions for H264 chroma MC
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
;* 2005-2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
rnd_rv40_2d_tbl: times 4 dw 0
times 4 dw 16
times 4 dw 32
times 4 dw 16
times 4 dw 32
times 4 dw 28
times 4 dw 32
times 4 dw 28
times 4 dw 0
times 4 dw 32
times 4 dw 16
times 4 dw 32
times 4 dw 32
times 4 dw 28
times 4 dw 32
times 4 dw 28
rnd_rv40_1d_tbl: times 4 dw 0
times 4 dw 2
times 4 dw 4
times 4 dw 2
times 4 dw 4
times 4 dw 3
times 4 dw 4
times 4 dw 3
times 4 dw 0
times 4 dw 4
times 4 dw 2
times 4 dw 4
times 4 dw 4
times 4 dw 3
times 4 dw 4
times 4 dw 3
cextern pw_3
cextern pw_4
cextern pw_8
pw_28: times 8 dw 28
cextern pw_32
cextern pw_64
SECTION .text
%macro mv0_pixels_mc8 0
lea r4, [r2*2 ]
.next4rows:
movq mm0, [r1 ]
movq mm1, [r1+r2]
add r1, r4
CHROMAMC_AVG mm0, [r0 ]
CHROMAMC_AVG mm1, [r0+r2]
movq [r0 ], mm0
movq [r0+r2], mm1
add r0, r4
movq mm0, [r1 ]
movq mm1, [r1+r2]
add r1, r4
CHROMAMC_AVG mm0, [r0 ]
CHROMAMC_AVG mm1, [r0+r2]
movq [r0 ], mm0
movq [r0+r2], mm1
add r0, r4
sub r3d, 4
jne .next4rows
%endmacro
%macro chroma_mc8_mmx_func 2-3
%ifidn %2, rv40
%ifdef PIC
%define rnd_1d_rv40 r8
%define rnd_2d_rv40 r8
%define extra_regs 2
%else ; no-PIC
%define rnd_1d_rv40 rnd_rv40_1d_tbl
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%define extra_regs 1
%endif ; PIC
%else
%define extra_regs 0
%endif ; rv40
; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
; int stride, int h, int mx, int my)
cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
; mx == 0 AND my == 0 - no filter needed
mv0_pixels_mc8
REP_RET
.at_least_one_non_zero:
%ifidn %2, rv40
%if ARCH_X86_64
mov r7, r5
and r7, 6 ; &~1 for mx/my=[0,7]
lea r7, [r7*4+r4]
sar r7d, 1
%define rnd_bias r7
%define dest_reg r0
%else ; x86-32
mov r0, r5
and r0, 6 ; &~1 for mx/my=[0,7]
lea r0, [r0*4+r4]
sar r0d, 1
%define rnd_bias r0
%define dest_reg r5
%endif
%else ; vc1, h264
%define rnd_bias 0
%define dest_reg r0
%endif
test r5d, r5d
mov r6, 1
je .my_is_zero
test r4d, r4d
mov r6, r2 ; dxy = x ? 1 : stride
jne .both_non_zero
.my_is_zero:
; mx == 0 XOR my == 0 - 1 dimensional filter only
or r4d, r5d ; x + y
%ifidn %2, rv40
%ifdef PIC
lea r8, [rnd_rv40_1d_tbl]
%endif
%if ARCH_X86_64 == 0
mov r5, r0m
%endif
%endif
movd m5, r4d
movq m4, [pw_8]
movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
punpcklwd m5, m5
punpckldq m5, m5 ; mm5 = B = x
pxor m7, m7
psubw m4, m5 ; mm4 = A = 8-x
.next1drow:
movq m0, [r1 ] ; mm0 = src[0..7]
movq m2, [r1+r6] ; mm1 = src[1..8]
movq m1, m0
movq m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
pmullw m1, m4
pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
pmullw m3, m5
paddw m0, m6
paddw m1, m6
paddw m0, m2
paddw m1, m3
psrlw m0, 3
psrlw m1, 3
packuswb m0, m1
CHROMAMC_AVG m0, [dest_reg]
movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
add dest_reg, r2
add r1, r2
dec r3d
jne .next1drow
REP_RET
.both_non_zero: ; general case, bilinear
movd m4, r4d ; x
movd m6, r5d ; y
%ifidn %2, rv40
%ifdef PIC
lea r8, [rnd_rv40_2d_tbl]
%endif
%if ARCH_X86_64 == 0
mov r5, r0m
%endif
%endif
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, 16 ; AA and DD
punpcklwd m4, m4
punpcklwd m6, m6
punpckldq m4, m4 ; mm4 = x words
punpckldq m6, m6 ; mm6 = y words
movq m5, m4
pmullw m4, m6 ; mm4 = x * y
psllw m5, 3
psllw m6, 3
movq m7, m5
paddw m7, m6
movq [rsp+8], m4 ; DD = x * y
psubw m5, m4 ; mm5 = B = 8x - xy
psubw m6, m4 ; mm6 = C = 8y - xy
paddw m4, [pw_64]
psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
pxor m7, m7
movq [rsp ], m4
movq m0, [r1 ] ; mm0 = src[0..7]
movq m1, [r1+1] ; mm1 = src[1..8]
.next2drow:
add r1, r2
movq m2, m0
movq m3, m1
punpckhbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
pmullw m0, [rsp]
pmullw m2, [rsp]
pmullw m1, m5
pmullw m3, m5
paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
movq m0, [r1]
movq m1, m0
punpcklbw m0, m7
punpckhbw m1, m7
pmullw m0, m6
pmullw m1, m6
paddw m2, m0
paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
movq m1, [r1+1]
movq m0, m1
movq m4, m1
punpcklbw m0, m7
punpckhbw m4, m7
pmullw m0, [rsp+8]
pmullw m4, [rsp+8]
paddw m2, m0
paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
movq m0, [r1]
paddw m2, [rnd_2d_%2+rnd_bias*8]
paddw m3, [rnd_2d_%2+rnd_bias*8]
psrlw m2, 6
psrlw m3, 6
packuswb m2, m3
CHROMAMC_AVG m2, [dest_reg]
movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
add dest_reg, r2
dec r3d
jne .next2drow
mov rsp, r6 ; restore stack pointer
RET
%endmacro
%macro chroma_mc4_mmx_func 2
%define extra_regs 0
%ifidn %2, rv40
%ifdef PIC
%define extra_regs 1
%endif ; PIC
%endif ; rv40
cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
pxor m7, m7
movd m2, r4d ; x
movd m3, r5d ; y
movq m4, [pw_8]
movq m5, [pw_8]
punpcklwd m2, m2
punpcklwd m3, m3
punpcklwd m2, m2
punpcklwd m3, m3
psubw m4, m2
psubw m5, m3
%ifidn %2, rv40
%ifdef PIC
lea r6, [rnd_rv40_2d_tbl]
%define rnd_2d_rv40 r6
%else
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%endif
and r5, 6 ; &~1 for mx/my=[0,7]
lea r5, [r5*4+r4]
sar r5d, 1
%define rnd_bias r5
%else ; vc1, h264
%define rnd_bias 0
%endif
movd m0, [r1 ]
movd m6, [r1+1]
add r1, r2
punpcklbw m0, m7
punpcklbw m6, m7
pmullw m0, m4
pmullw m6, m2
paddw m6, m0
.next2rows:
movd m0, [r1 ]
movd m1, [r1+1]
add r1, r2
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4
pmullw m1, m2
paddw m1, m0
movq m0, m1
pmullw m6, m5
pmullw m1, m3
paddw m6, [rnd_2d_%2+rnd_bias*8]
paddw m1, m6
psrlw m1, 6
packuswb m1, m1
CHROMAMC_AVG4 m1, m6, [r0]
movd [r0], m1
add r0, r2
movd m6, [r1 ]
movd m1, [r1+1]
add r1, r2
punpcklbw m6, m7
punpcklbw m1, m7
pmullw m6, m4
pmullw m1, m2
paddw m1, m6
movq m6, m1
pmullw m0, m5
pmullw m1, m3
paddw m0, [rnd_2d_%2+rnd_bias*8]
paddw m1, m0
psrlw m1, 6
packuswb m1, m1
CHROMAMC_AVG4 m1, m0, [r0]
movd [r0], m1
add r0, r2
sub r3d, 2
jnz .next2rows
REP_RET
%endmacro
%macro chroma_mc2_mmx_func 2
cglobal %1_%2_chroma_mc2, 6, 7, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
mov r6d, r4d
shl r4d, 16
sub r4d, r6d
add r4d, 8
imul r5d, r4d ; x*y<<16 | y*(8-x)
shl r4d, 3
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
movd m5, r4d
movd m6, r5d
punpckldq m5, m5 ; mm5 = {A,B,A,B}
punpckldq m6, m6 ; mm6 = {C,D,C,D}
pxor m7, m7
movd m2, [r1]
punpcklbw m2, m7
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
.nextrow:
add r1, r2
movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
movd m0, [r1]
punpcklbw m0, m7
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
movq m2, m0
pmaddwd m0, m6
paddw m1, [rnd_2d_%2]
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
psrlw m1, 6
packssdw m1, m7
packuswb m1, m7
CHROMAMC_AVG4 m1, m3, [r0]
movd r5d, m1
mov [r0], r5w
add r0, r2
sub r3d, 1
jnz .nextrow
REP_RET
%endmacro
%define rnd_1d_h264 pw_4
%define rnd_2d_h264 pw_32
%define rnd_1d_vc1 pw_3
%define rnd_2d_vc1 pw_28
%macro NOTHING 2-3
%endmacro
%macro DIRECT_AVG 2
PAVGB %1, %2
%endmacro
%macro COPY_AVG 3
movd %2, %3
PAVGB %1, %2
%endmacro
INIT_MMX mmx
%define CHROMAMC_AVG NOTHING
%define CHROMAMC_AVG4 NOTHING
chroma_mc8_mmx_func put, h264, _rnd
chroma_mc8_mmx_func put, vc1, _nornd
chroma_mc8_mmx_func put, rv40
chroma_mc4_mmx_func put, h264
chroma_mc4_mmx_func put, rv40
INIT_MMX mmxext
chroma_mc2_mmx_func put, h264
%define CHROMAMC_AVG DIRECT_AVG
%define CHROMAMC_AVG4 COPY_AVG
chroma_mc8_mmx_func avg, h264, _rnd
chroma_mc8_mmx_func avg, vc1, _nornd
chroma_mc8_mmx_func avg, rv40
chroma_mc4_mmx_func avg, h264
chroma_mc4_mmx_func avg, rv40
chroma_mc2_mmx_func avg, h264
INIT_MMX 3dnow
chroma_mc8_mmx_func avg, h264, _rnd
chroma_mc8_mmx_func avg, vc1, _nornd
chroma_mc8_mmx_func avg, rv40
chroma_mc4_mmx_func avg, h264
chroma_mc4_mmx_func avg, rv40
%macro chroma_mc8_ssse3_func 2-3
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
%if ARCH_X86_64
movsxd r2, r2d
%endif
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
; mx == 0 AND my == 0 - no filter needed
mv0_pixels_mc8
REP_RET
.at_least_one_non_zero:
test r5d, r5d
je .my_is_zero
test r4d, r4d
je .mx_is_zero
; general case, bilinear
mov r6d, r4d
shl r4d, 8
sub r4, r6
mov r6, 8
add r4, 8 ; x*288+8 = x<<8 | (8-x)
sub r6d, r5d
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
movd m7, r6d
movd m6, r4d
movdqa m5, [rnd_2d_%2]
movq m0, [r1 ]
movq m1, [r1+1]
pshuflw m7, m7, 0
pshuflw m6, m6, 0
punpcklbw m0, m1
movlhps m7, m7
movlhps m6, m6
.next2rows:
movq m1, [r1+r2*1 ]
movq m2, [r1+r2*1+1]
movq m3, [r1+r2*2 ]
movq m4, [r1+r2*2+1]
lea r1, [r1+r2*2]
punpcklbw m1, m2
movdqa m2, m1
punpcklbw m3, m4
movdqa m4, m3
pmaddubsw m0, m7
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
paddw m0, m5
paddw m2, m5
paddw m1, m0
paddw m3, m2
psrlw m1, 6
movdqa m0, m4
psrlw m3, 6
%ifidn %1, avg
movq m2, [r0 ]
movhps m2, [r0+r2]
%endif
packuswb m1, m3
CHROMAMC_AVG m1, m2
movq [r0 ], m1
movhps [r0+r2], m1
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2rows
REP_RET
.my_is_zero:
mov r5d, r4d
shl r4d, 8
add r4, 8
sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
movd m7, r4d
movdqa m6, [rnd_1d_%2]
pshuflw m7, m7, 0
movlhps m7, m7
.next2xrows:
movq m0, [r1 ]
movq m1, [r1 +1]
movq m2, [r1+r2 ]
movq m3, [r1+r2+1]
punpcklbw m0, m1
punpcklbw m2, m3
pmaddubsw m0, m7
pmaddubsw m2, m7
%ifidn %1, avg
movq m4, [r0 ]
movhps m4, [r0+r2]
%endif
paddw m0, m6
paddw m2, m6
psrlw m0, 3
psrlw m2, 3
packuswb m0, m2
CHROMAMC_AVG m0, m4
movq [r0 ], m0
movhps [r0+r2], m0
sub r3d, 2
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
jg .next2xrows
REP_RET
.mx_is_zero:
mov r4d, r5d
shl r5d, 8
add r5, 8
sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
movd m7, r5d
movdqa m6, [rnd_1d_%2]
pshuflw m7, m7, 0
movlhps m7, m7
.next2yrows:
movq m0, [r1 ]
movq m1, [r1+r2 ]
movdqa m2, m1
movq m3, [r1+r2*2]
lea r1, [r1+r2*2]
punpcklbw m0, m1
punpcklbw m2, m3
pmaddubsw m0, m7
pmaddubsw m2, m7
%ifidn %1, avg
movq m4, [r0 ]
movhps m4, [r0+r2]
%endif
paddw m0, m6
paddw m2, m6
psrlw m0, 3
psrlw m2, 3
packuswb m0, m2
CHROMAMC_AVG m0, m4
movq [r0 ], m0
movhps [r0+r2], m0
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2yrows
REP_RET
%endmacro
%macro chroma_mc4_ssse3_func 2
cglobal %1_%2_chroma_mc4, 6, 7, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
mov r6, r4
shl r4d, 8
sub r4d, r6d
mov r6, 8
add r4d, 8 ; x*288+8
sub r6d, r5d
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
movd m7, r6d
movd m6, r4d
movq m5, [pw_32]
movd m0, [r1 ]
pshufw m7, m7, 0
punpcklbw m0, [r1+1]
pshufw m6, m6, 0
.next2rows:
movd m1, [r1+r2*1 ]
movd m3, [r1+r2*2 ]
punpcklbw m1, [r1+r2*1+1]
punpcklbw m3, [r1+r2*2+1]
lea r1, [r1+r2*2]
movq m2, m1
movq m4, m3
pmaddubsw m0, m7
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
paddw m0, m5
paddw m2, m5
paddw m1, m0
paddw m3, m2
psrlw m1, 6
movq m0, m4
psrlw m3, 6
packuswb m1, m1
packuswb m3, m3
CHROMAMC_AVG m1, [r0 ]
CHROMAMC_AVG m3, [r0+r2]
movd [r0 ], m1
movd [r0+r2], m3
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2rows
REP_RET
%endmacro
%define CHROMAMC_AVG NOTHING
INIT_XMM ssse3
chroma_mc8_ssse3_func put, h264, _rnd
chroma_mc8_ssse3_func put, vc1, _nornd
INIT_MMX ssse3
chroma_mc4_ssse3_func put, h264
%define CHROMAMC_AVG DIRECT_AVG
INIT_XMM ssse3
chroma_mc8_ssse3_func avg, h264, _rnd
chroma_mc8_ssse3_func avg, vc1, _nornd
INIT_MMX ssse3
chroma_mc4_ssse3_func avg, h264

View File

@@ -0,0 +1,271 @@
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pw_4
cextern pw_8
cextern pw_32
cextern pw_64
SECTION .text
%macro MV0_PIXELS_MC8 0
lea r4, [r2*3 ]
lea r5, [r2*4 ]
.next4rows:
movu m0, [r1 ]
movu m1, [r1+r2 ]
CHROMAMC_AVG m0, [r0 ]
CHROMAMC_AVG m1, [r0+r2 ]
mova [r0 ], m0
mova [r0+r2 ], m1
movu m0, [r1+r2*2]
movu m1, [r1+r4 ]
CHROMAMC_AVG m0, [r0+r2*2]
CHROMAMC_AVG m1, [r0+r4 ]
mova [r0+r2*2], m0
mova [r0+r4 ], m1
add r1, r5
add r0, r5
sub r3d, 4
jne .next4rows
%endmacro
;-----------------------------------------------------------------------------
; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my)
;-----------------------------------------------------------------------------
%macro CHROMA_MC8 1
; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
; int stride, int h, int mx, int my)
cglobal %1_h264_chroma_mc8_10, 6,7,8
movsxdifnidn r2, r2d
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
; mx == 0 AND my == 0 - no filter needed
MV0_PIXELS_MC8
REP_RET
.at_least_one_non_zero:
mov r6d, 2
test r5d, r5d
je .x_interpolation
mov r6, r2 ; dxy = x ? 1 : stride
test r4d, r4d
jne .xy_interpolation
.x_interpolation:
; mx == 0 XOR my == 0 - 1 dimensional filter only
or r4d, r5d ; x + y
movd m5, r4d
mova m4, [pw_8]
mova m6, [pw_4] ; mm6 = rnd >> 3
SPLATW m5, m5 ; mm5 = B = x
psubw m4, m5 ; mm4 = A = 8-x
.next1drow:
movu m0, [r1 ] ; mm0 = src[0..7]
movu m2, [r1+r6] ; mm2 = src[1..8]
pmullw m0, m4 ; mm0 = A * src[0..7]
pmullw m2, m5 ; mm2 = B * src[1..8]
paddw m0, m6
paddw m0, m2
psrlw m0, 3
CHROMAMC_AVG m0, [r0]
mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
add r0, r2
add r1, r2
dec r3d
jne .next1drow
REP_RET
.xy_interpolation: ; general case, bilinear
movd m4, r4m ; x
movd m6, r5m ; y
SPLATW m4, m4 ; mm4 = x words
SPLATW m6, m6 ; mm6 = y words
psllw m5, m4, 3 ; mm5 = 8x
pmullw m4, m6 ; mm4 = x * y
psllw m6, 3 ; mm6 = 8y
paddw m1, m5, m6 ; mm7 = 8x+8y
mova m7, m4 ; DD = x * y
psubw m5, m4 ; mm5 = B = 8x - xy
psubw m6, m4 ; mm6 = C = 8y - xy
paddw m4, [pw_64]
psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64
movu m0, [r1 ] ; mm0 = src[0..7]
movu m1, [r1+2] ; mm1 = src[1..8]
.next2drow:
add r1, r2
pmullw m2, m0, m4
pmullw m1, m5
paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8]
movu m0, [r1]
movu m1, [r1+2]
pmullw m3, m0, m6
paddw m2, m3 ; mm2 += C * src[0..7+strde]
pmullw m3, m1, m7
paddw m2, m3 ; mm2 += D * src[1..8+strde]
paddw m2, [pw_32]
psrlw m2, 6
CHROMAMC_AVG m2, [r0]
mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6
add r0, r2
dec r3d
jne .next2drow
REP_RET
%endmacro
;-----------------------------------------------------------------------------
; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my)
;-----------------------------------------------------------------------------
;TODO: xmm mc4
%macro MC4_OP 2
movq %1, [r1 ]
movq m1, [r1+2]
add r1, r2
pmullw %1, m4
pmullw m1, m2
paddw m1, %1
mova %1, m1
pmullw %2, m5
pmullw m1, m3
paddw %2, [pw_32]
paddw m1, %2
psrlw m1, 6
CHROMAMC_AVG m1, %2, [r0]
movq [r0], m1
add r0, r2
%endmacro
%macro CHROMA_MC4 1
cglobal %1_h264_chroma_mc4_10, 6,6,7
movsxdifnidn r2, r2d
movd m2, r4m ; x
movd m3, r5m ; y
mova m4, [pw_8]
mova m5, m4
SPLATW m2, m2
SPLATW m3, m3
psubw m4, m2
psubw m5, m3
movq m0, [r1 ]
movq m6, [r1+2]
add r1, r2
pmullw m0, m4
pmullw m6, m2
paddw m6, m0
.next2rows:
MC4_OP m0, m6
MC4_OP m6, m0
sub r3d, 2
jnz .next2rows
REP_RET
%endmacro
;-----------------------------------------------------------------------------
; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my)
;-----------------------------------------------------------------------------
%macro CHROMA_MC2 1
cglobal %1_h264_chroma_mc2_10, 6,7
movsxdifnidn r2, r2d
mov r6d, r4d
shl r4d, 16
sub r4d, r6d
add r4d, 8
imul r5d, r4d ; x*y<<16 | y*(8-x)
shl r4d, 3
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
movd m5, r4d
movd m6, r5d
punpckldq m5, m5 ; mm5 = {A,B,A,B}
punpckldq m6, m6 ; mm6 = {C,D,C,D}
pxor m7, m7
pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2]
.nextrow:
add r1, r2
movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2]
movq m2, m0
pmaddwd m0, m6
paddw m1, [pw_32]
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
psrlw m1, 6
packssdw m1, m7
CHROMAMC_AVG m1, m3, [r0]
movd [r0], m1
add r0, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
%macro NOTHING 2-3
%endmacro
%macro AVG 2-3
%if %0==3
movq %2, %3
%endif
pavgw %1, %2
%endmacro
%define CHROMAMC_AVG NOTHING
INIT_XMM sse2
CHROMA_MC8 put
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
CHROMA_MC8 put
%endif
INIT_MMX mmxext
CHROMA_MC4 put
CHROMA_MC2 put
%define CHROMAMC_AVG AVG
INIT_XMM sse2
CHROMA_MC8 avg
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
CHROMA_MC8 avg
%endif
INIT_MMX mmxext
CHROMA_MC4 avg
CHROMA_MC2 avg

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,923 @@
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Oskar Arvidsson <oskar@irock.se>
;* Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_pixel_max: times 8 dw ((1 << 10)-1)
SECTION .text
cextern pw_2
cextern pw_3
cextern pw_4
; out: %4 = |%1-%2|-%3
; clobbers: %5
%macro ABS_SUB 5
psubusw %5, %2, %1
psubusw %4, %1, %2
por %4, %5
psubw %4, %3
%endmacro
; out: %4 = |%1-%2|<%3
%macro DIFF_LT 5
psubusw %4, %2, %1
psubusw %5, %1, %2
por %5, %4 ; |%1-%2|
pxor %4, %4
psubw %5, %3 ; |%1-%2|-%3
pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
%endmacro
%macro LOAD_AB 4
movd %1, %3
movd %2, %4
SPLATW %1, %1
SPLATW %2, %2
%endmacro
; in: %2=tc reg
; out: %1=splatted tc
%macro LOAD_TC 2
movd %1, [%2]
punpcklbw %1, %1
%if mmsize == 8
pshufw %1, %1, 0
%else
pshuflw %1, %1, 01010000b
pshufd %1, %1, 01010000b
%endif
psraw %1, 6
%endmacro
; in: %1=p1, %2=p0, %3=q0, %4=q1
; %5=alpha, %6=beta, %7-%9=tmp
; out: %7=mask
%macro LOAD_MASK 9
ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
pand %8, %9
ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
pxor %7, %7
pand %8, %9
pcmpgtw %7, %8
%endmacro
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', m2=q0'
%macro DEBLOCK_P0_Q0 7
psubw %3, %4
pxor %7, %7
paddw %3, [pw_4]
psubw %7, %5
psubw %6, %2, %1
psllw %6, 2
paddw %3, %6
psraw %3, 3
mova %6, [pw_pixel_max]
CLIPW %3, %7, %5
pxor %7, %7
paddw %1, %3
psubw %2, %3
CLIPW %1, %7, %6
CLIPW %2, %7, %6
%endmacro
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
%macro LUMA_Q1 6
pavgw %6, %3, %4 ; (p0+q0+1)>>1
paddw %1, %6
pxor %6, %6
psraw %1, 1
psubw %6, %5
psubw %1, %2
CLIPW %1, %6, %5
paddw %1, %2
%endmacro
%macro LUMA_DEBLOCK_ONE 3
DIFF_LT m5, %1, bm, m4, m6
pxor m6, m6
mova %3, m4
pcmpgtw m6, tcm
pand m4, tcm
pandn m6, m7
pand m4, m6
LUMA_Q1 m5, %2, m1, m2, m4, m6
%endmacro
%macro LUMA_H_STORE 2
%if mmsize == 8
movq [r0-4], m0
movq [r0+r1-4], m1
movq [r0+r1*2-4], m2
movq [r0+%2-4], m3
%else
movq [r0-4], m0
movhps [r0+r1-4], m0
movq [r0+r1*2-4], m1
movhps [%1-4], m1
movq [%1+r1-4], m2
movhps [%1+r1*2-4], m2
movq [%1+%2-4], m3
movhps [%1+r1*4-4], m3
%endif
%endmacro
%macro DEBLOCK_LUMA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
%assign pad 5*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
%define ms2 [rsp+mmsize*2]
%define am [rsp+mmsize*3]
%define bm [rsp+mmsize*4]
SUB rsp, pad
shl r2d, 2
shl r3d, 2
LOAD_AB m4, m5, r2d, r3d
mov r3, 32/mmsize
mov r2, r0
sub r0, r1
mova am, m4
sub r0, r1
mova bm, m5
sub r0, r1
.loop:
mova m0, [r0+r1]
mova m1, [r0+r1*2]
mova m2, [r2]
mova m3, [r2+r1]
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
LOAD_TC m6, r4
mova tcm, m6
mova m5, [r0]
LUMA_DEBLOCK_ONE m1, m0, ms1
mova [r0+r1], m5
mova m5, [r2+r1*2]
LUMA_DEBLOCK_ONE m2, m3, ms2
mova [r2+r1], m5
pxor m5, m5
mova m6, tcm
pcmpgtw m5, tcm
psubw m6, ms1
pandn m5, m7
psubw m6, ms2
pand m5, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
mova [r0+r1*2], m1
mova [r2], m2
add r0, mmsize
add r2, mmsize
add r4, mmsize/8
dec r3
jg .loop
ADD rsp, pad
RET
cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
%assign pad 7*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
%define ms2 [rsp+mmsize*2]
%define p1m [rsp+mmsize*3]
%define p2m [rsp+mmsize*4]
%define am [rsp+mmsize*5]
%define bm [rsp+mmsize*6]
SUB rsp, pad
shl r2d, 2
shl r3d, 2
LOAD_AB m4, m5, r2d, r3d
mov r3, r1
mova am, m4
add r3, r1
mov r5, 32/mmsize
mova bm, m5
add r3, r1
%if mmsize == 16
mov r2, r0
add r2, r3
%endif
.loop:
%if mmsize == 8
movq m2, [r0-8] ; y q2 q1 q0
movq m7, [r0+0]
movq m5, [r0+r1-8]
movq m3, [r0+r1+0]
movq m0, [r0+r1*2-8]
movq m6, [r0+r1*2+0]
movq m1, [r0+r3-8]
TRANSPOSE4x4W 2, 5, 0, 1, 4
SWAP 2, 7
movq m7, [r0+r3]
TRANSPOSE4x4W 2, 3, 6, 7, 4
%else
movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
movu m0, [r0+r1-8]
movu m2, [r0+r1*2-8]
movu m3, [r2-8]
TRANSPOSE4x4W 5, 0, 2, 3, 6
mova tcm, m3
movu m4, [r2+r1-8]
movu m1, [r2+r1*2-8]
movu m3, [r2+r3-8]
movu m7, [r2+r1*4-8]
TRANSPOSE4x4W 4, 1, 3, 7, 6
mova m6, tcm
punpcklqdq m6, m7
punpckhqdq m5, m4
SBUTTERFLY qdq, 0, 1, 7
SBUTTERFLY qdq, 2, 3, 7
%endif
mova p2m, m6
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
LOAD_TC m6, r4
mova tcm, m6
LUMA_DEBLOCK_ONE m1, m0, ms1
mova p1m, m5
mova m5, p2m
LUMA_DEBLOCK_ONE m2, m3, ms2
mova p2m, m5
pxor m5, m5
mova m6, tcm
pcmpgtw m5, tcm
psubw m6, ms1
pandn m5, m7
psubw m6, ms2
pand m5, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
mova m0, p1m
mova m3, p2m
TRANSPOSE4x4W 0, 1, 2, 3, 4
LUMA_H_STORE r2, r3
add r4, mmsize/8
lea r0, [r0+r1*(mmsize/2)]
lea r2, [r2+r1*(mmsize/2)]
dec r5
jg .loop
ADD rsp, pad
RET
%endmacro
%if ARCH_X86_64
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; m12=alpha, m13=beta
; out: m0=p1', m3=q1', m1=p0', m2=q0'
; clobbers: m4, m5, m6, m7, m10, m11, m14
%macro DEBLOCK_LUMA_INTER_SSE2 0
LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
LOAD_TC m6, r4
DIFF_LT m8, m1, m13, m10, m4
DIFF_LT m9, m2, m13, m11, m4
pand m6, m7
mova m14, m6
pxor m4, m4
pcmpgtw m6, m4
pand m6, m14
mova m5, m10
pand m5, m6
LUMA_Q1 m8, m0, m1, m2, m5, m4
mova m5, m11
pand m5, m6
LUMA_Q1 m9, m3, m1, m2, m5, m4
pxor m4, m4
psubw m6, m10
pcmpgtw m4, m14
pandn m4, m7
psubw m6, m11
pand m4, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
SWAP 0, 8
SWAP 3, 9
%endmacro
%macro DEBLOCK_LUMA_64 0
cglobal deblock_v_luma_10, 5,5,15
%define p2 m8
%define p1 m0
%define p0 m1
%define q0 m2
%define q1 m3
%define q2 m9
%define mask0 m7
%define mask1 m10
%define mask2 m11
shl r2d, 2
shl r3d, 2
LOAD_AB m12, m13, r2d, r3d
mov r2, r0
sub r0, r1
sub r0, r1
sub r0, r1
mov r3, 2
.loop:
mova p2, [r0]
mova p1, [r0+r1]
mova p0, [r0+r1*2]
mova q0, [r2]
mova q1, [r2+r1]
mova q2, [r2+r1*2]
DEBLOCK_LUMA_INTER_SSE2
mova [r0+r1], p1
mova [r0+r1*2], p0
mova [r2], q0
mova [r2+r1], q1
add r0, mmsize
add r2, mmsize
add r4, 2
dec r3
jg .loop
REP_RET
cglobal deblock_h_luma_10, 5,7,15
shl r2d, 2
shl r3d, 2
LOAD_AB m12, m13, r2d, r3d
mov r2, r1
add r2, r1
add r2, r1
mov r5, r0
add r5, r2
mov r6, 2
.loop:
movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
movu m0, [r0+r1-8]
movu m2, [r0+r1*2-8]
movu m9, [r5-8]
movu m5, [r5+r1-8]
movu m1, [r5+r1*2-8]
movu m3, [r5+r2-8]
movu m7, [r5+r1*4-8]
TRANSPOSE4x4W 8, 0, 2, 9, 10
TRANSPOSE4x4W 5, 1, 3, 7, 10
punpckhqdq m8, m5
SBUTTERFLY qdq, 0, 1, 10
SBUTTERFLY qdq, 2, 3, 10
punpcklqdq m9, m7
DEBLOCK_LUMA_INTER_SSE2
TRANSPOSE4x4W 0, 1, 2, 3, 4
LUMA_H_STORE r5, r2
add r4, 2
lea r0, [r0+r1*8]
lea r5, [r5+r1*8]
dec r6
jg .loop
REP_RET
%endmacro
INIT_XMM sse2
DEBLOCK_LUMA_64
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_64
%endif
%endif
%macro SWAPMOVA 2
%ifid %1
SWAP %1, %2
%else
mova %1, %2
%endif
%endmacro
; in: t0-t2: tmp registers
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
%if ARCH_X86_64
paddw t0, %3, %2
mova t2, %4
paddw t2, %3
%else
mova t0, %3
mova t2, %4
paddw t0, %2
paddw t2, %3
%endif
paddw t0, %1
paddw t2, t2
paddw t0, %5
paddw t2, %9
paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
psrlw t2, 3
psrlw t1, t0, 2
psubw t2, %3
psubw t1, %2
pand t2, %8
pand t1, %8
paddw t2, %3
paddw t1, %2
SWAPMOVA %11, t1
psubw t1, t0, %3
paddw t0, t0
psubw t1, %5
psubw t0, %3
paddw t1, %6
paddw t1, %2
paddw t0, %6
psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
pxor t0, t1
pxor t1, %1
pand t0, %8
pand t1, %7
pxor t0, t1
pxor t0, %1
SWAPMOVA %10, t0
SWAPMOVA %12, t2
%endmacro
%macro LUMA_INTRA_INIT 1
%xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
%define t0 m4
%define t1 m5
%define t2 m6
%define t3 m7
%assign i 4
%rep %1
CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
%assign i i+1
%endrep
SUB rsp, pad
%endmacro
; in: %1-%3=tmp, %4=p2, %5=q2
%macro LUMA_INTRA_INTER 5
LOAD_AB t0, t1, r2d, r3d
mova %1, t0
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
%if ARCH_X86_64
mova %2, t0 ; mask0
psrlw t3, %1, 2
%else
mova t3, %1
mova %2, t0 ; mask0
psrlw t3, 2
%endif
paddw t3, [pw_2] ; alpha/4+2
DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
pand t2, %2
mova t3, %5 ; q2
mova %1, t2 ; mask1
DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
pand t2, %1
mova t3, %4 ; p2
mova %3, t2 ; mask1q
DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
pand t2, %1
mova %1, t2 ; mask1p
%endmacro
%macro LUMA_H_INTRA_LOAD 0
%if mmsize == 8
movu t0, [r0-8]
movu t1, [r0+r1-8]
movu m0, [r0+r1*2-8]
movu m1, [r0+r4-8]
TRANSPOSE4x4W 4, 5, 0, 1, 2
mova t4, t0 ; p3
mova t5, t1 ; p2
movu m2, [r0]
movu m3, [r0+r1]
movu t0, [r0+r1*2]
movu t1, [r0+r4]
TRANSPOSE4x4W 2, 3, 4, 5, 6
mova t6, t0 ; q2
mova t7, t1 ; q3
%else
movu t0, [r0-8]
movu t1, [r0+r1-8]
movu m0, [r0+r1*2-8]
movu m1, [r0+r5-8]
movu m2, [r4-8]
movu m3, [r4+r1-8]
movu t2, [r4+r1*2-8]
movu t3, [r4+r5-8]
TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
mova t4, t0 ; p3
mova t5, t1 ; p2
mova t6, t2 ; q2
mova t7, t3 ; q3
%endif
%endmacro
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
%macro LUMA_H_INTRA_STORE 9
%if mmsize == 8
TRANSPOSE4x4W %1, %2, %3, %4, %9
movq [r0-8], m%1
movq [r0+r1-8], m%2
movq [r0+r1*2-8], m%3
movq [r0+r4-8], m%4
movq m%1, %8
TRANSPOSE4x4W %5, %6, %7, %1, %9
movq [r0], m%5
movq [r0+r1], m%6
movq [r0+r1*2], m%7
movq [r0+r4], m%1
%else
TRANSPOSE2x4x4W %1, %2, %3, %4, %9
movq [r0-8], m%1
movq [r0+r1-8], m%2
movq [r0+r1*2-8], m%3
movq [r0+r5-8], m%4
movhps [r4-8], m%1
movhps [r4+r1-8], m%2
movhps [r4+r1*2-8], m%3
movhps [r4+r5-8], m%4
%ifnum %8
SWAP %1, %8
%else
mova m%1, %8
%endif
TRANSPOSE2x4x4W %5, %6, %7, %1, %9
movq [r0], m%5
movq [r0+r1], m%6
movq [r0+r1*2], m%7
movq [r0+r5], m%1
movhps [r4], m%5
movhps [r4+r1], m%6
movhps [r4+r1*2], m%7
movhps [r4+r5], m%1
%endif
%endmacro
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA_INTRA_64 0
cglobal deblock_v_luma_intra_10, 4,7,16
%define t0 m1
%define t1 m2
%define t2 m4
%define p2 m8
%define p1 m9
%define p0 m10
%define q0 m11
%define q1 m12
%define q2 m13
%define aa m5
%define bb m14
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
neg r4
add r4, r0 ; pix-4*stride
mov r6, 2
mova m0, [pw_2]
shl r2d, 2
shl r3d, 2
LOAD_AB aa, bb, r2d, r3d
.loop:
mova p2, [r4+r1]
mova p1, [r4+2*r1]
mova p0, [r4+r5]
mova q0, [r0]
mova q1, [r0+r1]
mova q2, [r0+2*r1]
LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
mova t2, aa
psrlw t2, 2
paddw t2, m0 ; alpha/4+2
DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
pand m6, m3
pand m7, m6
pand m6, t1
LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
add r0, mmsize
add r4, mmsize
dec r6
jg .loop
REP_RET
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra_10, 4,7,16
%define t0 m15
%define t1 m14
%define t2 m2
%define q3 m5
%define q2 m8
%define q1 m9
%define q0 m10
%define p0 m11
%define p1 m12
%define p2 m13
%define p3 m4
%define spill [rsp]
%assign pad 24-(stack_offset&15)
SUB rsp, pad
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
add r4, r0 ; pix+4*stride
mov r6, 2
mova m0, [pw_2]
shl r2d, 2
shl r3d, 2
.loop:
movu q3, [r0-8]
movu q2, [r0+r1-8]
movu q1, [r0+r1*2-8]
movu q0, [r0+r5-8]
movu p0, [r4-8]
movu p1, [r4+r1-8]
movu p2, [r4+r1*2-8]
movu p3, [r4+r5-8]
TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
LOAD_AB m1, m2, r2d, r3d
LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
psrlw m1, 2
paddw m1, m0 ; alpha/4+2
DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
pand m6, m3
pand m7, m6
pand m6, t1
mova spill, q3
LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
mova m7, spill
LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
lea r0, [r0+r1*8]
lea r4, [r4+r1*8]
dec r6
jg .loop
ADD rsp, pad
RET
%endmacro
INIT_XMM sse2
DEBLOCK_LUMA_INTRA_64
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_INTRA_64
%endif
%endif
%macro DEBLOCK_LUMA_INTRA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
LUMA_INTRA_INIT 3
lea r4, [r1*4]
lea r5, [r1*3]
neg r4
add r4, r0
mov r6, 32/mmsize
shl r2d, 2
shl r3d, 2
.loop:
mova m0, [r4+r1*2] ; p1
mova m1, [r4+r5] ; p0
mova m2, [r0] ; q0
mova m3, [r0+r1] ; q1
LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
mova t3, [r0+r1*2] ; q2
LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
add r0, mmsize
add r4, mmsize
dec r6
jg .loop
ADD rsp, pad
RET
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
LUMA_INTRA_INIT 8
%if mmsize == 8
lea r4, [r1*3]
mov r5, 32/mmsize
%else
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
add r4, r0 ; pix+4*stride
mov r6, 32/mmsize
%endif
shl r2d, 2
shl r3d, 2
.loop:
LUMA_H_INTRA_LOAD
LUMA_INTRA_INTER t8, t9, t10, t5, t6
LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
mova t3, t6 ; q2
LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
mova m2, t4
mova m0, t11
mova m1, t5
mova m3, t8
mova m6, t6
LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
lea r0, [r0+r1*(mmsize/2)]
%if mmsize == 8
dec r5
%else
lea r4, [r4+r1*(mmsize/2)]
dec r6
%endif
jg .loop
ADD rsp, pad
RET
%endmacro
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
INIT_XMM sse2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
%endif
%endif
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', %2=q0'
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
mova %6, [pw_2]
paddw %6, %3
paddw %6, %4
paddw %7, %6, %2
paddw %6, %1
paddw %6, %3
paddw %7, %4
psraw %6, 2
psraw %7, 2
psubw %6, %1
psubw %7, %2
pand %6, %5
pand %7, %5
paddw %1, %6
paddw %2, %7
%endmacro
%macro CHROMA_V_LOAD 1
mova m0, [r0] ; p1
mova m1, [r0+r1] ; p0
mova m2, [%1] ; q0
mova m3, [%1+r1] ; q1
%endmacro
%macro CHROMA_V_STORE 0
mova [r0+1*r1], m1
mova [r0+2*r1], m2
%endmacro
%macro CHROMA_V_LOAD_TC 2
movd %1, [%2]
punpcklbw %1, %1
punpcklwd %1, %1
psraw %1, 6
%endmacro
%macro DEBLOCK_CHROMA 0
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
mov r5, r0
sub r0, r1
sub r0, r1
shl r2d, 2
shl r3d, 2
%if mmsize < 16
mov r6, 16/mmsize
.loop:
%endif
CHROMA_V_LOAD r5
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
CHROMA_V_LOAD_TC m6, r4
psubw m6, [pw_3]
pmaxsw m6, m4
pand m7, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE
%if mmsize < 16
add r0, mmsize
add r5, mmsize
add r4, mmsize/4
dec r6
jg .loop
REP_RET
%else
RET
%endif
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
mov r4, r0
sub r0, r1
sub r0, r1
shl r2d, 2
shl r3d, 2
%if mmsize < 16
mov r5, 16/mmsize
.loop:
%endif
CHROMA_V_LOAD r4
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE
%if mmsize < 16
add r0, mmsize
add r4, mmsize
dec r5
jg .loop
REP_RET
%else
RET
%endif
%endmacro
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_CHROMA
%endif
INIT_XMM sse2
DEBLOCK_CHROMA
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_CHROMA
%endif

View File

@@ -0,0 +1,204 @@
/*
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* H.264 / AVC / MPEG4 part10 codec.
* non-MMX i386-specific optimizations for H.264
* @author Michael Niedermayer <michaelni@gmx.at>
*/
#ifndef AVCODEC_X86_H264_I386_H
#define AVCODEC_X86_H264_I386_H
#include <stddef.h>
#include "libavcodec/cabac.h"
#include "cabac.h"
#if HAVE_INLINE_ASM
//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
//as that would make optimization work hard)
#if HAVE_7REGS
#define decode_significance decode_significance_x86
static int decode_significance_x86(CABACContext *c, int max_coeff,
uint8_t *significant_coeff_ctx_base,
int *index, x86_reg last_off){
void *end= significant_coeff_ctx_base + max_coeff - 1;
int minusstart= -(intptr_t)significant_coeff_ctx_base;
int minusindex= 4-(intptr_t)index;
int bit;
x86_reg coeff_count;
#ifdef BROKEN_RELOCATIONS
void *tables;
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
);
#endif
__asm__ volatile(
"3: \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c11(%6)", "%c12(%6)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%13")
"test $1, %4 \n\t"
" jz 4f \n\t"
"add %10, %1 \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c11(%6)", "%c12(%6)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%13")
"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
"test $1, %4 \n\t"
" jnz 5f \n\t"
"add"OPSIZE" $4, %2 \n\t"
"4: \n\t"
"add $1, %1 \n\t"
"cmp %8, %1 \n\t"
" jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
"5: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
"+&r"(c->low), "=&r"(bit), "+&r"(c->range)
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end))
TABLES_ARG
: "%"REG_c, "memory"
);
return coeff_count;
}
#define decode_significance_8x8 decode_significance_8x8_x86
static int decode_significance_8x8_x86(CABACContext *c,
uint8_t *significant_coeff_ctx_base,
int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
int minusindex= 4-(intptr_t)index;
int bit;
x86_reg coeff_count;
x86_reg last=0;
x86_reg state;
#ifdef BROKEN_RELOCATIONS
void *tables;
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
);
#endif
__asm__ volatile(
"mov %1, %6 \n\t"
"3: \n\t"
"mov %10, %0 \n\t"
"movzbl (%0, %6), %k6 \n\t"
"add %9, %6 \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c12(%7)", "%c13(%7)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%15")
"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
" jz 4f \n\t"
#ifdef BROKEN_RELOCATIONS
"movzbl %c14(%15, %q6), %k6\n\t"
#else
"movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t"
#endif
"add %11, %6 \n\t"
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c12(%7)", "%c13(%7)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%15")
"mov %2, %0 \n\t"
"mov %1, %k6 \n\t"
"movl %k6, (%0) \n\t"
"test $1, %4 \n\t"
" jnz 5f \n\t"
"add"OPSIZE" $4, %2 \n\t"
"4: \n\t"
"addl $1, %k6 \n\t"
"mov %k6, %1 \n\t"
"cmpl $63, %k6 \n\t"
" jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %k6, (%0) \n\t"
"5: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
"=&r"(bit), "+&r"(c->range), "=&r"(state)
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
"m"(sig_off), "m"(last_coeff_ctx_base),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
: "%"REG_c, "memory"
);
return coeff_count;
}
#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_H264_I386_H */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,589 @@
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_pixel_max: times 8 dw ((1 << 10)-1)
pd_32: times 4 dd 32
SECTION .text
;-----------------------------------------------------------------------------
; void h264_idct_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
%macro STORE_DIFFx2 6
psrad %1, 6
psrad %2, 6
packssdw %1, %2
movq %3, [%5]
movhps %3, [%5+%6]
paddsw %1, %3
CLIPW %1, %4, [pw_pixel_max]
movq [%5], %1
movhps [%5+%6], %1
%endmacro
%macro STORE_DIFF16 5
psrad %1, 6
psrad %2, 6
packssdw %1, %2
paddsw %1, [%5]
CLIPW %1, %3, %4
mova [%5], %1
%endmacro
;dst, in, stride
%macro IDCT4_ADD_10 3
mova m0, [%2+ 0]
mova m1, [%2+16]
mova m2, [%2+32]
mova m3, [%2+48]
IDCT4_1D d,0,1,2,3,4,5
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5
mova [%2+ 0], m5
mova [%2+16], m5
mova [%2+32], m5
mova [%2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m4, m5, %1, %3
%endmacro
%macro IDCT_ADD_10 0
cglobal h264_idct_add_10, 3,3
IDCT4_ADD_10 r0, r1, r2
RET
%endmacro
INIT_XMM sse2
IDCT_ADD_10
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD_10
%endif
;-----------------------------------------------------------------------------
; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
;;;;;;; NO FATE SAMPLES TRIGGER THIS
%macro ADD4x4IDCT 0
add4x4_idct %+ SUFFIX:
add r5, r0
mova m0, [r2+ 0]
mova m1, [r2+16]
mova m2, [r2+32]
mova m3, [r2+48]
IDCT4_1D d,0,1,2,3,4,5
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5
mova [r2+ 0], m5
mova [r2+16], m5
mova [r2+32], m5
mova [r2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, r5, r3
lea r5, [r5+r3*2]
STORE_DIFFx2 m2, m3, m4, m5, r5, r3
ret
%endmacro
INIT_XMM sse2
ALIGN 16
ADD4x4IDCT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
ALIGN 16
ADD4x4IDCT
%endif
%macro ADD16_OP 2
cmp byte [r4+%2], 0
jz .skipblock%1
mov r5d, [r1+%1*4]
call add4x4_idct %+ SUFFIX
.skipblock%1:
%if %1<15
add r2, 64
%endif
%endmacro
%macro IDCT_ADD16_10 0
cglobal h264_idct_add16_10, 5,6
ADD16_OP 0, 4+1*8
ADD16_OP 1, 5+1*8
ADD16_OP 2, 4+2*8
ADD16_OP 3, 5+2*8
ADD16_OP 4, 6+1*8
ADD16_OP 5, 7+1*8
ADD16_OP 6, 6+2*8
ADD16_OP 7, 7+2*8
ADD16_OP 8, 4+3*8
ADD16_OP 9, 5+3*8
ADD16_OP 10, 4+4*8
ADD16_OP 11, 5+4*8
ADD16_OP 12, 6+3*8
ADD16_OP 13, 7+3*8
ADD16_OP 14, 6+4*8
ADD16_OP 15, 7+4*8
REP_RET
%endmacro
INIT_XMM sse2
IDCT_ADD16_10
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD16_10
%endif
;-----------------------------------------------------------------------------
; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT_DC_ADD_OP_10 3
pxor m5, m5
%if avx_enabled
paddw m1, m0, [%1+0 ]
paddw m2, m0, [%1+%2 ]
paddw m3, m0, [%1+%2*2]
paddw m4, m0, [%1+%3 ]
%else
mova m1, [%1+0 ]
mova m2, [%1+%2 ]
mova m3, [%1+%2*2]
mova m4, [%1+%3 ]
paddw m1, m0
paddw m2, m0
paddw m3, m0
paddw m4, m0
%endif
CLIPW m1, m5, m6
CLIPW m2, m5, m6
CLIPW m3, m5, m6
CLIPW m4, m5, m6
mova [%1+0 ], m1
mova [%1+%2 ], m2
mova [%1+%2*2], m3
mova [%1+%3 ], m4
%endmacro
INIT_MMX mmxext
cglobal h264_idct_dc_add_10,3,3
movd m0, [r1]
mov dword [r1], 0
paddd m0, [pd_32]
psrad m0, 6
lea r1, [r2*3]
pshufw m0, m0, 0
mova m6, [pw_pixel_max]
IDCT_DC_ADD_OP_10 r0, r2, r1
RET
;-----------------------------------------------------------------------------
; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT8_DC_ADD 0
cglobal h264_idct8_dc_add_10,3,4,7
movd m0, [r1]
mov dword[r1], 0
paddd m0, [pd_32]
psrad m0, 6
lea r1, [r2*3]
SPLATW m0, m0, 0
mova m6, [pw_pixel_max]
IDCT_DC_ADD_OP_10 r0, r2, r1
lea r0, [r0+r2*4]
IDCT_DC_ADD_OP_10 r0, r2, r1
RET
%endmacro
INIT_XMM sse2
IDCT8_DC_ADD
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_DC_ADD
%endif
;-----------------------------------------------------------------------------
; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
%macro AC 1
.ac%1:
mov r5d, [r1+(%1+0)*4]
call add4x4_idct %+ SUFFIX
mov r5d, [r1+(%1+1)*4]
add r2, 64
call add4x4_idct %+ SUFFIX
add r2, 64
jmp .skipadd%1
%endmacro
%assign last_block 16
%macro ADD16_OP_INTRA 2
cmp word [r4+%2], 0
jnz .ac%1
mov r5d, [r2+ 0]
or r5d, [r2+64]
jz .skipblock%1
mov r5d, [r1+(%1+0)*4]
call idct_dc_add %+ SUFFIX
.skipblock%1:
%if %1<last_block-2
add r2, 128
%endif
.skipadd%1:
%endmacro
%macro IDCT_ADD16INTRA_10 0
idct_dc_add %+ SUFFIX:
add r5, r0
movq m0, [r2+ 0]
movhps m0, [r2+64]
mov dword [r2+ 0], 0
mov dword [r2+64], 0
paddd m0, [pd_32]
psrad m0, 6
pshufhw m0, m0, 0
pshuflw m0, m0, 0
lea r6, [r3*3]
mova m6, [pw_pixel_max]
IDCT_DC_ADD_OP_10 r5, r3, r6
ret
cglobal h264_idct_add16intra_10,5,7,8
ADD16_OP_INTRA 0, 4+1*8
ADD16_OP_INTRA 2, 4+2*8
ADD16_OP_INTRA 4, 6+1*8
ADD16_OP_INTRA 6, 6+2*8
ADD16_OP_INTRA 8, 4+3*8
ADD16_OP_INTRA 10, 4+4*8
ADD16_OP_INTRA 12, 6+3*8
ADD16_OP_INTRA 14, 6+4*8
REP_RET
AC 8
AC 10
AC 12
AC 14
AC 0
AC 2
AC 4
AC 6
%endmacro
INIT_XMM sse2
IDCT_ADD16INTRA_10
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD16INTRA_10
%endif
%assign last_block 36
;-----------------------------------------------------------------------------
; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
%macro IDCT_ADD8 0
cglobal h264_idct_add8_10,5,8,7
%if ARCH_X86_64
mov r7, r0
%endif
add r2, 1024
mov r0, [r0]
ADD16_OP_INTRA 16, 4+ 6*8
ADD16_OP_INTRA 18, 4+ 7*8
add r2, 1024-128*2
%if ARCH_X86_64
mov r0, [r7+gprsize]
%else
mov r0, r0m
mov r0, [r0+gprsize]
%endif
ADD16_OP_INTRA 32, 4+11*8
ADD16_OP_INTRA 34, 4+12*8
REP_RET
AC 16
AC 18
AC 32
AC 34
%endmacro ; IDCT_ADD8
INIT_XMM sse2
IDCT_ADD8
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD8
%endif
;-----------------------------------------------------------------------------
; void h264_idct8_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT8_1D 2
SWAP 0, 1
psrad m4, m5, 1
psrad m1, m0, 1
paddd m4, m5
paddd m1, m0
paddd m4, m7
paddd m1, m5
psubd m4, m0
paddd m1, m3
psubd m0, m3
psubd m5, m3
paddd m0, m7
psubd m5, m7
psrad m3, 1
psrad m7, 1
psubd m0, m3
psubd m5, m7
SWAP 1, 7
psrad m1, m7, 2
psrad m3, m4, 2
paddd m3, m0
psrad m0, 2
paddd m1, m5
psrad m5, 2
psubd m0, m4
psubd m7, m5
SWAP 5, 6
psrad m4, m2, 1
psrad m6, m5, 1
psubd m4, m5
paddd m6, m2
mova m2, %1
mova m5, %2
SUMSUB_BA d, 5, 2
SUMSUB_BA d, 6, 5
SUMSUB_BA d, 4, 2
SUMSUB_BA d, 7, 6
SUMSUB_BA d, 0, 4
SUMSUB_BA d, 3, 2
SUMSUB_BA d, 1, 5
SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
%endmacro
%macro IDCT8_1D_FULL 1
mova m7, [%1+112*2]
mova m6, [%1+ 96*2]
mova m5, [%1+ 80*2]
mova m3, [%1+ 48*2]
mova m2, [%1+ 32*2]
mova m1, [%1+ 16*2]
IDCT8_1D [%1], [%1+ 64*2]
%endmacro
; %1=int16_t *block, %2=int16_t *dstblock
%macro IDCT8_ADD_SSE_START 2
IDCT8_1D_FULL %1
%if ARCH_X86_64
TRANSPOSE4x4D 0,1,2,3,8
mova [%2 ], m0
TRANSPOSE4x4D 4,5,6,7,8
mova [%2+8*2], m4
%else
mova [%1], m7
TRANSPOSE4x4D 0,1,2,3,7
mova m7, [%1]
mova [%2 ], m0
mova [%2+16*2], m1
mova [%2+32*2], m2
mova [%2+48*2], m3
TRANSPOSE4x4D 4,5,6,7,3
mova [%2+ 8*2], m4
mova [%2+24*2], m5
mova [%2+40*2], m6
mova [%2+56*2], m7
%endif
%endmacro
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE_END 3
IDCT8_1D_FULL %2
mova [%2 ], m6
mova [%2+16*2], m7
pxor m7, m7
STORE_DIFFx2 m0, m1, m6, m7, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m6, m7, %1, %3
mova m0, [%2 ]
mova m1, [%2+16*2]
lea %1, [%1+%3*2]
STORE_DIFFx2 m4, m5, m6, m7, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m0, m1, m6, m7, %1, %3
%endmacro
%macro IDCT8_ADD 0
cglobal h264_idct8_add_10, 3,4,16
%if UNIX64 == 0
%assign pad 16-gprsize-(stack_offset&15)
sub rsp, pad
call h264_idct8_add1_10 %+ SUFFIX
add rsp, pad
RET
%endif
ALIGN 16
; TODO: does not need to use stack
h264_idct8_add1_10 %+ SUFFIX:
%assign pad 256+16-gprsize
sub rsp, pad
add dword [r1], 32
%if ARCH_X86_64
IDCT8_ADD_SSE_START r1, rsp
SWAP 1, 9
SWAP 2, 10
SWAP 3, 11
SWAP 5, 13
SWAP 6, 14
SWAP 7, 15
IDCT8_ADD_SSE_START r1+16, rsp+128
PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
IDCT8_1D [rsp], [rsp+128]
SWAP 0, 8
SWAP 1, 9
SWAP 2, 10
SWAP 3, 11
SWAP 4, 12
SWAP 5, 13
SWAP 6, 14
SWAP 7, 15
IDCT8_1D [rsp+16], [rsp+144]
psrad m8, 6
psrad m0, 6
packssdw m8, m0
paddsw m8, [r0]
pxor m0, m0
mova [r1+ 0], m0
mova [r1+ 16], m0
mova [r1+ 32], m0
mova [r1+ 48], m0
mova [r1+ 64], m0
mova [r1+ 80], m0
mova [r1+ 96], m0
mova [r1+112], m0
mova [r1+128], m0
mova [r1+144], m0
mova [r1+160], m0
mova [r1+176], m0
mova [r1+192], m0
mova [r1+208], m0
mova [r1+224], m0
mova [r1+240], m0
CLIPW m8, m0, [pw_pixel_max]
mova [r0], m8
mova m8, [pw_pixel_max]
STORE_DIFF16 m9, m1, m0, m8, r0+r2
lea r0, [r0+r2*2]
STORE_DIFF16 m10, m2, m0, m8, r0
STORE_DIFF16 m11, m3, m0, m8, r0+r2
lea r0, [r0+r2*2]
STORE_DIFF16 m12, m4, m0, m8, r0
STORE_DIFF16 m13, m5, m0, m8, r0+r2
lea r0, [r0+r2*2]
STORE_DIFF16 m14, m6, m0, m8, r0
STORE_DIFF16 m15, m7, m0, m8, r0+r2
%else
IDCT8_ADD_SSE_START r1, rsp
IDCT8_ADD_SSE_START r1+16, rsp+128
lea r3, [r0+8]
IDCT8_ADD_SSE_END r0, rsp, r2
IDCT8_ADD_SSE_END r3, rsp+16, r2
mova [r1+ 0], m7
mova [r1+ 16], m7
mova [r1+ 32], m7
mova [r1+ 48], m7
mova [r1+ 64], m7
mova [r1+ 80], m7
mova [r1+ 96], m7
mova [r1+112], m7
mova [r1+128], m7
mova [r1+144], m7
mova [r1+160], m7
mova [r1+176], m7
mova [r1+192], m7
mova [r1+208], m7
mova [r1+224], m7
mova [r1+240], m7
%endif ; ARCH_X86_64
add rsp, pad
ret
%endmacro
INIT_XMM sse2
IDCT8_ADD
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_ADD
%endif
;-----------------------------------------------------------------------------
; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
;;;;;;; NO FATE SAMPLES TRIGGER THIS
%macro IDCT8_ADD4_OP 2
cmp byte [r4+%2], 0
jz .skipblock%1
mov r0d, [r6+%1*4]
add r0, r5
call h264_idct8_add1_10 %+ SUFFIX
.skipblock%1:
%if %1<12
add r1, 256
%endif
%endmacro
%macro IDCT8_ADD4 0
cglobal h264_idct8_add4_10, 0,7,16
%assign pad 16-gprsize-(stack_offset&15)
SUB rsp, pad
mov r5, r0mp
mov r6, r1mp
mov r1, r2mp
mov r2d, r3m
movifnidn r4, r4mp
IDCT8_ADD4_OP 0, 4+1*8
IDCT8_ADD4_OP 4, 6+1*8
IDCT8_ADD4_OP 8, 4+3*8
IDCT8_ADD4_OP 12, 6+3*8
ADD rsp, pad
RET
%endmacro ; IDCT8_ADD4
INIT_XMM sse2
IDCT8_ADD4
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_ADD4
%endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,402 @@
/*
* Copyright (c) 2010 Jason Garrett-Glaser
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/h264pred.h"
#define PRED4x4(TYPE, DEPTH, OPT) \
void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
const uint8_t *topright, \
ptrdiff_t stride);
PRED4x4(dc, 10, mmxext)
PRED4x4(down_left, 10, sse2)
PRED4x4(down_left, 10, avx)
PRED4x4(down_right, 10, sse2)
PRED4x4(down_right, 10, ssse3)
PRED4x4(down_right, 10, avx)
PRED4x4(vertical_left, 10, sse2)
PRED4x4(vertical_left, 10, avx)
PRED4x4(vertical_right, 10, sse2)
PRED4x4(vertical_right, 10, ssse3)
PRED4x4(vertical_right, 10, avx)
PRED4x4(horizontal_up, 10, mmxext)
PRED4x4(horizontal_down, 10, sse2)
PRED4x4(horizontal_down, 10, ssse3)
PRED4x4(horizontal_down, 10, avx)
#define PRED8x8(TYPE, DEPTH, OPT) \
void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
ptrdiff_t stride);
PRED8x8(dc, 10, mmxext)
PRED8x8(dc, 10, sse2)
PRED8x8(top_dc, 10, sse2)
PRED8x8(plane, 10, sse2)
PRED8x8(vertical, 10, sse2)
PRED8x8(horizontal, 10, sse2)
#define PRED8x8L(TYPE, DEPTH, OPT)\
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
int has_topleft, \
int has_topright, \
ptrdiff_t stride);
PRED8x8L(dc, 10, sse2)
PRED8x8L(dc, 10, avx)
PRED8x8L(128_dc, 10, mmxext)
PRED8x8L(128_dc, 10, sse2)
PRED8x8L(top_dc, 10, sse2)
PRED8x8L(top_dc, 10, avx)
PRED8x8L(vertical, 10, sse2)
PRED8x8L(vertical, 10, avx)
PRED8x8L(horizontal, 10, sse2)
PRED8x8L(horizontal, 10, ssse3)
PRED8x8L(horizontal, 10, avx)
PRED8x8L(down_left, 10, sse2)
PRED8x8L(down_left, 10, ssse3)
PRED8x8L(down_left, 10, avx)
PRED8x8L(down_right, 10, sse2)
PRED8x8L(down_right, 10, ssse3)
PRED8x8L(down_right, 10, avx)
PRED8x8L(vertical_right, 10, sse2)
PRED8x8L(vertical_right, 10, ssse3)
PRED8x8L(vertical_right, 10, avx)
PRED8x8L(horizontal_up, 10, sse2)
PRED8x8L(horizontal_up, 10, ssse3)
PRED8x8L(horizontal_up, 10, avx)
#define PRED16x16(TYPE, DEPTH, OPT)\
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
ptrdiff_t stride);
PRED16x16(dc, 10, mmxext)
PRED16x16(dc, 10, sse2)
PRED16x16(top_dc, 10, mmxext)
PRED16x16(top_dc, 10, sse2)
PRED16x16(128_dc, 10, mmxext)
PRED16x16(128_dc, 10, sse2)
PRED16x16(left_dc, 10, mmxext)
PRED16x16(left_dc, 10, sse2)
PRED16x16(vertical, 10, mmxext)
PRED16x16(vertical, 10, sse2)
PRED16x16(horizontal, 10, mmxext)
PRED16x16(horizontal, 10, sse2)
/* 8-bit versions */
PRED16x16(vertical, 8, mmx)
PRED16x16(vertical, 8, sse)
PRED16x16(horizontal, 8, mmx)
PRED16x16(horizontal, 8, mmxext)
PRED16x16(horizontal, 8, ssse3)
PRED16x16(dc, 8, mmxext)
PRED16x16(dc, 8, sse2)
PRED16x16(dc, 8, ssse3)
PRED16x16(plane_h264, 8, mmx)
PRED16x16(plane_h264, 8, mmxext)
PRED16x16(plane_h264, 8, sse2)
PRED16x16(plane_h264, 8, ssse3)
PRED16x16(plane_rv40, 8, mmx)
PRED16x16(plane_rv40, 8, mmxext)
PRED16x16(plane_rv40, 8, sse2)
PRED16x16(plane_rv40, 8, ssse3)
PRED16x16(plane_svq3, 8, mmx)
PRED16x16(plane_svq3, 8, mmxext)
PRED16x16(plane_svq3, 8, sse2)
PRED16x16(plane_svq3, 8, ssse3)
PRED16x16(tm_vp8, 8, mmx)
PRED16x16(tm_vp8, 8, mmxext)
PRED16x16(tm_vp8, 8, sse2)
PRED8x8(top_dc, 8, mmxext)
PRED8x8(dc_rv40, 8, mmxext)
PRED8x8(dc, 8, mmxext)
PRED8x8(vertical, 8, mmx)
PRED8x8(horizontal, 8, mmx)
PRED8x8(horizontal, 8, mmxext)
PRED8x8(horizontal, 8, ssse3)
PRED8x8(plane, 8, mmx)
PRED8x8(plane, 8, mmxext)
PRED8x8(plane, 8, sse2)
PRED8x8(plane, 8, ssse3)
PRED8x8(tm_vp8, 8, mmx)
PRED8x8(tm_vp8, 8, mmxext)
PRED8x8(tm_vp8, 8, sse2)
PRED8x8(tm_vp8, 8, ssse3)
PRED8x8L(top_dc, 8, mmxext)
PRED8x8L(top_dc, 8, ssse3)
PRED8x8L(dc, 8, mmxext)
PRED8x8L(dc, 8, ssse3)
PRED8x8L(horizontal, 8, mmxext)
PRED8x8L(horizontal, 8, ssse3)
PRED8x8L(vertical, 8, mmxext)
PRED8x8L(vertical, 8, ssse3)
PRED8x8L(down_left, 8, mmxext)
PRED8x8L(down_left, 8, sse2)
PRED8x8L(down_left, 8, ssse3)
PRED8x8L(down_right, 8, mmxext)
PRED8x8L(down_right, 8, sse2)
PRED8x8L(down_right, 8, ssse3)
PRED8x8L(vertical_right, 8, mmxext)
PRED8x8L(vertical_right, 8, sse2)
PRED8x8L(vertical_right, 8, ssse3)
PRED8x8L(vertical_left, 8, sse2)
PRED8x8L(vertical_left, 8, ssse3)
PRED8x8L(horizontal_up, 8, mmxext)
PRED8x8L(horizontal_up, 8, ssse3)
PRED8x8L(horizontal_down, 8, mmxext)
PRED8x8L(horizontal_down, 8, sse2)
PRED8x8L(horizontal_down, 8, ssse3)
PRED4x4(dc, 8, mmxext)
PRED4x4(down_left, 8, mmxext)
PRED4x4(down_right, 8, mmxext)
PRED4x4(vertical_left, 8, mmxext)
PRED4x4(vertical_right, 8, mmxext)
PRED4x4(horizontal_up, 8, mmxext)
PRED4x4(horizontal_down, 8, mmxext)
PRED4x4(tm_vp8, 8, mmx)
PRED4x4(tm_vp8, 8, mmxext)
PRED4x4(tm_vp8, 8, ssse3)
PRED4x4(vertical_vp8, 8, mmxext)
av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
const int bit_depth,
const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx;
if (chroma_format_idc == 1) {
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
}
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx;
} else {
if (chroma_format_idc == 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
if (codec_id == AV_CODEC_ID_SVQ3) {
if (cpu_flags & AV_CPU_FLAG_CMOV)
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
}
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
if (chroma_format_idc == 1)
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext;
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext;
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext;
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext;
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext;
h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext;
h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext;
h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext;
h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext;
if (codec_id == AV_CODEC_ID_VP8 || codec_id == AV_CODEC_ID_H264) {
h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
}
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext;
}
if (codec_id != AV_CODEC_ID_RV40) {
h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext;
}
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
if (chroma_format_idc == 1) {
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
}
}
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext;
h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext;
} else {
if (chroma_format_idc == 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext;
} else {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext;
}
}
}
if (EXTERNAL_SSE(cpu_flags)) {
h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
} else {
if (chroma_format_idc == 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
}
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3;
if (chroma_format_idc == 1)
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3;
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3;
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3;
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3;
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3;
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3;
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3;
if (codec_id == AV_CODEC_ID_VP8) {
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3;
} else {
if (chroma_format_idc == 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
}
}
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
if (chroma_format_idc == 1)
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2;
if (chroma_format_idc == 1) {
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
}
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
}
}
}

View File

@@ -0,0 +1,634 @@
/*
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
* Copyright (c) 2011 Daniel Kang
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264qpel.h"
#include "libavcodec/mpegvideo.h"
#include "dsputil_x86.h"
#if HAVE_YASM
void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
PIXELS16(static, ff_avg, , , _mmxext)
PIXELS16(static, ff_put, , , _mmxext)
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);
DEF_QPEL(avg)
DEF_QPEL(put)
#define QPEL_H264(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
int w=3;\
src -= 2*srcStride+2;\
while(w--){\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
tmp += 4;\
src += 4;\
}\
tmp -= 3*4;\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
src -= 2*srcStride;\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
src += 4;\
dst += 4;\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
int w = (size+8)>>2;\
src -= 2*srcStride+2;\
while(w--){\
ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\
tmp += 4;\
src += 4;\
}\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
int w = size>>4;\
do{\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
tmp += 8;\
dst += 8;\
}while(w--);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
}\
\
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
{\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
}\
#if ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
#else // ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}
#endif // ARCH_X86_64
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
uint8_t *src,
int tmpStride,
int srcStride,
int size)
{
int w = (size+8)>>3;
src -= 2*srcStride+2;
while(w--){
ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
tmp += 8;
src += 8;
}
}
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
}\
#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_sse2(dst, src, stride, 16);
}
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_sse2(dst, src, stride, 16);
}
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
}\
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
}\
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
}\
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
}\
#define H264_MC_4816(MMX)\
H264_MC(put_, 4, MMX, 8)\
H264_MC(put_, 8, MMX, 8)\
H264_MC(put_, 16,MMX, 8)\
H264_MC(avg_, 4, MMX, 8)\
H264_MC(avg_, 8, MMX, 8)\
H264_MC(avg_, 16,MMX, 8)\
#define H264_MC_816(QPEL, XMM)\
QPEL(put_, 8, XMM, 16)\
QPEL(put_, 16,XMM, 16)\
QPEL(avg_, 8, XMM, 16)\
QPEL(avg_, 16,XMM, 16)\
QPEL_H264(put_, PUT_OP, mmxext)
QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
QPEL_H264_V_XMM(put_, PUT_OP, sse2)
QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
H264_MC_4816(mmxext)
H264_MC_816(H264_MC_V, sse2)
H264_MC_816(H264_MC_HV, sse2)
H264_MC_816(H264_MC_H, ssse3)
H264_MC_816(H264_MC_HV, ssse3)
//10bit
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
#define LUMA_MC_816(DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
LUMA_MC_ALL(10, mc00, mmxext)
LUMA_MC_ALL(10, mc10, mmxext)
LUMA_MC_ALL(10, mc20, mmxext)
LUMA_MC_ALL(10, mc30, mmxext)
LUMA_MC_ALL(10, mc01, mmxext)
LUMA_MC_ALL(10, mc11, mmxext)
LUMA_MC_ALL(10, mc21, mmxext)
LUMA_MC_ALL(10, mc31, mmxext)
LUMA_MC_ALL(10, mc02, mmxext)
LUMA_MC_ALL(10, mc12, mmxext)
LUMA_MC_ALL(10, mc22, mmxext)
LUMA_MC_ALL(10, mc32, mmxext)
LUMA_MC_ALL(10, mc03, mmxext)
LUMA_MC_ALL(10, mc13, mmxext)
LUMA_MC_ALL(10, mc23, mmxext)
LUMA_MC_ALL(10, mc33, mmxext)
LUMA_MC_816(10, mc00, sse2)
LUMA_MC_816(10, mc10, sse2)
LUMA_MC_816(10, mc10, sse2_cache64)
LUMA_MC_816(10, mc10, ssse3_cache64)
LUMA_MC_816(10, mc20, sse2)
LUMA_MC_816(10, mc20, sse2_cache64)
LUMA_MC_816(10, mc20, ssse3_cache64)
LUMA_MC_816(10, mc30, sse2)
LUMA_MC_816(10, mc30, sse2_cache64)
LUMA_MC_816(10, mc30, ssse3_cache64)
LUMA_MC_816(10, mc01, sse2)
LUMA_MC_816(10, mc11, sse2)
LUMA_MC_816(10, mc21, sse2)
LUMA_MC_816(10, mc31, sse2)
LUMA_MC_816(10, mc02, sse2)
LUMA_MC_816(10, mc12, sse2)
LUMA_MC_816(10, mc22, sse2)
LUMA_MC_816(10, mc32, sse2)
LUMA_MC_816(10, mc03, sse2)
LUMA_MC_816(10, mc13, sse2)
LUMA_MC_816(10, mc23, sse2)
LUMA_MC_816(10, mc33, sse2)
#define QPEL16_OPMC(OP, MC, MMX)\
void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride){\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
src += 8*stride;\
dst += 8*stride;\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
}
#define QPEL16_OP(MC, MMX)\
QPEL16_OPMC(put, MC, MMX)\
QPEL16_OPMC(avg, MC, MMX)
#define QPEL16(MMX)\
QPEL16_OP(mc00, MMX)\
QPEL16_OP(mc01, MMX)\
QPEL16_OP(mc02, MMX)\
QPEL16_OP(mc03, MMX)\
QPEL16_OP(mc10, MMX)\
QPEL16_OP(mc11, MMX)\
QPEL16_OP(mc12, MMX)\
QPEL16_OP(mc13, MMX)\
QPEL16_OP(mc20, MMX)\
QPEL16_OP(mc21, MMX)\
QPEL16_OP(mc22, MMX)\
QPEL16_OP(mc23, MMX)\
QPEL16_OP(mc30, MMX)\
QPEL16_OP(mc31, MMX)\
QPEL16_OP(mc32, MMX)\
QPEL16_OP(mc33, MMX)
#if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+
QPEL16(mmxext)
#endif
#endif /* HAVE_YASM */
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
#define H264_QPEL_FUNCS(x, y, CPU) \
do { \
c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
} while (0)
#define H264_QPEL_FUNCS_10(x, y, CPU) \
do { \
c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
} while (0)
av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
{
#if HAVE_YASM
int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMXEXT(cpu_flags)) {
if (!high_bit_depth) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
} else if (bit_depth == 10) {
#if ARCH_X86_32
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
#endif
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && !high_bit_depth) {
// these functions are slower than mmx on AMD, but faster on Intel
H264_QPEL_FUNCS(0, 0, sse2);
}
if (!high_bit_depth) {
H264_QPEL_FUNCS(0, 1, sse2);
H264_QPEL_FUNCS(0, 2, sse2);
H264_QPEL_FUNCS(0, 3, sse2);
H264_QPEL_FUNCS(1, 1, sse2);
H264_QPEL_FUNCS(1, 2, sse2);
H264_QPEL_FUNCS(1, 3, sse2);
H264_QPEL_FUNCS(2, 1, sse2);
H264_QPEL_FUNCS(2, 2, sse2);
H264_QPEL_FUNCS(2, 3, sse2);
H264_QPEL_FUNCS(3, 1, sse2);
H264_QPEL_FUNCS(3, 2, sse2);
H264_QPEL_FUNCS(3, 3, sse2);
}
if (bit_depth == 10) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
if (!high_bit_depth) {
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
H264_QPEL_FUNCS(1, 2, ssse3);
H264_QPEL_FUNCS(1, 3, ssse3);
H264_QPEL_FUNCS(2, 0, ssse3);
H264_QPEL_FUNCS(2, 1, ssse3);
H264_QPEL_FUNCS(2, 2, ssse3);
H264_QPEL_FUNCS(2, 3, ssse3);
H264_QPEL_FUNCS(3, 0, ssse3);
H264_QPEL_FUNCS(3, 1, ssse3);
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
}
if (bit_depth == 10) {
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
}
}
if (EXTERNAL_AVX(cpu_flags)) {
/* AVX implies 64 byte cache lines without the need to avoid unaligned
* memory accesses that cross the boundary between two cache lines.
* TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
* having to treat SSE2 functions with such properties as AVX. */
if (bit_depth == 10) {
H264_QPEL_FUNCS_10(1, 0, sse2);
H264_QPEL_FUNCS_10(2, 0, sse2);
H264_QPEL_FUNCS_10(3, 0, sse2);
}
}
#endif
}

View File

@@ -0,0 +1,884 @@
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
;*****************************************************************************
;* Copyright (C) 2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
cextern pw_16
cextern pw_1
cextern pb_0
pw_pixel_max: times 8 dw ((1 << 10)-1)
pad10: times 8 dw 10*1023
pad20: times 8 dw 20*1023
pad30: times 8 dw 30*1023
depad: times 4 dd 32*20*1023 + 512
depad2: times 8 dw 20*1023 + 16*1022 + 16
unpad: times 8 dw 16*1022/32 ; needs to be mod 16
tap1: times 4 dw 1, -5
tap2: times 4 dw 20, 20
tap3: times 4 dw -5, 1
pd_0f: times 4 dd 0xffff
SECTION .text
%macro AVG_MOV 2
pavgw %2, %1
mova %1, %2
%endmacro
%macro ADDW 3
%if mmsize == 8
paddw %1, %2
%else
movu %3, %2
paddw %1, %3
%endif
%endmacro
%macro FILT_H 4
paddw %1, %4
psubw %1, %2 ; a-b
psraw %1, 2 ; (a-b)/4
psubw %1, %2 ; (a-b)/4-b
paddw %1, %3 ; (a-b)/4-b+c
psraw %1, 2 ; ((a-b)/4-b+c)/4
paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
%endmacro
%macro PRELOAD_V 0
lea r3, [r2*3]
sub r1, r3
movu m0, [r1+r2]
movu m1, [r1+r2*2]
add r1, r3
movu m2, [r1]
movu m3, [r1+r2]
movu m4, [r1+r2*2]
add r1, r3
%endmacro
%macro FILT_V 8
movu %6, [r1]
paddw %1, %6
mova %7, %2
paddw %7, %5
mova %8, %3
paddw %8, %4
FILT_H %1, %7, %8, [pw_16]
psraw %1, 1
CLIPW %1, [pb_0], [pw_pixel_max]
%endmacro
%macro MC 1
%define OP_MOV mova
INIT_MMX mmxext
%1 put, 4
INIT_XMM sse2
%1 put, 8
%define OP_MOV AVG_MOV
INIT_MMX mmxext
%1 avg, 4
INIT_XMM sse2
%1 avg, 8
%endmacro
%macro MCAxA_OP 7
%if ARCH_X86_32
cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m
mov r1, r1m
add r0, %3*2
add r1, %3*2
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m
mov r1, r1m
lea r0, [r0+r2*%3]
lea r1, [r1+r2*%3]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m
mov r1, r1m
lea r0, [r0+r2*%3+%3*2]
lea r1, [r1+r2*%3+%3*2]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET
%else ; ARCH_X86_64
cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
mov r%6, r0
%assign p1 %6+1
mov r %+ p1, r1
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%6+%3*2]
lea r1, [r %+ p1+%3*2]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%6+r2*%3]
lea r1, [r %+ p1+r2*%3]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%6+r2*%3+%3*2]
lea r1, [r %+ p1+r2*%3+%3*2]
%if UNIX64 == 0 ; fall through to function
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET
%endif
%endif
%endmacro
;cpu, put/avg, mc, 4/8, ...
%macro cglobal_mc 6
%assign i %3*2
%if ARCH_X86_32 || cpuflag(sse2)
MCAxA_OP %1, %2, %3, i, %4,%5,%6
%endif
cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET
%endif
stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
%endmacro
;-----------------------------------------------------------------------------
; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro COPY4 0
movu m0, [r1 ]
OP_MOV [r0 ], m0
movu m0, [r1+r2 ]
OP_MOV [r0+r2 ], m0
movu m0, [r1+r2*2]
OP_MOV [r0+r2*2], m0
movu m0, [r1+r3 ]
OP_MOV [r0+r3 ], m0
%endmacro
%macro MC00 1
INIT_MMX mmxext
cglobal_mc %1, mc00, 4, 3,4,0
lea r3, [r2*3]
COPY4
ret
INIT_XMM sse2
cglobal %1_h264_qpel8_mc00_10, 3,4
lea r3, [r2*3]
COPY4
lea r0, [r0+r2*4]
lea r1, [r1+r2*4]
COPY4
RET
cglobal %1_h264_qpel16_mc00_10, 3,4
mov r3d, 8
.loop:
movu m0, [r1 ]
movu m1, [r1 +16]
OP_MOV [r0 ], m0
OP_MOV [r0 +16], m1
movu m0, [r1+r2 ]
movu m1, [r1+r2+16]
OP_MOV [r0+r2 ], m0
OP_MOV [r0+r2+16], m1
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
dec r3d
jg .loop
REP_RET
%endmacro
%define OP_MOV mova
MC00 put
%define OP_MOV AVG_MOV
MC00 avg
;-----------------------------------------------------------------------------
; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC_CACHE 1
%define OP_MOV mova
INIT_MMX mmxext
%1 put, 4
INIT_XMM sse2, cache64
%1 put, 8
INIT_XMM ssse3, cache64
%1 put, 8
INIT_XMM sse2
%1 put, 8
%define OP_MOV AVG_MOV
INIT_MMX mmxext
%1 avg, 4
INIT_XMM sse2, cache64
%1 avg, 8
INIT_XMM ssse3, cache64
%1 avg, 8
INIT_XMM sse2
%1 avg, 8
%endmacro
%macro MC20 2
cglobal_mc %1, mc20, %2, 3,4,9
mov r3d, %2
mova m1, [pw_pixel_max]
%if num_mmregs > 8
mova m8, [pw_16]
%define p16 m8
%else
%define p16 [pw_16]
%endif
.nextrow:
%if %0 == 4
movu m2, [r1-4]
movu m3, [r1-2]
movu m4, [r1+0]
ADDW m2, [r1+6], m5
ADDW m3, [r1+4], m5
ADDW m4, [r1+2], m5
%else ; movu is slow on these processors
%if mmsize==16
movu m2, [r1-4]
movu m0, [r1+6]
mova m6, m0
psrldq m0, 6
paddw m6, m2
PALIGNR m3, m0, m2, 2, m5
PALIGNR m7, m0, m2, 8, m5
paddw m3, m7
PALIGNR m4, m0, m2, 4, m5
PALIGNR m7, m0, m2, 6, m5
paddw m4, m7
SWAP 2, 6
%else
movu m2, [r1-4]
movu m6, [r1+4]
PALIGNR m3, m6, m2, 2, m5
paddw m3, m6
PALIGNR m4, m6, m2, 4, m5
PALIGNR m7, m6, m2, 6, m5
paddw m4, m7
paddw m2, [r1+6]
%endif
%endif
FILT_H m2, m3, m4, p16
psraw m2, 1
pxor m0, m0
CLIPW m2, m0, m1
OP_MOV [r0], m2
add r0, r2
add r1, r2
dec r3d
jg .nextrow
rep ret
%endmacro
MC_CACHE MC20
;-----------------------------------------------------------------------------
; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC30 2
cglobal_mc %1, mc30, %2, 3,5,9
lea r4, [r1+2]
jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
%endmacro
MC_CACHE MC30
;-----------------------------------------------------------------------------
; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC10 2
cglobal_mc %1, mc10, %2, 3,5,9
mov r4, r1
.body:
mov r3d, %2
mova m1, [pw_pixel_max]
%if num_mmregs > 8
mova m8, [pw_16]
%define p16 m8
%else
%define p16 [pw_16]
%endif
.nextrow:
%if %0 == 4
movu m2, [r1-4]
movu m3, [r1-2]
movu m4, [r1+0]
ADDW m2, [r1+6], m5
ADDW m3, [r1+4], m5
ADDW m4, [r1+2], m5
%else ; movu is slow on these processors
%if mmsize==16
movu m2, [r1-4]
movu m0, [r1+6]
mova m6, m0
psrldq m0, 6
paddw m6, m2
PALIGNR m3, m0, m2, 2, m5
PALIGNR m7, m0, m2, 8, m5
paddw m3, m7
PALIGNR m4, m0, m2, 4, m5
PALIGNR m7, m0, m2, 6, m5
paddw m4, m7
SWAP 2, 6
%else
movu m2, [r1-4]
movu m6, [r1+4]
PALIGNR m3, m6, m2, 2, m5
paddw m3, m6
PALIGNR m4, m6, m2, 4, m5
PALIGNR m7, m6, m2, 6, m5
paddw m4, m7
paddw m2, [r1+6]
%endif
%endif
FILT_H m2, m3, m4, p16
psraw m2, 1
pxor m0, m0
CLIPW m2, m0, m1
movu m3, [r4]
pavgw m2, m3
OP_MOV [r0], m2
add r0, r2
add r1, r2
add r4, r2
dec r3d
jg .nextrow
rep ret
%endmacro
MC_CACHE MC10
;-----------------------------------------------------------------------------
; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro V_FILT 10
v_filt%9_%10_10
add r4, r2
.no_addr4:
FILT_V m0, m1, m2, m3, m4, m5, m6, m7
add r1, r2
add r0, r2
ret
%endmacro
INIT_MMX mmxext
RESET_MM_PERMUTATION
%assign i 0
%rep 4
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
INIT_XMM sse2
RESET_MM_PERMUTATION
%assign i 0
%rep 6
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
%macro MC02 2
cglobal_mc %1, mc02, %2, 3,4,8
PRELOAD_V
sub r0, r2
%assign j 0
%rep %2
%assign i (j % 6)
call v_filt%2_ %+ i %+ _10.no_addr4
OP_MOV [r0], m0
SWAP 0,1,2,3,4,5
%assign j j+1
%endrep
ret
%endmacro
MC MC02
;-----------------------------------------------------------------------------
; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC01 2
cglobal_mc %1, mc01, %2, 3,5,8
mov r4, r1
.body:
PRELOAD_V
sub r4, r2
sub r0, r2
%assign j 0
%rep %2
%assign i (j % 6)
call v_filt%2_ %+ i %+ _10
movu m7, [r4]
pavgw m0, m7
OP_MOV [r0], m0
SWAP 0,1,2,3,4,5
%assign j j+1
%endrep
ret
%endmacro
MC MC01
;-----------------------------------------------------------------------------
; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC03 2
cglobal_mc %1, mc03, %2, 3,5,8
lea r4, [r1+r2]
jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
%endmacro
MC MC03
;-----------------------------------------------------------------------------
; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro H_FILT_AVG 2-3
h_filt%1_%2_10:
;FILT_H with fewer registers and averaged with the FILT_V result
;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
;unfortunately I need three registers, so m5 will have to be re-read from memory
movu m5, [r4-4]
ADDW m5, [r4+6], m7
movu m6, [r4-2]
ADDW m6, [r4+4], m7
paddw m5, [pw_16]
psubw m5, m6 ; a-b
psraw m5, 2 ; (a-b)/4
psubw m5, m6 ; (a-b)/4-b
movu m6, [r4+0]
ADDW m6, [r4+2], m7
paddw m5, m6 ; (a-b)/4-b+c
psraw m5, 2 ; ((a-b)/4-b+c)/4
paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
psraw m5, 1
CLIPW m5, [pb_0], [pw_pixel_max]
;avg FILT_V, FILT_H
pavgw m0, m5
%if %0!=4
movu m5, [r1+r5]
%endif
ret
%endmacro
INIT_MMX mmxext
RESET_MM_PERMUTATION
%assign i 0
%rep 3
H_FILT_AVG 4, i
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
H_FILT_AVG 4, i, 0
INIT_XMM sse2
RESET_MM_PERMUTATION
%assign i 0
%rep 6
%if i==1
H_FILT_AVG 8, i, 0
%else
H_FILT_AVG 8, i
%endif
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
%macro MC11 2
; this REALLY needs x86_64
cglobal_mc %1, mc11, %2, 3,6,8
mov r4, r1
.body:
PRELOAD_V
sub r0, r2
sub r4, r2
mov r5, r2
neg r5
%assign j 0
%rep %2
%assign i (j % 6)
call v_filt%2_ %+ i %+ _10
call h_filt%2_ %+ i %+ _10
%if %2==8 && i==1
movu m5, [r1+r5]
%endif
OP_MOV [r0], m0
SWAP 0,1,2,3,4,5
%assign j j+1
%endrep
ret
%endmacro
MC MC11
;-----------------------------------------------------------------------------
; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC31 2
cglobal_mc %1, mc31, %2, 3,6,8
mov r4, r1
add r1, 2
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro
MC MC31
;-----------------------------------------------------------------------------
; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC13 2
cglobal_mc %1, mc13, %2, 3,7,12
lea r4, [r1+r2]
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro
MC MC13
;-----------------------------------------------------------------------------
; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC33 2
cglobal_mc %1, mc33, %2, 3,6,8
lea r4, [r1+r2]
add r1, 2
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro
MC MC33
;-----------------------------------------------------------------------------
; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro FILT_H2 3
psubw %1, %2 ; a-b
psubw %2, %3 ; b-c
psllw %2, 2
psubw %1, %2 ; a-5*b+4*c
psllw %3, 4
paddw %1, %3 ; a-5*b+20*c
%endmacro
%macro FILT_VNRD 8
movu %6, [r1]
paddw %1, %6
mova %7, %2
paddw %7, %5
mova %8, %3
paddw %8, %4
FILT_H2 %1, %7, %8
%endmacro
%macro HV 1
%if mmsize==16
%define PAD 12
%define COUNT 2
%else
%define PAD 4
%define COUNT 3
%endif
put_hv%1_10:
neg r2 ; This actually saves instructions
lea r1, [r1+r2*2-mmsize+PAD]
lea r4, [rsp+PAD+gprsize]
mov r3d, COUNT
.v_loop:
movu m0, [r1]
sub r1, r2
movu m1, [r1]
sub r1, r2
movu m2, [r1]
sub r1, r2
movu m3, [r1]
sub r1, r2
movu m4, [r1]
sub r1, r2
%assign i 0
%rep %1-1
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
psubw m0, [pad20]
movu [r4+i*mmsize*3], m0
sub r1, r2
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
psubw m0, [pad20]
movu [r4+i*mmsize*3], m0
add r4, mmsize
lea r1, [r1+r2*8+mmsize]
%if %1==8
lea r1, [r1+r2*4]
%endif
dec r3d
jg .v_loop
neg r2
ret
%endmacro
INIT_MMX mmxext
HV 4
INIT_XMM sse2
HV 8
%macro H_LOOP 1
%if num_mmregs > 8
%define s1 m8
%define s2 m9
%define s3 m10
%define d1 m11
%else
%define s1 [tap1]
%define s2 [tap2]
%define s3 [tap3]
%define d1 [depad]
%endif
h%1_loop_op:
movu m1, [r1+mmsize-4]
movu m2, [r1+mmsize-2]
mova m3, [r1+mmsize+0]
movu m4, [r1+mmsize+2]
movu m5, [r1+mmsize+4]
movu m6, [r1+mmsize+6]
%if num_mmregs > 8
pmaddwd m1, s1
pmaddwd m2, s1
pmaddwd m3, s2
pmaddwd m4, s2
pmaddwd m5, s3
pmaddwd m6, s3
paddd m1, d1
paddd m2, d1
%else
mova m0, s1
pmaddwd m1, m0
pmaddwd m2, m0
mova m0, s2
pmaddwd m3, m0
pmaddwd m4, m0
mova m0, s3
pmaddwd m5, m0
pmaddwd m6, m0
mova m0, d1
paddd m1, m0
paddd m2, m0
%endif
paddd m3, m5
paddd m4, m6
paddd m1, m3
paddd m2, m4
psrad m1, 10
psrad m2, 10
pslld m2, 16
pand m1, [pd_0f]
por m1, m2
%if num_mmregs <= 8
pxor m0, m0
%endif
CLIPW m1, m0, m7
add r1, mmsize*3
ret
%endmacro
INIT_MMX mmxext
H_LOOP 4
INIT_XMM sse2
H_LOOP 8
%macro MC22 2
cglobal_mc %1, mc22, %2, 3,7,12
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
call put_hv%2_10
mov r3d, %2
mova m7, [pw_pixel_max]
%if num_mmregs > 8
pxor m0, m0
mova m8, [tap1]
mova m9, [tap2]
mova m10, [tap3]
mova m11, [depad]
%endif
mov r1, rsp
.h_loop:
call h%2_loop_op
OP_MOV [r0], m1
add r0, r2
dec r3d
jg .h_loop
mov rsp, r6 ; restore stack pointer
ret
%endmacro
MC MC22
;-----------------------------------------------------------------------------
; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC12 2
cglobal_mc %1, mc12, %2, 3,7,12
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
call put_hv%2_10
xor r4d, r4d
.body:
mov r3d, %2
pxor m0, m0
mova m7, [pw_pixel_max]
%if num_mmregs > 8
mova m8, [tap1]
mova m9, [tap2]
mova m10, [tap3]
mova m11, [depad]
%endif
mov r1, rsp
.h_loop:
call h%2_loop_op
movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
paddw m3, [depad2]
psrlw m3, 5
psubw m3, [unpad]
CLIPW m3, m0, m7
pavgw m1, m3
OP_MOV [r0], m1
add r0, r2
dec r3d
jg .h_loop
mov rsp, r6 ; restore stack pointer
ret
%endmacro
MC MC12
;-----------------------------------------------------------------------------
; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC32 2
cglobal_mc %1, mc32, %2, 3,7,12
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
call put_hv%2_10
mov r4d, 2 ; sizeof(pixel)
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
%endmacro
MC MC32
;-----------------------------------------------------------------------------
; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro H_NRD 1
put_h%1_10:
add rsp, gprsize
mov r3d, %1
xor r4d, r4d
mova m6, [pad20]
.nextrow:
movu m2, [r5-4]
movu m3, [r5-2]
movu m4, [r5+0]
ADDW m2, [r5+6], m5
ADDW m3, [r5+4], m5
ADDW m4, [r5+2], m5
FILT_H2 m2, m3, m4
psubw m2, m6
mova [rsp+r4], m2
add r4d, mmsize*3
add r5, r2
dec r3d
jg .nextrow
sub rsp, gprsize
ret
%endmacro
INIT_MMX mmxext
H_NRD 4
INIT_XMM sse2
H_NRD 8
%macro MC21 2
cglobal_mc %1, mc21, %2, 3,7,12
mov r5, r1
.body:
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
call put_h%2_10
sub rsp, PAD
call put_hv%2_10
mov r4d, PAD-mmsize ; H buffer
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
%endmacro
MC MC21
;-----------------------------------------------------------------------------
; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC23 2
cglobal_mc %1, mc23, %2, 3,7,12
lea r5, [r1+r2]
jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
%endmacro
MC MC23

View File

@@ -0,0 +1,862 @@
;*****************************************************************************
;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2012 Daniel Kang
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
cextern pw_16
cextern pw_5
cextern pb_0
SECTION .text
%macro op_avgh 3
movh %3, %2
pavgb %1, %3
movh %2, %1
%endmacro
%macro op_avg 2-3
pavgb %1, %2
mova %2, %1
%endmacro
%macro op_puth 2-3
movh %2, %1
%endmacro
%macro op_put 2-3
mova %2, %1
%endmacro
%macro QPEL4_H_LOWPASS_OP 1
cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
pxor m7, m7
mova m4, [pw_5]
mova m5, [pw_16]
mov r4d, 4
.loop:
movh m1, [r1-1]
movh m2, [r1+0]
movh m3, [r1+1]
movh m0, [r1+2]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m0, m7
paddw m1, m0
paddw m2, m3
movh m0, [r1-2]
movh m3, [r1+3]
punpcklbw m0, m7
punpcklbw m3, m7
paddw m0, m3
psllw m2, 2
psubw m2, m1
pmullw m2, m4
paddw m0, m5
paddw m0, m2
psraw m0, 5
packuswb m0, m0
op_%1h m0, [r0], m6
add r0, r2
add r1, r3
dec r4d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL4_H_LOWPASS_OP put
QPEL4_H_LOWPASS_OP avg
%macro QPEL8_H_LOWPASS_OP 1
cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
mova m0, [r1]
mova m2, [r1+1]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m0, m2
paddw m1, m3
psllw m0, 2
psllw m1, 2
mova m2, [r1-1]
mova m4, [r1+2]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
paddw m2, m4
paddw m5, m3
psubw m0, m2
psubw m1, m5
pmullw m0, m6
pmullw m1, m6
movd m2, [r1-2]
movd m5, [r1+7]
punpcklbw m2, m7
punpcklbw m5, m7
paddw m2, m3
paddw m4, m5
mova m5, [pw_16]
paddw m2, m5
paddw m4, m5
paddw m0, m2
paddw m1, m4
psraw m0, 5
psraw m1, 5
packuswb m0, m1
op_%1 m0, [r0], m4
add r0, r2
add r1, r3
dec r4d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8_H_LOWPASS_OP put
QPEL8_H_LOWPASS_OP avg
%macro QPEL8_H_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
movu m1, [r1-2]
mova m0, m1
punpckhbw m1, m7
punpcklbw m0, m7
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m4, m0, 2
palignr m3, m0, 4
palignr m2, m0, 6
palignr m1, m0, 8
palignr m5, m0, 10
paddw m0, m5
paddw m2, m3
paddw m1, m4
psllw m2, 2
psubw m2, m1
paddw m0, [pw_16]
pmullw m2, m6
paddw m2, m0
psraw m2, 5
packuswb m2, m2
op_%1h m2, [r0], m4
add r1, r3
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
INIT_XMM ssse3
QPEL8_H_LOWPASS_OP_XMM put
QPEL8_H_LOWPASS_OP_XMM avg
%macro QPEL4_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
pxor m7, m7
mova m4, [pw_5]
mova m5, [pw_16]
mov r5d, 4
.loop:
movh m1, [r1-1]
movh m2, [r1+0]
movh m3, [r1+1]
movh m0, [r1+2]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m0, m7
paddw m1, m0
paddw m2, m3
movh m0, [r1-2]
movh m3, [r1+3]
punpcklbw m0, m7
punpcklbw m3, m7
paddw m0, m3
psllw m2, 2
psubw m2, m1
pmullw m2, m4
paddw m0, m5
paddw m0, m2
movh m3, [r2]
psraw m0, 5
packuswb m0, m0
pavgb m0, m3
op_%1h m0, [r0], m6
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL4_H_LOWPASS_L2_OP put
QPEL4_H_LOWPASS_L2_OP avg
%macro QPEL8_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
mova m0, [r1]
mova m2, [r1+1]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m0, m2
paddw m1, m3
psllw m0, 2
psllw m1, 2
mova m2, [r1-1]
mova m4, [r1+2]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
paddw m2, m4
paddw m5, m3
psubw m0, m2
psubw m1, m5
pmullw m0, m6
pmullw m1, m6
movd m2, [r1-2]
movd m5, [r1+7]
punpcklbw m2, m7
punpcklbw m5, m7
paddw m2, m3
paddw m4, m5
mova m5, [pw_16]
paddw m2, m5
paddw m4, m5
paddw m0, m2
paddw m1, m4
psraw m0, 5
psraw m1, 5
mova m4, [r2]
packuswb m0, m1
pavgb m0, m4
op_%1 m0, [r0], m4
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8_H_LOWPASS_L2_OP put
QPEL8_H_LOWPASS_L2_OP avg
%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
lddqu m1, [r1-2]
mova m0, m1
punpckhbw m1, m7
punpcklbw m0, m7
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m4, m0, 2
palignr m3, m0, 4
palignr m2, m0, 6
palignr m1, m0, 8
palignr m5, m0, 10
paddw m0, m5
paddw m2, m3
paddw m1, m4
psllw m2, 2
movh m3, [r2]
psubw m2, m1
paddw m0, [pw_16]
pmullw m2, m6
paddw m2, m0
psraw m2, 5
packuswb m2, m2
pavgb m2, m3
op_%1h m2, [r0], m4
add r1, r3
add r0, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_XMM ssse3
QPEL8_H_LOWPASS_L2_OP_XMM put
QPEL8_H_LOWPASS_L2_OP_XMM avg
; All functions that call this are required to have function arguments of
; dst, src, dstStride, srcStride
%macro FILT_V 1
mova m6, m2
movh m5, [r1]
paddw m6, m3
psllw m6, 2
psubw m6, m1
psubw m6, m4
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, [pw_16]
add r1, r3
paddw m0, m5
paddw m6, m0
psraw m6, 5
packuswb m6, m6
op_%1h m6, [r0], m0 ; 1
add r0, r2
SWAP 0, 1, 2, 3, 4, 5
%endmacro
%macro QPEL4_V_LOWPASS_OP 1
cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
sub r1, r3
sub r1, r3
pxor m7, m7
movh m0, [r1]
movh m1, [r1+r3]
lea r1, [r1+2*r3]
movh m2, [r1]
movh m3, [r1+r3]
lea r1, [r1+2*r3]
movh m4, [r1]
add r1, r3
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
RET
%endmacro
INIT_MMX mmxext
QPEL4_V_LOWPASS_OP put
QPEL4_V_LOWPASS_OP avg
%macro QPEL8OR16_V_LOWPASS_OP 1
%if cpuflag(sse2)
cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
sub r1, r3
sub r1, r3
%else
cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
%endif
pxor m7, m7
movh m0, [r1]
movh m1, [r1+r3]
lea r1, [r1+2*r3]
movh m2, [r1]
movh m3, [r1+r3]
lea r1, [r1+2*r3]
movh m4, [r1]
add r1, r3
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
cmp r4d, 16
jne .end
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
.end:
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8OR16_V_LOWPASS_OP put
QPEL8OR16_V_LOWPASS_OP avg
INIT_XMM sse2
QPEL8OR16_V_LOWPASS_OP put
QPEL8OR16_V_LOWPASS_OP avg
; All functions that use this are required to have args:
; src, tmp, srcSize
%macro FILT_HV 1 ; offset
mova m6, m2
movh m5, [r0]
paddw m6, m3
psllw m6, 2
paddw m0, [pw_16]
psubw m6, m1
psubw m6, m4
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, m5
add r0, r2
paddw m6, m0
mova [r1+%1], m6
SWAP 0, 1, 2, 3, 4, 5
%endmacro
%macro QPEL4_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
movsxdifnidn r2, r2d
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
movh m2, [r0]
movh m3, [r0+r2]
lea r0, [r0+2*r2]
movh m4, [r0]
add r0, r2
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_HV 0*24
FILT_HV 1*24
FILT_HV 2*24
FILT_HV 3*24
RET
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
movsxdifnidn r2, r2d
mov r3d, 4
.loop:
mova m0, [r0]
paddw m0, [r0+10]
mova m1, [r0+2]
paddw m1, [r0+8]
mova m2, [r0+4]
paddw m2, [r0+6]
psubw m0, m1
psraw m0, 2
psubw m0, m1
paddsw m0, m2
psraw m0, 2
paddw m0, m2
psraw m0, 6
packuswb m0, m0
op_%1h m0, [r1], m7
add r0, 24
add r1, r2
dec r3d
jnz .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL4_HV1_LOWPASS_OP put
QPEL4_HV1_LOWPASS_OP avg
%macro QPEL8OR16_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
movsxdifnidn r2, r2d
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
movh m2, [r0]
movh m3, [r0+r2]
lea r0, [r0+2*r2]
movh m4, [r0]
add r0, r2
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_HV 0*48
FILT_HV 1*48
FILT_HV 2*48
FILT_HV 3*48
FILT_HV 4*48
FILT_HV 5*48
FILT_HV 6*48
FILT_HV 7*48
cmp r3d, 16
jne .end
FILT_HV 8*48
FILT_HV 9*48
FILT_HV 10*48
FILT_HV 11*48
FILT_HV 12*48
FILT_HV 13*48
FILT_HV 14*48
FILT_HV 15*48
.end:
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8OR16_HV1_LOWPASS_OP put
QPEL8OR16_HV1_LOWPASS_OP avg
INIT_XMM sse2
QPEL8OR16_HV1_LOWPASS_OP put
%macro QPEL8OR16_HV2_LOWPASS_OP 1
; unused is to match ssse3 and mmxext args
cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
movsxdifnidn r2, r2d
.loop:
mova m0, [r1]
mova m3, [r1+8]
mova m1, [r1+2]
mova m4, [r1+10]
paddw m0, m4
paddw m1, m3
paddw m3, [r1+18]
paddw m4, [r1+16]
mova m2, [r1+4]
mova m5, [r1+12]
paddw m2, [r1+6]
paddw m5, [r1+14]
psubw m0, m1
psubw m3, m4
psraw m0, 2
psraw m3, 2
psubw m0, m1
psubw m3, m4
paddsw m0, m2
paddsw m3, m5
psraw m0, 2
psraw m3, 2
paddw m0, m2
paddw m3, m5
psraw m0, 6
psraw m3, 6
packuswb m0, m3
op_%1 m0, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
QPEL8OR16_HV2_LOWPASS_OP put
QPEL8OR16_HV2_LOWPASS_OP avg
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
cmp r4d, 16
je .op16
.loop8:
mova m1, [r1+16]
mova m0, [r1]
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m5, m0, 10
palignr m4, m0, 8
palignr m3, m0, 6
palignr m2, m0, 4
palignr m1, m0, 2
paddw m0, m5
paddw m1, m4
paddw m2, m3
psubw m0, m1
psraw m0, 2
psubw m0, m1
paddw m0, m2
psraw m0, 2
paddw m0, m2
psraw m0, 6
packuswb m0, m0
op_%1h m0, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .loop8
jmp .done
.op16:
mova m4, [r1+32]
mova m5, [r1+16]
mova m7, [r1]
mova m3, m4
mova m2, m4
mova m1, m4
mova m0, m4
palignr m0, m5, 10
palignr m1, m5, 8
palignr m2, m5, 6
palignr m3, m5, 4
palignr m4, m5, 2
paddw m0, m5
paddw m1, m4
paddw m2, m3
mova m6, m5
mova m4, m5
mova m3, m5
palignr m4, m7, 8
palignr m6, m7, 2
palignr m3, m7, 10
paddw m4, m6
mova m6, m5
palignr m5, m7, 6
palignr m6, m7, 4
paddw m3, m7
paddw m5, m6
psubw m0, m1
psubw m3, m4
psraw m0, 2
psraw m3, 2
psubw m0, m1
psubw m3, m4
paddw m0, m2
paddw m3, m5
psraw m0, 2
psraw m3, 2
paddw m0, m2
paddw m3, m5
psraw m0, 6
psraw m3, 6
packuswb m3, m0
op_%1 m3, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .op16
.done:
REP_RET
%endmacro
INIT_XMM ssse3
QPEL8OR16_HV2_LOWPASS_OP_XMM put
QPEL8OR16_HV2_LOWPASS_OP_XMM avg
%macro PIXELS4_L2_SHIFT5 1
cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mova m0, [r1]
mova m1, [r1+24]
psraw m0, 5
psraw m1, 5
packuswb m0, m0
packuswb m1, m1
pavgb m0, [r2]
pavgb m1, [r2+r4]
op_%1h m0, [r0], m4
op_%1h m1, [r0+r3], m5
lea r2, [r2+r4*2]
lea r0, [r0+r3*2]
mova m0, [r1+48]
mova m1, [r1+72]
psraw m0, 5
psraw m1, 5
packuswb m0, m0
packuswb m1, m1
pavgb m0, [r2]
pavgb m1, [r2+r4]
op_%1h m0, [r0], m4
op_%1h m1, [r0+r3], m5
RET
%endmacro
INIT_MMX mmxext
PIXELS4_L2_SHIFT5 put
PIXELS4_L2_SHIFT5 avg
%macro PIXELS8_L2_SHIFT5 1
cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
.loop:
mova m0, [r1]
mova m1, [r1+8]
mova m2, [r1+48]
mova m3, [r1+48+8]
psraw m0, 5
psraw m1, 5
psraw m2, 5
psraw m3, 5
packuswb m0, m1
packuswb m2, m3
pavgb m0, [r2]
pavgb m2, [r2+r4]
op_%1 m0, [r0], m4
op_%1 m2, [r0+r3], m5
lea r2, [r2+2*r4]
add r1, 48*2
lea r0, [r0+2*r3]
sub r5d, 2
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS8_L2_SHIFT5 put
PIXELS8_L2_SHIFT5 avg
%if ARCH_X86_64
%macro QPEL16_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 16
pxor m15, m15
mova m14, [pw_5]
mova m13, [pw_16]
.loop:
lddqu m1, [r1+6]
lddqu m7, [r1-2]
mova m0, m1
punpckhbw m1, m15
punpcklbw m0, m15
punpcklbw m7, m15
mova m2, m1
mova m6, m0
mova m3, m1
mova m8, m0
mova m4, m1
mova m9, m0
mova m12, m0
mova m11, m1
palignr m11, m0, 10
palignr m12, m7, 10
palignr m4, m0, 2
palignr m9, m7, 2
palignr m3, m0, 4
palignr m8, m7, 4
palignr m2, m0, 6
palignr m6, m7, 6
paddw m11, m0
palignr m1, m0, 8
palignr m0, m7, 8
paddw m7, m12
paddw m2, m3
paddw m6, m8
paddw m1, m4
paddw m0, m9
psllw m2, 2
psllw m6, 2
psubw m2, m1
psubw m6, m0
paddw m11, m13
paddw m7, m13
pmullw m2, m14
pmullw m6, m14
lddqu m3, [r2]
paddw m2, m11
paddw m6, m7
psraw m2, 5
psraw m6, 5
packuswb m6, m2
pavgb m6, m3
op_%1 m6, [r0], m11
add r1, r3
add r0, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
INIT_XMM ssse3
QPEL16_H_LOWPASS_L2_OP put
QPEL16_H_LOWPASS_L2_OP avg
%endif

View File

@@ -0,0 +1,317 @@
;*****************************************************************************
;* SSE2-optimized weighted prediction code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; biweight pred:
;
; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
; int height, int log2_denom, int weightd,
; int weights, int offset);
; and
; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
; int log2_denom, int weight, int offset);
;-----------------------------------------------------------------------------
%macro WEIGHT_SETUP 0
add r5, r5
inc r5
movd m3, r4d
movd m5, r5d
movd m6, r3d
pslld m5, m6
psrld m5, 1
%if mmsize == 16
pshuflw m3, m3, 0
pshuflw m5, m5, 0
punpcklqdq m3, m3
punpcklqdq m5, m5
%else
pshufw m3, m3, 0
pshufw m5, m5, 0
%endif
pxor m7, m7
%endmacro
%macro WEIGHT_OP 2
movh m0, [r0+%1]
movh m1, [r0+%2]
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m3
pmullw m1, m3
paddsw m0, m5
paddsw m1, m5
psraw m0, m6
psraw m1, m6
packuswb m0, m1
%endmacro
INIT_MMX mmxext
cglobal h264_weight_16, 6, 6, 0
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0, 4
mova [r0 ], m0
WEIGHT_OP 8, 12
mova [r0+8], m0
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%macro WEIGHT_FUNC_MM 2
cglobal h264_weight_%1, 6, 6, %2
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0, mmsize/2
mova [r0], m0
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%endmacro
INIT_MMX mmxext
WEIGHT_FUNC_MM 8, 0
INIT_XMM sse2
WEIGHT_FUNC_MM 16, 8
%macro WEIGHT_FUNC_HALF_MM 2
cglobal h264_weight_%1, 6, 6, %2
WEIGHT_SETUP
sar r2d, 1
lea r3, [r1*2]
.nextrow:
WEIGHT_OP 0, r1
movh [r0], m0
%if mmsize == 16
movhps [r0+r1], m0
%else
psrlq m0, 32
movh [r0+r1], m0
%endif
add r0, r3
dec r2d
jnz .nextrow
REP_RET
%endmacro
INIT_MMX mmxext
WEIGHT_FUNC_HALF_MM 4, 0
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM 8, 8
%macro BIWEIGHT_SETUP 0
%if ARCH_X86_64
%define off_regd r7d
%else
%define off_regd r3d
%endif
mov off_regd, r7m
add off_regd, 1
or off_regd, 1
add r4, 1
cmp r5, 128
jne .normal
sar r5, 1
sar r6, 1
sar off_regd, 1
sub r4, 1
.normal
%if cpuflag(ssse3)
movd m4, r5d
movd m0, r6d
%else
movd m3, r5d
movd m4, r6d
%endif
movd m5, off_regd
movd m6, r4d
pslld m5, m6
psrld m5, 1
%if cpuflag(ssse3)
punpcklbw m4, m0
pshuflw m4, m4, 0
pshuflw m5, m5, 0
punpcklqdq m4, m4
punpcklqdq m5, m5
%else
%if mmsize == 16
pshuflw m3, m3, 0
pshuflw m4, m4, 0
pshuflw m5, m5, 0
punpcklqdq m3, m3
punpcklqdq m4, m4
punpcklqdq m5, m5
%else
pshufw m3, m3, 0
pshufw m4, m4, 0
pshufw m5, m5, 0
%endif
pxor m7, m7
%endif
%endmacro
%macro BIWEIGHT_STEPA 3
movh m%1, [r0+%3]
movh m%2, [r1+%3]
punpcklbw m%1, m7
punpcklbw m%2, m7
pmullw m%1, m3
pmullw m%2, m4
paddsw m%1, m%2
%endmacro
%macro BIWEIGHT_STEPB 0
paddsw m0, m5
paddsw m1, m5
psraw m0, m6
psraw m1, m6
packuswb m0, m1
%endmacro
INIT_MMX mmxext
cglobal h264_biweight_16, 7, 8, 0
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow:
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, 4
BIWEIGHT_STEPB
mova [r0], m0
BIWEIGHT_STEPA 0, 1, 8
BIWEIGHT_STEPA 1, 2, 12
BIWEIGHT_STEPB
mova [r0+8], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%macro BIWEIGHT_FUNC_MM 2
cglobal h264_biweight_%1, 7, 8, %2
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow:
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, mmsize/2
BIWEIGHT_STEPB
mova [r0], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
INIT_MMX mmxext
BIWEIGHT_FUNC_MM 8, 0
INIT_XMM sse2
BIWEIGHT_FUNC_MM 16, 8
%macro BIWEIGHT_FUNC_HALF_MM 2
cglobal h264_biweight_%1, 7, 8, %2
BIWEIGHT_SETUP
movifnidn r3d, r3m
sar r3, 1
lea r4, [r2*2]
.nextrow:
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, r2
BIWEIGHT_STEPB
movh [r0], m0
%if mmsize == 16
movhps [r0+r2], m0
%else
psrlq m0, 32
movh [r0+r2], m0
%endif
add r0, r4
add r1, r4
dec r3d
jnz .nextrow
REP_RET
%endmacro
INIT_MMX mmxext
BIWEIGHT_FUNC_HALF_MM 4, 0
INIT_XMM sse2
BIWEIGHT_FUNC_HALF_MM 8, 8
%macro BIWEIGHT_SSSE3_OP 0
pmaddubsw m0, m4
pmaddubsw m2, m4
paddsw m0, m5
paddsw m2, m5
psraw m0, m6
psraw m2, m6
packuswb m0, m2
%endmacro
INIT_XMM ssse3
cglobal h264_biweight_16, 7, 8, 8
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow:
movh m0, [r0]
movh m2, [r0+8]
movh m3, [r1+8]
punpcklbw m0, [r1]
punpcklbw m2, m3
BIWEIGHT_SSSE3_OP
mova [r0], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
INIT_XMM ssse3
cglobal h264_biweight_8, 7, 8, 8
BIWEIGHT_SETUP
movifnidn r3d, r3m
sar r3, 1
lea r4, [r2*2]
.nextrow:
movh m0, [r0]
movh m1, [r1]
movh m2, [r0+r2]
movh m3, [r1+r2]
punpcklbw m0, m1
punpcklbw m2, m3
BIWEIGHT_SSSE3_OP
movh [r0], m0
movhps [r0+r2], m0
add r0, r4
add r1, r4
dec r3d
jnz .nextrow
REP_RET

View File

@@ -0,0 +1,282 @@
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA 32
pw_pixel_max: times 8 dw ((1 << 10)-1)
sq_1: dq 1
dq 0
cextern pw_1
SECTION .text
;-----------------------------------------------------------------------------
; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
; int weight, int offset);
;-----------------------------------------------------------------------------
%macro WEIGHT_PROLOGUE 0
.prologue:
PROLOGUE 0,6,8
movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r2d, r2m
movifnidn r4d, r4m
movifnidn r5d, r5m
%endmacro
%macro WEIGHT_SETUP 0
mova m0, [pw_1]
movd m2, r3m
pslld m0, m2 ; 1<<log2_denom
SPLATW m0, m0
shl r5, 19 ; *8, move to upper half of dword
lea r5, [r5+r4*2+0x10000]
movd m3, r5d ; weight<<1 | 1+(offset<<(3))
pshufd m3, m3, 0
mova m4, [pw_pixel_max]
paddw m2, [sq_1] ; log2_denom+1
%if notcpuflag(sse4)
pxor m7, m7
%endif
%endmacro
%macro WEIGHT_OP 1-2
%if %0==1
mova m5, [r0+%1]
punpckhwd m6, m5, m0
punpcklwd m5, m0
%else
movq m5, [r0+%1]
movq m6, [r0+%2]
punpcklwd m5, m0
punpcklwd m6, m0
%endif
pmaddwd m5, m3
pmaddwd m6, m3
psrad m5, m2
psrad m6, m2
%if cpuflag(sse4)
packusdw m5, m6
pminsw m5, m4
%else
packssdw m5, m6
CLIPW m5, m7, m4
%endif
%endmacro
%macro WEIGHT_FUNC_DBL 0
cglobal h264_weight_16_10
WEIGHT_PROLOGUE
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0
mova [r0 ], m5
WEIGHT_OP 16
mova [r0+16], m5
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
WEIGHT_FUNC_DBL
INIT_XMM sse4
WEIGHT_FUNC_DBL
%macro WEIGHT_FUNC_MM 0
cglobal h264_weight_8_10
WEIGHT_PROLOGUE
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0
mova [r0], m5
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
WEIGHT_FUNC_MM
INIT_XMM sse4
WEIGHT_FUNC_MM
%macro WEIGHT_FUNC_HALF_MM 0
cglobal h264_weight_4_10
WEIGHT_PROLOGUE
sar r2d, 1
WEIGHT_SETUP
lea r3, [r1*2]
.nextrow:
WEIGHT_OP 0, r1
movh [r0], m5
movhps [r0+r1], m5
add r0, r3
dec r2d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM
INIT_XMM sse4
WEIGHT_FUNC_HALF_MM
;-----------------------------------------------------------------------------
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
; int log2_denom, int weightd, int weights, int offset);
;-----------------------------------------------------------------------------
%if ARCH_X86_32
DECLARE_REG_TMP 3
%else
DECLARE_REG_TMP 7
%endif
%macro BIWEIGHT_PROLOGUE 0
.prologue:
PROLOGUE 0,8,8
movifnidn r0, r0mp
movifnidn r1, r1mp
movifnidn r2d, r2m
movifnidn r5d, r5m
movifnidn r6d, r6m
movifnidn t0d, r7m
%endmacro
%macro BIWEIGHT_SETUP 0
lea t0, [t0*4+1] ; (offset<<2)+1
or t0, 1
shl r6, 16
or r5, r6
movd m4, r5d ; weightd | weights
movd m5, t0d ; (offset+1)|1
movd m6, r4m ; log2_denom
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
paddd m6, [sq_1]
pshufd m4, m4, 0
pshufd m5, m5, 0
mova m3, [pw_pixel_max]
movifnidn r3d, r3m
%if notcpuflag(sse4)
pxor m7, m7
%endif
%endmacro
%macro BIWEIGHT 1-2
%if %0==1
mova m0, [r0+%1]
mova m1, [r1+%1]
punpckhwd m2, m0, m1
punpcklwd m0, m1
%else
movq m0, [r0+%1]
movq m1, [r1+%1]
punpcklwd m0, m1
movq m2, [r0+%2]
movq m1, [r1+%2]
punpcklwd m2, m1
%endif
pmaddwd m0, m4
pmaddwd m2, m4
paddd m0, m5
paddd m2, m5
psrad m0, m6
psrad m2, m6
%if cpuflag(sse4)
packusdw m0, m2
pminsw m0, m3
%else
packssdw m0, m2
CLIPW m0, m7, m3
%endif
%endmacro
%macro BIWEIGHT_FUNC_DBL 0
cglobal h264_biweight_16_10
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP
.nextrow:
BIWEIGHT 0
mova [r0 ], m0
BIWEIGHT 16
mova [r0+16], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
BIWEIGHT_FUNC_DBL
INIT_XMM sse4
BIWEIGHT_FUNC_DBL
%macro BIWEIGHT_FUNC 0
cglobal h264_biweight_8_10
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP
.nextrow:
BIWEIGHT 0
mova [r0], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
BIWEIGHT_FUNC
INIT_XMM sse4
BIWEIGHT_FUNC
%macro BIWEIGHT_FUNC_HALF 0
cglobal h264_biweight_4_10
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP
sar r3d, 1
lea r4, [r2*2]
.nextrow:
BIWEIGHT 0, r2
movh [r0 ], m0
movhps [r0+r2], m0
add r0, r4
add r1, r4
dec r3d
jnz .nextrow
REP_RET
%endmacro
INIT_XMM sse2
BIWEIGHT_FUNC_HALF
INIT_XMM sse4
BIWEIGHT_FUNC_HALF

View File

@@ -0,0 +1,119 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264chroma.h"
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, uint8_t *src, \
int stride, int h, int x, int y);
CHROMA_MC(put, 2, 10, mmxext)
CHROMA_MC(avg, 2, 10, mmxext)
CHROMA_MC(put, 4, 10, mmxext)
CHROMA_MC(avg, 4, 10, mmxext)
CHROMA_MC(put, 8, 10, sse2)
CHROMA_MC(avg, 8, 10, sse2)
CHROMA_MC(put, 8, 10, avx)
CHROMA_MC(avg, 8, 10, avx)
av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
{
#if HAVE_YASM
int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
}
if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
}
if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
}
if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
// AVX implies !cache64.
// TODO: Port cache(32|64) detection from x264.
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
}
#endif
}

View File

@@ -0,0 +1,371 @@
/*
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264dsp.h"
#include "dsputil_x86.h"
/***********************************/
/* IDCT */
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
int16_t *block, \
int stride);
IDCT_ADD_FUNC(, 8, mmx)
IDCT_ADD_FUNC(, 10, sse2)
IDCT_ADD_FUNC(_dc, 8, mmxext)
IDCT_ADD_FUNC(_dc, 10, mmxext)
IDCT_ADD_FUNC(8_dc, 8, mmxext)
IDCT_ADD_FUNC(8_dc, 10, sse2)
IDCT_ADD_FUNC(8, 8, mmx)
IDCT_ADD_FUNC(8, 8, sse2)
IDCT_ADD_FUNC(8, 10, sse2)
IDCT_ADD_FUNC(, 10, avx)
IDCT_ADD_FUNC(8_dc, 10, avx)
IDCT_ADD_FUNC(8, 10, avx)
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, avx)
IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t **dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
/***********************************/
/* deblocking */
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
int8_t ref[2][40],
int16_t mv[2][40][2],
int bidir, int edges, int step,
int mask_mv0, int mask_mv1, int field);
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
int stride, \
int alpha, \
int beta, \
int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
int stride, \
int alpha, \
int beta);
#define LF_FUNCS(type, depth) \
LF_FUNC(h, chroma, depth, mmxext) \
LF_IFUNC(h, chroma_intra, depth, mmxext) \
LF_FUNC(v, chroma, depth, mmxext) \
LF_IFUNC(v, chroma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, mmxext) \
LF_IFUNC(h, luma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, sse2) \
LF_IFUNC(h, luma_intra, depth, sse2) \
LF_FUNC(v, luma, depth, sse2) \
LF_IFUNC(v, luma_intra, depth, sse2) \
LF_FUNC(h, chroma, depth, sse2) \
LF_IFUNC(h, chroma_intra, depth, sse2) \
LF_FUNC(v, chroma, depth, sse2) \
LF_IFUNC(v, chroma_intra, depth, sse2) \
LF_FUNC(h, luma, depth, avx) \
LF_IFUNC(h, luma_intra, depth, avx) \
LF_FUNC(v, luma, depth, avx) \
LF_IFUNC(v, luma_intra, depth, avx) \
LF_FUNC(h, chroma, depth, avx) \
LF_IFUNC(h, chroma_intra, depth, avx) \
LF_FUNC(v, chroma, depth, avx) \
LF_IFUNC(v, chroma_intra, depth, avx)
LF_FUNCS(uint8_t, 8)
LF_FUNCS(uint16_t, 10)
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
LF_FUNC(v8, luma, 8, mmxext)
static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0)
{
if ((tc0[0] & tc0[1]) >= 0)
ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
if ((tc0[2] & tc0[3]) >= 0)
ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
}
LF_IFUNC(v8, luma_intra, 8, mmxext)
static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
int alpha, int beta)
{
ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
}
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
LF_FUNC(v, luma, 10, mmxext)
LF_IFUNC(v, luma_intra, 10, mmxext)
/***********************************/
/* weighted prediction */
#define H264_WEIGHT(W, OPT) \
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \
int height, int log2_denom, \
int weight, int offset);
#define H264_BIWEIGHT(W, OPT) \
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
int stride, int height, \
int log2_denom, int weightd, \
int weights, int offset);
#define H264_BIWEIGHT_MMX(W) \
H264_WEIGHT(W, mmxext) \
H264_BIWEIGHT(W, mmxext)
#define H264_BIWEIGHT_MMX_SSE(W) \
H264_BIWEIGHT_MMX(W) \
H264_WEIGHT(W, sse2) \
H264_BIWEIGHT(W, sse2) \
H264_BIWEIGHT(W, ssse3)
H264_BIWEIGHT_MMX_SSE(16)
H264_BIWEIGHT_MMX_SSE(8)
H264_BIWEIGHT_MMX(4)
#define H264_WEIGHT_10(W, DEPTH, OPT) \
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
int stride, \
int height, \
int log2_denom, \
int weight, \
int offset);
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
uint8_t *src, \
int stride, \
int height, \
int log2_denom, \
int weightd, \
int weights, \
int offset);
#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
H264_WEIGHT_10(W, DEPTH, sse2) \
H264_WEIGHT_10(W, DEPTH, sse4) \
H264_BIWEIGHT_10(W, DEPTH, sse2) \
H264_BIWEIGHT_10(W, DEPTH, sse4)
H264_BIWEIGHT_10_SSE(16, 10)
H264_BIWEIGHT_10_SSE(8, 10)
H264_BIWEIGHT_10_SSE(4, 10)
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(cpu_flags))
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
c->h264_idct_dc_add =
c->h264_idct_add = ff_h264_idct_add_8_mmx;
c->h264_idct8_dc_add =
c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
if (cpu_flags & AV_CPU_FLAG_CMOV)
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
if (chroma_format_idc == 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
}
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
#if ARCH_X86_32
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
#endif /* ARCH_X86_32 */
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->h264_idct_add = ff_h264_idct_add_10_sse2;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
#if HAVE_ALIGNED_STACK
c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->h264_idct_dc_add =
c->h264_idct_add = ff_h264_idct_add_10_avx;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
#if HAVE_ALIGNED_STACK
c->h264_idct8_add = ff_h264_idct8_add_10_avx;
c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
#endif /* HAVE_ALIGNED_STACK */
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
#endif /* HAVE_ALIGNED_STACK */
}
}
#endif
}

View File

@@ -0,0 +1,461 @@
;******************************************************************************
;*
;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
;* Copyright (c) 2013 Daniel Kang
;*
;* MMX optimized hpel functions
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_1
SECTION_TEXT
; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS8_X2 0
cglobal put_pixels8_x2, 4,5
lea r4, [r2*2]
.loop:
mova m0, [r1]
mova m1, [r1+r2]
PAVGB m0, [r1+1]
PAVGB m1, [r1+r2+1]
mova [r0], m0
mova [r0+r2], m1
add r1, r4
add r0, r4
mova m0, [r1]
mova m1, [r1+r2]
PAVGB m0, [r1+1]
PAVGB m1, [r1+r2+1]
add r1, r4
mova [r0], m0
mova [r0+r2], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_PIXELS8_X2
INIT_MMX 3dnow
PUT_PIXELS8_X2
; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS_16 0
cglobal put_pixels16_x2, 4,5
lea r4, [r2*2]
.loop:
mova m0, [r1]
mova m1, [r1+r2]
mova m2, [r1+8]
mova m3, [r1+r2+8]
PAVGB m0, [r1+1]
PAVGB m1, [r1+r2+1]
PAVGB m2, [r1+9]
PAVGB m3, [r1+r2+9]
mova [r0], m0
mova [r0+r2], m1
mova [r0+8], m2
mova [r0+r2+8], m3
add r1, r4
add r0, r4
mova m0, [r1]
mova m1, [r1+r2]
mova m2, [r1+8]
mova m3, [r1+r2+8]
PAVGB m0, [r1+1]
PAVGB m1, [r1+r2+1]
PAVGB m2, [r1+9]
PAVGB m3, [r1+r2+9]
add r1, r4
mova [r0], m0
mova [r0+r2], m1
mova [r0+8], m2
mova [r0+r2+8], m3
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_PIXELS_16
INIT_MMX 3dnow
PUT_PIXELS_16
; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_X2 0
cglobal put_no_rnd_pixels8_x2, 4,5
mova m6, [pb_1]
lea r4, [r2*2]
.loop:
mova m0, [r1]
mova m2, [r1+r2]
mova m1, [r1+1]
mova m3, [r1+r2+1]
add r1, r4
psubusb m0, m6
psubusb m2, m6
PAVGB m0, m1
PAVGB m2, m3
mova [r0], m0
mova [r0+r2], m2
mova m0, [r1]
mova m1, [r1+1]
mova m2, [r1+r2]
mova m3, [r1+r2+1]
add r0, r4
add r1, r4
psubusb m0, m6
psubusb m2, m6
PAVGB m0, m1
PAVGB m2, m3
mova [r0], m0
mova [r0+r2], m2
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_X2
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_X2
; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
cglobal put_no_rnd_pixels8_x2_exact, 4,5
lea r4, [r2*3]
pcmpeqb m6, m6
.loop:
mova m0, [r1]
mova m2, [r1+r2]
mova m1, [r1+1]
mova m3, [r1+r2+1]
pxor m0, m6
pxor m2, m6
pxor m1, m6
pxor m3, m6
PAVGB m0, m1
PAVGB m2, m3
pxor m0, m6
pxor m2, m6
mova [r0], m0
mova [r0+r2], m2
mova m0, [r1+r2*2]
mova m1, [r1+r2*2+1]
mova m2, [r1+r4]
mova m3, [r1+r4+1]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m1
PAVGB m2, m3
pxor m0, m6
pxor m2, m6
mova [r0+r2*2], m0
mova [r0+r4], m2
lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
sub r3d, 4
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_X2_EXACT
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_X2_EXACT
; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS8_Y2 0
cglobal put_pixels8_y2, 4,5
lea r4, [r2*2]
mova m0, [r1]
sub r0, r2
.loop:
mova m1, [r1+r2]
mova m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
mova [r0+r2], m0
mova [r0+r4], m1
mova m1, [r1+r2]
mova m0, [r1+r4]
add r0, r4
add r1, r4
PAVGB m2, m1
PAVGB m1, m0
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_PIXELS8_Y2
INIT_MMX 3dnow
PUT_PIXELS8_Y2
; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_Y2 0
cglobal put_no_rnd_pixels8_y2, 4,5
mova m6, [pb_1]
lea r4, [r2+r2]
mova m0, [r1]
sub r0, r2
.loop:
mova m1, [r1+r2]
mova m2, [r1+r4]
add r1, r4
psubusb m1, m6
PAVGB m0, m1
PAVGB m1, m2
mova [r0+r2], m0
mova [r0+r4], m1
mova m1, [r1+r2]
mova m0, [r1+r4]
add r0, r4
add r1, r4
psubusb m1, m6
PAVGB m2, m1
PAVGB m1, m0
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_Y2
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_Y2
; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
cglobal put_no_rnd_pixels8_y2_exact, 4,5
lea r4, [r2*3]
mova m0, [r1]
pcmpeqb m6, m6
add r1, r2
pxor m0, m6
.loop:
mova m1, [r1]
mova m2, [r1+r2]
pxor m1, m6
pxor m2, m6
PAVGB m0, m1
PAVGB m1, m2
pxor m0, m6
pxor m1, m6
mova [r0], m0
mova [r0+r2], m1
mova m1, [r1+r2*2]
mova m0, [r1+r4]
pxor m1, m6
pxor m0, m6
PAVGB m2, m1
PAVGB m1, m0
pxor m2, m6
pxor m1, m6
mova [r0+r2*2], m2
mova [r0+r4], m1
lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
sub r3d, 4
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_Y2_EXACT
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_Y2_EXACT
; avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8 0
cglobal avg_pixels8, 4,5
lea r4, [r2*2]
.loop:
mova m0, [r0]
mova m1, [r0+r2]
PAVGB m0, [r1]
PAVGB m1, [r1+r2]
mova [r0], m0
mova [r0+r2], m1
add r1, r4
add r0, r4
mova m0, [r0]
mova m1, [r0+r2]
PAVGB m0, [r1]
PAVGB m1, [r1+r2]
add r1, r4
mova [r0], m0
mova [r0+r2], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX 3dnow
AVG_PIXELS8
; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_X2 0
cglobal avg_pixels8_x2, 4,5
lea r4, [r2*2]
.loop:
mova m0, [r1]
mova m2, [r1+r2]
PAVGB m0, [r1+1]
PAVGB m2, [r1+r2+1]
PAVGB m0, [r0]
PAVGB m2, [r0+r2]
add r1, r4
mova [r0], m0
mova [r0+r2], m2
mova m0, [r1]
mova m2, [r1+r2]
PAVGB m0, [r1+1]
PAVGB m2, [r1+r2+1]
add r0, r4
add r1, r4
PAVGB m0, [r0]
PAVGB m2, [r0+r2]
mova [r0], m0
mova [r0+r2], m2
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
AVG_PIXELS8_X2
INIT_MMX 3dnow
AVG_PIXELS8_X2
; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_Y2 0
cglobal avg_pixels8_y2, 4,5
lea r4, [r2*2]
mova m0, [r1]
sub r0, r2
.loop:
mova m1, [r1+r2]
mova m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
mova m3, [r0+r2]
mova m4, [r0+r4]
PAVGB m0, m3
PAVGB m1, m4
mova [r0+r2], m0
mova [r0+r4], m1
mova m1, [r1+r2]
mova m0, [r1+r4]
PAVGB m2, m1
PAVGB m1, m0
add r0, r4
add r1, r4
mova m3, [r0+r2]
mova m4, [r0+r4]
PAVGB m2, m3
PAVGB m1, m4
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
AVG_PIXELS8_Y2
INIT_MMX 3dnow
AVG_PIXELS8_Y2
; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_XY2 0
cglobal avg_pixels8_xy2, 4,5
mova m6, [pb_1]
lea r4, [r2*2]
mova m0, [r1]
pavgb m0, [r1+1]
.loop:
mova m2, [r1+r4]
mova m1, [r1+r2]
psubusb m2, m6
pavgb m1, [r1+r2+1]
pavgb m2, [r1+r4+1]
add r1, r4
pavgb m0, m1
pavgb m1, m2
pavgb m0, [r0]
pavgb m1, [r0+r2]
mova [r0], m0
mova [r0+r2], m1
mova m1, [r1+r2]
mova m0, [r1+r4]
pavgb m1, [r1+r2+1]
pavgb m0, [r1+r4+1]
add r0, r4
add r1, r4
pavgb m2, m1
pavgb m1, m0
pavgb m2, [r0]
pavgb m1, [r0+r2]
mova [r0], m2
mova [r0+r2], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
AVG_PIXELS8_XY2
INIT_MMX 3dnow
AVG_PIXELS8_XY2

View File

@@ -0,0 +1,269 @@
/*
* MMX optimized DSP utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/hpeldsp.h"
#include "dsputil_x86.h"
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
#define avg_pixels8_mmx ff_avg_pixels8_mmx
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
#define avg_pixels16_mmx ff_avg_pixels16_mmx
#define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx
#define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx
#define put_pixels8_mmx ff_put_pixels8_mmx
#define put_pixels16_mmx ff_put_pixels16_mmx
#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx
#define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx
#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
#if HAVE_INLINE_ASM
/***********************************/
/* MMX no rounding */
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
#define SET_RND MOVQ_WONE
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
#define STATIC static
#include "rnd_template.c"
#include "hpeldsp_rnd_template.c"
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
#undef STATIC
PIXELS16(static, avg_no_rnd, , _y2, _mmx)
PIXELS16(static, put_no_rnd, , _y2, _mmx)
PIXELS16(static, avg_no_rnd, , _xy2, _mmx)
PIXELS16(static, put_no_rnd, , _xy2, _mmx)
/***********************************/
/* MMX rounding */
#define DEF(x, y) x ## _ ## y ## _mmx
#define SET_RND MOVQ_WTWO
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
#include "hpeldsp_rnd_template.c"
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
PIXELS16(static, avg, , _y2, _mmx)
PIXELS16(static, put, , _y2, _mmx)
#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
#define HPELDSP_AVG_PIXELS16(CPUEXT) \
PIXELS16(static, put_no_rnd, ff_, _x2, CPUEXT) \
PIXELS16(static, put, ff_, _y2, CPUEXT) \
PIXELS16(static, put_no_rnd, ff_, _y2, CPUEXT) \
PIXELS16(static, avg, ff_, , CPUEXT) \
PIXELS16(static, avg, ff_, _x2, CPUEXT) \
PIXELS16(static, avg, ff_, _y2, CPUEXT) \
PIXELS16(static, avg, ff_, _xy2, CPUEXT)
HPELDSP_AVG_PIXELS16(_3dnow)
HPELDSP_AVG_PIXELS16(_mmxext)
#endif /* HAVE_YASM */
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
} while (0)
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags)
{
#if HAVE_MMX_INLINE
SET_HPEL_FUNCS(put, [0], 16, mmx);
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
SET_HPEL_FUNCS(avg, [0], 16, mmx);
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
SET_HPEL_FUNCS(put, [1], 8, mmx);
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
SET_HPEL_FUNCS(avg, [1], 8, mmx);
#endif /* HAVE_MMX_INLINE */
}
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
{
#if HAVE_MMXEXT_EXTERNAL
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
if (!(flags & CODEC_FLAG_BITEXACT)) {
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
}
if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
{
#if HAVE_AMD3DNOW_EXTERNAL
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
if (!(flags & CODEC_FLAG_BITEXACT)){
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
}
if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
}
#endif /* HAVE_AMD3DNOW_EXTERNAL */
}
static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
{
#if HAVE_SSE2_EXTERNAL
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
// these functions are slower than mmx on AMD, but faster on Intel
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
}
#endif /* HAVE_SSE2_EXTERNAL */
}
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
hpeldsp_init_mmx(c, flags, cpu_flags);
if (EXTERNAL_MMXEXT(cpu_flags))
hpeldsp_init_mmxext(c, flags, cpu_flags);
if (EXTERNAL_AMD3DNOW(cpu_flags))
hpeldsp_init_3dnow(c, flags, cpu_flags);
if (EXTERNAL_SSE2(cpu_flags))
hpeldsp_init_sse2(c, flags, cpu_flags);
}

View File

@@ -0,0 +1,52 @@
/*
* MMX-optimized avg/put pixel routines
*
* Copyright (c) 2001 Fabrice Bellard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "config.h"
#include "dsputil_x86.h"
#if HAVE_MMX_INLINE
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
JUMPALIGN();
do {
__asm__ volatile(
"movq %1, %%mm0 \n\t"
"movq 1%1, %%mm1 \n\t"
"movq %0, %%mm3 \n\t"
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
"movq %%mm0, %0 \n\t"
:"+m"(*block)
:"m"(*pixels)
:"memory");
pixels += line_size;
block += line_size;
} while (--h);
}
#endif /* HAVE_MMX_INLINE */

View File

@@ -0,0 +1,198 @@
/*
* DSP utils mmx functions are compiled twice for rnd/no_rnd
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
// put_pixels
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%1, %3), %%mm2 \n\t"
"movq 9(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%1, %3), %%mm2 \n\t"
"movq 9(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"REG_a"),%%mm2 \n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"REG_a"),%%mm0 \n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
JUMPALIGN();
do {
__asm__ volatile(
"movq %1, %%mm0 \n\t"
"movq 1%1, %%mm1 \n\t"
"movq %0, %%mm3 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
"movq %%mm0, %0 \n\t"
"movq 8%1, %%mm0 \n\t"
"movq 9%1, %%mm1 \n\t"
"movq 8%0, %%mm3 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
"movq %%mm0, 8%0 \n\t"
:"+m"(*block)
:"m"(*pixels)
:"memory");
pixels += line_size;
block += line_size;
} while (--h);
}
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm2 \n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
"movq (%2, %3), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
"movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
"movq (%2, %3), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
"movq %%mm2, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}

View File

@@ -0,0 +1,562 @@
/*
* XVID MPEG-4 VIDEO CODEC
* - MMX and XMM forward discrete cosine transform -
*
* Copyright(C) 2001 Peter Ross <pross@xvid.org>
*
* Originally provided by Intel at AP-922
* http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
* (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
* but in a limited edition.
* New macro implements a column part for precise iDCT
* The routine precision now satisfies IEEE standard 1180-1990.
*
* Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
* Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
*
* http://www.elecard.com/peter/idct.html
* http://www.linuxvideo.org/mpeg2dec/
*
* These examples contain code fragments for first stage iDCT 8x8
* (for rows) and first stage DCT 8x8 (for columns)
*
* conversion to gcc syntax by Michael Niedermayer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with FFmpeg; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include "config.h"
#include "libavcodec/avcodec.h"
#include "libavutil/mem.h"
#include "dsputil_x86.h"
#include "idct_xvid.h"
#if HAVE_MMX_INLINE
//=============================================================================
// Macros and other preprocessor constants
//=============================================================================
#define BITS_INV_ACC 5 // 4 or 5 for IEEE
#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11
#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6
#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC))
#define RND_INV_COL (16 * (BITS_INV_ACC - 3))
#define RND_INV_CORR (RND_INV_COL - 1)
#define BITS_FRW_ACC 3 // 2 or 3 for accuracy
#define SHIFT_FRW_COL BITS_FRW_ACC
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1))
//-----------------------------------------------------------------------------
// Various memory constants (trigonometric values or rounding values)
//-----------------------------------------------------------------------------
DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
13036,13036,13036,13036, // tg * (2<<16) + 0.5
27146,27146,27146,27146, // tg * (2<<16) + 0.5
-21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5
23170,23170,23170,23170}; // cos * (2<<15) + 0.5
DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
65536,65536,
3597,3597,
2260,2260,
1203,1203,
0,0,
120,120,
512,512,
512,512};
//-----------------------------------------------------------------------------
//
// The first stage iDCT 8x8 - inverse DCTs of rows
//
//-----------------------------------------------------------------------------
// The 8-point inverse DCT direct algorithm
//-----------------------------------------------------------------------------
//
// static const short w[32] = {
// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
//
// #define DCT_8_INV_ROW(x, y)
// {
// int a0, a1, a2, a3, b0, b1, b2, b3;
//
// a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3];
// a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7];
// a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
//
// y[0] = SHIFT_ROUND ( a0 + b0 );
// y[1] = SHIFT_ROUND ( a1 + b1 );
// y[2] = SHIFT_ROUND ( a2 + b2 );
// y[3] = SHIFT_ROUND ( a3 + b3 );
// y[4] = SHIFT_ROUND ( a3 - b3 );
// y[5] = SHIFT_ROUND ( a2 - b2 );
// y[6] = SHIFT_ROUND ( a1 - b1 );
// y[7] = SHIFT_ROUND ( a0 - b0 );
// }
//
//-----------------------------------------------------------------------------
//
// In this implementation the outputs of the iDCT-1D are multiplied
// for rows 0,4 - by cos_4_16,
// for rows 1,7 - by cos_1_16,
// for rows 2,6 - by cos_2_16,
// for rows 3,5 - by cos_3_16
// and are shifted to the left for better accuracy
//
// For the constants used,
// FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
//
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
// Tables for mmx processors
//-----------------------------------------------------------------------------
// Table for rows 0,4 - constants are multiplied by cos_4_16
DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32*4] = {
16384,16384,16384,-16384, // movq-> w06 w04 w02 w00
21407,8867,8867,-21407, // w07 w05 w03 w01
16384,-16384,16384,16384, // w14 w12 w10 w08
-8867,21407,-21407,-8867, // w15 w13 w11 w09
22725,12873,19266,-22725, // w22 w20 w18 w16
19266,4520,-4520,-12873, // w23 w21 w19 w17
12873,4520,4520,19266, // w30 w28 w26 w24
-22725,19266,-12873,-22725, // w31 w29 w27 w25
// Table for rows 1,7 - constants are multiplied by cos_1_16
22725,22725,22725,-22725, // movq-> w06 w04 w02 w00
29692,12299,12299,-29692, // w07 w05 w03 w01
22725,-22725,22725,22725, // w14 w12 w10 w08
-12299,29692,-29692,-12299, // w15 w13 w11 w09
31521,17855,26722,-31521, // w22 w20 w18 w16
26722,6270,-6270,-17855, // w23 w21 w19 w17
17855,6270,6270,26722, // w30 w28 w26 w24
-31521,26722,-17855,-31521, // w31 w29 w27 w25
// Table for rows 2,6 - constants are multiplied by cos_2_16
21407,21407,21407,-21407, // movq-> w06 w04 w02 w00
27969,11585,11585,-27969, // w07 w05 w03 w01
21407,-21407,21407,21407, // w14 w12 w10 w08
-11585,27969,-27969,-11585, // w15 w13 w11 w09
29692,16819,25172,-29692, // w22 w20 w18 w16
25172,5906,-5906,-16819, // w23 w21 w19 w17
16819,5906,5906,25172, // w30 w28 w26 w24
-29692,25172,-16819,-29692, // w31 w29 w27 w25
// Table for rows 3,5 - constants are multiplied by cos_3_16
19266,19266,19266,-19266, // movq-> w06 w04 w02 w00
25172,10426,10426,-25172, // w07 w05 w03 w01
19266,-19266,19266,19266, // w14 w12 w10 w08
-10426,25172,-25172,-10426, // w15 w13 w11 w09
26722,15137,22654,-26722, // w22 w20 w18 w16
22654,5315,-5315,-15137, // w23 w21 w19 w17
15137,5315,5315,22654, // w30 w28 w26 w24
-26722,22654,-15137,-26722, // w31 w29 w27 w25
};
//-----------------------------------------------------------------------------
// Tables for xmm processors
//-----------------------------------------------------------------------------
// %3 for rows 0,4 - constants are multiplied by cos_4_16
DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32*4] = {
16384,21407,16384,8867, // movq-> w05 w04 w01 w00
16384,8867,-16384,-21407, // w07 w06 w03 w02
16384,-8867,16384,-21407, // w13 w12 w09 w08
-16384,21407,16384,-8867, // w15 w14 w11 w10
22725,19266,19266,-4520, // w21 w20 w17 w16
12873,4520,-22725,-12873, // w23 w22 w19 w18
12873,-22725,4520,-12873, // w29 w28 w25 w24
4520,19266,19266,-22725, // w31 w30 w27 w26
// %3 for rows 1,7 - constants are multiplied by cos_1_16
22725,29692,22725,12299, // movq-> w05 w04 w01 w00
22725,12299,-22725,-29692, // w07 w06 w03 w02
22725,-12299,22725,-29692, // w13 w12 w09 w08
-22725,29692,22725,-12299, // w15 w14 w11 w10
31521,26722,26722,-6270, // w21 w20 w17 w16
17855,6270,-31521,-17855, // w23 w22 w19 w18
17855,-31521,6270,-17855, // w29 w28 w25 w24
6270,26722,26722,-31521, // w31 w30 w27 w26
// %3 for rows 2,6 - constants are multiplied by cos_2_16
21407,27969,21407,11585, // movq-> w05 w04 w01 w00
21407,11585,-21407,-27969, // w07 w06 w03 w02
21407,-11585,21407,-27969, // w13 w12 w09 w08
-21407,27969,21407,-11585, // w15 w14 w11 w10
29692,25172,25172,-5906, // w21 w20 w17 w16
16819,5906,-29692,-16819, // w23 w22 w19 w18
16819,-29692,5906,-16819, // w29 w28 w25 w24
5906,25172,25172,-29692, // w31 w30 w27 w26
// %3 for rows 3,5 - constants are multiplied by cos_3_16
19266,25172,19266,10426, // movq-> w05 w04 w01 w00
19266,10426,-19266,-25172, // w07 w06 w03 w02
19266,-10426,19266,-25172, // w13 w12 w09 w08
-19266,25172,19266,-10426, // w15 w14 w11 w10
26722,22654,22654,-5315, // w21 w20 w17 w16
15137,5315,-26722,-15137, // w23 w22 w19 w18
15137,-26722,5315,-15137, // w29 w28 w25 w24
5315,22654,22654,-26722, // w31 w30 w27 w26
};
//=============================================================================
// Helper macros for the code
//=============================================================================
//-----------------------------------------------------------------------------
// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
//-----------------------------------------------------------------------------
#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\
"movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
"movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
"movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
"movq " #A3 ",%%mm3 \n\t"/* 3 ; w06 w04 w02 w00*/\
"punpcklwd %%mm1,%%mm0 \n\t"/* x5 x1 x4 x0*/\
"movq %%mm0,%%mm5 \n\t"/* 5 ; x5 x1 x4 x0*/\
"punpckldq %%mm0,%%mm0 \n\t"/* x4 x0 x4 x0*/\
"movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w05 w03 w01*/\
"punpckhwd %%mm1,%%mm2 \n\t"/* 1 ; x7 x3 x6 x2*/\
"pmaddwd %%mm0,%%mm3 \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\
"movq %%mm2,%%mm6 \n\t"/* 6 ; x7 x3 x6 x2*/\
"movq 32+" #A3 ",%%mm1 \n\t"/* 1 ; w22 w20 w18 w16*/\
"punpckldq %%mm2,%%mm2 \n\t"/* x6 x2 x6 x2*/\
"pmaddwd %%mm2,%%mm4 \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\
"punpckhdq %%mm5,%%mm5 \n\t"/* x5 x1 x5 x1*/\
"pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\
"punpckhdq %%mm6,%%mm6 \n\t"/* x7 x3 x7 x3*/\
"movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w21 w19 w17*/\
"pmaddwd %%mm5,%%mm1 \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\
"paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
"pmaddwd %%mm6,%%mm7 \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\
"pmaddwd 24+" #A3 ",%%mm2 \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\
"paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
"pmaddwd 48+" #A3 ",%%mm5 \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\
"movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
"pmaddwd 56+" #A3 ",%%mm6 \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\
"paddd %%mm7,%%mm1 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
"paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
"psubd %%mm1,%%mm3 \n\t"/* a1-b1 a0-b0*/\
"psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\
"paddd %%mm4,%%mm1 \n\t"/* 4 ; a1+b1 a0+b0*/\
"paddd %%mm2,%%mm0 \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\
"psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\
"paddd %%mm6,%%mm5 \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\
"movq %%mm0,%%mm4 \n\t"/* 4 ; a3 a2*/\
"paddd %%mm5,%%mm0 \n\t"/* a3+b3 a2+b2*/\
"psubd %%mm5,%%mm4 \n\t"/* 5 ; a3-b3 a2-b2*/\
"psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
"psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\
"packssdw %%mm0,%%mm1 \n\t"/* 0 ; y3 y2 y1 y0*/\
"packssdw %%mm3,%%mm4 \n\t"/* 3 ; y6 y7 y4 y5*/\
"movq %%mm4,%%mm7 \n\t"/* 7 ; y6 y7 y4 y5*/\
"psrld $16,%%mm4 \n\t"/* 0 y6 0 y4*/\
"pslld $16,%%mm7 \n\t"/* y7 0 y5 0*/\
"movq %%mm1," #A2 " \n\t"/* 1 ; save y3 y2 y1 y0*/\
"por %%mm4,%%mm7 \n\t"/* 4 ; y7 y6 y5 y4*/\
"movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
//-----------------------------------------------------------------------------
// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
//-----------------------------------------------------------------------------
#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\
"movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
"movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
"movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
"movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\
"pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\
"movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\
"movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\
"pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\
"movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\
"pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\
"pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\
"movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\
"pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\
"pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\
"pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\
"pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\
"paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
"pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\
"paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
"pmaddwd 24+" #A3 ",%%mm1 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\
"movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
"pmaddwd 48+" #A3 ",%%mm2 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\
"paddd %%mm7,%%mm6 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
"pmaddwd 56+" #A3 ",%%mm5 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\
"paddd %%mm6,%%mm3 \n\t"/* a1+b1 a0+b0*/\
"paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
"psrad $11,%%mm3 \n\t"/* y1=a1+b1 y0=a0+b0*/\
"paddd %%mm1,%%mm0 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\
"psubd %%mm6,%%mm4 \n\t"/* 6 ; a1-b1 a0-b0*/\
"movq %%mm0,%%mm7 \n\t"/* 7 ; a3 a2*/\
"paddd %%mm5,%%mm2 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\
"paddd %%mm2,%%mm0 \n\t"/* a3+b3 a2+b2*/\
"psrad $11,%%mm4 \n\t"/* y6=a1-b1 y7=a0-b0*/\
"psubd %%mm2,%%mm7 \n\t"/* 2 ; a3-b3 a2-b2*/\
"psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
"psrad $11,%%mm7 \n\t"/* y4=a3-b3 y5=a2-b2*/\
"packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\
"packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\
"movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\
"pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\
"movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
//-----------------------------------------------------------------------------
//
// The first stage DCT 8x8 - forward DCTs of columns
//
// The %2puts are multiplied
// for rows 0,4 - on cos_4_16,
// for rows 1,7 - on cos_1_16,
// for rows 2,6 - on cos_2_16,
// for rows 3,5 - on cos_3_16
// and are shifted to the left for rise of accuracy
//
//-----------------------------------------------------------------------------
//
// The 8-point scaled forward DCT algorithm (26a8m)
//
//-----------------------------------------------------------------------------
//
// #define DCT_8_FRW_COL(x, y)
//{
// short t0, t1, t2, t3, t4, t5, t6, t7;
// short tp03, tm03, tp12, tm12, tp65, tm65;
// short tp465, tm465, tp765, tm765;
//
// t0 = LEFT_SHIFT ( x[0] + x[7] );
// t1 = LEFT_SHIFT ( x[1] + x[6] );
// t2 = LEFT_SHIFT ( x[2] + x[5] );
// t3 = LEFT_SHIFT ( x[3] + x[4] );
// t4 = LEFT_SHIFT ( x[3] - x[4] );
// t5 = LEFT_SHIFT ( x[2] - x[5] );
// t6 = LEFT_SHIFT ( x[1] - x[6] );
// t7 = LEFT_SHIFT ( x[0] - x[7] );
//
// tp03 = t0 + t3;
// tm03 = t0 - t3;
// tp12 = t1 + t2;
// tm12 = t1 - t2;
//
// y[0] = tp03 + tp12;
// y[4] = tp03 - tp12;
//
// y[2] = tm03 + tm12 * tg_2_16;
// y[6] = tm03 * tg_2_16 - tm12;
//
// tp65 =(t6 +t5 )*cos_4_16;
// tm65 =(t6 -t5 )*cos_4_16;
//
// tp765 = t7 + tp65;
// tm765 = t7 - tp65;
// tp465 = t4 + tm65;
// tm465 = t4 - tm65;
//
// y[1] = tp765 + tp465 * tg_1_16;
// y[7] = tp765 * tg_1_16 - tp465;
// y[5] = tm765 * tg_3_16 + tm465;
// y[3] = tm765 - tm465 * tg_3_16;
//}
//
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
// DCT_8_INV_COL_4 INP,OUT
//-----------------------------------------------------------------------------
#define DCT_8_INV_COL(A1,A2)\
"movq 2*8(%3),%%mm0\n\t"\
"movq 16*3+" #A1 ",%%mm3\n\t"\
"movq %%mm0,%%mm1 \n\t"/* tg_3_16*/\
"movq 16*5+" #A1 ",%%mm5\n\t"\
"pmulhw %%mm3,%%mm0 \n\t"/* x3*(tg_3_16-1)*/\
"movq (%3),%%mm4\n\t"\
"pmulhw %%mm5,%%mm1 \n\t"/* x5*(tg_3_16-1)*/\
"movq 16*7+" #A1 ",%%mm7\n\t"\
"movq %%mm4,%%mm2 \n\t"/* tg_1_16*/\
"movq 16*1+" #A1 ",%%mm6\n\t"\
"pmulhw %%mm7,%%mm4 \n\t"/* x7*tg_1_16*/\
"paddsw %%mm3,%%mm0 \n\t"/* x3*tg_3_16*/\
"pmulhw %%mm6,%%mm2 \n\t"/* x1*tg_1_16*/\
"paddsw %%mm3,%%mm1 \n\t"/* x3+x5*(tg_3_16-1)*/\
"psubsw %%mm5,%%mm0 \n\t"/* x3*tg_3_16-x5 = tm35*/\
"movq 3*8(%3),%%mm3\n\t"\
"paddsw %%mm5,%%mm1 \n\t"/* x3+x5*tg_3_16 = tp35*/\
"paddsw %%mm6,%%mm4 \n\t"/* x1+tg_1_16*x7 = tp17*/\
"psubsw %%mm7,%%mm2 \n\t"/* x1*tg_1_16-x7 = tm17*/\
"movq %%mm4,%%mm5 \n\t"/* tp17*/\
"movq %%mm2,%%mm6 \n\t"/* tm17*/\
"paddsw %%mm1,%%mm5 \n\t"/* tp17+tp35 = b0*/\
"psubsw %%mm0,%%mm6 \n\t"/* tm17-tm35 = b3*/\
"psubsw %%mm1,%%mm4 \n\t"/* tp17-tp35 = t1*/\
"paddsw %%mm0,%%mm2 \n\t"/* tm17+tm35 = t2*/\
"movq 1*8(%3),%%mm7\n\t"\
"movq %%mm4,%%mm1 \n\t"/* t1*/\
"movq %%mm5,3*16 +" #A2 "\n\t"/* save b0*/\
"paddsw %%mm2,%%mm1 \n\t"/* t1+t2*/\
"movq %%mm6,5*16 +" #A2 "\n\t"/* save b3*/\
"psubsw %%mm2,%%mm4 \n\t"/* t1-t2*/\
"movq 2*16+" #A1 ",%%mm5\n\t"\
"movq %%mm7,%%mm0 \n\t"/* tg_2_16*/\
"movq 6*16+" #A1 ",%%mm6\n\t"\
"pmulhw %%mm5,%%mm0 \n\t"/* x2*tg_2_16*/\
"pmulhw %%mm6,%%mm7 \n\t"/* x6*tg_2_16*/\
"pmulhw %%mm3,%%mm1 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\
"movq 0*16+" #A1 ",%%mm2\n\t"\
"pmulhw %%mm3,%%mm4 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\
"psubsw %%mm6,%%mm0 \n\t"/* t2*tg_2_16-x6 = tm26*/\
"movq %%mm2,%%mm3 \n\t"/* x0*/\
"movq 4*16+" #A1 ",%%mm6\n\t"\
"paddsw %%mm5,%%mm7 \n\t"/* x2+x6*tg_2_16 = tp26*/\
"paddsw %%mm6,%%mm2 \n\t"/* x0+x4 = tp04*/\
"psubsw %%mm6,%%mm3 \n\t"/* x0-x4 = tm04*/\
"movq %%mm2,%%mm5 \n\t"/* tp04*/\
"movq %%mm3,%%mm6 \n\t"/* tm04*/\
"psubsw %%mm7,%%mm2 \n\t"/* tp04-tp26 = a3*/\
"paddsw %%mm0,%%mm3 \n\t"/* tm04+tm26 = a1*/\
"paddsw %%mm1,%%mm1 \n\t"/* b1*/\
"paddsw %%mm4,%%mm4 \n\t"/* b2*/\
"paddsw %%mm7,%%mm5 \n\t"/* tp04+tp26 = a0*/\
"psubsw %%mm0,%%mm6 \n\t"/* tm04-tm26 = a2*/\
"movq %%mm3,%%mm7 \n\t"/* a1*/\
"movq %%mm6,%%mm0 \n\t"/* a2*/\
"paddsw %%mm1,%%mm3 \n\t"/* a1+b1*/\
"paddsw %%mm4,%%mm6 \n\t"/* a2+b2*/\
"psraw $6,%%mm3 \n\t"/* dst1*/\
"psubsw %%mm1,%%mm7 \n\t"/* a1-b1*/\
"psraw $6,%%mm6 \n\t"/* dst2*/\
"psubsw %%mm4,%%mm0 \n\t"/* a2-b2*/\
"movq 3*16+" #A2 ",%%mm1 \n\t"/* load b0*/\
"psraw $6,%%mm7 \n\t"/* dst6*/\
"movq %%mm5,%%mm4 \n\t"/* a0*/\
"psraw $6,%%mm0 \n\t"/* dst5*/\
"movq %%mm3,1*16+" #A2 "\n\t"\
"paddsw %%mm1,%%mm5 \n\t"/* a0+b0*/\
"movq %%mm6,2*16+" #A2 "\n\t"\
"psubsw %%mm1,%%mm4 \n\t"/* a0-b0*/\
"movq 5*16+" #A2 ",%%mm3 \n\t"/* load b3*/\
"psraw $6,%%mm5 \n\t"/* dst0*/\
"movq %%mm2,%%mm6 \n\t"/* a3*/\
"psraw $6,%%mm4 \n\t"/* dst7*/\
"movq %%mm0,5*16+" #A2 "\n\t"\
"paddsw %%mm3,%%mm2 \n\t"/* a3+b3*/\
"movq %%mm7,6*16+" #A2 "\n\t"\
"psubsw %%mm3,%%mm6 \n\t"/* a3-b3*/\
"movq %%mm5,0*16+" #A2 "\n\t"\
"psraw $6,%%mm2 \n\t"/* dst3*/\
"movq %%mm4,7*16+" #A2 "\n\t"\
"psraw $6,%%mm6 \n\t"/* dst4*/\
"movq %%mm2,3*16+" #A2 "\n\t"\
"movq %%mm6,4*16+" #A2 "\n\t"
//=============================================================================
// Code
//=============================================================================
//-----------------------------------------------------------------------------
// void idct_mmx(uint16_t block[64]);
//-----------------------------------------------------------------------------
void ff_idct_xvid_mmx(short *block){
__asm__ volatile(
//# Process each row
DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
//# Process the columns (4 at a time)
DCT_8_INV_COL(0(%0), 0(%0))
DCT_8_INV_COL(8(%0), 8(%0))
:: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16));
}
void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block)
{
ff_idct_xvid_mmx(block);
ff_put_pixels_clamped_mmx(block, dest, line_size);
}
void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block)
{
ff_idct_xvid_mmx(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMXEXT_INLINE
//-----------------------------------------------------------------------------
// void idct_xmm(uint16_t block[64]);
//-----------------------------------------------------------------------------
void ff_idct_xvid_mmxext(short *block)
{
__asm__ volatile(
//# Process each row
DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
//# Process the columns (4 at a time)
DCT_8_INV_COL(0(%0), 0(%0))
DCT_8_INV_COL(8(%0), 8(%0))
:: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
}
void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block)
{
ff_idct_xvid_mmxext(block);
ff_put_pixels_clamped_mmx(block, dest, line_size);
}
void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
{
ff_idct_xvid_mmxext(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
#endif /* HAVE_MMXEXT_INLINE */

View File

@@ -0,0 +1,407 @@
/*
* XVID MPEG-4 VIDEO CODEC
* - SSE2 inverse discrete cosine transform -
*
* Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
*
* Conversion to gcc syntax with modifications
* by Alexander Strange <astrange@ithinksw.com>
*
* Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
*
* This file is part of FFmpeg.
*
* Vertical pass is an implementation of the scheme:
* Loeffler C., Ligtenberg A., and Moschytz C.S.:
* Practical Fast 1D DCT Algorithm with Eleven Multiplications,
* Proc. ICASSP 1989, 988-991.
*
* Horizontal pass is a double 4x4 vector/matrix multiplication,
* (see also Intel's Application Note 922:
* http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
* Copyright (C) 1999 Intel Corporation)
*
* More details at http://skal.planet-d.net/coding/dct.html
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with FFmpeg; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "idct_xvid.h"
#include "dsputil_x86.h"
#if HAVE_SSE2_INLINE
/**
* @file
* @brief SSE2 idct compatible with xvidmmx
*/
#define X8(x) x,x,x,x,x,x,x,x
#define ROW_SHIFT 11
#define COL_SHIFT 6
DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16)
DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1
DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2)
DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)};
DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
};
DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
};
DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
};
DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
};
DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
65536, 65536, 65536, 65536,
3597, 3597, 3597, 3597,
2260, 2260, 2260, 2260,
1203, 1203, 1203, 1203,
120, 120, 120, 120,
512, 512, 512, 512
};
// Temporary storage before the column pass
#define ROW1 "%%xmm6"
#define ROW3 "%%xmm4"
#define ROW5 "%%xmm5"
#define ROW7 "%%xmm7"
#define CLEAR_ODD(r) "pxor "r","r" \n\t"
#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
#if ARCH_X86_64
# define ROW0 "%%xmm8"
# define REG0 ROW0
# define ROW2 "%%xmm9"
# define REG2 ROW2
# define ROW4 "%%xmm10"
# define REG4 ROW4
# define ROW6 "%%xmm11"
# define REG6 ROW6
# define CLEAR_EVEN(r) CLEAR_ODD(r)
# define PUT_EVEN(dst) PUT_ODD(dst)
# define XMMS "%%xmm12"
# define MOV_32_ONLY "#"
# define SREG2 REG2
# define TAN3 "%%xmm13"
# define TAN1 "%%xmm14"
#else
# define ROW0 "(%0)"
# define REG0 "%%xmm4"
# define ROW2 "2*16(%0)"
# define REG2 "%%xmm4"
# define ROW4 "4*16(%0)"
# define REG4 "%%xmm6"
# define ROW6 "6*16(%0)"
# define REG6 "%%xmm6"
# define CLEAR_EVEN(r)
# define PUT_EVEN(dst) \
"pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
"movdqa %%xmm2, "dst" \n\t"
# define XMMS "%%xmm2"
# define MOV_32_ONLY "movdqa "
# define SREG2 "%%xmm7"
# define TAN3 "%%xmm0"
# define TAN1 "%%xmm2"
#endif
#define ROUND(x) "paddd "MANGLE(x)
#define JZ(reg, to) \
"testl "reg","reg" \n\t" \
"jz "to" \n\t"
#define JNZ(reg, to) \
"testl "reg","reg" \n\t" \
"jnz "to" \n\t"
#define TEST_ONE_ROW(src, reg, clear) \
clear \
"movq "src", %%mm1 \n\t" \
"por 8+"src", %%mm1 \n\t" \
"paddusb %%mm0, %%mm1 \n\t" \
"pmovmskb %%mm1, "reg" \n\t"
#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
clear1 \
clear2 \
"movq "row1", %%mm1 \n\t" \
"por 8+"row1", %%mm1 \n\t" \
"movq "row2", %%mm2 \n\t" \
"por 8+"row2", %%mm2 \n\t" \
"paddusb %%mm0, %%mm1 \n\t" \
"paddusb %%mm0, %%mm2 \n\t" \
"pmovmskb %%mm1, "reg1" \n\t" \
"pmovmskb %%mm2, "reg2" \n\t"
///IDCT pass on rows.
#define iMTX_MULT(src, table, rounder, put) \
"movdqa "src", %%xmm3 \n\t" \
"movdqa %%xmm3, %%xmm0 \n\t" \
"pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \
"punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \
"pmaddwd "table", %%xmm0 \n\t" \
"pmaddwd 16+"table", %%xmm1 \n\t" \
"pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \
"punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \
"pmaddwd 32+"table", %%xmm2 \n\t" \
"pmaddwd 48+"table", %%xmm3 \n\t" \
"paddd %%xmm1, %%xmm0 \n\t" \
"paddd %%xmm3, %%xmm2 \n\t" \
rounder", %%xmm0 \n\t" \
"movdqa %%xmm2, %%xmm3 \n\t" \
"paddd %%xmm0, %%xmm2 \n\t" \
"psubd %%xmm3, %%xmm0 \n\t" \
"psrad $11, %%xmm2 \n\t" \
"psrad $11, %%xmm0 \n\t" \
"packssdw %%xmm0, %%xmm2 \n\t" \
put \
"1: \n\t"
#define iLLM_HEAD \
"movdqa "MANGLE(tan3)", "TAN3" \n\t" \
"movdqa "MANGLE(tan1)", "TAN1" \n\t" \
///IDCT pass on columns.
#define iLLM_PASS(dct) \
"movdqa "TAN3", %%xmm1 \n\t" \
"movdqa "TAN1", %%xmm3 \n\t" \
"pmulhw %%xmm4, "TAN3" \n\t" \
"pmulhw %%xmm5, %%xmm1 \n\t" \
"paddsw %%xmm4, "TAN3" \n\t" \
"paddsw %%xmm5, %%xmm1 \n\t" \
"psubsw %%xmm5, "TAN3" \n\t" \
"paddsw %%xmm4, %%xmm1 \n\t" \
"pmulhw %%xmm7, %%xmm3 \n\t" \
"pmulhw %%xmm6, "TAN1" \n\t" \
"paddsw %%xmm6, %%xmm3 \n\t" \
"psubsw %%xmm7, "TAN1" \n\t" \
"movdqa %%xmm3, %%xmm7 \n\t" \
"movdqa "TAN1", %%xmm6 \n\t" \
"psubsw %%xmm1, %%xmm3 \n\t" \
"psubsw "TAN3", "TAN1" \n\t" \
"paddsw %%xmm7, %%xmm1 \n\t" \
"paddsw %%xmm6, "TAN3" \n\t" \
"movdqa %%xmm3, %%xmm6 \n\t" \
"psubsw "TAN3", %%xmm3 \n\t" \
"paddsw %%xmm6, "TAN3" \n\t" \
"movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
"pmulhw %%xmm4, %%xmm3 \n\t" \
"pmulhw %%xmm4, "TAN3" \n\t" \
"paddsw "TAN3", "TAN3" \n\t" \
"paddsw %%xmm3, %%xmm3 \n\t" \
"movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
MOV_32_ONLY ROW2", "REG2" \n\t" \
MOV_32_ONLY ROW6", "REG6" \n\t" \
"movdqa %%xmm7, %%xmm5 \n\t" \
"pmulhw "REG6", %%xmm7 \n\t" \
"pmulhw "REG2", %%xmm5 \n\t" \
"paddsw "REG2", %%xmm7 \n\t" \
"psubsw "REG6", %%xmm5 \n\t" \
MOV_32_ONLY ROW0", "REG0" \n\t" \
MOV_32_ONLY ROW4", "REG4" \n\t" \
MOV_32_ONLY" "TAN1", (%0) \n\t" \
"movdqa "REG0", "XMMS" \n\t" \
"psubsw "REG4", "REG0" \n\t" \
"paddsw "XMMS", "REG4" \n\t" \
"movdqa "REG4", "XMMS" \n\t" \
"psubsw %%xmm7, "REG4" \n\t" \
"paddsw "XMMS", %%xmm7 \n\t" \
"movdqa "REG0", "XMMS" \n\t" \
"psubsw %%xmm5, "REG0" \n\t" \
"paddsw "XMMS", %%xmm5 \n\t" \
"movdqa %%xmm5, "XMMS" \n\t" \
"psubsw "TAN3", %%xmm5 \n\t" \
"paddsw "XMMS", "TAN3" \n\t" \
"movdqa "REG0", "XMMS" \n\t" \
"psubsw %%xmm3, "REG0" \n\t" \
"paddsw "XMMS", %%xmm3 \n\t" \
MOV_32_ONLY" (%0), "TAN1" \n\t" \
"psraw $6, %%xmm5 \n\t" \
"psraw $6, "REG0" \n\t" \
"psraw $6, "TAN3" \n\t" \
"psraw $6, %%xmm3 \n\t" \
"movdqa "TAN3", 1*16("dct") \n\t" \
"movdqa %%xmm3, 2*16("dct") \n\t" \
"movdqa "REG0", 5*16("dct") \n\t" \
"movdqa %%xmm5, 6*16("dct") \n\t" \
"movdqa %%xmm7, %%xmm0 \n\t" \
"movdqa "REG4", %%xmm4 \n\t" \
"psubsw %%xmm1, %%xmm7 \n\t" \
"psubsw "TAN1", "REG4" \n\t" \
"paddsw %%xmm0, %%xmm1 \n\t" \
"paddsw %%xmm4, "TAN1" \n\t" \
"psraw $6, %%xmm1 \n\t" \
"psraw $6, %%xmm7 \n\t" \
"psraw $6, "TAN1" \n\t" \
"psraw $6, "REG4" \n\t" \
"movdqa %%xmm1, ("dct") \n\t" \
"movdqa "TAN1", 3*16("dct") \n\t" \
"movdqa "REG4", 4*16("dct") \n\t" \
"movdqa %%xmm7, 7*16("dct") \n\t"
///IDCT pass on columns, assuming rows 4-7 are zero.
#define iLLM_PASS_SPARSE(dct) \
"pmulhw %%xmm4, "TAN3" \n\t" \
"paddsw %%xmm4, "TAN3" \n\t" \
"movdqa %%xmm6, %%xmm3 \n\t" \
"pmulhw %%xmm6, "TAN1" \n\t" \
"movdqa %%xmm4, %%xmm1 \n\t" \
"psubsw %%xmm1, %%xmm3 \n\t" \
"paddsw %%xmm6, %%xmm1 \n\t" \
"movdqa "TAN1", %%xmm6 \n\t" \
"psubsw "TAN3", "TAN1" \n\t" \
"paddsw %%xmm6, "TAN3" \n\t" \
"movdqa %%xmm3, %%xmm6 \n\t" \
"psubsw "TAN3", %%xmm3 \n\t" \
"paddsw %%xmm6, "TAN3" \n\t" \
"movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
"pmulhw %%xmm4, %%xmm3 \n\t" \
"pmulhw %%xmm4, "TAN3" \n\t" \
"paddsw "TAN3", "TAN3" \n\t" \
"paddsw %%xmm3, %%xmm3 \n\t" \
"movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
MOV_32_ONLY ROW2", "SREG2" \n\t" \
"pmulhw "SREG2", %%xmm5 \n\t" \
MOV_32_ONLY ROW0", "REG0" \n\t" \
"movdqa "REG0", %%xmm6 \n\t" \
"psubsw "SREG2", %%xmm6 \n\t" \
"paddsw "REG0", "SREG2" \n\t" \
MOV_32_ONLY" "TAN1", (%0) \n\t" \
"movdqa "REG0", "XMMS" \n\t" \
"psubsw %%xmm5, "REG0" \n\t" \
"paddsw "XMMS", %%xmm5 \n\t" \
"movdqa %%xmm5, "XMMS" \n\t" \
"psubsw "TAN3", %%xmm5 \n\t" \
"paddsw "XMMS", "TAN3" \n\t" \
"movdqa "REG0", "XMMS" \n\t" \
"psubsw %%xmm3, "REG0" \n\t" \
"paddsw "XMMS", %%xmm3 \n\t" \
MOV_32_ONLY" (%0), "TAN1" \n\t" \
"psraw $6, %%xmm5 \n\t" \
"psraw $6, "REG0" \n\t" \
"psraw $6, "TAN3" \n\t" \
"psraw $6, %%xmm3 \n\t" \
"movdqa "TAN3", 1*16("dct") \n\t" \
"movdqa %%xmm3, 2*16("dct") \n\t" \
"movdqa "REG0", 5*16("dct") \n\t" \
"movdqa %%xmm5, 6*16("dct") \n\t" \
"movdqa "SREG2", %%xmm0 \n\t" \
"movdqa %%xmm6, %%xmm4 \n\t" \
"psubsw %%xmm1, "SREG2" \n\t" \
"psubsw "TAN1", %%xmm6 \n\t" \
"paddsw %%xmm0, %%xmm1 \n\t" \
"paddsw %%xmm4, "TAN1" \n\t" \
"psraw $6, %%xmm1 \n\t" \
"psraw $6, "SREG2" \n\t" \
"psraw $6, "TAN1" \n\t" \
"psraw $6, %%xmm6 \n\t" \
"movdqa %%xmm1, ("dct") \n\t" \
"movdqa "TAN1", 3*16("dct") \n\t" \
"movdqa %%xmm6, 4*16("dct") \n\t" \
"movdqa "SREG2", 7*16("dct") \n\t"
inline void ff_idct_xvid_sse2(short *block)
{
__asm__ volatile(
"movq "MANGLE(m127)", %%mm0 \n\t"
iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
JZ("%%eax", "1f")
iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
iLLM_HEAD
".p2align 4 \n\t"
JNZ("%%ecx", "2f")
JNZ("%%eax", "3f")
JNZ("%%edx", "4f")
JNZ("%%esi", "5f")
iLLM_PASS_SPARSE("%0")
"jmp 6f \n\t"
"2: \n\t"
iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
"3: \n\t"
iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
JZ("%%edx", "1f")
"4: \n\t"
iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
JZ("%%esi", "1f")
"5: \n\t"
iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
#if ARCH_X86_32
iLLM_HEAD
#endif
iLLM_PASS("%0")
"6: \n\t"
: "+r"(block)
:
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
"%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
#if ARCH_X86_64
XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14",)
#endif
"%eax", "%ecx", "%edx", "%esi", "memory"
);
}
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
{
ff_idct_xvid_sse2(block);
ff_put_pixels_clamped_mmx(block, dest, line_size);
}
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
{
ff_idct_xvid_sse2(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
#endif /* HAVE_SSE2_INLINE */

View File

@@ -0,0 +1,43 @@
/*
* XVID MPEG-4 VIDEO CODEC
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* header for Xvid IDCT functions
*/
#ifndef AVCODEC_X86_IDCT_XVID_H
#define AVCODEC_X86_IDCT_XVID_H
#include <stdint.h>
void ff_idct_xvid_mmx(short *block);
void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block);
void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block);
void ff_idct_xvid_mmxext(short *block);
void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block);
void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block);
void ff_idct_xvid_sse2(short *block);
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block);
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block);
#endif /* AVCODEC_X86_IDCT_XVID_H */

View File

@@ -0,0 +1,724 @@
;******************************************************************************
;* 36 point SSE-optimized IMDCT transform
;* Copyright (c) 2011 Vitor Sessak
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
align 16
ps_mask: dd 0, ~0, ~0, ~0
ps_mask2: dd 0, ~0, 0, ~0
ps_mask3: dd 0, 0, 0, ~0
ps_mask4: dd 0, ~0, 0, 0
ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
dd 1.0, 0.70710678118654752439, 0.0, 0.0
ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
dd 1.0, 0.70710678118654752439, 0.0, 0.0
costabs: times 4 dd 0.98480773
times 4 dd 0.93969262
times 4 dd 0.86602539
times 4 dd -0.76604444
times 4 dd -0.64278764
times 4 dd 0.50000000
times 4 dd -0.50000000
times 4 dd -0.34202015
times 4 dd -0.17364818
times 4 dd 0.50190992
times 4 dd 0.51763808
times 4 dd 0.55168896
times 4 dd 0.61038726
times 4 dd 0.70710677
times 4 dd 0.87172341
times 4 dd 1.18310082
times 4 dd 1.93185163
times 4 dd 5.73685646
%define SBLIMIT 32
SECTION_TEXT
%macro PSHUFD 3
%if cpuflag(sse2) && notcpuflag(avx)
pshufd %1, %2, %3
%else
shufps %1, %2, %2, %3
%endif
%endmacro
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
; output %1={x3,x4,y1,y2}
%macro BUILDINVHIGHLOW 3
%if cpuflag(avx)
shufps %1, %2, %3, 0x4e
%else
movlhps %1, %3
movhlps %1, %2
%endif
%endmacro
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
; output %1={x4,y1,y2,y3}
%macro ROTLEFT 3
%if cpuflag(ssse3)
palignr %1, %3, %2, 12
%else
BUILDINVHIGHLOW %1, %2, %3
shufps %1, %1, %3, 0x99
%endif
%endmacro
%macro INVERTHL 2
%if cpuflag(sse2)
PSHUFD %1, %2, 0x4e
%else
movhlps %1, %2
movlhps %1, %2
%endif
%endmacro
%macro BUTTERF 3
INVERTHL %2, %1
xorps %1, [ps_p1p1m1m1]
addps %1, %2
%if cpuflag(sse3)
mulps %1, %1, [ps_cosh_sse3 + %3]
PSHUFD %2, %1, 0xb1
addsubps %1, %1, %2
%else
mulps %1, [ps_cosh + %3]
PSHUFD %2, %1, 0xb1
xorps %1, [ps_p1m1p1m1]
addps %1, %2
%endif
%endmacro
%macro STORE 4
movhlps %2, %1
movss [%3 ], %1
movss [%3 + 2*%4], %2
shufps %1, %1, 0xb1
movss [%3 + %4], %1
movhlps %2, %1
movss [%3 + 3*%4], %2
%endmacro
%macro LOAD 4
movlps %1, [%3 ]
movhps %1, [%3 + %4]
movlps %2, [%3 + 2*%4]
movhps %2, [%3 + 3*%4]
shufps %1, %2, 0x88
%endmacro
%macro LOADA64 2
%if cpuflag(avx)
movu %1, [%2]
%else
movlps %1, [%2]
movhps %1, [%2 + 8]
%endif
%endmacro
%macro DEFINE_IMDCT 0
cglobal imdct36_float, 4,4,9, out, buf, in, win
; for(i=17;i>=1;i--) in[i] += in[i-1];
LOADA64 m0, inq
LOADA64 m1, inq + 16
ROTLEFT m5, m0, m1
PSHUFD m6, m0, 0x93
andps m6, m6, [ps_mask]
addps m0, m0, m6
LOADA64 m2, inq + 32
ROTLEFT m7, m1, m2
addps m1, m1, m5
LOADA64 m3, inq + 48
ROTLEFT m5, m2, m3
xorps m4, m4, m4
movlps m4, [inq+64]
BUILDINVHIGHLOW m6, m3, m4
shufps m6, m6, m4, 0xa9
addps m4, m4, m6
addps m2, m2, m7
addps m3, m3, m5
; for(i=17;i>=3;i-=2) in[i] += in[i-2];
movlhps m5, m5, m0
andps m5, m5, [ps_mask3]
BUILDINVHIGHLOW m7, m0, m1
andps m7, m7, [ps_mask2]
addps m0, m0, m5
BUILDINVHIGHLOW m6, m1, m2
andps m6, m6, [ps_mask2]
addps m1, m1, m7
BUILDINVHIGHLOW m7, m2, m3
andps m7, m7, [ps_mask2]
addps m2, m2, m6
movhlps m6, m6, m3
andps m6, m6, [ps_mask4]
addps m3, m3, m7
addps m4, m4, m6
; Populate tmp[]
movlhps m6, m1, m5 ; zero out high values
subps m6, m6, m4
subps m5, m0, m3
%if ARCH_X86_64
SWAP m5, m8
%endif
mulps m7, m2, [ps_val1]
%if ARCH_X86_64
mulps m5, m8, [ps_val2]
%else
mulps m5, m5, [ps_val2]
%endif
addps m7, m7, m5
mulps m5, m6, [ps_val1]
subps m7, m7, m5
%if ARCH_X86_64
SWAP m5, m8
%else
subps m5, m0, m3
%endif
subps m5, m5, m6
addps m5, m5, m2
shufps m6, m4, m3, 0xe4
subps m6, m6, m2
mulps m6, m6, [ps_val3]
addps m4, m4, m1
mulps m4, m4, [ps_val4]
shufps m1, m1, m0, 0xe4
addps m1, m1, m2
mulps m1, m1, [ps_val5]
mulps m3, m3, [ps_val6]
mulps m0, m0, [ps_val7]
addps m0, m0, m3
xorps m2, m1, [ps_p1p1m1m1]
subps m2, m2, m4
addps m2, m2, m0
addps m3, m4, m0
subps m3, m3, m6
xorps m3, m3, [ps_p1p1m1m1]
shufps m0, m0, m4, 0xe4
subps m0, m0, m1
addps m0, m0, m6
BUILDINVHIGHLOW m4, m2, m3
shufps m3, m3, m2, 0x4e
; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
BUTTERF m0, m1, 0
BUTTERF m7, m2, 16
BUTTERF m3, m6, 32
BUTTERF m4, m1, 48
mulps m5, m5, [ps_cosh + 64]
PSHUFD m1, m5, 0xe1
xorps m5, m5, [ps_p1m1p1m1]
addps m5, m5, m1
; permutates:
; m0 0 1 2 3 => 2 6 10 14 m1
; m7 4 5 6 7 => 3 7 11 15 m2
; m3 8 9 10 11 => 17 13 9 5 m3
; m4 12 13 14 15 => 16 12 8 4 m5
; m5 16 17 xx xx => 0 1 xx xx m0
unpckhps m1, m0, m7
unpckhps m6, m3, m4
movhlps m2, m6, m1
movlhps m1, m1, m6
unpcklps m5, m5, m4
unpcklps m3, m3, m7
movhlps m4, m3, m5
movlhps m5, m5, m3
SWAP m4, m3
; permutation done
PSHUFD m6, m2, 0xb1
movss m4, [bufq + 4*68]
movss m7, [bufq + 4*64]
unpcklps m7, m7, m4
mulps m6, m6, [winq + 16*4]
addps m6, m6, m7
movss [outq + 64*SBLIMIT], m6
shufps m6, m6, m6, 0xb1
movss [outq + 68*SBLIMIT], m6
mulps m6, m3, [winq + 4*4]
LOAD m4, m7, bufq + 4*16, 16
addps m6, m6, m4
STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
shufps m4, m0, m3, 0xb5
mulps m4, m4, [winq + 8*4]
LOAD m7, m6, bufq + 4*32, 16
addps m4, m4, m7
STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
shufps m3, m3, m2, 0xb1
mulps m3, m3, [winq + 12*4]
LOAD m7, m6, bufq + 4*48, 16
addps m3, m3, m7
STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
mulps m2, m2, [winq]
LOAD m6, m7, bufq, 16
addps m2, m2, m6
STORE m2, m7, outq, 4*SBLIMIT
mulps m4, m1, [winq + 20*4]
STORE m4, m7, bufq, 16
mulps m3, m5, [winq + 24*4]
STORE m3, m7, bufq + 4*16, 16
shufps m0, m0, m5, 0xb0
mulps m0, m0, [winq + 28*4]
STORE m0, m7, bufq + 4*32, 16
shufps m5, m5, m1, 0xb1
mulps m5, m5, [winq + 32*4]
STORE m5, m7, bufq + 4*48, 16
shufps m1, m1, m1, 0xb1
mulps m1, m1, [winq + 36*4]
movss [bufq + 4*64], m1
shufps m1, m1, 0xb1
movss [bufq + 4*68], m1
RET
%endmacro
INIT_XMM sse
DEFINE_IMDCT
INIT_XMM sse2
DEFINE_IMDCT
INIT_XMM sse3
DEFINE_IMDCT
INIT_XMM ssse3
DEFINE_IMDCT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEFINE_IMDCT
%endif
INIT_XMM sse
%if ARCH_X86_64
%define SPILL SWAP
%define UNSPILL SWAP
%define SPILLED(x) m %+ x
%else
%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
%macro SPILL 2 ; xmm#, mempos
movaps SPILLED(%2), m%1
%endmacro
%macro UNSPILL 2
movaps m%1, SPILLED(%2)
%endmacro
%endif
%macro DEFINE_FOUR_IMDCT 0
cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
movlps m0, [inq+64]
movhps m0, [inq+64 + 72]
movlps m3, [inq+64 + 2*72]
movhps m3, [inq+64 + 3*72]
shufps m5, m0, m3, 0xdd
shufps m0, m0, m3, 0x88
mova m1, [inq+48]
movu m6, [inq+48 + 72]
mova m7, [inq+48 + 2*72]
movu m3, [inq+48 + 3*72]
TRANSPOSE4x4PS 1, 6, 7, 3, 4
addps m4, m6, m7
mova [tmpq+4*28], m4
addps m7, m3
addps m6, m1
addps m3, m0
addps m0, m5
addps m0, m7
addps m7, m6
mova [tmpq+4*12], m7
SPILL 3, 12
mova m4, [inq+32]
movu m5, [inq+32 + 72]
mova m2, [inq+32 + 2*72]
movu m7, [inq+32 + 3*72]
TRANSPOSE4x4PS 4, 5, 2, 7, 3
addps m1, m7
SPILL 1, 11
addps m3, m5, m2
SPILL 3, 13
addps m7, m2
addps m5, m4
addps m6, m7
mova [tmpq], m6
addps m7, m5
mova [tmpq+4*16], m7
mova m2, [inq+16]
movu m7, [inq+16 + 72]
mova m1, [inq+16 + 2*72]
movu m6, [inq+16 + 3*72]
TRANSPOSE4x4PS 2, 7, 1, 6, 3
addps m4, m6
addps m6, m1
addps m1, m7
addps m7, m2
addps m5, m6
SPILL 5, 15
addps m6, m7
mulps m6, [costabs + 16*2]
mova [tmpq+4*8], m6
SPILL 1, 10
SPILL 0, 14
mova m1, [inq]
movu m6, [inq + 72]
mova m3, [inq + 2*72]
movu m5, [inq + 3*72]
TRANSPOSE4x4PS 1, 6, 3, 5, 0
addps m2, m5
addps m5, m3
addps m7, m5
addps m3, m6
addps m6, m1
SPILL 7, 8
addps m5, m6
SPILL 6, 9
addps m6, m4, SPILLED(12)
subps m6, m2
UNSPILL 7, 11
SPILL 5, 11
subps m5, m1, m7
mulps m7, [costabs + 16*5]
addps m7, m1
mulps m0, m6, [costabs + 16*6]
addps m0, m5
mova [tmpq+4*24], m0
addps m6, m5
mova [tmpq+4*4], m6
addps m6, m4, m2
mulps m6, [costabs + 16*1]
subps m4, SPILLED(12)
mulps m4, [costabs + 16*8]
addps m2, SPILLED(12)
mulps m2, [costabs + 16*3]
subps m5, m7, m6
subps m5, m2
addps m6, m7
addps m6, m4
addps m7, m2
subps m7, m4
mova [tmpq+4*20], m7
mova m2, [tmpq+4*28]
mova [tmpq+4*28], m5
UNSPILL 7, 13
subps m5, m7, m2
mulps m5, [costabs + 16*7]
UNSPILL 1, 10
mulps m1, [costabs + 16*2]
addps m4, m3, m2
mulps m4, [costabs + 16*4]
addps m2, m7
addps m7, m3
mulps m7, [costabs]
subps m3, m2
mulps m3, [costabs + 16*2]
addps m2, m7, m5
addps m2, m1
SPILL 2, 10
addps m7, m4
subps m7, m1
SPILL 7, 12
subps m5, m4
subps m5, m1
UNSPILL 0, 14
SPILL 5, 13
addps m1, m0, SPILLED(15)
subps m1, SPILLED(8)
mova m4, [costabs + 16*5]
mulps m4, [tmpq]
UNSPILL 2, 9
addps m4, m2
subps m2, [tmpq]
mulps m5, m1, [costabs + 16*6]
addps m5, m2
SPILL 5, 9
addps m2, m1
SPILL 2, 14
UNSPILL 5, 15
subps m7, m5, m0
addps m5, SPILLED(8)
mulps m5, [costabs + 16*1]
mulps m7, [costabs + 16*8]
addps m0, SPILLED(8)
mulps m0, [costabs + 16*3]
subps m2, m4, m5
subps m2, m0
SPILL 2, 15
addps m5, m4
addps m5, m7
addps m4, m0
subps m4, m7
SPILL 4, 8
mova m7, [tmpq+4*16]
mova m2, [tmpq+4*12]
addps m0, m7, m2
subps m0, SPILLED(11)
mulps m0, [costabs + 16*2]
addps m4, m7, SPILLED(11)
mulps m4, [costabs]
subps m7, m2
mulps m7, [costabs + 16*7]
addps m2, SPILLED(11)
mulps m2, [costabs + 16*4]
addps m1, m7, [tmpq+4*8]
addps m1, m4
addps m4, m2
subps m4, [tmpq+4*8]
SPILL 4, 11
subps m7, m2
subps m7, [tmpq+4*8]
addps m4, m6, SPILLED(10)
subps m6, SPILLED(10)
addps m2, m5, m1
mulps m2, [costabs + 16*9]
subps m5, m1
mulps m5, [costabs + 16*17]
subps m1, m4, m2
addps m4, m2
mulps m2, m1, [winq+4*36]
addps m2, [bufq+4*36]
mova [outq+1152], m2
mulps m1, [winq+4*32]
addps m1, [bufq+4*32]
mova [outq+1024], m1
mulps m1, m4, [winq+4*116]
mova [bufq+4*36], m1
mulps m4, [winq+4*112]
mova [bufq+4*32], m4
addps m2, m6, m5
subps m6, m5
mulps m1, m6, [winq+4*68]
addps m1, [bufq+4*68]
mova [outq+2176], m1
mulps m6, [winq]
addps m6, [bufq]
mova [outq], m6
mulps m1, m2, [winq+4*148]
mova [bufq+4*68], m1
mulps m2, [winq+4*80]
mova [bufq], m2
addps m5, m3, [tmpq+4*24]
mova m2, [tmpq+4*24]
subps m2, m3
mova m1, SPILLED(9)
subps m1, m0
mulps m1, [costabs + 16*10]
addps m0, SPILLED(9)
mulps m0, [costabs + 16*16]
addps m6, m5, m1
subps m5, m1
mulps m3, m5, [winq+4*40]
addps m3, [bufq+4*40]
mova [outq+1280], m3
mulps m5, [winq+4*28]
addps m5, [bufq+4*28]
mova [outq+896], m5
mulps m1, m6, [winq+4*120]
mova [bufq+4*40], m1
mulps m6, [winq+4*108]
mova [bufq+4*28], m6
addps m1, m2, m0
subps m2, m0
mulps m5, m2, [winq+4*64]
addps m5, [bufq+4*64]
mova [outq+2048], m5
mulps m2, [winq+4*4]
addps m2, [bufq+4*4]
mova [outq+128], m2
mulps m0, m1, [winq+4*144]
mova [bufq+4*64], m0
mulps m1, [winq+4*84]
mova [bufq+4*4], m1
mova m1, [tmpq+4*28]
mova m5, m1
addps m1, SPILLED(13)
subps m5, SPILLED(13)
UNSPILL 3, 15
addps m2, m7, m3
mulps m2, [costabs + 16*11]
subps m3, m7
mulps m3, [costabs + 16*15]
addps m0, m2, m1
subps m1, m2
SWAP m0, m2
mulps m6, m1, [winq+4*44]
addps m6, [bufq+4*44]
mova [outq+1408], m6
mulps m1, [winq+4*24]
addps m1, [bufq+4*24]
mova [outq+768], m1
mulps m0, m2, [winq+4*124]
mova [bufq+4*44], m0
mulps m2, [winq+4*104]
mova [bufq+4*24], m2
addps m0, m5, m3
subps m5, m3
mulps m1, m5, [winq+4*60]
addps m1, [bufq+4*60]
mova [outq+1920], m1
mulps m5, [winq+4*8]
addps m5, [bufq+4*8]
mova [outq+256], m5
mulps m1, m0, [winq+4*140]
mova [bufq+4*60], m1
mulps m0, [winq+4*88]
mova [bufq+4*8], m0
mova m1, [tmpq+4*20]
addps m1, SPILLED(12)
mova m2, [tmpq+4*20]
subps m2, SPILLED(12)
UNSPILL 7, 8
subps m0, m7, SPILLED(11)
addps m7, SPILLED(11)
mulps m4, m7, [costabs + 16*12]
mulps m0, [costabs + 16*14]
addps m5, m1, m4
subps m1, m4
mulps m7, m1, [winq+4*48]
addps m7, [bufq+4*48]
mova [outq+1536], m7
mulps m1, [winq+4*20]
addps m1, [bufq+4*20]
mova [outq+640], m1
mulps m1, m5, [winq+4*128]
mova [bufq+4*48], m1
mulps m5, [winq+4*100]
mova [bufq+4*20], m5
addps m6, m2, m0
subps m2, m0
mulps m1, m2, [winq+4*56]
addps m1, [bufq+4*56]
mova [outq+1792], m1
mulps m2, [winq+4*12]
addps m2, [bufq+4*12]
mova [outq+384], m2
mulps m0, m6, [winq+4*136]
mova [bufq+4*56], m0
mulps m6, [winq+4*92]
mova [bufq+4*12], m6
UNSPILL 0, 14
mulps m0, [costabs + 16*13]
mova m3, [tmpq+4*4]
addps m2, m0, m3
subps m3, m0
mulps m0, m3, [winq+4*52]
addps m0, [bufq+4*52]
mova [outq+1664], m0
mulps m3, [winq+4*16]
addps m3, [bufq+4*16]
mova [outq+512], m3
mulps m0, m2, [winq+4*132]
mova [bufq+4*52], m0
mulps m2, [winq+4*96]
mova [bufq+4*16], m2
RET
%endmacro
INIT_XMM sse
DEFINE_FOUR_IMDCT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEFINE_FOUR_IMDCT
%endif

View File

@@ -0,0 +1,159 @@
/*
* MMX optimized LPC DSP utils
* Copyright (c) 2007 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/lpc.h"
DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 };
#if HAVE_SSE2_INLINE
static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
double *w_data)
{
double c = 2.0 / (len-1.0);
int n2 = len>>1;
x86_reg i = -n2*sizeof(int32_t);
x86_reg j = n2*sizeof(int32_t);
__asm__ volatile(
"movsd %4, %%xmm7 \n\t"
"movapd "MANGLE(pd_1)", %%xmm6 \n\t"
"movapd "MANGLE(pd_2)", %%xmm5 \n\t"
"movlhps %%xmm7, %%xmm7 \n\t"
"subpd %%xmm5, %%xmm7 \n\t"
"addsd %%xmm6, %%xmm7 \n\t"
"test $1, %5 \n\t"
"jz 2f \n\t"
#define WELCH(MOVPD, offset)\
"1: \n\t"\
"movapd %%xmm7, %%xmm1 \n\t"\
"mulpd %%xmm1, %%xmm1 \n\t"\
"movapd %%xmm6, %%xmm0 \n\t"\
"subpd %%xmm1, %%xmm0 \n\t"\
"pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
"cvtpi2pd (%3,%0), %%xmm2 \n\t"\
"cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
"mulpd %%xmm0, %%xmm2 \n\t"\
"mulpd %%xmm1, %%xmm3 \n\t"\
"movapd %%xmm2, (%2,%0,2) \n\t"\
MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
"subpd %%xmm5, %%xmm7 \n\t"\
"sub $8, %1 \n\t"\
"add $8, %0 \n\t"\
"jl 1b \n\t"\
WELCH("movupd", -1)
"jmp 3f \n\t"
"2: \n\t"
WELCH("movapd", -2)
"3: \n\t"
:"+&r"(i), "+&r"(j)
:"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm5", "%xmm6", "%xmm7")
);
#undef WELCH
}
static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
double *autoc)
{
int j;
if((x86_reg)data & 15)
data++;
for(j=0; j<lag; j+=2){
x86_reg i = -len*sizeof(double);
if(j == lag-2) {
__asm__ volatile(
"movsd "MANGLE(pd_1)", %%xmm0 \n\t"
"movsd "MANGLE(pd_1)", %%xmm1 \n\t"
"movsd "MANGLE(pd_1)", %%xmm2 \n\t"
"1: \n\t"
"movapd (%2,%0), %%xmm3 \n\t"
"movupd -8(%3,%0), %%xmm4 \n\t"
"movapd (%3,%0), %%xmm5 \n\t"
"mulpd %%xmm3, %%xmm4 \n\t"
"mulpd %%xmm3, %%xmm5 \n\t"
"mulpd -16(%3,%0), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm1 \n\t"
"addpd %%xmm5, %%xmm0 \n\t"
"addpd %%xmm3, %%xmm2 \n\t"
"add $16, %0 \n\t"
"jl 1b \n\t"
"movhlps %%xmm0, %%xmm3 \n\t"
"movhlps %%xmm1, %%xmm4 \n\t"
"movhlps %%xmm2, %%xmm5 \n\t"
"addsd %%xmm3, %%xmm0 \n\t"
"addsd %%xmm4, %%xmm1 \n\t"
"addsd %%xmm5, %%xmm2 \n\t"
"movsd %%xmm0, (%1) \n\t"
"movsd %%xmm1, 8(%1) \n\t"
"movsd %%xmm2, 16(%1) \n\t"
:"+&r"(i)
:"r"(autoc+j), "r"(data+len), "r"(data+len-j)
:"memory"
);
} else {
__asm__ volatile(
"movsd "MANGLE(pd_1)", %%xmm0 \n\t"
"movsd "MANGLE(pd_1)", %%xmm1 \n\t"
"1: \n\t"
"movapd (%3,%0), %%xmm3 \n\t"
"movupd -8(%4,%0), %%xmm4 \n\t"
"mulpd %%xmm3, %%xmm4 \n\t"
"mulpd (%4,%0), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm1 \n\t"
"addpd %%xmm3, %%xmm0 \n\t"
"add $16, %0 \n\t"
"jl 1b \n\t"
"movhlps %%xmm0, %%xmm3 \n\t"
"movhlps %%xmm1, %%xmm4 \n\t"
"addsd %%xmm3, %%xmm0 \n\t"
"addsd %%xmm4, %%xmm1 \n\t"
"movsd %%xmm0, %1 \n\t"
"movsd %%xmm1, %2 \n\t"
:"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
:"r"(data+len), "r"(data+len-j)
);
}
}
}
#endif /* HAVE_SSE2_INLINE */
av_cold void ff_lpc_init_x86(LPCContext *c)
{
#if HAVE_SSE2_INLINE
int cpu_flags = av_get_cpu_flags();
if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
}
#endif /* HAVE_SSE2_INLINE */
}

View File

@@ -0,0 +1,128 @@
/*
* simple math operations
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_MATHOPS_H
#define AVCODEC_X86_MATHOPS_H
#include "config.h"
#include "libavutil/common.h"
#if HAVE_INLINE_ASM
#if ARCH_X86_32
#define MULL MULL
static av_always_inline av_const int MULL(int a, int b, unsigned shift)
{
int rt, dummy;
__asm__ (
"imull %3 \n\t"
"shrdl %4, %%edx, %%eax \n\t"
:"=a"(rt), "=d"(dummy)
:"a"(a), "rm"(b), "ci"((uint8_t)shift)
);
return rt;
}
#define MULH MULH
static av_always_inline av_const int MULH(int a, int b)
{
int rt, dummy;
__asm__ (
"imull %3"
:"=d"(rt), "=a"(dummy)
:"a"(a), "rm"(b)
);
return rt;
}
#define MUL64 MUL64
static av_always_inline av_const int64_t MUL64(int a, int b)
{
int64_t rt;
__asm__ (
"imull %2"
:"=A"(rt)
:"a"(a), "rm"(b)
);
return rt;
}
#endif /* ARCH_X86_32 */
#if HAVE_I686
/* median of 3 */
#define mid_pred mid_pred
static inline av_const int mid_pred(int a, int b, int c)
{
int i=b;
__asm__ (
"cmp %2, %1 \n\t"
"cmovg %1, %0 \n\t"
"cmovg %2, %1 \n\t"
"cmp %3, %1 \n\t"
"cmovl %3, %1 \n\t"
"cmp %1, %0 \n\t"
"cmovg %1, %0 \n\t"
:"+&r"(i), "+&r"(a)
:"r"(b), "r"(c)
);
return i;
}
#define COPY3_IF_LT(x, y, a, b, c, d)\
__asm__ volatile(\
"cmpl %0, %3 \n\t"\
"cmovl %3, %0 \n\t"\
"cmovl %4, %1 \n\t"\
"cmovl %5, %2 \n\t"\
: "+&r" (x), "+&r" (a), "+r" (c)\
: "r" (y), "r" (b), "r" (d)\
);
#endif /* HAVE_I686 */
#define MASK_ABS(mask, level) \
__asm__ ("cltd \n\t" \
"xorl %1, %0 \n\t" \
"subl %1, %0 \n\t" \
: "+a"(level), "=&d"(mask))
// avoid +32 for shift optimization (gcc should do that ...)
#define NEG_SSR32 NEG_SSR32
static inline int32_t NEG_SSR32( int32_t a, int8_t s){
__asm__ ("sarl %1, %0\n\t"
: "+r" (a)
: "ic" ((uint8_t)(-s))
);
return a;
}
#define NEG_USR32 NEG_USR32
static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
__asm__ ("shrl %1, %0\n\t"
: "+r" (a)
: "ic" ((uint8_t)(-s))
);
return a;
}
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_MATHOPS_H */

View File

@@ -0,0 +1,186 @@
/*
* MLP DSP functions x86-optimized
* Copyright (c) 2009 Ramiro Polla
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mlpdsp.h"
#include "libavcodec/mlp.h"
#if HAVE_7REGS && HAVE_INLINE_ASM
extern char ff_mlp_firorder_8;
extern char ff_mlp_firorder_7;
extern char ff_mlp_firorder_6;
extern char ff_mlp_firorder_5;
extern char ff_mlp_firorder_4;
extern char ff_mlp_firorder_3;
extern char ff_mlp_firorder_2;
extern char ff_mlp_firorder_1;
extern char ff_mlp_firorder_0;
extern char ff_mlp_iirorder_4;
extern char ff_mlp_iirorder_3;
extern char ff_mlp_iirorder_2;
extern char ff_mlp_iirorder_1;
extern char ff_mlp_iirorder_0;
static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
&ff_mlp_firorder_2, &ff_mlp_firorder_3,
&ff_mlp_firorder_4, &ff_mlp_firorder_5,
&ff_mlp_firorder_6, &ff_mlp_firorder_7,
&ff_mlp_firorder_8 };
static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
&ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
&ff_mlp_iirorder_4 };
#if ARCH_X86_64
#define MLPMUL(label, offset, offs, offc) \
LABEL_MANGLE(label)": \n\t" \
"movslq "offset"+"offs"(%0), %%rax\n\t" \
"movslq "offset"+"offc"(%1), %%rdx\n\t" \
"imul %%rdx, %%rax\n\t" \
"add %%rax, %%rsi\n\t"
#define FIRMULREG(label, offset, firc)\
LABEL_MANGLE(label)": \n\t" \
"movslq "#offset"(%0), %%rax\n\t" \
"imul %"#firc", %%rax\n\t" \
"add %%rax, %%rsi\n\t"
#define CLEAR_ACCUM \
"xor %%rsi, %%rsi\n\t"
#define SHIFT_ACCUM \
"shr %%cl, %%rsi\n\t"
#define ACCUM "%%rdx"
#define RESULT "%%rsi"
#define RESULT32 "%%esi"
#else /* if ARCH_X86_32 */
#define MLPMUL(label, offset, offs, offc) \
LABEL_MANGLE(label)": \n\t" \
"mov "offset"+"offs"(%0), %%eax\n\t" \
"imull "offset"+"offc"(%1) \n\t" \
"add %%eax , %%esi\n\t" \
"adc %%edx , %%ecx\n\t"
#define FIRMULREG(label, offset, firc) \
MLPMUL(label, #offset, "0", "0")
#define CLEAR_ACCUM \
"xor %%esi, %%esi\n\t" \
"xor %%ecx, %%ecx\n\t"
#define SHIFT_ACCUM \
"mov %%ecx, %%edx\n\t" \
"mov %%esi, %%eax\n\t" \
"movzbl %7 , %%ecx\n\t" \
"shrd %%cl, %%edx, %%eax\n\t" \
#define ACCUM "%%edx"
#define RESULT "%%eax"
#define RESULT32 "%%eax"
#endif /* !ARCH_X86_64 */
#define BINC AV_STRINGIFY(4* MAX_CHANNELS)
#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0")
#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer)
{
const void *firjump = firtable[firorder];
const void *iirjump = iirtable[iirorder];
blocksize = -blocksize;
__asm__ volatile(
"1: \n\t"
CLEAR_ACCUM
"jmp *%5 \n\t"
FIRMUL (ff_mlp_firorder_8, 0x1c )
FIRMUL (ff_mlp_firorder_7, 0x18 )
FIRMUL (ff_mlp_firorder_6, 0x14 )
FIRMUL (ff_mlp_firorder_5, 0x10 )
FIRMUL (ff_mlp_firorder_4, 0x0c )
FIRMULREG(ff_mlp_firorder_3, 0x08,10)
FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
"jmp *%6 \n\t"
IIRMUL (ff_mlp_iirorder_4, 0x0c )
IIRMUL (ff_mlp_iirorder_3, 0x08 )
IIRMUL (ff_mlp_iirorder_2, 0x04 )
IIRMUL (ff_mlp_iirorder_1, 0x00 )
LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
SHIFT_ACCUM
"mov "RESULT" ,"ACCUM" \n\t"
"add (%2) ,"RESULT" \n\t"
"and %4 ,"RESULT" \n\t"
"sub $4 , %0 \n\t"
"mov "RESULT32", (%0) \n\t"
"mov "RESULT32", (%2) \n\t"
"add $"BINC" , %2 \n\t"
"sub "ACCUM" ,"RESULT" \n\t"
"mov "RESULT32","IOFFS"(%0) \n\t"
"incl %3 \n\t"
"js 1b \n\t"
: /* 0*/"+r"(state),
/* 1*/"+r"(coeff),
/* 2*/"+r"(sample_buffer),
#if ARCH_X86_64
/* 3*/"+r"(blocksize)
: /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
/* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift)
, /* 8*/"r"((int64_t)coeff[0])
, /* 9*/"r"((int64_t)coeff[1])
, /*10*/"r"((int64_t)coeff[2])
: "rax", "rdx", "rsi"
#else /* ARCH_X86_32 */
/* 3*/"+m"(blocksize)
: /* 4*/"m"( mask), /* 5*/"m"(firjump),
/* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift)
: "eax", "edx", "esi", "ecx"
#endif /* !ARCH_X86_64 */
);
}
#endif /* HAVE_7REGS && HAVE_INLINE_ASM */
av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
{
#if HAVE_7REGS && HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
c->mlp_filter_channel = mlp_filter_channel_x86;
#endif
}

View File

@@ -0,0 +1,474 @@
/*
* MMX optimized motion estimation
* Copyright (c) 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer
*
* mostly by Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "dsputil_x86.h"
#if HAVE_INLINE_ASM
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
0x0000000000000000ULL,
0x0001000100010001ULL,
0x0002000200020002ULL,
};
DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
x86_reg len= -(x86_reg)stride*h;
__asm__ volatile(
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%"REG_a"), %%mm2 \n\t"
"movq (%2, %%"REG_a"), %%mm4 \n\t"
"add %3, %%"REG_a" \n\t"
"psubusb %%mm0, %%mm2 \n\t"
"psubusb %%mm4, %%mm0 \n\t"
"movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"movq (%2, %%"REG_a"), %%mm5 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm5, %%mm1 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm1, %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm3, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"add %3, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
);
}
static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
int stride, int h)
{
__asm__ volatile(
".p2align 4 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"psadbw (%2), %%mm0 \n\t"
"psadbw (%2, %3), %%mm1 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm1, %%mm6 \n\t"
"lea (%1,%3,2), %1 \n\t"
"lea (%2,%3,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2)
: "r" ((x86_reg)stride)
);
}
static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
{
int ret;
__asm__ volatile(
"pxor %%xmm2, %%xmm2 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movdqu (%1), %%xmm0 \n\t"
"movdqu (%1, %4), %%xmm1 \n\t"
"psadbw (%2), %%xmm0 \n\t"
"psadbw (%2, %4), %%xmm1 \n\t"
"paddw %%xmm0, %%xmm2 \n\t"
"paddw %%xmm1, %%xmm2 \n\t"
"lea (%1,%4,2), %1 \n\t"
"lea (%2,%4,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
"movhlps %%xmm2, %%xmm0 \n\t"
"paddw %%xmm0, %%xmm2 \n\t"
"movd %%xmm2, %3 \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret)
: "r" ((x86_reg)stride)
);
return ret;
}
static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
int stride, int h)
{
__asm__ volatile(
".p2align 4 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"pavgb 1(%1), %%mm0 \n\t"
"pavgb 1(%1, %3), %%mm1 \n\t"
"psadbw (%2), %%mm0 \n\t"
"psadbw (%2, %3), %%mm1 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm1, %%mm6 \n\t"
"lea (%1,%3,2), %1 \n\t"
"lea (%2,%3,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2)
: "r" ((x86_reg)stride)
);
}
static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
int stride, int h)
{
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"add %3, %1 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"pavgb %%mm1, %%mm0 \n\t"
"pavgb %%mm2, %%mm1 \n\t"
"psadbw (%2), %%mm0 \n\t"
"psadbw (%2, %3), %%mm1 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm1, %%mm6 \n\t"
"movq %%mm2, %%mm0 \n\t"
"lea (%1,%3,2), %1 \n\t"
"lea (%2,%3,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2)
: "r" ((x86_reg)stride)
);
}
static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
int stride, int h)
{
__asm__ volatile(
"movq "MANGLE(bone)", %%mm5 \n\t"
"movq (%1), %%mm0 \n\t"
"pavgb 1(%1), %%mm0 \n\t"
"add %3, %1 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%1), %%mm1 \n\t"
"movq (%1,%3), %%mm2 \n\t"
"pavgb 1(%1), %%mm1 \n\t"
"pavgb 1(%1,%3), %%mm2 \n\t"
"psubusb %%mm5, %%mm1 \n\t"
"pavgb %%mm1, %%mm0 \n\t"
"pavgb %%mm2, %%mm1 \n\t"
"psadbw (%2), %%mm0 \n\t"
"psadbw (%2,%3), %%mm1 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm1, %%mm6 \n\t"
"movq %%mm2, %%mm0 \n\t"
"lea (%1,%3,2), %1 \n\t"
"lea (%2,%3,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2)
: "r" ((x86_reg)stride)
);
}
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
{
x86_reg len= -(x86_reg)stride*h;
__asm__ volatile(
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm2 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm0, %%mm1 \n\t"
"paddw %%mm2, %%mm3 \n\t"
"movq (%3, %%"REG_a"), %%mm4 \n\t"
"movq (%3, %%"REG_a"), %%mm2 \n\t"
"paddw %%mm5, %%mm1 \n\t"
"paddw %%mm5, %%mm3 \n\t"
"psrlw $1, %%mm1 \n\t"
"psrlw $1, %%mm3 \n\t"
"packuswb %%mm3, %%mm1 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm2, %%mm1 \n\t"
"por %%mm4, %%mm1 \n\t"
"movq %%mm1, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"add %4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
);
}
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
x86_reg len= -(x86_reg)stride*h;
__asm__ volatile(
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%2, %%"REG_a"), %%mm2 \n\t"
"movq 1(%2, %%"REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddw %%mm4, %%mm2 \n\t"
"paddw %%mm5, %%mm3 \n\t"
"movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
"paddw %%mm5, %%mm0 \n\t"
"paddw %%mm5, %%mm1 \n\t"
"movq (%3, %%"REG_a"), %%mm4 \n\t"
"movq (%3, %%"REG_a"), %%mm5 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"psubusb %%mm0, %%mm4 \n\t"
"psubusb %%mm5, %%mm0 \n\t"
"por %%mm4, %%mm0 \n\t"
"movq %%mm0, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm4 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm4, %%mm6 \n\t"
"movq %%mm2, %%mm0 \n\t"
"movq %%mm3, %%mm1 \n\t"
"add %4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
);
}
static inline int sum_mmx(void)
{
int ret;
__asm__ volatile(
"movq %%mm6, %%mm0 \n\t"
"psrlq $32, %%mm6 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"movq %%mm6, %%mm0 \n\t"
"psrlq $16, %%mm6 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"movd %%mm6, %0 \n\t"
: "=r" (ret)
);
return ret&0xFFFF;
}
static inline int sum_mmxext(void)
{
int ret;
__asm__ volatile(
"movd %%mm6, %0 \n\t"
: "=r" (ret)
);
return ret;
}
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
}
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
}
#define PIX_SAD(suf)\
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
av_assert2(h==8);\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t":);\
\
sad8_1_ ## suf(blk1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
av_assert2(h==8);\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[1]) \
);\
\
sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
\
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
av_assert2(h==8);\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[1]) \
);\
\
sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
\
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
av_assert2(h==8);\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
::);\
\
sad8_4_ ## suf(blk1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
\
static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t":);\
\
sad8_1_ ## suf(blk1 , blk2 , stride, h);\
sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[1]) \
);\
\
sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[1]) \
);\
\
sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
::);\
\
sad8_4_ ## suf(blk1 , blk2 , stride, h);\
sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
PIX_SAD(mmx)
PIX_SAD(mmxext)
#endif /* HAVE_INLINE_ASM */
av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
{
#if HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
c->pix_abs[0][0] = sad16_mmx;
c->pix_abs[0][1] = sad16_x2_mmx;
c->pix_abs[0][2] = sad16_y2_mmx;
c->pix_abs[0][3] = sad16_xy2_mmx;
c->pix_abs[1][0] = sad8_mmx;
c->pix_abs[1][1] = sad8_x2_mmx;
c->pix_abs[1][2] = sad8_y2_mmx;
c->pix_abs[1][3] = sad8_xy2_mmx;
c->sad[0]= sad16_mmx;
c->sad[1]= sad8_mmx;
}
if (INLINE_MMXEXT(cpu_flags)) {
c->pix_abs[0][0] = sad16_mmxext;
c->pix_abs[1][0] = sad8_mmxext;
c->sad[0] = sad16_mmxext;
c->sad[1] = sad8_mmxext;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->pix_abs[0][1] = sad16_x2_mmxext;
c->pix_abs[0][2] = sad16_y2_mmxext;
c->pix_abs[0][3] = sad16_xy2_mmxext;
c->pix_abs[1][1] = sad8_x2_mmxext;
c->pix_abs[1][2] = sad8_y2_mmxext;
c->pix_abs[1][3] = sad8_xy2_mmxext;
}
}
if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
c->sad[0]= sad16_sse2;
}
#endif /* HAVE_INLINE_ASM */
}

View File

@@ -0,0 +1,560 @@
;******************************************************************************
;* mpeg4 qpel
;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pb_1
cextern pw_3
cextern pw_15
cextern pw_16
cextern pw_20
SECTION_TEXT
; put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PUT_NO_RND_PIXELS8_L2 0
cglobal put_no_rnd_pixels8_l2, 6,6
movsxdifnidn r4, r4d
movsxdifnidn r3, r3d
pcmpeqb m6, m6
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r2]
add r1, r4
add r2, 8
pxor m0, m6
pxor m1, m6
PAVGB m0, m1
pxor m0, m6
mova [r0], m0
add r0, r3
dec r5d
.loop:
mova m0, [r1]
add r1, r4
mova m1, [r1]
add r1, r4
mova m2, [r2]
mova m3, [r2+8]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m2
PAVGB m1, m3
pxor m0, m6
pxor m1, m6
mova [r0], m0
add r0, r3
mova [r0], m1
add r0, r3
mova m0, [r1]
add r1, r4
mova m1, [r1]
add r1, r4
mova m2, [r2+16]
mova m3, [r2+24]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m2
PAVGB m1, m3
pxor m0, m6
pxor m1, m6
mova [r0], m0
add r0, r3
mova [r0], m1
add r0, r3
add r2, 32
sub r5d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_L2
; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PUT_NO_RND_PIXELS16_l2 0
cglobal put_no_rnd_pixels16_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
pcmpeqb m6, m6
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r1+8]
mova m2, [r2]
mova m3, [r2+8]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m2
PAVGB m1, m3
pxor m0, m6
pxor m1, m6
add r1, r4
add r2, 16
mova [r0], m0
mova [r0+8], m1
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
mova m2, [r2]
mova m3, [r2+8]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m2
PAVGB m1, m3
pxor m0, m6
pxor m1, m6
mova [r0], m0
mova [r0+8], m1
add r0, r3
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
mova m2, [r2+16]
mova m3, [r2+24]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m2
PAVGB m1, m3
pxor m0, m6
pxor m1, m6
mova [r0], m0
mova [r0+8], m1
add r0, r3
add r2, 32
sub r5d, 2
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PUT_NO_RND_PIXELS16_l2
INIT_MMX 3dnow
PUT_NO_RND_PIXELS16_l2
%macro MPEG4_QPEL16_H_LOWPASS 1
cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
pxor m7, m7
.loop:
mova m0, [r1]
mova m1, m0
mova m2, m0
punpcklbw m0, m7
punpckhbw m1, m7
pshufw m5, m0, 0x90
pshufw m6, m0, 0x41
mova m3, m2
mova m4, m2
psllq m2, 8
psllq m3, 16
psllq m4, 24
punpckhbw m2, m7
punpckhbw m3, m7
punpckhbw m4, m7
paddw m5, m3
paddw m6, m2
paddw m5, m5
psubw m6, m5
pshufw m5, m0, 6
pmullw m6, [pw_3]
paddw m0, m4
paddw m5, m1
pmullw m0, [pw_20]
psubw m0, m5
paddw m6, [PW_ROUND]
paddw m0, m6
psraw m0, 5
mova [rsp+8], m0
mova m0, [r1+5]
mova m5, m0
mova m6, m0
psrlq m0, 8
psrlq m5, 16
punpcklbw m0, m7
punpcklbw m5, m7
paddw m2, m0
paddw m3, m5
paddw m2, m2
psubw m3, m2
mova m2, m6
psrlq m6, 24
punpcklbw m2, m7
punpcklbw m6, m7
pmullw m3, [pw_3]
paddw m1, m2
paddw m4, m6
pmullw m1, [pw_20]
psubw m3, m4
paddw m1, [PW_ROUND]
paddw m3, m1
psraw m3, 5
mova m1, [rsp+8]
packuswb m1, m3
OP_MOV [r0], m1, m4
mova m1, [r1+9]
mova m4, m1
mova m3, m1
psrlq m1, 8
psrlq m4, 16
punpcklbw m1, m7
punpcklbw m4, m7
paddw m5, m1
paddw m0, m4
paddw m5, m5
psubw m0, m5
mova m5, m3
psrlq m3, 24
pmullw m0, [pw_3]
punpcklbw m3, m7
paddw m2, m3
psubw m0, m2
mova m2, m5
punpcklbw m2, m7
punpckhbw m5, m7
paddw m6, m2
pmullw m6, [pw_20]
paddw m0, [PW_ROUND]
paddw m0, m6
psraw m0, 5
paddw m3, m5
pshufw m6, m5, 0xf9
paddw m6, m4
pshufw m4, m5, 0xbe
pshufw m5, m5, 0x6f
paddw m4, m1
paddw m5, m2
paddw m6, m6
psubw m4, m6
pmullw m3, [pw_20]
pmullw m4, [pw_3]
psubw m3, m5
paddw m4, [PW_ROUND]
paddw m4, m3
psraw m4, 5
packuswb m0, m4
OP_MOV [r0+8], m0, m4
add r1, r3
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
%macro PUT_OP 2-3
mova %1, %2
%endmacro
%macro AVG_OP 2-3
mova %3, %1
pavgb %2, %3
mova %1, %2
%endmacro
INIT_MMX mmxext
%define PW_ROUND pw_16
%define OP_MOV PUT_OP
MPEG4_QPEL16_H_LOWPASS put
%define PW_ROUND pw_16
%define OP_MOV AVG_OP
MPEG4_QPEL16_H_LOWPASS avg
%define PW_ROUND pw_15
%define OP_MOV PUT_OP
MPEG4_QPEL16_H_LOWPASS put_no_rnd
%macro MPEG4_QPEL8_H_LOWPASS 1
cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
pxor m7, m7
.loop:
mova m0, [r1]
mova m1, m0
mova m2, m0
punpcklbw m0, m7
punpckhbw m1, m7
pshufw m5, m0, 0x90
pshufw m6, m0, 0x41
mova m3, m2
mova m4, m2
psllq m2, 8
psllq m3, 16
psllq m4, 24
punpckhbw m2, m7
punpckhbw m3, m7
punpckhbw m4, m7
paddw m5, m3
paddw m6, m2
paddw m5, m5
psubw m6, m5
pshufw m5, m0, 0x6
pmullw m6, [pw_3]
paddw m0, m4
paddw m5, m1
pmullw m0, [pw_20]
psubw m0, m5
paddw m6, [PW_ROUND]
paddw m0, m6
psraw m0, 5
movh m5, [r1+5]
punpcklbw m5, m7
pshufw m6, m5, 0xf9
paddw m1, m5
paddw m2, m6
pshufw m6, m5, 0xbe
pshufw m5, m5, 0x6f
paddw m3, m6
paddw m4, m5
paddw m2, m2
psubw m3, m2
pmullw m1, [pw_20]
pmullw m3, [pw_3]
psubw m3, m4
paddw m1, [PW_ROUND]
paddw m3, m1
psraw m3, 5
packuswb m0, m3
OP_MOV [r0], m0, m4
add r1, r3
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
%define PW_ROUND pw_16
%define OP_MOV PUT_OP
MPEG4_QPEL8_H_LOWPASS put
%define PW_ROUND pw_16
%define OP_MOV AVG_OP
MPEG4_QPEL8_H_LOWPASS avg
%define PW_ROUND pw_15
%define OP_MOV PUT_OP
MPEG4_QPEL8_H_LOWPASS put_no_rnd
%macro QPEL_V_LOW 5
paddw m0, m1
mova m4, [pw_20]
pmullw m4, m0
mova m0, %4
mova m5, %1
paddw m5, m0
psubw m4, m5
mova m5, %2
mova m6, %3
paddw m5, m3
paddw m6, m2
paddw m6, m6
psubw m5, m6
pmullw m5, [pw_3]
paddw m4, [PW_ROUND]
paddw m5, m4
psraw m5, 5
packuswb m5, m5
OP_MOV %5, m5, m7
SWAP 0,1,2,3
%endmacro
%macro MPEG4_QPEL16_V_LOWPASS 1
cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 17
mov r5, rsp
pxor m7, m7
.looph:
mova m0, [r1]
mova m1, [r1]
mova m2, [r1+8]
mova m3, [r1+8]
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
mova [r5], m0
mova [r5+0x88], m1
mova [r5+0x110], m2
mova [r5+0x198], m3
add r5, 8
add r1, r3
dec r4d
jne .looph
; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
mov r4d, 4
mov r1, 4
neg r2
lea r1, [r1+r2*8]
lea r1, [r1+r2*4]
lea r1, [r1+r2*2]
neg r2
mov r5, rsp
.loopv:
pxor m7, m7
mova m0, [r5+ 0x0]
mova m1, [r5+ 0x8]
mova m2, [r5+0x10]
mova m3, [r5+0x18]
QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
add r5, 0x88
add r0, r1
dec r4d
jne .loopv
REP_RET
%endmacro
%macro PUT_OPH 2-3
movh %1, %2
%endmacro
%macro AVG_OPH 2-3
movh %3, %1
pavgb %2, %3
movh %1, %2
%endmacro
INIT_MMX mmxext
%define PW_ROUND pw_16
%define OP_MOV PUT_OPH
MPEG4_QPEL16_V_LOWPASS put
%define PW_ROUND pw_16
%define OP_MOV AVG_OPH
MPEG4_QPEL16_V_LOWPASS avg
%define PW_ROUND pw_15
%define OP_MOV PUT_OPH
MPEG4_QPEL16_V_LOWPASS put_no_rnd
%macro MPEG4_QPEL8_V_LOWPASS 1
cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 9
mov r5, rsp
pxor m7, m7
.looph:
mova m0, [r1]
mova m1, [r1]
punpcklbw m0, m7
punpckhbw m1, m7
mova [r5], m0
mova [r5+0x48], m1
add r5, 8
add r1, r3
dec r4d
jne .looph
; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
mov r4d, 2
mov r1, 4
neg r2
lea r1, [r1+r2*4]
lea r1, [r1+r2*2]
neg r2
mov r5, rsp
.loopv:
pxor m7, m7
mova m0, [r5+ 0x0]
mova m1, [r5+ 0x8]
mova m2, [r5+0x10]
mova m3, [r5+0x18]
QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
add r5, 0x48
add r0, r1
dec r4d
jne .loopv
REP_RET
%endmacro
INIT_MMX mmxext
%define PW_ROUND pw_16
%define OP_MOV PUT_OPH
MPEG4_QPEL8_V_LOWPASS put
%define PW_ROUND pw_16
%define OP_MOV AVG_OPH
MPEG4_QPEL8_V_LOWPASS avg
%define PW_ROUND pw_15
%define OP_MOV PUT_OPH
MPEG4_QPEL8_V_LOWPASS put_no_rnd

View File

@@ -0,0 +1,277 @@
/*
* MMX optimized MP3 decoding functions
* Copyright (c) 2010 Vitor Sessak
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mpegaudiodsp.h"
#define DECL(CPU)\
static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
DECL(sse)
DECL(sse2)
DECL(sse3)
DECL(ssse3)
DECL(avx)
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
float *tmpbuf);
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
float *tmpbuf);
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
#if HAVE_SSE2_INLINE
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
#define SUM8(op, sum, w, p) \
{ \
op(sum, (w)[0 * 64], (p)[0 * 64]); \
op(sum, (w)[1 * 64], (p)[1 * 64]); \
op(sum, (w)[2 * 64], (p)[2 * 64]); \
op(sum, (w)[3 * 64], (p)[3 * 64]); \
op(sum, (w)[4 * 64], (p)[4 * 64]); \
op(sum, (w)[5 * 64], (p)[5 * 64]); \
op(sum, (w)[6 * 64], (p)[6 * 64]); \
op(sum, (w)[7 * 64], (p)[7 * 64]); \
}
static void apply_window(const float *buf, const float *win1,
const float *win2, float *sum1, float *sum2, int len)
{
x86_reg count = - 4*len;
const float *win1a = win1+len;
const float *win2a = win2+len;
const float *bufa = buf+len;
float *sum1a = sum1+len;
float *sum2a = sum2+len;
#define MULT(a, b) \
"movaps " #a "(%1,%0), %%xmm1 \n\t" \
"movaps " #a "(%3,%0), %%xmm2 \n\t" \
"mulps %%xmm2, %%xmm1 \n\t" \
"subps %%xmm1, %%xmm0 \n\t" \
"mulps " #b "(%2,%0), %%xmm2 \n\t" \
"subps %%xmm2, %%xmm4 \n\t" \
__asm__ volatile(
"1: \n\t"
"xorps %%xmm0, %%xmm0 \n\t"
"xorps %%xmm4, %%xmm4 \n\t"
MULT( 0, 0)
MULT( 256, 64)
MULT( 512, 128)
MULT( 768, 192)
MULT(1024, 256)
MULT(1280, 320)
MULT(1536, 384)
MULT(1792, 448)
"movaps %%xmm0, (%4,%0) \n\t"
"movaps %%xmm4, (%5,%0) \n\t"
"add $16, %0 \n\t"
"jl 1b \n\t"
:"+&r"(count)
:"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
);
#undef MULT
}
static void apply_window_mp3(float *in, float *win, int *unused, float *out,
int incr)
{
LOCAL_ALIGNED_16(float, suma, [17]);
LOCAL_ALIGNED_16(float, sumb, [17]);
LOCAL_ALIGNED_16(float, sumc, [17]);
LOCAL_ALIGNED_16(float, sumd, [17]);
float sum;
/* copy to avoid wrap */
__asm__ volatile(
"movaps 0(%0), %%xmm0 \n\t" \
"movaps 16(%0), %%xmm1 \n\t" \
"movaps 32(%0), %%xmm2 \n\t" \
"movaps 48(%0), %%xmm3 \n\t" \
"movaps %%xmm0, 0(%1) \n\t" \
"movaps %%xmm1, 16(%1) \n\t" \
"movaps %%xmm2, 32(%1) \n\t" \
"movaps %%xmm3, 48(%1) \n\t" \
"movaps 64(%0), %%xmm0 \n\t" \
"movaps 80(%0), %%xmm1 \n\t" \
"movaps 96(%0), %%xmm2 \n\t" \
"movaps 112(%0), %%xmm3 \n\t" \
"movaps %%xmm0, 64(%1) \n\t" \
"movaps %%xmm1, 80(%1) \n\t" \
"movaps %%xmm2, 96(%1) \n\t" \
"movaps %%xmm3, 112(%1) \n\t"
::"r"(in), "r"(in+512)
:"memory"
);
apply_window(in + 16, win , win + 512, suma, sumc, 16);
apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
SUM8(MACS, suma[0], win + 32, in + 48);
sumc[ 0] = 0;
sumb[16] = 0;
sumd[16] = 0;
#define SUMS(suma, sumb, sumc, sumd, out1, out2) \
"movups " #sumd "(%4), %%xmm0 \n\t" \
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
"subps " #suma "(%1), %%xmm0 \n\t" \
"movaps %%xmm0," #out1 "(%0) \n\t" \
\
"movups " #sumc "(%3), %%xmm0 \n\t" \
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
"addps " #sumb "(%2), %%xmm0 \n\t" \
"movaps %%xmm0," #out2 "(%0) \n\t"
if (incr == 1) {
__asm__ volatile(
SUMS( 0, 48, 4, 52, 0, 112)
SUMS(16, 32, 20, 36, 16, 96)
SUMS(32, 16, 36, 20, 32, 80)
SUMS(48, 0, 52, 4, 48, 64)
:"+&r"(out)
:"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
:"memory"
);
out += 16*incr;
} else {
int j;
float *out2 = out + 32 * incr;
out[0 ] = -suma[ 0];
out += incr;
out2 -= incr;
for(j=1;j<16;j++) {
*out = -suma[ j] + sumd[16-j];
*out2 = sumb[16-j] + sumc[ j];
out += incr;
out2 -= incr;
}
}
sum = 0;
SUM8(MLSS, sum, win + 16 + 32, in + 32);
*out = sum;
}
#endif /* HAVE_SSE2_INLINE */
#if HAVE_YASM
#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
int count, int switch_point, int block_type) \
{ \
int align_end = count - (count & 3); \
int j; \
for (j = 0; j < align_end; j+= 4) { \
LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
/* apply window & overlap with previous buffer */ \
\
/* select window */ \
ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
in += 4*18; \
buf += 4*18; \
out += 4; \
} \
for (; j < count; j++) { \
/* apply window & overlap with previous buffer */ \
\
/* select window */ \
int win_idx = (switch_point && j < 2) ? 0 : block_type; \
float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
\
ff_imdct36_float_ ## CPU1(out, buf, in, win); \
\
in += 18; \
buf++; \
out++; \
} \
}
#if HAVE_SSE
DECL_IMDCT_BLOCKS(sse,sse)
DECL_IMDCT_BLOCKS(sse2,sse)
DECL_IMDCT_BLOCKS(sse3,sse)
DECL_IMDCT_BLOCKS(ssse3,sse)
#endif
#if HAVE_AVX_EXTERNAL
DECL_IMDCT_BLOCKS(avx,avx)
#endif
#endif /* HAVE_YASM */
av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
int i, j;
for (j = 0; j < 4; j++) {
for (i = 0; i < 40; i ++) {
mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
}
}
#if HAVE_SSE2_INLINE
if (cpu_flags & AV_CPU_FLAG_SSE2) {
s->apply_window_float = apply_window_mp3;
}
#endif /* HAVE_SSE2_INLINE */
#if HAVE_YASM
if (EXTERNAL_SSE(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse2;
}
if (EXTERNAL_SSE3(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse3;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_avx;
}
#endif /* HAVE_YASM */
}

View File

@@ -0,0 +1,577 @@
/*
* Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
* h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
#include "dsputil_x86.h"
#if HAVE_MMX_INLINE
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg level, qmul, qadd, nCoeffs;
qmul = qscale << 1;
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
if (!s->h263_aic) {
if (n < 4)
level = block[0] * s->y_dc_scale;
else
level = block[0] * s->c_dc_scale;
qadd = (qscale - 1) | 1;
}else{
qadd = 0;
level= block[0];
}
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
__asm__ volatile(
"movd %1, %%mm6 \n\t" //qmul
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" //qadd
"pxor %%mm7, %%mm7 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"psubw %%mm5, %%mm7 \n\t"
"pxor %%mm4, %%mm4 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %3), %%mm0 \n\t"
"movq 8(%0, %3), %%mm1 \n\t"
"pmullw %%mm6, %%mm0 \n\t"
"pmullw %%mm6, %%mm1 \n\t"
"movq (%0, %3), %%mm2 \n\t"
"movq 8(%0, %3), %%mm3 \n\t"
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"paddw %%mm7, %%mm0 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
"pandn %%mm2, %%mm0 \n\t"
"pandn %%mm3, %%mm1 \n\t"
"movq %%mm0, (%0, %3) \n\t"
"movq %%mm1, 8(%0, %3) \n\t"
"add $16, %3 \n\t"
"jng 1b \n\t"
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
: "memory"
);
block[0]= level;
}
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg qmul, qadd, nCoeffs;
qmul = qscale << 1;
qadd = (qscale - 1) | 1;
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
__asm__ volatile(
"movd %1, %%mm6 \n\t" //qmul
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" //qadd
"pxor %%mm7, %%mm7 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"psubw %%mm5, %%mm7 \n\t"
"pxor %%mm4, %%mm4 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %3), %%mm0 \n\t"
"movq 8(%0, %3), %%mm1 \n\t"
"pmullw %%mm6, %%mm0 \n\t"
"pmullw %%mm6, %%mm1 \n\t"
"movq (%0, %3), %%mm2 \n\t"
"movq 8(%0, %3), %%mm3 \n\t"
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"paddw %%mm7, %%mm0 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
"pandn %%mm2, %%mm0 \n\t"
"pandn %%mm3, %%mm1 \n\t"
"movq %%mm0, (%0, %3) \n\t"
"movq %%mm1, 8(%0, %3) \n\t"
"add $16, %3 \n\t"
"jng 1b \n\t"
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
: "memory"
);
}
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
int block0;
av_assert2(s->block_last_index[n]>=0);
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
if (n < 4)
block0 = block[0] * s->y_dc_scale;
else
block0 = block[0] * s->c_dc_scale;
/* XXX: only mpeg1 */
quant_matrix = s->intra_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $3, %%mm0 \n\t"
"psraw $3, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t"
"psubw %%mm7, %%mm1 \n\t"
"por %%mm7, %%mm0 \n\t"
"por %%mm7, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
"add $16, %%"REG_a" \n\t"
"js 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"REG_a, "memory"
);
block[0]= block0;
}
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
av_assert2(s->block_last_index[n]>=0);
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
quant_matrix = s->inter_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $4, %%mm0 \n\t"
"psraw $4, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t"
"psubw %%mm7, %%mm1 \n\t"
"por %%mm7, %%mm0 \n\t"
"por %%mm7, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
"add $16, %%"REG_a" \n\t"
"js 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"REG_a, "memory"
);
}
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
int block0;
av_assert2(s->block_last_index[n]>=0);
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
if (n < 4)
block0 = block[0] * s->y_dc_scale;
else
block0 = block[0] * s->c_dc_scale;
quant_matrix = s->intra_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $3, %%mm0 \n\t"
"psraw $3, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
"add $16, %%"REG_a" \n\t"
"jng 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"REG_a, "memory"
);
block[0]= block0;
//Note, we do not do mismatch control for intra as errors cannot accumulate
}
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
av_assert2(s->block_last_index[n]>=0);
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
quant_matrix = s->inter_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlq $48, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psrlw $4, %%mm0 \n\t"
"psrlw $4, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"pxor %%mm4, %%mm7 \n\t"
"pxor %%mm5, %%mm7 \n\t"
"movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
"add $16, %%"REG_a" \n\t"
"jng 1b \n\t"
"movd 124(%0, %3), %%mm0 \n\t"
"movq %%mm7, %%mm6 \n\t"
"psrlq $32, %%mm7 \n\t"
"pxor %%mm6, %%mm7 \n\t"
"movq %%mm7, %%mm6 \n\t"
"psrlq $16, %%mm7 \n\t"
"pxor %%mm6, %%mm7 \n\t"
"pslld $31, %%mm7 \n\t"
"psrlq $15, %%mm7 \n\t"
"pxor %%mm7, %%mm0 \n\t"
"movd %%mm0, 124(%0, %3) \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
: "%"REG_a, "memory"
);
}
static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){
const int intra= s->mb_intra;
int *sum= s->dct_error_sum[intra];
uint16_t *offset= s->dct_offset[intra];
s->dct_count[intra]++;
__asm__ volatile(
"pxor %%mm7, %%mm7 \n\t"
"1: \n\t"
"pxor %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"movq (%0), %%mm2 \n\t"
"movq 8(%0), %%mm3 \n\t"
"pcmpgtw %%mm2, %%mm0 \n\t"
"pcmpgtw %%mm3, %%mm1 \n\t"
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
"psubw %%mm0, %%mm2 \n\t"
"psubw %%mm1, %%mm3 \n\t"
"movq %%mm2, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"psubusw (%2), %%mm2 \n\t"
"psubusw 8(%2), %%mm3 \n\t"
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
"psubw %%mm0, %%mm2 \n\t"
"psubw %%mm1, %%mm3 \n\t"
"movq %%mm2, (%0) \n\t"
"movq %%mm3, 8(%0) \n\t"
"movq %%mm4, %%mm2 \n\t"
"movq %%mm5, %%mm3 \n\t"
"punpcklwd %%mm7, %%mm4 \n\t"
"punpckhwd %%mm7, %%mm2 \n\t"
"punpcklwd %%mm7, %%mm5 \n\t"
"punpckhwd %%mm7, %%mm3 \n\t"
"paddd (%1), %%mm4 \n\t"
"paddd 8(%1), %%mm2 \n\t"
"paddd 16(%1), %%mm5 \n\t"
"paddd 24(%1), %%mm3 \n\t"
"movq %%mm4, (%1) \n\t"
"movq %%mm2, 8(%1) \n\t"
"movq %%mm5, 16(%1) \n\t"
"movq %%mm3, 24(%1) \n\t"
"add $16, %0 \n\t"
"add $32, %1 \n\t"
"add $16, %2 \n\t"
"cmp %3, %0 \n\t"
" jb 1b \n\t"
: "+r" (block), "+r" (sum), "+r" (offset)
: "r"(block+64)
);
}
static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){
const int intra= s->mb_intra;
int *sum= s->dct_error_sum[intra];
uint16_t *offset= s->dct_offset[intra];
s->dct_count[intra]++;
__asm__ volatile(
"pxor %%xmm7, %%xmm7 \n\t"
"1: \n\t"
"pxor %%xmm0, %%xmm0 \n\t"
"pxor %%xmm1, %%xmm1 \n\t"
"movdqa (%0), %%xmm2 \n\t"
"movdqa 16(%0), %%xmm3 \n\t"
"pcmpgtw %%xmm2, %%xmm0 \n\t"
"pcmpgtw %%xmm3, %%xmm1 \n\t"
"pxor %%xmm0, %%xmm2 \n\t"
"pxor %%xmm1, %%xmm3 \n\t"
"psubw %%xmm0, %%xmm2 \n\t"
"psubw %%xmm1, %%xmm3 \n\t"
"movdqa %%xmm2, %%xmm4 \n\t"
"movdqa %%xmm3, %%xmm5 \n\t"
"psubusw (%2), %%xmm2 \n\t"
"psubusw 16(%2), %%xmm3 \n\t"
"pxor %%xmm0, %%xmm2 \n\t"
"pxor %%xmm1, %%xmm3 \n\t"
"psubw %%xmm0, %%xmm2 \n\t"
"psubw %%xmm1, %%xmm3 \n\t"
"movdqa %%xmm2, (%0) \n\t"
"movdqa %%xmm3, 16(%0) \n\t"
"movdqa %%xmm4, %%xmm6 \n\t"
"movdqa %%xmm5, %%xmm0 \n\t"
"punpcklwd %%xmm7, %%xmm4 \n\t"
"punpckhwd %%xmm7, %%xmm6 \n\t"
"punpcklwd %%xmm7, %%xmm5 \n\t"
"punpckhwd %%xmm7, %%xmm0 \n\t"
"paddd (%1), %%xmm4 \n\t"
"paddd 16(%1), %%xmm6 \n\t"
"paddd 32(%1), %%xmm5 \n\t"
"paddd 48(%1), %%xmm0 \n\t"
"movdqa %%xmm4, (%1) \n\t"
"movdqa %%xmm6, 16(%1) \n\t"
"movdqa %%xmm5, 32(%1) \n\t"
"movdqa %%xmm0, 48(%1) \n\t"
"add $32, %0 \n\t"
"add $64, %1 \n\t"
"add $32, %2 \n\t"
"cmp %3, %0 \n\t"
" jb 1b \n\t"
: "+r" (block), "+r" (sum), "+r" (offset)
: "r"(block+64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
#endif /* HAVE_MMX_INLINE */
av_cold void ff_MPV_common_init_x86(MpegEncContext *s)
{
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags)) {
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
if(!(s->flags & CODEC_FLAG_BITEXACT))
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
s->denoise_dct = denoise_dct_mmx;
}
if (INLINE_SSE2(cpu_flags)) {
s->denoise_dct = denoise_dct_sse2;
}
#endif /* HAVE_MMX_INLINE */
}

View File

@@ -0,0 +1,107 @@
/*
* The simplest mpeg encoder (well, it was the simplest!)
* Copyright (c) 2000,2001 Fabrice Bellard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dct.h"
#include "libavcodec/mpegvideo.h"
#include "dsputil_x86.h"
extern uint16_t ff_inv_zigzag_direct16[64];
#if HAVE_MMX_INLINE
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_SSSE3 0
#define RENAME(a) a ## _MMX
#define RENAMEl(a) a ## _mmx
#include "mpegvideoenc_template.c"
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMXEXT_INLINE
#undef COMPILE_TEMPLATE_SSSE3
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_SSSE3 0
#undef RENAME
#undef RENAMEl
#define RENAME(a) a ## _MMXEXT
#define RENAMEl(a) a ## _mmxext
#include "mpegvideoenc_template.c"
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_SSE2_INLINE
#undef COMPILE_TEMPLATE_MMXEXT
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_SSSE3
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 1
#define COMPILE_TEMPLATE_SSSE3 0
#undef RENAME
#undef RENAMEl
#define RENAME(a) a ## _SSE2
#define RENAMEl(a) a ## _sse2
#include "mpegvideoenc_template.c"
#endif /* HAVE_SSE2_INLINE */
#if HAVE_SSSE3_INLINE
#undef COMPILE_TEMPLATE_MMXEXT
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_SSSE3
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 1
#define COMPILE_TEMPLATE_SSSE3 1
#undef RENAME
#undef RENAMEl
#define RENAME(a) a ## _SSSE3
#define RENAMEl(a) a ## _sse2
#include "mpegvideoenc_template.c"
#endif /* HAVE_SSSE3_INLINE */
av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
{
const int dct_algo = s->avctx->dct_algo;
if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
s->dct_quantize = dct_quantize_MMX;
#endif
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
s->dct_quantize = dct_quantize_MMXEXT;
#endif
#if HAVE_SSE2_INLINE
if (INLINE_SSE2(cpu_flags))
s->dct_quantize = dct_quantize_SSE2;
#endif
#if HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags))
s->dct_quantize = dct_quantize_SSSE3;
#endif
}
}

View File

@@ -0,0 +1,364 @@
/*
* MPEG video MMX templates
*
* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#undef MMREG_WIDTH
#undef MM
#undef MOVQ
#undef SPREADW
#undef PMAXW
#undef PMAX
#undef SAVE_SIGN
#undef RESTORE_SIGN
#if COMPILE_TEMPLATE_SSE2
#define MMREG_WIDTH "16"
#define MM "%%xmm"
#define MOVQ "movdqa"
#define SPREADW(a) \
"pshuflw $0, "a", "a" \n\t"\
"punpcklwd "a", "a" \n\t"
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
#define PMAX(a,b) \
"movhlps "a", "b" \n\t"\
PMAXW(b, a)\
"pshuflw $0x0E, "a", "b" \n\t"\
PMAXW(b, a)\
"pshuflw $0x01, "a", "b" \n\t"\
PMAXW(b, a)
#else
#define MMREG_WIDTH "8"
#define MM "%%mm"
#define MOVQ "movq"
#if COMPILE_TEMPLATE_MMXEXT
#define SPREADW(a) "pshufw $0, "a", "a" \n\t"
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
#define PMAX(a,b) \
"pshufw $0x0E, "a", "b" \n\t"\
PMAXW(b, a)\
"pshufw $0x01, "a", "b" \n\t"\
PMAXW(b, a)
#else
#define SPREADW(a) \
"punpcklwd "a", "a" \n\t"\
"punpcklwd "a", "a" \n\t"
#define PMAXW(a,b) \
"psubusw "a", "b" \n\t"\
"paddw "a", "b" \n\t"
#define PMAX(a,b) \
"movq "a", "b" \n\t"\
"psrlq $32, "a" \n\t"\
PMAXW(b, a)\
"movq "a", "b" \n\t"\
"psrlq $16, "a" \n\t"\
PMAXW(b, a)
#endif
#endif
#if COMPILE_TEMPLATE_SSSE3
#define SAVE_SIGN(a,b) \
"movdqa "b", "a" \n\t"\
"pabsw "b", "b" \n\t"
#define RESTORE_SIGN(a,b) \
"psignw "a", "b" \n\t"
#else
#define SAVE_SIGN(a,b) \
"pxor "a", "a" \n\t"\
"pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
"pxor "a", "b" \n\t"\
"psubw "a", "b" \n\t" /* ABS(block[i]) */
#define RESTORE_SIGN(a,b) \
"pxor "a", "b" \n\t"\
"psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
#endif
static int RENAME(dct_quantize)(MpegEncContext *s,
int16_t *block, int n,
int qscale, int *overflow)
{
x86_reg last_non_zero_p1;
int level=0, q; //=0 is because gcc says uninitialized ...
const uint16_t *qmat, *bias;
LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
//s->fdct (block);
RENAMEl(ff_fdct) (block); //cannot be anything else ...
if(s->dct_error_sum)
s->denoise_dct(s, block);
if (s->mb_intra) {
int dummy;
if (n < 4){
q = s->y_dc_scale;
bias = s->q_intra_matrix16[qscale][1];
qmat = s->q_intra_matrix16[qscale][0];
}else{
q = s->c_dc_scale;
bias = s->q_chroma_intra_matrix16[qscale][1];
qmat = s->q_chroma_intra_matrix16[qscale][0];
}
/* note: block[0] is assumed to be positive */
if (!s->h263_aic) {
__asm__ volatile (
"mul %%ecx \n\t"
: "=d" (level), "=a"(dummy)
: "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1])
);
} else
/* For AIC we skip quant/dequant of INTRADC */
level = (block[0] + 4)>>3;
block[0]=0; //avoid fake overflow
// temp_block[0] = (block[0] + (q >> 1)) / q;
last_non_zero_p1 = 1;
} else {
last_non_zero_p1 = 0;
bias = s->q_inter_matrix16[qscale][1];
qmat = s->q_inter_matrix16[qscale][0];
}
if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
__asm__ volatile(
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
SPREADW(MM"3")
"pxor "MM"7, "MM"7 \n\t" // 0
"pxor "MM"4, "MM"4 \n\t" // 0
MOVQ" (%2), "MM"5 \n\t" // qmat[0]
"pxor "MM"6, "MM"6 \n\t"
"psubw (%3), "MM"6 \n\t" // -bias[0]
"mov $-128, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
"psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
"por "MM"0, "MM"4 \n\t"
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
"pandn "MM"1, "MM"0 \n\t"
PMAXW(MM"0", MM"3")
"add $"MMREG_WIDTH", %%"REG_a" \n\t"
" js 1b \n\t"
PMAX(MM"3", MM"0")
"movd "MM"3, %%"REG_a" \n\t"
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat), "r" (bias),
"r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}else{ // FMT_H263
__asm__ volatile(
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
SPREADW(MM"3")
"pxor "MM"7, "MM"7 \n\t" // 0
"pxor "MM"4, "MM"4 \n\t" // 0
"mov $-128, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0]
"paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i]
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
"por "MM"0, "MM"4 \n\t"
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
"pandn "MM"1, "MM"0 \n\t"
PMAXW(MM"0", MM"3")
"add $"MMREG_WIDTH", %%"REG_a" \n\t"
" js 1b \n\t"
PMAX(MM"3", MM"0")
"movd "MM"3, %%"REG_a" \n\t"
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat+64), "r" (bias+64),
"r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
__asm__ volatile(
"movd %1, "MM"1 \n\t" // max_qcoeff
SPREADW(MM"1")
"psubusw "MM"1, "MM"4 \n\t"
"packuswb "MM"4, "MM"4 \n\t"
#if COMPILE_TEMPLATE_SSE2
"packuswb "MM"4, "MM"4 \n\t"
#endif
"movd "MM"4, %0 \n\t" // *overflow
: "=g" (*overflow)
: "g" (s->max_qcoeff)
);
if(s->mb_intra) block[0]= level;
else block[0]= temp_block[0];
if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
if(last_non_zero_p1 <= 1) goto end;
block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
block[0x20] = temp_block[0x10];
if(last_non_zero_p1 <= 4) goto end;
block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
block[0x09] = temp_block[0x03];
if(last_non_zero_p1 <= 7) goto end;
block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
if(last_non_zero_p1 <= 11) goto end;
block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
block[0x0C] = temp_block[0x05];
if(last_non_zero_p1 <= 16) goto end;
block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
if(last_non_zero_p1 <= 24) goto end;
block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
if(last_non_zero_p1 <= 32) goto end;
block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
if(last_non_zero_p1 <= 40) goto end;
block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
if(last_non_zero_p1 <= 48) goto end;
block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
if(last_non_zero_p1 <= 56) goto end;
block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
}else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){
if(last_non_zero_p1 <= 1) goto end;
block[0x04] = temp_block[0x01];
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
if(last_non_zero_p1 <= 4) goto end;
block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
block[0x05] = temp_block[0x03];
if(last_non_zero_p1 <= 7) goto end;
block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
if(last_non_zero_p1 <= 11) goto end;
block[0x1C] = temp_block[0x19];
block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
if(last_non_zero_p1 <= 16) goto end;
block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
if(last_non_zero_p1 <= 24) goto end;
block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
if(last_non_zero_p1 <= 32) goto end;
block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
if(last_non_zero_p1 <= 40) goto end;
block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
if(last_non_zero_p1 <= 48) goto end;
block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
if(last_non_zero_p1 <= 56) goto end;
block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
}else{
if(last_non_zero_p1 <= 1) goto end;
block[0x01] = temp_block[0x01];
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
if(last_non_zero_p1 <= 4) goto end;
block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
block[0x03] = temp_block[0x03];
if(last_non_zero_p1 <= 7) goto end;
block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
if(last_non_zero_p1 <= 11) goto end;
block[0x19] = temp_block[0x19];
block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
if(last_non_zero_p1 <= 16) goto end;
block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
if(last_non_zero_p1 <= 24) goto end;
block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
if(last_non_zero_p1 <= 32) goto end;
block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
if(last_non_zero_p1 <= 40) goto end;
block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
if(last_non_zero_p1 <= 48) goto end;
block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
if(last_non_zero_p1 <= 56) goto end;
block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
}
end:
return last_non_zero_p1 - 1;
}

View File

@@ -0,0 +1,173 @@
;******************************************************************************
;* x86 optimizations for PNG decoding
;*
;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
cextern pw_255
SECTION_TEXT
; %1 = nr. of xmm registers used
%macro ADD_BYTES_FN 1
cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
%if ARCH_X86_64
movsxd waq, wad
%endif
xor iq, iq
; vector loop
mov wq, waq
and waq, ~(mmsize*2-1)
jmp .end_v
.loop_v:
mova m0, [src1q+iq]
mova m1, [src1q+iq+mmsize]
paddb m0, [src2q+iq]
paddb m1, [src2q+iq+mmsize]
mova [dstq+iq ], m0
mova [dstq+iq+mmsize], m1
add iq, mmsize*2
.end_v:
cmp iq, waq
jl .loop_v
%if mmsize == 16
; vector loop
mov waq, wq
and waq, ~7
jmp .end_l
.loop_l:
movq mm0, [src1q+iq]
paddb mm0, [src2q+iq]
movq [dstq+iq ], mm0
add iq, 8
.end_l:
cmp iq, waq
jl .loop_l
%endif
; scalar loop for leftover
jmp .end_s
.loop_s:
mov wab, [src1q+iq]
add wab, [src2q+iq]
mov [dstq+iq], wab
inc iq
.end_s:
cmp iq, wq
jl .loop_s
REP_RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
ADD_BYTES_FN 0
%endif
INIT_XMM sse2
ADD_BYTES_FN 2
%macro ADD_PAETH_PRED_FN 1
cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
%if ARCH_X86_64
movsxd bppq, bppd
movsxd wq, wd
%endif
lea endq, [dstq+wq-(mmsize/2-1)]
sub topq, dstq
sub srcq, dstq
sub dstq, bppq
pxor m7, m7
PUSH dstq
lea cntrq, [bppq-1]
shr cntrq, 2 + mmsize/16
.bpp_loop:
lea dstq, [dstq+cntrq*(mmsize/2)]
movh m0, [dstq]
movh m1, [topq+dstq]
punpcklbw m0, m7
punpcklbw m1, m7
add dstq, bppq
.loop:
mova m2, m1
movh m1, [topq+dstq]
mova m3, m2
punpcklbw m1, m7
mova m4, m2
psubw m3, m1
psubw m4, m0
mova m5, m3
paddw m5, m4
%if cpuflag(ssse3)
pabsw m3, m3
pabsw m4, m4
pabsw m5, m5
%else ; !cpuflag(ssse3)
psubw m7, m5
pmaxsw m5, m7
pxor m6, m6
pxor m7, m7
psubw m6, m3
psubw m7, m4
pmaxsw m3, m6
pmaxsw m4, m7
pxor m7, m7
%endif ; cpuflag(ssse3)
mova m6, m4
pminsw m6, m5
pcmpgtw m3, m6
pcmpgtw m4, m5
mova m6, m4
pand m4, m3
pandn m6, m3
pandn m3, m0
movh m0, [srcq+dstq]
pand m6, m1
pand m2, m4
punpcklbw m0, m7
paddw m0, m6
paddw m3, m2
paddw m0, m3
pand m0, [pw_255]
mova m3, m0
packuswb m3, m3
movh [dstq], m3
add dstq, bppq
cmp dstq, endq
jle .loop
mov dstq, [rsp]
dec cntrq
jge .bpp_loop
POP dstq
RET
%endmacro
INIT_MMX mmxext
ADD_PAETH_PRED_FN 0
INIT_MMX ssse3
ADD_PAETH_PRED_FN 0

View File

@@ -0,0 +1,50 @@
/*
* x86 PNG optimizations.
* Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/common.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/pngdsp.h"
void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src,
uint8_t *top, int w, int bpp);
void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src,
uint8_t *top, int w, int bpp);
void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1,
uint8_t *src2, int w);
void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1,
uint8_t *src2, int w);
av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags))
dsp->add_bytes_l2 = ff_add_bytes_l2_mmx;
#endif
if (EXTERNAL_MMXEXT(cpu_flags))
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext;
if (EXTERNAL_SSE2(cpu_flags))
dsp->add_bytes_l2 = ff_add_bytes_l2_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3;
}

View File

@@ -0,0 +1,326 @@
;******************************************************************************
;* x86-SIMD-optimized IDCT for prores
;* this is identical to "simple" IDCT written by Michael Niedermayer
;* except for the clip range
;*
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
%define W6sh2 8867 ; W6 = 35468 = 8867<<2
%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
%if ARCH_X86_64
SECTION_RODATA
w4_plus_w2: times 4 dw W4sh2, +W2sh2
w4_min_w2: times 4 dw W4sh2, -W2sh2
w4_plus_w6: times 4 dw W4sh2, +W6sh2
w4_min_w6: times 4 dw W4sh2, -W6sh2
w1_plus_w3: times 4 dw W1sh2, +W3sh2
w3_min_w1: times 4 dw W3sh2, -W1sh2
w7_plus_w3: times 4 dw W7sh2, +W3sh2
w3_min_w7: times 4 dw W3sh2, -W7sh2
w1_plus_w5: times 4 dw W1sh2, +W5sh2
w5_min_w1: times 4 dw W5sh2, -W1sh2
w5_plus_w7: times 4 dw W5sh2, +W7sh2
w7_min_w5: times 4 dw W7sh2, -W5sh2
pw_88: times 8 dw 0x2008
cextern pw_1
cextern pw_4
cextern pw_512
cextern pw_1019
section .text align=16
; interleave data while maintaining source
; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
%macro SBUTTERFLY3 5
punpckl%1 m%2, m%4, m%5
punpckh%1 m%3, m%4, m%5
%endmacro
; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
%macro SUMSUB_SHPK 7
psubd %3, %1, %5 ; { a0 - b0 }[0-3]
psubd %4, %2, %6 ; { a0 - b0 }[4-7]
paddd %1, %5 ; { a0 + b0 }[0-3]
paddd %2, %6 ; { a0 + b0 }[4-7]
psrad %1, %7
psrad %2, %7
psrad %3, %7
psrad %4, %7
packssdw %1, %2 ; row[0]
packssdw %3, %4 ; row[7]
%endmacro
; %1 = row or col (for rounding variable)
; %2 = number of bits to shift at the end
%macro IDCT_1D 2
; a0 = (W4 * row[0]) + (1 << (15 - 1));
; a1 = a0;
; a2 = a0;
; a3 = a0;
; a0 += W2 * row[2];
; a1 += W6 * row[2];
; a2 -= W6 * row[2];
; a3 -= W2 * row[2];
%ifidn %1, col
paddw m10,[pw_88]
%endif
%ifidn %1, row
paddw m10,[pw_1]
%endif
SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
pmaddwd m2, m0, [w4_plus_w6]
pmaddwd m3, m1, [w4_plus_w6]
pmaddwd m4, m0, [w4_min_w6]
pmaddwd m5, m1, [w4_min_w6]
pmaddwd m6, m0, [w4_min_w2]
pmaddwd m7, m1, [w4_min_w2]
pmaddwd m0, [w4_plus_w2]
pmaddwd m1, [w4_plus_w2]
; a0: -1*row[0]-1*row[2]
; a1: -1*row[0]
; a2: -1*row[0]
; a3: -1*row[0]+1*row[2]
; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
pmaddwd m10, m8, [w4_plus_w6]
pmaddwd m11, m9, [w4_plus_w6]
paddd m0, m10 ; a0[0-3]
paddd m1, m11 ; a0[4-7]
pmaddwd m10, m8, [w4_min_w6]
pmaddwd m11, m9, [w4_min_w6]
paddd m6, m10 ; a3[0-3]
paddd m7, m11 ; a3[4-7]
pmaddwd m10, m8, [w4_min_w2]
pmaddwd m11, m9, [w4_min_w2]
pmaddwd m8, [w4_plus_w2]
pmaddwd m9, [w4_plus_w2]
psubd m4, m10 ; a2[0-3] intermediate
psubd m5, m11 ; a2[4-7] intermediate
psubd m2, m8 ; a1[0-3] intermediate
psubd m3, m9 ; a1[4-7] intermediate
; load/store
mova [r2+ 0], m0
mova [r2+ 32], m2
mova [r2+ 64], m4
mova [r2+ 96], m6
mova m10,[r2+ 16] ; { row[1] }[0-7]
mova m8, [r2+ 48] ; { row[3] }[0-7]
mova m13,[r2+ 80] ; { row[5] }[0-7]
mova m14,[r2+112] ; { row[7] }[0-7]
mova [r2+ 16], m1
mova [r2+ 48], m3
mova [r2+ 80], m5
mova [r2+112], m7
%ifidn %1, row
pmullw m10,[r3+ 16]
pmullw m8, [r3+ 48]
pmullw m13,[r3+ 80]
pmullw m14,[r3+112]
%endif
; b0 = MUL(W1, row[1]);
; MAC(b0, W3, row[3]);
; b1 = MUL(W3, row[1]);
; MAC(b1, -W7, row[3]);
; b2 = MUL(W5, row[1]);
; MAC(b2, -W1, row[3]);
; b3 = MUL(W7, row[1]);
; MAC(b3, -W5, row[3]);
SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
pmaddwd m2, m0, [w3_min_w7]
pmaddwd m3, m1, [w3_min_w7]
pmaddwd m4, m0, [w5_min_w1]
pmaddwd m5, m1, [w5_min_w1]
pmaddwd m6, m0, [w7_min_w5]
pmaddwd m7, m1, [w7_min_w5]
pmaddwd m0, [w1_plus_w3]
pmaddwd m1, [w1_plus_w3]
; b0: +1*row[1]+2*row[3]
; b1: +2*row[1]-1*row[3]
; b2: -1*row[1]-1*row[3]
; b3: +1*row[1]+1*row[3]
; MAC(b0, W5, row[5]);
; MAC(b0, W7, row[7]);
; MAC(b1, -W1, row[5]);
; MAC(b1, -W5, row[7]);
; MAC(b2, W7, row[5]);
; MAC(b2, W3, row[7]);
; MAC(b3, W3, row[5]);
; MAC(b3, -W1, row[7]);
SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
; b0: -1*row[5]+1*row[7]
; b1: -1*row[5]+1*row[7]
; b2: +1*row[5]+2*row[7]
; b3: +2*row[5]-1*row[7]
pmaddwd m10, m8, [w1_plus_w5]
pmaddwd m11, m9, [w1_plus_w5]
pmaddwd m12, m8, [w5_plus_w7]
pmaddwd m13, m9, [w5_plus_w7]
psubd m2, m10 ; b1[0-3]
psubd m3, m11 ; b1[4-7]
paddd m0, m12 ; b0[0-3]
paddd m1, m13 ; b0[4-7]
pmaddwd m12, m8, [w7_plus_w3]
pmaddwd m13, m9, [w7_plus_w3]
pmaddwd m8, [w3_min_w1]
pmaddwd m9, [w3_min_w1]
paddd m4, m12 ; b2[0-3]
paddd m5, m13 ; b2[4-7]
paddd m6, m8 ; b3[0-3]
paddd m7, m9 ; b3[4-7]
; row[0] = (a0 + b0) >> 15;
; row[7] = (a0 - b0) >> 15;
; row[1] = (a1 + b1) >> 15;
; row[6] = (a1 - b1) >> 15;
; row[2] = (a2 + b2) >> 15;
; row[5] = (a2 - b2) >> 15;
; row[3] = (a3 + b3) >> 15;
; row[4] = (a3 - b3) >> 15;
mova m8, [r2+ 0] ; a0[0-3]
mova m9, [r2+16] ; a0[4-7]
SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
mova m0, [r2+32] ; a1[0-3]
mova m1, [r2+48] ; a1[4-7]
SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
mova m1, [r2+64] ; a2[0-3]
mova m2, [r2+80] ; a2[4-7]
SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
mova m2, [r2+96] ; a3[0-3]
mova m3, [r2+112] ; a3[4-7]
SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
%endmacro
; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
; int16_t *block, const int16_t *qmat);
%macro idct_put_fn 1
cglobal prores_idct_put_10, 4, 4, %1
movsxd r1, r1d
pxor m15, m15 ; zero
; for (i = 0; i < 8; i++)
; idctRowCondDC(block + i*8);
mova m10,[r2+ 0] ; { row[0] }[0-7]
mova m8, [r2+32] ; { row[2] }[0-7]
mova m13,[r2+64] ; { row[4] }[0-7]
mova m12,[r2+96] ; { row[6] }[0-7]
pmullw m10,[r3+ 0]
pmullw m8, [r3+32]
pmullw m13,[r3+64]
pmullw m12,[r3+96]
IDCT_1D row, 15
; transpose for second part of IDCT
TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
mova [r2+ 16], m0
mova [r2+ 48], m2
mova [r2+ 80], m11
mova [r2+112], m10
SWAP 8, 10
SWAP 1, 8
SWAP 4, 13
SWAP 9, 12
; for (i = 0; i < 8; i++)
; idctSparseColAdd(dest + i, line_size, block + i);
IDCT_1D col, 18
; clip/store
mova m3, [pw_4]
mova m5, [pw_1019]
pmaxsw m8, m3
pmaxsw m0, m3
pmaxsw m1, m3
pmaxsw m2, m3
pmaxsw m4, m3
pmaxsw m11, m3
pmaxsw m9, m3
pmaxsw m10, m3
pminsw m8, m5
pminsw m0, m5
pminsw m1, m5
pminsw m2, m5
pminsw m4, m5
pminsw m11, m5
pminsw m9, m5
pminsw m10, m5
lea r2, [r1*3]
mova [r0 ], m8
mova [r0+r1 ], m0
mova [r0+r1*2], m1
mova [r0+r2 ], m2
lea r0, [r0+r1*4]
mova [r0 ], m4
mova [r0+r1 ], m11
mova [r0+r1*2], m9
mova [r0+r2 ], m10
RET
%endmacro
%macro SIGNEXTEND 2-3
%if cpuflag(sse4) ; dstlow, dsthigh
movhlps %2, %1
pmovsxwd %1, %1
pmovsxwd %2, %2
%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
pxor %3, %3
pcmpgtw %3, %1
mova %2, %1
punpcklwd %1, %3
punpckhwd %2, %3
%endif
%endmacro
INIT_XMM sse2
idct_put_fn 16
INIT_XMM sse4
idct_put_fn 16
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
idct_put_fn 16
%endif
%endif

View File

@@ -0,0 +1,58 @@
/*
* Apple ProRes compatible decoder
*
* Copyright (c) 2010-2011 Maxim Poliakovski
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/proresdsp.h"
void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
int16_t *block, const int16_t *qmat);
void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
int16_t *block, const int16_t *qmat);
void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
int16_t *block, const int16_t *qmat);
av_cold void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx)
{
#if ARCH_X86_64
int cpu_flags = av_get_cpu_flags();
if(avctx->flags & CODEC_FLAG_BITEXACT)
return;
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
dsp->idct_put = ff_prores_idct_put_10_sse2;
}
if (EXTERNAL_SSE4(cpu_flags)) {
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
dsp->idct_put = ff_prores_idct_put_10_sse4;
}
if (EXTERNAL_AVX(cpu_flags)) {
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
dsp->idct_put = ff_prores_idct_put_10_avx;
}
#endif /* ARCH_X86_64 */
}

View File

@@ -0,0 +1,176 @@
;******************************************************************************
;* MMX optimized DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
%macro op_avgh 3
movh %3, %2
pavgb %1, %3
movh %2, %1
%endmacro
%macro op_avg 2
pavgb %1, %2
mova %2, %1
%endmacro
%macro op_puth 2-3
movh %2, %1
%endmacro
%macro op_put 2
mova %2, %1
%endmacro
; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS4_L2 1
%define OP op_%1h
cglobal %1_pixels4_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
movd m0, [r1]
movd m1, [r2]
add r1, r4
add r2, 4
pavgb m0, m1
OP m0, [r0], m3
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2]
pavgb m1, [r2+4]
OP m0, [r0], m3
OP m1, [r0+r3], m3
lea r0, [r0+2*r3]
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2+8]
pavgb m1, [r2+12]
OP m0, [r0], m3
OP m1, [r0+r3], m3
lea r0, [r0+2*r3]
add r2, 16
sub r5d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS4_L2 put
PIXELS4_L2 avg
; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS8_L2 1
%define OP op_%1
cglobal %1_pixels8_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r2]
add r1, r4
add r2, 8
pavgb m0, m1
OP m0, [r0]
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2]
pavgb m1, [r2+8]
OP m0, [r0]
OP m1, [r0+r3]
lea r0, [r0+2*r3]
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2+16]
pavgb m1, [r2+24]
OP m0, [r0]
OP m1, [r0+r3]
lea r0, [r0+2*r3]
add r2, 32
sub r5d, 4
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS8_L2 put
PIXELS8_L2 avg
; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS16_L2 1
%define OP op_%1
cglobal %1_pixels16_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r1+8]
pavgb m0, [r2]
pavgb m1, [r2+8]
add r1, r4
add r2, 16
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
pavgb m0, [r2]
pavgb m1, [r2+8]
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
pavgb m0, [r2+16]
pavgb m1, [r2+24]
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
add r2, 32
sub r5d, 2
jne .loop
REP_RET
%endmacro
INIT_MMX mmxext
PIXELS16_L2 put
PIXELS16_L2 avg

View File

@@ -0,0 +1,35 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "dsputil_x86.h"
#if HAVE_INLINE_ASM
#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
#define SET_RND MOVQ_WTWO
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
#define STATIC
#include "rnd_template.c"
PIXELS16(, ff_avg, , _xy2, _mmx)
PIXELS16(, ff_put, , _xy2, _mmx)
#endif /* HAVE_INLINE_ASM */

View File

@@ -0,0 +1,173 @@
/*
* DSP utils mmx functions are compiled twice for rnd/no_rnd
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
// put_pixels
STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_ZERO(mm7);
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm4 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"xor %%"REG_a", %%"REG_a" \n\t"
"add %3, %1 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddusw %%mm2, %%mm0 \n\t"
"paddusw %%mm3, %%mm1 \n\t"
"paddusw %%mm6, %%mm4 \n\t"
"paddusw %%mm6, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm5 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"movq %%mm4, (%2, %%"REG_a") \n\t"
"add %3, %%"REG_a" \n\t"
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
"movq 1(%1, %%"REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm2, %%mm4 \n\t"
"paddusw %%mm3, %%mm5 \n\t"
"paddusw %%mm6, %%mm0 \n\t"
"paddusw %%mm6, %%mm1 \n\t"
"paddusw %%mm4, %%mm0 \n\t"
"paddusw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%2, %%"REG_a") \n\t"
"add %3, %%"REG_a" \n\t"
"subl $2, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels)
:"D"(block), "r"((x86_reg)line_size)
:REG_a, "memory");
}
// avg_pixels
// this routine is 'slightly' suboptimal but mostly unused
STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_ZERO(mm7);
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm4 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"xor %%"REG_a", %%"REG_a" \n\t"
"add %3, %1 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddusw %%mm2, %%mm0 \n\t"
"paddusw %%mm3, %%mm1 \n\t"
"paddusw %%mm6, %%mm4 \n\t"
"paddusw %%mm6, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm5 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"pcmpeqd %%mm2, %%mm2 \n\t"
"paddb %%mm2, %%mm2 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
"movq %%mm5, (%2, %%"REG_a") \n\t"
"add %3, %%"REG_a" \n\t"
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
"movq 1(%1, %%"REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm2, %%mm4 \n\t"
"paddusw %%mm3, %%mm5 \n\t"
"paddusw %%mm6, %%mm0 \n\t"
"paddusw %%mm6, %%mm1 \n\t"
"paddusw %%mm4, %%mm0 \n\t"
"paddusw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"pcmpeqd %%mm2, %%mm2 \n\t"
"paddb %%mm2, %%mm2 \n\t"
PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
"movq %%mm1, (%2, %%"REG_a") \n\t"
"add %3, %%"REG_a" \n\t"
"subl $2, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels)
:"D"(block), "r"((x86_reg)line_size)
:REG_a, "memory");
}

View File

@@ -0,0 +1,196 @@
;******************************************************************************
;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pw_row_coeffs: times 4 dw 13
times 4 dw 17
times 4 dw 7
pd_512: times 2 dd 0x200
pw_col_coeffs: dw 13, 13, 13, -13
dw 17, 7, 7, -17
dw 13, -13, 13, 13
dw -7, 17, -17, -7
SECTION .text
%macro IDCT_DC_NOROUND 1
imul %1, 13*13*3
sar %1, 11
%endmacro
%macro IDCT_DC_ROUND 1
imul %1, 13*13
add %1, 0x200
sar %1, 10
%endmacro
%macro rv34_idct 1
cglobal rv34_idct_%1, 1, 2, 0
movsx r1, word [r0]
IDCT_DC r1
movd m0, r1d
pshufw m0, m0, 0
movq [r0+ 0], m0
movq [r0+ 8], m0
movq [r0+16], m0
movq [r0+24], m0
REP_RET
%endmacro
INIT_MMX mmxext
%define IDCT_DC IDCT_DC_ROUND
rv34_idct dc
%define IDCT_DC IDCT_DC_NOROUND
rv34_idct dc_noround
; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
INIT_MMX mmx
cglobal rv34_idct_dc_add, 3, 3
; calculate DC
IDCT_DC_ROUND r2
pxor m1, m1
movd m0, r2d
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
punpcklbw m0, m0
punpcklbw m1, m1
punpcklwd m0, m0
punpcklwd m1, m1
; add DC
lea r2, [r0+r1*2]
movh m2, [r0]
movh m3, [r0+r1]
movh m4, [r2]
movh m5, [r2+r1]
paddusb m2, m0
paddusb m3, m0
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
psubusb m3, m1
psubusb m4, m1
psubusb m5, m1
movh [r0], m2
movh [r0+r1], m3
movh [r2], m4
movh [r2+r1], m5
RET
; Load coeffs and perform row transform
; Output: coeffs in mm[0467], rounder in mm5
%macro ROW_TRANSFORM 1
pxor mm7, mm7
mova mm0, [%1+ 0*8]
mova mm1, [%1+ 1*8]
mova mm2, [%1+ 2*8]
mova mm3, [%1+ 3*8]
mova [%1+ 0*8], mm7
mova [%1+ 1*8], mm7
mova [%1+ 2*8], mm7
mova [%1+ 3*8], mm7
mova mm4, mm0
mova mm6, [pw_row_coeffs+ 0]
paddsw mm0, mm2 ; b0 + b2
psubsw mm4, mm2 ; b0 - b2
pmullw mm0, mm6 ; *13 = z0
pmullw mm4, mm6 ; *13 = z1
mova mm5, mm1
pmullw mm1, [pw_row_coeffs+ 8] ; b1*17
pmullw mm5, [pw_row_coeffs+16] ; b1* 7
mova mm7, mm3
pmullw mm3, [pw_row_coeffs+ 8] ; b3*17
pmullw mm7, [pw_row_coeffs+16] ; b3* 7
paddsw mm1, mm7 ; z3 = b1*17 + b3* 7
psubsw mm5, mm3 ; z2 = b1* 7 - b3*17
mova mm7, mm0
mova mm6, mm4
paddsw mm0, mm1 ; z0 + z3
psubsw mm7, mm1 ; z0 - z3
paddsw mm4, mm5 ; z1 + z2
psubsw mm6, mm5 ; z1 - z2
mova mm5, [pd_512] ; 0x200
%endmacro
; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
%macro COL_TRANSFORM 4
pshufw mm3, %2, 0xDD ; col. 1,3,1,3
pshufw %2, %2, 0x88 ; col. 0,2,0,2
pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
paddd %2, mm5
pshufw mm1, %2, 01001110b ; z1 | z0
pshufw mm2, mm3, 01001110b ; z2 | z3
paddd %2, mm3 ; z0+z3 | z1+z2
psubd mm1, mm2 ; z1-z2 | z0-z3
movd mm3, %1
psrad %2, 10
pxor mm2, mm2
psrad mm1, 10
punpcklbw mm3, mm2
packssdw %2, mm1
paddw %2, mm3
packuswb %2, %2
movd %1, %2
%endmacro
INIT_MMX mmxext
cglobal rv34_idct_add, 3,3,0, d, s, b
ROW_TRANSFORM bq
COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
mova mm0, [pw_col_coeffs+ 0]
COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
mova mm4, [pw_col_coeffs+ 8]
lea dq, [dq + 2*sq]
COL_TRANSFORM [dq], mm6, mm0, mm4
COL_TRANSFORM [dq+sq], mm7, mm0, mm4
ret
; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
INIT_XMM sse4
cglobal rv34_idct_dc_add, 3, 3, 6
; load data
IDCT_DC_ROUND r2
pxor m1, m1
; calculate DC
movd m0, r2d
lea r2, [r0+r1*2]
movd m2, [r0]
movd m3, [r0+r1]
pshuflw m0, m0, 0
movd m4, [r2]
movd m5, [r2+r1]
punpcklqdq m0, m0
punpckldq m2, m3
punpckldq m4, m5
punpcklbw m2, m1
punpcklbw m4, m1
paddw m2, m0
paddw m4, m0
packuswb m2, m4
movd [r0], m2
pextrd [r0+r1], m2, 1
pextrd [r2], m2, 2
pextrd [r2+r1], m2, 3
RET

View File

@@ -0,0 +1,45 @@
/*
* RV30/40 MMX/SSE2 optimizations
* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/rv34dsp.h"
void ff_rv34_idct_dc_mmxext(int16_t *block);
void ff_rv34_idct_dc_noround_mmxext(int16_t *block);
void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags))
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
c->rv34_idct_add = ff_rv34_idct_add_mmxext;
}
if (EXTERNAL_SSE4(cpu_flags))
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
}

View File

@@ -0,0 +1,505 @@
;******************************************************************************
;* MMX/SSE2-optimized functions for the RV40 decoder
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
align 16
pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
sixtap_filter_hb_m: times 8 db 1, -5
times 8 db 52, 20
; multiplied by 2 to have the same shift
times 8 db 2, -10
times 8 db 40, 40
; back to normal
times 8 db 1, -5
times 8 db 20, 52
sixtap_filter_v_m: times 8 dw 1
times 8 dw -5
times 8 dw 52
times 8 dw 20
; multiplied by 2 to have the same shift
times 8 dw 2
times 8 dw -10
times 8 dw 40
times 8 dw 40
; back to normal
times 8 dw 1
times 8 dw -5
times 8 dw 20
times 8 dw 52
%ifdef PIC
%define sixtap_filter_hw picregq
%define sixtap_filter_hb picregq
%define sixtap_filter_v picregq
%define npicregs 1
%else
%define sixtap_filter_hw sixtap_filter_hw_m
%define sixtap_filter_hb sixtap_filter_hb_m
%define sixtap_filter_v sixtap_filter_v_m
%define npicregs 0
%endif
filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
cextern pw_32
cextern pw_16
cextern pw_512
SECTION .text
;-----------------------------------------------------------------------------
; subpel MC functions:
;
; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
; uint8_t *src, int srcstride,
; int len, int m);
;----------------------------------------------------------------------
%macro LOAD 2
%if WIN64
movsxd %1q, %1d
%endif
%ifdef PIC
add %1q, picregq
%else
add %1q, %2
%endif
%endmacro
%macro STORE 3
%ifidn %3, avg
movh %2, [dstq]
%endif
packuswb %1, %1
%ifidn %3, avg
%if cpuflag(3dnow)
pavgusb %1, %2
%else
pavgb %1, %2
%endif
%endif
movh [dstq], %1
%endmacro
%macro FILTER_V 1
cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
%ifdef PIC
lea picregq, [sixtap_filter_v_m]
%endif
pxor m7, m7
LOAD my, sixtap_filter_v
; read 5 lines
sub srcq, srcstrideq
sub srcq, srcstrideq
movh m0, [srcq]
movh m1, [srcq+srcstrideq]
movh m2, [srcq+srcstrideq*2]
lea srcq, [srcq+srcstrideq*2]
add srcq, srcstrideq
movh m3, [srcq]
movh m4, [srcq+srcstrideq]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
%ifdef m8
mova m8, [myq+ 0]
mova m9, [myq+16]
mova m10, [myq+32]
mova m11, [myq+48]
%define COEFF05 m8
%define COEFF14 m9
%define COEFF2 m10
%define COEFF3 m11
%else
%define COEFF05 [myq+ 0]
%define COEFF14 [myq+16]
%define COEFF2 [myq+32]
%define COEFF3 [myq+48]
%endif
.nextrow:
mova m6, m1
movh m5, [srcq+2*srcstrideq] ; read new row
paddw m6, m4
punpcklbw m5, m7
pmullw m6, COEFF14
paddw m0, m5
pmullw m0, COEFF05
paddw m6, m0
mova m0, m1
paddw m6, [pw_32]
mova m1, m2
pmullw m2, COEFF2
paddw m6, m2
mova m2, m3
pmullw m3, COEFF3
paddw m6, m3
; round/clip/store
mova m3, m4
psraw m6, 6
mova m4, m5
STORE m6, m5, %1
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
%endmacro
%macro FILTER_H 1
cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
%ifdef PIC
lea picregq, [sixtap_filter_v_m]
%endif
pxor m7, m7
LOAD mx, sixtap_filter_v
mova m6, [pw_32]
%ifdef m8
mova m8, [mxq+ 0]
mova m9, [mxq+16]
mova m10, [mxq+32]
mova m11, [mxq+48]
%define COEFF05 m8
%define COEFF14 m9
%define COEFF2 m10
%define COEFF3 m11
%else
%define COEFF05 [mxq+ 0]
%define COEFF14 [mxq+16]
%define COEFF2 [mxq+32]
%define COEFF3 [mxq+48]
%endif
.nextrow:
movq m0, [srcq-2]
movq m5, [srcq+3]
movq m1, [srcq-1]
movq m4, [srcq+2]
punpcklbw m0, m7
punpcklbw m5, m7
punpcklbw m1, m7
punpcklbw m4, m7
movq m2, [srcq-0]
movq m3, [srcq+1]
paddw m0, m5
paddw m1, m4
punpcklbw m2, m7
punpcklbw m3, m7
pmullw m0, COEFF05
pmullw m1, COEFF14
pmullw m2, COEFF2
pmullw m3, COEFF3
paddw m0, m6
paddw m1, m2
paddw m0, m3
paddw m0, m1
psraw m0, 6
STORE m0, m1, %1
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
FILTER_V put
FILTER_H put
INIT_MMX mmxext
FILTER_V avg
FILTER_H avg
INIT_MMX 3dnow
FILTER_V avg
FILTER_H avg
%endif
INIT_XMM sse2
FILTER_H put
FILTER_H avg
FILTER_V put
FILTER_V avg
%macro FILTER_SSSE3 1
cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
%ifdef PIC
lea picregq, [sixtap_filter_hb_m]
%endif
; read 5 lines
sub srcq, srcstrideq
LOAD my, sixtap_filter_hb
sub srcq, srcstrideq
movh m0, [srcq]
movh m1, [srcq+srcstrideq]
movh m2, [srcq+srcstrideq*2]
lea srcq, [srcq+srcstrideq*2]
add srcq, srcstrideq
mova m5, [myq]
movh m3, [srcq]
movh m4, [srcq+srcstrideq]
lea srcq, [srcq+2*srcstrideq]
.nextrow:
mova m6, m2
punpcklbw m0, m1
punpcklbw m6, m3
pmaddubsw m0, m5
pmaddubsw m6, [myq+16]
movh m7, [srcq] ; read new row
paddw m6, m0
mova m0, m1
mova m1, m2
mova m2, m3
mova m3, m4
mova m4, m7
punpcklbw m7, m3
pmaddubsw m7, m5
paddw m6, m7
pmulhrsw m6, [pw_512]
STORE m6, m7, %1
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
%ifdef PIC
lea picregq, [sixtap_filter_hb_m]
%endif
mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3]
LOAD mx, sixtap_filter_hb
mova m5, [mxq] ; set up 6tap filter in bytes
mova m6, [mxq+16]
mova m7, [filter_h6_shuf1]
.nextrow:
movu m0, [srcq-2]
mova m1, m0
mova m2, m0
pshufb m0, m7
pshufb m1, m3
pshufb m2, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
pmaddubsw m2, m5
paddw m0, m1
paddw m0, m2
pmulhrsw m0, [pw_512]
STORE m0, m1, %1
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
%endmacro
INIT_XMM ssse3
FILTER_SSSE3 put
FILTER_SSSE3 avg
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
%macro RV40_WCORE 4-5
movh m4, [%3 + r6 + 0]
movh m5, [%4 + r6 + 0]
%if %0 == 4
%define OFFSET r6 + mmsize / 2
%else
; 8x8 block and sse2, stride was provided
%define OFFSET r6
add r6, r5
%endif
movh m6, [%3 + OFFSET]
movh m7, [%4 + OFFSET]
%if %1 == 0
; 14bits weights
punpcklbw m4, m0
punpcklbw m5, m0
punpcklbw m6, m0
punpcklbw m7, m0
psllw m4, 7
psllw m5, 7
psllw m6, 7
psllw m7, 7
pmulhw m4, m3
pmulhw m5, m2
pmulhw m6, m3
pmulhw m7, m2
paddw m4, m5
paddw m6, m7
%else
; 5bits weights
%if cpuflag(ssse3)
punpcklbw m4, m5
punpcklbw m6, m7
pmaddubsw m4, m3
pmaddubsw m6, m3
%else
punpcklbw m4, m0
punpcklbw m5, m0
punpcklbw m6, m0
punpcklbw m7, m0
pmullw m4, m3
pmullw m5, m2
pmullw m6, m3
pmullw m7, m2
paddw m4, m5
paddw m6, m7
%endif
%endif
; bias and shift down
%if cpuflag(ssse3)
pmulhrsw m4, m1
pmulhrsw m6, m1
%else
paddw m4, m1
paddw m6, m1
psrlw m4, 5
psrlw m6, 5
%endif
packuswb m4, m6
%if %0 == 5
; Only called for 8x8 blocks and sse2
sub r6, r5
movh [%2 + r6], m4
add r6, r5
movhps [%2 + r6], m4
%else
mova [%2 + r6], m4
%endif
%endmacro
%macro MAIN_LOOP 2
%if mmsize == 8
RV40_WCORE %2, r0, r1, r2
%if %1 == 16
RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
%endif
; Prepare for next loop
add r6, r5
%else
%ifidn %1, 8
RV40_WCORE %2, r0, r1, r2, r5
; Prepare 2 next lines
add r6, r5
%else
RV40_WCORE %2, r0, r1, r2
; Prepare single next line
add r6, r5
%endif
%endif
%endmacro
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
; %1=size %2=num of xmm regs
; The weights are FP0.14 notation of fractions depending on pts.
; For timebases without rounding error (i.e. PAL), the fractions
; can be simplified, and several operations can be avoided.
; Therefore, we check here whether they are multiples of 2^9 for
; those simplifications to occur.
%macro RV40_WEIGHT 3
cglobal rv40_weight_func_%1_%2, 6, 7, 8
%if cpuflag(ssse3)
mova m1, [pw_1024]
%else
mova m1, [pw_16]
%endif
pxor m0, m0
; Set loop counter and increments
mov r6, r5
shl r6, %3
add r0, r6
add r1, r6
add r2, r6
neg r6
movd m2, r3d
movd m3, r4d
%ifidn %1,rnd
%define RND 0
SPLATW m2, m2
%else
%define RND 1
%if cpuflag(ssse3)
punpcklbw m3, m2
%else
SPLATW m2, m2
%endif
%endif
SPLATW m3, m3
.loop:
MAIN_LOOP %2, RND
jnz .loop
REP_RET
%endmacro
INIT_MMX mmxext
RV40_WEIGHT rnd, 8, 3
RV40_WEIGHT rnd, 16, 4
RV40_WEIGHT nornd, 8, 3
RV40_WEIGHT nornd, 16, 4
INIT_XMM sse2
RV40_WEIGHT rnd, 8, 3
RV40_WEIGHT rnd, 16, 4
RV40_WEIGHT nornd, 8, 3
RV40_WEIGHT nornd, 16, 4
INIT_XMM ssse3
RV40_WEIGHT rnd, 8, 3
RV40_WEIGHT rnd, 16, 4
RV40_WEIGHT nornd, 8, 3
RV40_WEIGHT nornd, 16, 4

View File

@@ -0,0 +1,270 @@
/*
* RV40 decoder motion compensation functions x86-optimised
* Copyright (c) 2008 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* RV40 decoder motion compensation functions x86-optimised
* 2,0 and 0,2 have h264 equivalents.
* 3,3 is bugged in the rv40 format and maps to _xy2 version
*/
#include "libavcodec/rv34dsp.h"
#include "libavutil/attributes.h"
#include "libavutil/mem.h"
#include "libavutil/x86/cpu.h"
#include "dsputil_x86.h"
#if HAVE_YASM
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
#define DECLARE_WEIGHT(opt) \
void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride);
DECLARE_WEIGHT(mmxext)
DECLARE_WEIGHT(sse2)
DECLARE_WEIGHT(ssse3)
/** @{ */
/**
* Define one qpel function.
* LOOPSIZE must be already set to the number of pixels processed per
* iteration in the inner loop of the called functions.
* COFF(x) must be already defined so as to provide the offset into any
* array of coeffs used by the called function for the qpel position x.
*/
#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
uint8_t *src, \
ptrdiff_t stride) \
{ \
int i; \
if (PH && PV) { \
DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \
uint8_t *tmpptr = tmp + SIZE * 2; \
src -= stride * 2; \
\
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
SIZE + 5, HCOFF(PH)); \
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
SIZE, SIZE, VCOFF(PV)); \
} else if (PV) { \
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
stride, SIZE, VCOFF(PV)); \
} else { \
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
stride, SIZE, HCOFF(PH)); \
} \
};
/** Declare functions for sizes 8 and 16 and given operations
* and qpel position. */
#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
/** Declare all functions for all sizes and qpel positions */
#define QPEL_MC_DECL(OP, OPT) \
void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
const uint8_t *src, \
ptrdiff_t srcStride, \
int len, int m); \
void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
const uint8_t *src, \
ptrdiff_t srcStride, \
int len, int m); \
QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
QPEL_FUNCS_DECL(OP, 3, 2, OPT)
/** @} */
#define LOOPSIZE 8
#define HCOFF(x) (32 * (x - 1))
#define VCOFF(x) (32 * (x - 1))
QPEL_MC_DECL(put_, _ssse3)
QPEL_MC_DECL(avg_, _ssse3)
#undef LOOPSIZE
#undef HCOFF
#undef VCOFF
#define LOOPSIZE 8
#define HCOFF(x) (64 * (x - 1))
#define VCOFF(x) (64 * (x - 1))
QPEL_MC_DECL(put_, _sse2)
QPEL_MC_DECL(avg_, _sse2)
#if ARCH_X86_32
#undef LOOPSIZE
#undef HCOFF
#undef VCOFF
#define LOOPSIZE 4
#define HCOFF(x) (64 * (x - 1))
#define VCOFF(x) (64 * (x - 1))
QPEL_MC_DECL(put_, _mmx)
#define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx
#define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx
QPEL_MC_DECL(avg_, _mmxext)
#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
QPEL_MC_DECL(avg_, _3dnow)
#endif
/** @{ */
/** Set one function */
#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
/** Set all functions for all sizes and qpel positions */
#define QPEL_MC_SET(OP, OPT) \
QPEL_FUNCS_SET (OP, 0, 1, OPT) \
QPEL_FUNCS_SET (OP, 0, 3, OPT) \
QPEL_FUNCS_SET (OP, 1, 0, OPT) \
QPEL_FUNCS_SET (OP, 1, 1, OPT) \
QPEL_FUNCS_SET (OP, 1, 2, OPT) \
QPEL_FUNCS_SET (OP, 1, 3, OPT) \
QPEL_FUNCS_SET (OP, 2, 1, OPT) \
QPEL_FUNCS_SET (OP, 2, 2, OPT) \
QPEL_FUNCS_SET (OP, 2, 3, OPT) \
QPEL_FUNCS_SET (OP, 3, 0, OPT) \
QPEL_FUNCS_SET (OP, 3, 1, OPT) \
QPEL_FUNCS_SET (OP, 3, 2, OPT)
/** @} */
#endif /* HAVE_YASM */
#if HAVE_MMX_INLINE
static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels8_xy2_mmx(dst, src, stride, 8);
}
static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_xy2_mmx(dst, src, stride, 16);
}
static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels8_xy2_mmx(dst, src, stride, 8);
}
static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_xy2_mmx(dst, src, stride, 16);
}
#endif /* HAVE_MMX_INLINE */
av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_MMX_INLINE
if (INLINE_MMX(cpu_flags)) {
c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_mmx;
c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx;
c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_mmx;
c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmx;
}
#endif /* HAVE_MMX_INLINE */
#if HAVE_YASM
if (EXTERNAL_MMX(cpu_flags)) {
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
#if ARCH_X86_32
QPEL_MC_SET(put_, _mmx)
#endif
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
#if ARCH_X86_32
QPEL_MC_SET(avg_, _3dnow)
#endif
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext;
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext;
#if ARCH_X86_32
QPEL_MC_SET(avg_, _mmxext)
#endif
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
QPEL_MC_SET(put_, _sse2)
QPEL_MC_SET(avg_, _sse2)
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
QPEL_MC_SET(put_, _ssse3)
QPEL_MC_SET(avg_, _ssse3)
}
#endif /* HAVE_YASM */
}

View File

@@ -0,0 +1,425 @@
;******************************************************************************
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask times 2 dd 1<<31, 0
ps_mask2 times 2 dd 0, 1<<31
ps_neg times 4 dd 1<<31
ps_noise0 times 2 dd 1.0, 0.0,
ps_noise2 times 2 dd -1.0, 0.0
ps_noise13 dd 0.0, 1.0, 0.0, -1.0
dd 0.0, -1.0, 0.0, 1.0
dd 0.0, 1.0, 0.0, -1.0
cextern sbr_noise_table
SECTION_TEXT
INIT_XMM sse
cglobal sbr_sum_square, 2, 3, 6
mov r2, r1
xorps m0, m0
xorps m1, m1
sar r2, 3
jz .prepare
.loop:
movu m2, [r0 + 0]
movu m3, [r0 + 16]
movu m4, [r0 + 32]
movu m5, [r0 + 48]
mulps m2, m2
mulps m3, m3
mulps m4, m4
mulps m5, m5
addps m0, m2
addps m1, m3
addps m0, m4
addps m1, m5
add r0, 64
dec r2
jnz .loop
.prepare:
and r1, 7
sar r1, 1
jz .end
; len is a multiple of 2, thus there are at least 4 elements to process
.endloop:
movu m2, [r0]
add r0, 16
mulps m2, m2
dec r1
addps m0, m2
jnz .endloop
.end:
addps m0, m1
movhlps m2, m0
addps m0, m2
movss m1, m0
shufps m0, m0, 1
addss m0, m1
%if ARCH_X86_64 == 0
movss r0m, m0
fld dword r0m
%endif
RET
%define STEP 40*4*2
cglobal sbr_hf_g_filt, 5, 6, 5
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
mov r5, r3
and r3, 0xFC
lea r2, [r2 + r3*4]
lea r0, [r0 + r3*8]
neg r3
jz .loop1
.loop4:
movlps m0, [r2 + 4*r3 + 0]
movlps m1, [r2 + 4*r3 + 8]
movlps m2, [r1 + 0*STEP]
movlps m3, [r1 + 2*STEP]
movhps m2, [r1 + 1*STEP]
movhps m3, [r1 + 3*STEP]
unpcklps m0, m0
unpcklps m1, m1
mulps m0, m2
mulps m1, m3
movu [r0 + 8*r3 + 0], m0
movu [r0 + 8*r3 + 16], m1
add r1, 4*STEP
add r3, 4
jnz .loop4
and r5, 3 ; number of single element loops
jz .end
.loop1: ; element 0 and 1 can be computed at the same time
movss m0, [r2]
movlps m2, [r1]
unpcklps m0, m0
mulps m2, m0
movlps [r0], m2
add r0, 8
add r2, 4
add r1, STEP
dec r5
jnz .loop1
.end:
RET
; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
; const float alpha0[2], const float alpha1[2],
; float bw, int start, int end)
;
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
; load alpha factors
%define bw m0
%if ARCH_X86_64 == 0 || WIN64
movss bw, BWm
%endif
movlps m2, [alpha1q]
movlps m1, [alpha0q]
shufps bw, bw, 0
mulps m2, bw ; (a1[0] a1[1])*bw
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
mova m3, m1
mova m4, m2
; Set pointers
%if ARCH_X86_64 == 0 || WIN64
; start and end 6th and 7th args on stack
mov r2d, Sm
mov r3d, Em
%define start r2q
%define end r3q
%else
; BW does not actually occupy a register, so shift by 1
%define start BWq
%define end Sq
%endif
sub start, end ; neg num of loops
lea X_highq, [X_highq + end*2*4]
lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
shl start, 3 ; offset from num loops
mova m0, [X_lowq + start]
shufps m3, m3, q1111
shufps m4, m4, q1111
xorps m3, [ps_mask]
shufps m1, m1, q0000
shufps m2, m2, q0000
xorps m4, [ps_mask]
.loop2:
movu m7, [X_lowq + start + 8] ; BbCc
mova m6, m0
mova m5, m7
shufps m0, m0, q2301 ; aAbB
shufps m7, m7, q2301 ; bBcC
mulps m0, m4
mulps m7, m3
mulps m6, m2
mulps m5, m1
addps m7, m0
mova m0, [X_lowq + start +16] ; CcDd
addps m7, m0
addps m6, m5
addps m7, m6
mova [X_highq + start], m7
add start, 16
jnz .loop2
RET
cglobal sbr_sum64x5, 1,2,4,z
lea r1q, [zq+ 256]
.loop:
mova m0, [zq+ 0]
mova m2, [zq+ 16]
mova m1, [zq+ 256]
mova m3, [zq+ 272]
addps m0, [zq+ 512]
addps m2, [zq+ 528]
addps m1, [zq+ 768]
addps m3, [zq+ 784]
addps m0, [zq+1024]
addps m2, [zq+1040]
addps m0, m1
addps m2, m3
mova [zq], m0
mova [zq+16], m2
add zq, 32
cmp zq, r1q
jne .loop
REP_RET
INIT_XMM sse
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
lea r2q, [zq + (64-4)*4]
mova m3, [ps_neg]
.loop:
mova m1, [zq]
xorps m0, m3, [r2q]
shufps m0, m0, m0, q0123
unpcklps m2, m0, m1
unpckhps m0, m0, m1
mova [Wq + 0], m2
mova [Wq + 16], m0
add Wq, 32
sub r2q, 16
add zq, 16
cmp zq, r2q
jl .loop
REP_RET
INIT_XMM sse
cglobal sbr_neg_odd_64, 1,2,4,z
lea r1q, [zq+256]
.loop:
mova m0, [zq+ 0]
mova m1, [zq+16]
mova m2, [zq+32]
mova m3, [zq+48]
xorps m0, [ps_mask2]
xorps m1, [ps_mask2]
xorps m2, [ps_mask2]
xorps m3, [ps_mask2]
mova [zq+ 0], m0
mova [zq+16], m1
mova [zq+32], m2
mova [zq+48], m3
add zq, 64
cmp zq, r1q
jne .loop
REP_RET
; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1)
%macro SBR_QMF_DEINT_BFLY 0
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
mov cq, 64*4-2*mmsize
lea vrevq, [vq + 64*4]
.loop:
mova m0, [src0q+cq]
mova m1, [src1q]
mova m4, [src0q+cq+mmsize]
mova m5, [src1q+mmsize]
%if cpuflag(sse2)
pshufd m2, m0, q0123
pshufd m3, m1, q0123
pshufd m6, m4, q0123
pshufd m7, m5, q0123
%else
shufps m2, m0, m0, q0123
shufps m3, m1, m1, q0123
shufps m6, m4, m4, q0123
shufps m7, m5, m5, q0123
%endif
addps m5, m2
subps m0, m7
addps m1, m6
subps m4, m3
mova [vrevq], m1
mova [vrevq+mmsize], m5
mova [vq+cq], m0
mova [vq+cq+mmsize], m4
add src1q, 2*mmsize
add vrevq, 2*mmsize
sub cq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
SBR_QMF_DEINT_BFLY
INIT_XMM sse2
SBR_QMF_DEINT_BFLY
INIT_XMM sse2
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
%define OFFSET (32*4-2*mmsize)
mov r3q, OFFSET
lea r1q, [zq + (32+1)*4]
lea r2q, [zq + 64*4]
mova m5, [ps_neg]
.loop:
movu m0, [r1q]
movu m2, [r1q + mmsize]
movu m1, [zq + r3q + 4 + mmsize]
movu m3, [zq + r3q + 4]
pxor m2, m5
pxor m0, m5
pshufd m2, m2, q0123
pshufd m0, m0, q0123
SBUTTERFLY dq, 2, 3, 4
SBUTTERFLY dq, 0, 1, 4
mova [r2q + 2*r3q + 0*mmsize], m2
mova [r2q + 2*r3q + 1*mmsize], m3
mova [r2q + 2*r3q + 2*mmsize], m0
mova [r2q + 2*r3q + 3*mmsize], m1
add r1q, 2*mmsize
sub r3q, 2*mmsize
jge .loop
movq m2, [zq]
movq [r2q], m2
REP_RET
%ifdef PIC
%define NREGS 1
%if UNIX64
%define NOISE_TABLE r6q ; r5q is m_max
%else
%define NOISE_TABLE r5q
%endif
%else
%define NREGS 0
%define NOISE_TABLE sbr_noise_table
%endif
%macro LOAD_NST 1
%ifdef PIC
lea NOISE_TABLE, [%1]
mova m0, [kxq + NOISE_TABLE]
%else
mova m0, [kxq + %1]
%endif
%endmacro
INIT_XMM sse2
; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
mova m0, [ps_noise0]
jmp apply_noise_main
; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
and kxq, 1
shl kxq, 4
LOAD_NST ps_noise13
jmp apply_noise_main
; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
mova m0, [ps_noise2]
jmp apply_noise_main
; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
and kxq, 1
shl kxq, 4
LOAD_NST ps_noise13+16
apply_noise_main:
%if ARCH_X86_64 == 0 || WIN64
mov kxd, m_maxm
%define count kxq
%else
%define count m_maxq
%endif
dec noiseq
shl count, 2
%ifdef PIC
lea NOISE_TABLE, [sbr_noise_table]
%endif
lea Yq, [Yq + 2*count]
add s_mq, count
add q_filtq, count
shl noiseq, 3
pxor m5, m5
neg count
.loop:
mova m1, [q_filtq + count]
movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
add noiseq, 2*mmsize
and noiseq, 0x1ff<<3
punpckhdq m2, m1, m1
punpckldq m1, m1
mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
mova m3, [s_mq + count]
; TODO: replace by a vpermd in AVX2
punpckhdq m4, m3, m3
punpckldq m3, m3
pcmpeqd m6, m3, m5 ; m6 == 0
pcmpeqd m7, m4, m5 ; m7 == 0
mulps m3, m0 ; s_m[m] * phi_sign
mulps m4, m0 ; s_m[m] * phi_sign
pand m1, m6
pand m2, m7
movu m6, [Yq + 2*count]
movu m7, [Yq + 2*count + mmsize]
addps m3, m1
addps m4, m2
addps m6, m3
addps m7, m4
movu [Yq + 2*count], m6
movu [Yq + 2*count + mmsize], m7
add count, mmsize
jl .loop
RET

View File

@@ -0,0 +1,76 @@
/*
* AAC Spectral Band Replication decoding functions
* Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/sbrdsp.h"
float ff_sbr_sum_square_sse(float (*x)[2], int n);
void ff_sbr_sum64x5_sse(float *z);
void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2],
const float *g_filt, int m_max, intptr_t ixh);
void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
const float alpha0[2], const float alpha1[2],
float bw, int start, int end);
void ff_sbr_neg_odd_64_sse(float *z);
void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
void ff_sbr_qmf_pre_shuffle_sse2(float *z);
void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(cpu_flags)) {
s->neg_odd_64 = ff_sbr_neg_odd_64_sse;
s->sum_square = ff_sbr_sum_square_sse;
s->sum64x5 = ff_sbr_sum64x5_sse;
s->hf_g_filt = ff_sbr_hf_g_filt_sse;
s->hf_gen = ff_sbr_hf_gen_sse;
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2;
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2;
s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,902 @@
/*
* MMX and SSE2 optimized snow DSP utils
* Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/snow.h"
#include "libavcodec/snow_dwt.h"
#include "dsputil_x86.h"
#if HAVE_INLINE_ASM
static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
const int w2= (width+1)>>1;
const int w_l= (width>>1);
const int w_r= w2 - 1;
int i;
{ // Lift 0
IDWTELEM * const ref = b + w2 - 1;
IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
// (the first time erroneously), we allow the SSE2 code to run an extra pass.
// The savings in code and time are well worth having to store this value and
// calculate b[0] correctly afterwards.
i = 0;
__asm__ volatile(
"pcmpeqd %%xmm7, %%xmm7 \n\t"
"pcmpeqd %%xmm3, %%xmm3 \n\t"
"psllw $1, %%xmm3 \n\t"
"paddw %%xmm7, %%xmm3 \n\t"
"psllw $13, %%xmm3 \n\t"
::);
for(; i<w_l-15; i+=16){
__asm__ volatile(
"movdqu (%1), %%xmm1 \n\t"
"movdqu 16(%1), %%xmm5 \n\t"
"movdqu 2(%1), %%xmm2 \n\t"
"movdqu 18(%1), %%xmm6 \n\t"
"paddw %%xmm1, %%xmm2 \n\t"
"paddw %%xmm5, %%xmm6 \n\t"
"paddw %%xmm7, %%xmm2 \n\t"
"paddw %%xmm7, %%xmm6 \n\t"
"pmulhw %%xmm3, %%xmm2 \n\t"
"pmulhw %%xmm3, %%xmm6 \n\t"
"paddw (%0), %%xmm2 \n\t"
"paddw 16(%0), %%xmm6 \n\t"
"movdqa %%xmm2, (%0) \n\t"
"movdqa %%xmm6, 16(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
}
{ // Lift 1
IDWTELEM * const dst = b+w2;
i = 0;
for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
dst[i] = dst[i] - (b[i] + b[i + 1]);
}
for(; i<w_r-15; i+=16){
__asm__ volatile(
"movdqu (%1), %%xmm1 \n\t"
"movdqu 16(%1), %%xmm5 \n\t"
"movdqu 2(%1), %%xmm2 \n\t"
"movdqu 18(%1), %%xmm6 \n\t"
"paddw %%xmm1, %%xmm2 \n\t"
"paddw %%xmm5, %%xmm6 \n\t"
"movdqa (%0), %%xmm0 \n\t"
"movdqa 16(%0), %%xmm4 \n\t"
"psubw %%xmm2, %%xmm0 \n\t"
"psubw %%xmm6, %%xmm4 \n\t"
"movdqa %%xmm0, (%0) \n\t"
"movdqa %%xmm4, 16(%0) \n\t"
:: "r"(&dst[i]), "r"(&b[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
}
{ // Lift 2
IDWTELEM * const ref = b+w2 - 1;
IDWTELEM b_0 = b[0];
i = 0;
__asm__ volatile(
"psllw $15, %%xmm7 \n\t"
"pcmpeqw %%xmm6, %%xmm6 \n\t"
"psrlw $13, %%xmm6 \n\t"
"paddw %%xmm7, %%xmm6 \n\t"
::);
for(; i<w_l-15; i+=16){
__asm__ volatile(
"movdqu (%1), %%xmm0 \n\t"
"movdqu 16(%1), %%xmm4 \n\t"
"movdqu 2(%1), %%xmm1 \n\t"
"movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
"paddw %%xmm6, %%xmm0 \n\t"
"paddw %%xmm6, %%xmm4 \n\t"
"paddw %%xmm7, %%xmm1 \n\t"
"paddw %%xmm7, %%xmm5 \n\t"
"pavgw %%xmm1, %%xmm0 \n\t"
"pavgw %%xmm5, %%xmm4 \n\t"
"psubw %%xmm7, %%xmm0 \n\t"
"psubw %%xmm7, %%xmm4 \n\t"
"psraw $1, %%xmm0 \n\t"
"psraw $1, %%xmm4 \n\t"
"movdqa (%0), %%xmm1 \n\t"
"movdqa 16(%0), %%xmm5 \n\t"
"paddw %%xmm1, %%xmm0 \n\t"
"paddw %%xmm5, %%xmm4 \n\t"
"psraw $2, %%xmm0 \n\t"
"psraw $2, %%xmm4 \n\t"
"paddw %%xmm1, %%xmm0 \n\t"
"paddw %%xmm5, %%xmm4 \n\t"
"movdqa %%xmm0, (%0) \n\t"
"movdqa %%xmm4, 16(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
}
{ // Lift 3
IDWTELEM * const src = b+w2;
i = 0;
for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
}
for(; i<w_r-7; i+=8){
__asm__ volatile(
"movdqu 2(%1), %%xmm2 \n\t"
"movdqu 18(%1), %%xmm6 \n\t"
"paddw (%1), %%xmm2 \n\t"
"paddw 16(%1), %%xmm6 \n\t"
"movdqu (%0), %%xmm0 \n\t"
"movdqu 16(%0), %%xmm4 \n\t"
"paddw %%xmm2, %%xmm0 \n\t"
"paddw %%xmm6, %%xmm4 \n\t"
"psraw $1, %%xmm2 \n\t"
"psraw $1, %%xmm6 \n\t"
"paddw %%xmm0, %%xmm2 \n\t"
"paddw %%xmm4, %%xmm6 \n\t"
"movdqa %%xmm2, (%2) \n\t"
"movdqa %%xmm6, 16(%2) \n\t"
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
}
{
snow_interleave_line_header(&i, width, b, temp);
for (; (i & 0x3E) != 0x3E; i-=2){
b[i+1] = temp[i>>1];
b[i] = b[i>>1];
}
for (i-=62; i>=0; i-=64){
__asm__ volatile(
"movdqa (%1), %%xmm0 \n\t"
"movdqa 16(%1), %%xmm2 \n\t"
"movdqa 32(%1), %%xmm4 \n\t"
"movdqa 48(%1), %%xmm6 \n\t"
"movdqa (%1), %%xmm1 \n\t"
"movdqa 16(%1), %%xmm3 \n\t"
"movdqa 32(%1), %%xmm5 \n\t"
"movdqa 48(%1), %%xmm7 \n\t"
"punpcklwd (%2), %%xmm0 \n\t"
"punpcklwd 16(%2), %%xmm2 \n\t"
"punpcklwd 32(%2), %%xmm4 \n\t"
"punpcklwd 48(%2), %%xmm6 \n\t"
"movdqa %%xmm0, (%0) \n\t"
"movdqa %%xmm2, 32(%0) \n\t"
"movdqa %%xmm4, 64(%0) \n\t"
"movdqa %%xmm6, 96(%0) \n\t"
"punpckhwd (%2), %%xmm1 \n\t"
"punpckhwd 16(%2), %%xmm3 \n\t"
"punpckhwd 32(%2), %%xmm5 \n\t"
"punpckhwd 48(%2), %%xmm7 \n\t"
"movdqa %%xmm1, 16(%0) \n\t"
"movdqa %%xmm3, 48(%0) \n\t"
"movdqa %%xmm5, 80(%0) \n\t"
"movdqa %%xmm7, 112(%0) \n\t"
:: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
: "memory"
);
}
}
}
static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
const int w2= (width+1)>>1;
const int w_l= (width>>1);
const int w_r= w2 - 1;
int i;
{ // Lift 0
IDWTELEM * const ref = b + w2 - 1;
i = 1;
b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"pcmpeqw %%mm3, %%mm3 \n\t"
"psllw $1, %%mm3 \n\t"
"paddw %%mm7, %%mm3 \n\t"
"psllw $13, %%mm3 \n\t"
::);
for(; i<w_l-7; i+=8){
__asm__ volatile(
"movq (%1), %%mm2 \n\t"
"movq 8(%1), %%mm6 \n\t"
"paddw 2(%1), %%mm2 \n\t"
"paddw 10(%1), %%mm6 \n\t"
"paddw %%mm7, %%mm2 \n\t"
"paddw %%mm7, %%mm6 \n\t"
"pmulhw %%mm3, %%mm2 \n\t"
"pmulhw %%mm3, %%mm6 \n\t"
"paddw (%0), %%mm2 \n\t"
"paddw 8(%0), %%mm6 \n\t"
"movq %%mm2, (%0) \n\t"
"movq %%mm6, 8(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
}
{ // Lift 1
IDWTELEM * const dst = b+w2;
i = 0;
for(; i<w_r-7; i+=8){
__asm__ volatile(
"movq (%1), %%mm2 \n\t"
"movq 8(%1), %%mm6 \n\t"
"paddw 2(%1), %%mm2 \n\t"
"paddw 10(%1), %%mm6 \n\t"
"movq (%0), %%mm0 \n\t"
"movq 8(%0), %%mm4 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm6, %%mm4 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm4, 8(%0) \n\t"
:: "r"(&dst[i]), "r"(&b[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
}
{ // Lift 2
IDWTELEM * const ref = b+w2 - 1;
i = 1;
b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
__asm__ volatile(
"psllw $15, %%mm7 \n\t"
"pcmpeqw %%mm6, %%mm6 \n\t"
"psrlw $13, %%mm6 \n\t"
"paddw %%mm7, %%mm6 \n\t"
::);
for(; i<w_l-7; i+=8){
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm4 \n\t"
"movq 2(%1), %%mm1 \n\t"
"movq 10(%1), %%mm5 \n\t"
"paddw %%mm6, %%mm0 \n\t"
"paddw %%mm6, %%mm4 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"paddw %%mm7, %%mm5 \n\t"
"pavgw %%mm1, %%mm0 \n\t"
"pavgw %%mm5, %%mm4 \n\t"
"psubw %%mm7, %%mm0 \n\t"
"psubw %%mm7, %%mm4 \n\t"
"psraw $1, %%mm0 \n\t"
"psraw $1, %%mm4 \n\t"
"movq (%0), %%mm1 \n\t"
"movq 8(%0), %%mm5 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm5, %%mm4 \n\t"
"psraw $2, %%mm0 \n\t"
"psraw $2, %%mm4 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm5, %%mm4 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm4, 8(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
}
{ // Lift 3
IDWTELEM * const src = b+w2;
i = 0;
for(; i<w_r-7; i+=8){
__asm__ volatile(
"movq 2(%1), %%mm2 \n\t"
"movq 10(%1), %%mm6 \n\t"
"paddw (%1), %%mm2 \n\t"
"paddw 8(%1), %%mm6 \n\t"
"movq (%0), %%mm0 \n\t"
"movq 8(%0), %%mm4 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm6, %%mm4 \n\t"
"psraw $1, %%mm2 \n\t"
"psraw $1, %%mm6 \n\t"
"paddw %%mm0, %%mm2 \n\t"
"paddw %%mm4, %%mm6 \n\t"
"movq %%mm2, (%2) \n\t"
"movq %%mm6, 8(%2) \n\t"
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
}
{
snow_interleave_line_header(&i, width, b, temp);
for (; (i & 0x1E) != 0x1E; i-=2){
b[i+1] = temp[i>>1];
b[i] = b[i>>1];
}
for (i-=30; i>=0; i-=32){
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm2 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm6 \n\t"
"movq (%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm5 \n\t"
"movq 24(%1), %%mm7 \n\t"
"punpcklwd (%2), %%mm0 \n\t"
"punpcklwd 8(%2), %%mm2 \n\t"
"punpcklwd 16(%2), %%mm4 \n\t"
"punpcklwd 24(%2), %%mm6 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm2, 16(%0) \n\t"
"movq %%mm4, 32(%0) \n\t"
"movq %%mm6, 48(%0) \n\t"
"punpckhwd (%2), %%mm1 \n\t"
"punpckhwd 8(%2), %%mm3 \n\t"
"punpckhwd 16(%2), %%mm5 \n\t"
"punpckhwd 24(%2), %%mm7 \n\t"
"movq %%mm1, 8(%0) \n\t"
"movq %%mm3, 24(%0) \n\t"
"movq %%mm5, 40(%0) \n\t"
"movq %%mm7, 56(%0) \n\t"
:: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
: "memory"
);
}
}
}
#if HAVE_7REGS
#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
""op" ("r",%%"REG_d"), %%"t0" \n\t"\
""op" 16("r",%%"REG_d"), %%"t1" \n\t"\
""op" 32("r",%%"REG_d"), %%"t2" \n\t"\
""op" 48("r",%%"REG_d"), %%"t3" \n\t"
#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
"psubw %%"s0", %%"t0" \n\t"\
"psubw %%"s1", %%"t1" \n\t"\
"psubw %%"s2", %%"t2" \n\t"\
"psubw %%"s3", %%"t3" \n\t"
#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
"movdqa %%"s0", ("w",%%"REG_d") \n\t"\
"movdqa %%"s1", 16("w",%%"REG_d") \n\t"\
"movdqa %%"s2", 32("w",%%"REG_d") \n\t"\
"movdqa %%"s3", 48("w",%%"REG_d") \n\t"
#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
"psraw $"n", %%"t0" \n\t"\
"psraw $"n", %%"t1" \n\t"\
"psraw $"n", %%"t2" \n\t"\
"psraw $"n", %%"t3" \n\t"
#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
"paddw %%"s0", %%"t0" \n\t"\
"paddw %%"s1", %%"t1" \n\t"\
"paddw %%"s2", %%"t2" \n\t"\
"paddw %%"s3", %%"t3" \n\t"
#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
"pmulhw %%"s0", %%"t0" \n\t"\
"pmulhw %%"s1", %%"t1" \n\t"\
"pmulhw %%"s2", %%"t2" \n\t"\
"pmulhw %%"s3", %%"t3" \n\t"
#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
"movdqa %%"s0", %%"t0" \n\t"\
"movdqa %%"s1", %%"t1" \n\t"\
"movdqa %%"s2", %%"t2" \n\t"\
"movdqa %%"s3", %%"t3" \n\t"
static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
x86_reg i = width;
while(i & 0x1F)
{
i--;
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
}
i+=i;
__asm__ volatile (
"jmp 2f \n\t"
"1: \n\t"
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
"pcmpeqw %%xmm0, %%xmm0 \n\t"
"pcmpeqw %%xmm2, %%xmm2 \n\t"
"paddw %%xmm2, %%xmm2 \n\t"
"paddw %%xmm0, %%xmm2 \n\t"
"psllw $13, %%xmm2 \n\t"
snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
"pcmpeqw %%xmm7, %%xmm7 \n\t"
"pcmpeqw %%xmm5, %%xmm5 \n\t"
"psllw $15, %%xmm7 \n\t"
"psrlw $13, %%xmm5 \n\t"
"paddw %%xmm7, %%xmm5 \n\t"
snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
"movq (%2,%%"REG_d"), %%xmm1 \n\t"
"movq 8(%2,%%"REG_d"), %%xmm3 \n\t"
"paddw %%xmm7, %%xmm1 \n\t"
"paddw %%xmm7, %%xmm3 \n\t"
"pavgw %%xmm1, %%xmm0 \n\t"
"pavgw %%xmm3, %%xmm2 \n\t"
"movq 16(%2,%%"REG_d"), %%xmm1 \n\t"
"movq 24(%2,%%"REG_d"), %%xmm3 \n\t"
"paddw %%xmm7, %%xmm1 \n\t"
"paddw %%xmm7, %%xmm3 \n\t"
"pavgw %%xmm1, %%xmm4 \n\t"
"pavgw %%xmm3, %%xmm6 \n\t"
snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
"2: \n\t"
"sub $64, %%"REG_d" \n\t"
"jge 1b \n\t"
:"+d"(i)
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
}
#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
""op" ("r",%%"REG_d"), %%"t0" \n\t"\
""op" 8("r",%%"REG_d"), %%"t1" \n\t"\
""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
""op" 24("r",%%"REG_d"), %%"t3" \n\t"
#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
"movq %%"s0", ("w",%%"REG_d") \n\t"\
"movq %%"s1", 8("w",%%"REG_d") \n\t"\
"movq %%"s2", 16("w",%%"REG_d") \n\t"\
"movq %%"s3", 24("w",%%"REG_d") \n\t"
#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
"movq %%"s0", %%"t0" \n\t"\
"movq %%"s1", %%"t1" \n\t"\
"movq %%"s2", %%"t2" \n\t"\
"movq %%"s3", %%"t3" \n\t"
static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
x86_reg i = width;
while(i & 15)
{
i--;
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
}
i+=i;
__asm__ volatile(
"jmp 2f \n\t"
"1: \n\t"
snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
"pcmpeqw %%mm0, %%mm0 \n\t"
"pcmpeqw %%mm2, %%mm2 \n\t"
"paddw %%mm2, %%mm2 \n\t"
"paddw %%mm0, %%mm2 \n\t"
"psllw $13, %%mm2 \n\t"
snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
"pcmpeqw %%mm7, %%mm7 \n\t"
"pcmpeqw %%mm5, %%mm5 \n\t"
"psllw $15, %%mm7 \n\t"
"psrlw $13, %%mm5 \n\t"
"paddw %%mm7, %%mm5 \n\t"
snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
"movq (%2,%%"REG_d"), %%mm1 \n\t"
"movq 8(%2,%%"REG_d"), %%mm3 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"paddw %%mm7, %%mm3 \n\t"
"pavgw %%mm1, %%mm0 \n\t"
"pavgw %%mm3, %%mm2 \n\t"
"movq 16(%2,%%"REG_d"), %%mm1 \n\t"
"movq 24(%2,%%"REG_d"), %%mm3 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"paddw %%mm7, %%mm3 \n\t"
"pavgw %%mm1, %%mm4 \n\t"
"pavgw %%mm3, %%mm6 \n\t"
snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
"2: \n\t"
"sub $32, %%"REG_d" \n\t"
"jge 1b \n\t"
:"+d"(i)
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
}
#endif //HAVE_7REGS
#define snow_inner_add_yblock_sse2_header \
IDWTELEM * * dst_array = sb->line + src_y;\
x86_reg tmp;\
__asm__ volatile(\
"mov %7, %%"REG_c" \n\t"\
"mov %6, %2 \n\t"\
"mov %4, %%"REG_S" \n\t"\
"pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
"pcmpeqd %%xmm3, %%xmm3 \n\t"\
"psllw $15, %%xmm3 \n\t"\
"psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
"1: \n\t"\
"mov %1, %%"REG_D" \n\t"\
"mov (%%"REG_D"), %%"REG_D" \n\t"\
"add %3, %%"REG_D" \n\t"
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
"movq (%%"REG_d"), %%"out_reg1" \n\t"\
"movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\
"movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
"movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"punpcklbw %%xmm7, %%xmm4 \n\t"\
"pmullw %%xmm0, %%"out_reg1" \n\t"\
"pmullw %%xmm4, %%"out_reg2" \n\t"
#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
"movq (%%"REG_d"), %%"out_reg1" \n\t"\
"movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\
"movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
"movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"punpcklbw %%xmm7, %%xmm4 \n\t"\
"pmullw %%xmm0, %%"out_reg1" \n\t"\
"pmullw %%xmm4, %%"out_reg2" \n\t"
#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
"paddusw %%xmm2, %%xmm1 \n\t"\
"paddusw %%xmm6, %%xmm5 \n\t"
#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
"paddusw %%xmm2, %%xmm1 \n\t"\
"paddusw %%xmm6, %%xmm5 \n\t"
#define snow_inner_add_yblock_sse2_end_common1\
"add $32, %%"REG_S" \n\t"\
"add %%"REG_c", %0 \n\t"\
"add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
"add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
"add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
"add %%"REG_c", (%%"REG_a") \n\t"
#define snow_inner_add_yblock_sse2_end_common2\
"jnz 1b \n\t"\
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
:\
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
#define snow_inner_add_yblock_sse2_end_8\
"sal $1, %%"REG_c" \n\t"\
"add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\
snow_inner_add_yblock_sse2_end_common1\
"sar $1, %%"REG_c" \n\t"\
"sub $2, %2 \n\t"\
snow_inner_add_yblock_sse2_end_common2
#define snow_inner_add_yblock_sse2_end_16\
"add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\
snow_inner_add_yblock_sse2_end_common1\
"dec %2 \n\t"\
snow_inner_add_yblock_sse2_end_common2
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
snow_inner_add_yblock_sse2_accum_8("2", "8")
snow_inner_add_yblock_sse2_accum_8("1", "128")
snow_inner_add_yblock_sse2_accum_8("0", "136")
"mov %0, %%"REG_d" \n\t"
"movdqa (%%"REG_D"), %%xmm0 \n\t"
"movdqa %%xmm1, %%xmm2 \n\t"
"punpckhwd %%xmm7, %%xmm1 \n\t"
"punpcklwd %%xmm7, %%xmm2 \n\t"
"paddd %%xmm2, %%xmm0 \n\t"
"movdqa 16(%%"REG_D"), %%xmm2 \n\t"
"paddd %%xmm1, %%xmm2 \n\t"
"paddd %%xmm3, %%xmm0 \n\t"
"paddd %%xmm3, %%xmm2 \n\t"
"mov %1, %%"REG_D" \n\t"
"mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
"add %3, %%"REG_D" \n\t"
"movdqa (%%"REG_D"), %%xmm4 \n\t"
"movdqa %%xmm5, %%xmm6 \n\t"
"punpckhwd %%xmm7, %%xmm5 \n\t"
"punpcklwd %%xmm7, %%xmm6 \n\t"
"paddd %%xmm6, %%xmm4 \n\t"
"movdqa 16(%%"REG_D"), %%xmm6 \n\t"
"paddd %%xmm5, %%xmm6 \n\t"
"paddd %%xmm3, %%xmm4 \n\t"
"paddd %%xmm3, %%xmm6 \n\t"
"psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
"psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
"packssdw %%xmm2, %%xmm0 \n\t"
"packuswb %%xmm7, %%xmm0 \n\t"
"movq %%xmm0, (%%"REG_d") \n\t"
"psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
"psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
"packssdw %%xmm6, %%xmm4 \n\t"
"packuswb %%xmm7, %%xmm4 \n\t"
"movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
snow_inner_add_yblock_sse2_end_8
}
static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
snow_inner_add_yblock_sse2_accum_16("2", "16")
snow_inner_add_yblock_sse2_accum_16("1", "512")
snow_inner_add_yblock_sse2_accum_16("0", "528")
"mov %0, %%"REG_d" \n\t"
"psrlw $4, %%xmm1 \n\t"
"psrlw $4, %%xmm5 \n\t"
"paddw (%%"REG_D"), %%xmm1 \n\t"
"paddw 16(%%"REG_D"), %%xmm5 \n\t"
"paddw %%xmm3, %%xmm1 \n\t"
"paddw %%xmm3, %%xmm5 \n\t"
"psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
"psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
"packuswb %%xmm5, %%xmm1 \n\t"
"movdqu %%xmm1, (%%"REG_d") \n\t"
snow_inner_add_yblock_sse2_end_16
}
#define snow_inner_add_yblock_mmx_header \
IDWTELEM * * dst_array = sb->line + src_y;\
x86_reg tmp;\
__asm__ volatile(\
"mov %7, %%"REG_c" \n\t"\
"mov %6, %2 \n\t"\
"mov %4, %%"REG_S" \n\t"\
"pxor %%mm7, %%mm7 \n\t" /* 0 */\
"pcmpeqd %%mm3, %%mm3 \n\t"\
"psllw $15, %%mm3 \n\t"\
"psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
"1: \n\t"\
"mov %1, %%"REG_D" \n\t"\
"mov (%%"REG_D"), %%"REG_D" \n\t"\
"add %3, %%"REG_D" \n\t"
#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
"movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
"movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
"punpcklbw %%mm7, %%"out_reg1" \n\t"\
"punpcklbw %%mm7, %%"out_reg2" \n\t"\
"movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
"movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"pmullw %%mm0, %%"out_reg1" \n\t"\
"pmullw %%mm4, %%"out_reg2" \n\t"
#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
"paddusw %%mm2, %%mm1 \n\t"\
"paddusw %%mm6, %%mm5 \n\t"
#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
"mov %0, %%"REG_d" \n\t"\
"psrlw $4, %%mm1 \n\t"\
"psrlw $4, %%mm5 \n\t"\
"paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
"paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psraw $4, %%mm1 \n\t"\
"psraw $4, %%mm5 \n\t"\
"packuswb %%mm5, %%mm1 \n\t"\
"movq %%mm1, "write_offset"(%%"REG_d") \n\t"
#define snow_inner_add_yblock_mmx_end(s_step)\
"add $"s_step", %%"REG_S" \n\t"\
"add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
"add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
"add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
"add %%"REG_c", (%%"REG_a") \n\t"\
"add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\
"add %%"REG_c", %0 \n\t"\
"dec %2 \n\t"\
"jnz 1b \n\t"\
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
:\
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
snow_inner_add_yblock_mmx_accum("2", "8", "0")
snow_inner_add_yblock_mmx_accum("1", "128", "0")
snow_inner_add_yblock_mmx_accum("0", "136", "0")
snow_inner_add_yblock_mmx_mix("0", "0")
snow_inner_add_yblock_mmx_end("16")
}
static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
snow_inner_add_yblock_mmx_accum("2", "16", "0")
snow_inner_add_yblock_mmx_accum("1", "512", "0")
snow_inner_add_yblock_mmx_accum("0", "528", "0")
snow_inner_add_yblock_mmx_mix("0", "0")
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
snow_inner_add_yblock_mmx_accum("2", "24", "8")
snow_inner_add_yblock_mmx_accum("1", "520", "8")
snow_inner_add_yblock_mmx_accum("0", "536", "8")
snow_inner_add_yblock_mmx_mix("16", "8")
snow_inner_add_yblock_mmx_end("32")
}
static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
if (b_w == 16)
inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else if (b_w == 8 && obmc_stride == 16) {
if (!(b_h & 1))
inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
} else
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
}
static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
if (b_w == 16)
inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else if (b_w == 8 && obmc_stride == 16)
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
}
#endif /* HAVE_INLINE_ASM */
void ff_dwt_init_x86(SnowDWTContext *c)
{
#if HAVE_INLINE_ASM
int mm_flags = av_get_cpu_flags();
if (mm_flags & AV_CPU_FLAG_MMX) {
if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
#if HAVE_7REGS
c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
#endif
c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
}
else{
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
#if HAVE_7REGS
c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
#endif
}
c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
}
}
#endif /* HAVE_INLINE_ASM */
}

View File

@@ -0,0 +1,48 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavcodec/v210dec.h"
extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
av_cold void v210_x86_init(V210DecContext *s)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_YASM
if (s->aligned_input) {
if (cpu_flags & AV_CPU_FLAG_SSSE3)
s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
}
else {
if (cpu_flags & AV_CPU_FLAG_SSSE3)
s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
}
#endif
}

View File

@@ -0,0 +1,88 @@
;******************************************************************************
;* V210 SIMD unpack
;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
v210_mask: times 4 dd 0x3ff
v210_mult: dw 64,4,64,4,64,4,64,4
v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
SECTION .text
%macro v210_planar_unpack 2
; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
cglobal v210_planar_unpack_%1_%2, 5, 5, 7
movsxdifnidn r4, r4d
lea r1, [r1+2*r4]
add r2, r4
add r3, r4
neg r4
mova m3, [v210_mult]
mova m4, [v210_mask]
mova m5, [v210_luma_shuf]
mova m6, [v210_chroma_shuf]
.loop
%ifidn %1, unaligned
movu m0, [r0]
%else
mova m0, [r0]
%endif
pmullw m1, m0, m3
psrld m0, 10
psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5
pand m0, m4 ; y0 __ u1 __ y3 __ v2 __
shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
movu [r1+2*r4], m2
shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
movq [r2+r4], m1
movhps [r3+r4], m1
add r0, mmsize
add r4, 6
jl .loop
REP_RET
%endmacro
INIT_XMM
v210_planar_unpack unaligned, ssse3
%if HAVE_AVX_EXTERNAL
INIT_AVX
v210_planar_unpack unaligned, avx
%endif
INIT_XMM
v210_planar_unpack aligned, ssse3
%if HAVE_AVX_EXTERNAL
INIT_AVX
v210_planar_unpack aligned, avx
%endif

View File

@@ -0,0 +1,317 @@
;******************************************************************************
;* VC1 deblocking optimizations
;* Copyright (c) 2009 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
cextern pw_4
cextern pw_5
section .text
; dst_low, dst_high (src), zero
; zero-extends one vector from 8 to 16 bits
%macro UNPACK_8TO16 4
mova m%2, m%3
punpckh%1 m%3, m%4
punpckl%1 m%2, m%4
%endmacro
%macro STORE_4_WORDS 6
%if cpuflag(sse4)
pextrw %1, %5, %6+0
pextrw %2, %5, %6+1
pextrw %3, %5, %6+2
pextrw %4, %5, %6+3
%else
movd %6d, %5
%if mmsize==16
psrldq %5, 4
%else
psrlq %5, 32
%endif
mov %1, %6w
shr %6, 16
mov %2, %6w
movd %6d, %5
mov %3, %6w
shr %6, 16
mov %4, %6w
%endif
%endmacro
; in: p1 p0 q0 q1, clobbers p0
; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
%macro VC1_LOOP_FILTER_A0 4
psubw %1, %4
psubw %2, %3
paddw %1, %1
pmullw %2, [pw_5]
psubw %1, %2
paddw %1, [pw_4]
psraw %1, 3
%endmacro
; in: p0 q0 a0 a1 a2
; m0 m1 m7 m6 m5
; %1: size
; out: m0=p0' m1=q0'
%macro VC1_FILTER 1
PABSW m4, m7
PABSW m3, m6
PABSW m2, m5
mova m6, m4
pminsw m3, m2
pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
psubw m3, m4
pmullw m3, [pw_5] ; 5*(a3 - a0)
PABSW m2, m3
psraw m2, 3 ; abs(d/8)
pxor m7, m3 ; d_sign ^= a0_sign
pxor m5, m5
movd m3, r2d
%if %1 > 4
punpcklbw m3, m3
%endif
punpcklbw m3, m5
pcmpgtw m3, m4 ; if (a0 < pq)
pand m6, m3
mova m3, m0
psubw m3, m1
PABSW m4, m3
psraw m4, 1
pxor m3, m7 ; d_sign ^ clip_sign
psraw m3, 15
pminsw m2, m4 ; min(d, clip)
pcmpgtw m4, m5
pand m6, m4 ; filt3 (C return value)
; each set of 4 pixels is not filtered if the 3rd is not
%if mmsize==16
pshuflw m4, m6, 0xaa
%if %1 > 4
pshufhw m4, m4, 0xaa
%endif
%else
pshufw m4, m6, 0xaa
%endif
pandn m3, m4
pand m2, m6
pand m3, m2 ; d final
psraw m7, 15
pxor m3, m7
psubw m3, m7
psubw m0, m3
paddw m1, m3
packuswb m0, m0
packuswb m1, m1
%endmacro
; 1st param: size of filter
; 2nd param: mov suffix equivalent to the filter size
%macro VC1_V_LOOP_FILTER 2
pxor m5, m5
mov%2 m6, [r4]
mov%2 m4, [r4+r1]
mov%2 m7, [r4+2*r1]
mov%2 m0, [r4+r3]
punpcklbw m6, m5
punpcklbw m4, m5
punpcklbw m7, m5
punpcklbw m0, m5
VC1_LOOP_FILTER_A0 m6, m4, m7, m0
mov%2 m1, [r0]
mov%2 m2, [r0+r1]
punpcklbw m1, m5
punpcklbw m2, m5
mova m4, m0
VC1_LOOP_FILTER_A0 m7, m4, m1, m2
mov%2 m3, [r0+2*r1]
mov%2 m4, [r0+r3]
punpcklbw m3, m5
punpcklbw m4, m5
mova m5, m1
VC1_LOOP_FILTER_A0 m5, m2, m3, m4
VC1_FILTER %1
mov%2 [r4+r3], m0
mov%2 [r0], m1
%endmacro
; 1st param: size of filter
; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
; 2nd (optional) param: temp register to use for storing words
%macro VC1_H_LOOP_FILTER 1-2
%if %1 == 4
movq m0, [r0 -4]
movq m1, [r0+ r1-4]
movq m2, [r0+2*r1-4]
movq m3, [r0+ r3-4]
TRANSPOSE4x4B 0, 1, 2, 3, 4
%else
movq m0, [r0 -4]
movq m4, [r0+ r1-4]
movq m1, [r0+2*r1-4]
movq m5, [r0+ r3-4]
movq m2, [r4 -4]
movq m6, [r4+ r1-4]
movq m3, [r4+2*r1-4]
movq m7, [r4+ r3-4]
punpcklbw m0, m4
punpcklbw m1, m5
punpcklbw m2, m6
punpcklbw m3, m7
TRANSPOSE4x4W 0, 1, 2, 3, 4
%endif
pxor m5, m5
UNPACK_8TO16 bw, 6, 0, 5
UNPACK_8TO16 bw, 7, 1, 5
VC1_LOOP_FILTER_A0 m6, m0, m7, m1
UNPACK_8TO16 bw, 4, 2, 5
mova m0, m1 ; m0 = p0
VC1_LOOP_FILTER_A0 m7, m1, m4, m2
UNPACK_8TO16 bw, 1, 3, 5
mova m5, m4
VC1_LOOP_FILTER_A0 m5, m2, m1, m3
SWAP 1, 4 ; m1 = q0
VC1_FILTER %1
punpcklbw m0, m1
%if %0 > 1
STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
%if %1 > 4
psrldq m0, 4
STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
%endif
%else
STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
%endif
%endmacro
%macro START_V_FILTER 0
mov r4, r0
lea r3, [4*r1]
sub r4, r3
lea r3, [r1+2*r1]
imul r2, 0x01010101
%endmacro
%macro START_H_FILTER 1
lea r3, [r1+2*r1]
%if %1 > 4
lea r4, [r0+4*r1]
%endif
imul r2, 0x01010101
%endmacro
%macro VC1_LF 0
cglobal vc1_v_loop_filter_internal
VC1_V_LOOP_FILTER 4, d
ret
cglobal vc1_h_loop_filter_internal
VC1_H_LOOP_FILTER 4, r4
ret
; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter4, 3,5,0
START_V_FILTER
call vc1_v_loop_filter_internal
RET
; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter4, 3,5,0
START_H_FILTER 4
call vc1_h_loop_filter_internal
RET
; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter8, 3,5,0
START_V_FILTER
call vc1_v_loop_filter_internal
add r4, 4
add r0, 4
call vc1_v_loop_filter_internal
RET
; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8, 3,5,0
START_H_FILTER 4
call vc1_h_loop_filter_internal
lea r0, [r0+4*r1]
call vc1_h_loop_filter_internal
RET
%endmacro
INIT_MMX mmxext
VC1_LF
INIT_XMM sse2
; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter8, 3,5,8
START_V_FILTER
VC1_V_LOOP_FILTER 8, q
RET
; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8, 3,6,8
START_H_FILTER 8
VC1_H_LOOP_FILTER 8, r5
RET
INIT_MMX ssse3
; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter4, 3,5,0
START_V_FILTER
VC1_V_LOOP_FILTER 4, d
RET
; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter4, 3,5,0
START_H_FILTER 4
VC1_H_LOOP_FILTER 4, r4
RET
INIT_XMM ssse3
; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter8, 3,5,8
START_V_FILTER
VC1_V_LOOP_FILTER 8, q
RET
; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8, 3,6,8
START_H_FILTER 8
VC1_H_LOOP_FILTER 8, r5
RET
INIT_XMM sse4
; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8, 3,5,8
START_H_FILTER 8
VC1_H_LOOP_FILTER 8
RET

View File

@@ -0,0 +1,29 @@
/*
* VC-1 and WMV3 decoder - X86 DSP init functions
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_VC1DSP_H
#define AVCODEC_X86_VC1DSP_H
#include "libavcodec/vc1dsp.h"
void ff_vc1dsp_init_mmx(VC1DSPContext *dsp);
void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp);
#endif /* AVCODEC_X86_VC1DSP_H */

View File

@@ -0,0 +1,131 @@
/*
* VC-1 and WMV3 - DSP functions MMX-optimized
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vc1dsp.h"
#include "dsputil_x86.h"
#include "vc1dsp.h"
#include "config.h"
#define LOOP_FILTER(EXT) \
void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
\
static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
{ \
ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \
ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
} \
\
static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
{ \
ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \
ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
}
#if HAVE_YASM
LOOP_FILTER(mmxext)
LOOP_FILTER(sse2)
LOOP_FILTER(ssse3)
void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
{
ff_vc1_h_loop_filter8_sse4(src, stride, pq);
ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
}
static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride, int rnd)
{
ff_avg_pixels8_mmxext(dst, src, stride, 8);
}
#endif /* HAVE_YASM */
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
ff_vc1dsp_init_mmx(dsp);
if (INLINE_MMXEXT(cpu_flags))
ff_vc1dsp_init_mmxext(dsp);
#define ASSIGN_LF(EXT) \
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \
dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
#if HAVE_YASM
if (EXTERNAL_MMX(cpu_flags)) {
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_LF(mmxext);
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
dsp->avg_vc1_mspel_pixels_tab[0] = avg_vc1_mspel_mc00_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
ASSIGN_LF(ssse3);
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3;
}
if (EXTERNAL_SSE4(cpu_flags)) {
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4;
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4;
}
#endif /* HAVE_YASM */
}

View File

@@ -0,0 +1,757 @@
/*
* VC-1 and WMV3 - DSP functions MMX-optimized
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vc1dsp.h"
#include "constants.h"
#include "dsputil_x86.h"
#include "vc1dsp.h"
#if HAVE_INLINE_ASM
#define OP_PUT(S,D)
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
/** Add rounder from mm7 to mm3 and pack result at destination */
#define NORMALIZE_MMX(SHIFT) \
"paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
"paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
"psraw "SHIFT", %%mm3 \n\t" \
"psraw "SHIFT", %%mm4 \n\t"
#define TRANSFER_DO_PACK(OP) \
"packuswb %%mm4, %%mm3 \n\t" \
OP((%2), %%mm3) \
"movq %%mm3, (%2) \n\t"
#define TRANSFER_DONT_PACK(OP) \
OP(0(%2), %%mm3) \
OP(8(%2), %%mm4) \
"movq %%mm3, 0(%2) \n\t" \
"movq %%mm4, 8(%2) \n\t"
/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
#define DONT_UNPACK(reg)
/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
#define LOAD_ROUNDER_MMX(ROUND) \
"movd "ROUND", %%mm7 \n\t" \
"punpcklwd %%mm7, %%mm7 \n\t" \
"punpckldq %%mm7, %%mm7 \n\t"
#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
"paddw %%mm"#R2", %%mm"#R1" \n\t" \
"movd (%0,%3), %%mm"#R0" \n\t" \
"pmullw %%mm6, %%mm"#R1" \n\t" \
"punpcklbw %%mm0, %%mm"#R0" \n\t" \
"movd (%0,%2), %%mm"#R3" \n\t" \
"psubw %%mm"#R0", %%mm"#R1" \n\t" \
"punpcklbw %%mm0, %%mm"#R3" \n\t" \
"paddw %%mm7, %%mm"#R1" \n\t" \
"psubw %%mm"#R3", %%mm"#R1" \n\t" \
"psraw %4, %%mm"#R1" \n\t" \
"movq %%mm"#R1", "#OFF"(%1) \n\t" \
"add %2, %0 \n\t"
/** Sacrifying mm6 allows to pipeline loads from src */
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
const uint8_t *src, x86_reg stride,
int rnd, int64_t shift)
{
__asm__ volatile(
"mov $3, %%"REG_c" \n\t"
LOAD_ROUNDER_MMX("%5")
"movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
"1: \n\t"
"movd (%0), %%mm2 \n\t"
"add %2, %0 \n\t"
"movd (%0), %%mm3 \n\t"
"punpcklbw %%mm0, %%mm2 \n\t"
"punpcklbw %%mm0, %%mm3 \n\t"
SHIFT2_LINE( 0, 1, 2, 3, 4)
SHIFT2_LINE( 24, 2, 3, 4, 1)
SHIFT2_LINE( 48, 3, 4, 1, 2)
SHIFT2_LINE( 72, 4, 1, 2, 3)
SHIFT2_LINE( 96, 1, 2, 3, 4)
SHIFT2_LINE(120, 2, 3, 4, 1)
SHIFT2_LINE(144, 3, 4, 1, 2)
SHIFT2_LINE(168, 4, 1, 2, 3)
"sub %6, %0 \n\t"
"add $8, %1 \n\t"
"dec %%"REG_c" \n\t"
"jnz 1b \n\t"
: "+r"(src), "+r"(dst)
: "r"(stride), "r"(-2*stride),
"m"(shift), "m"(rnd), "r"(9*stride-4)
: "%"REG_c, "memory"
);
}
/**
* Data is already unpacked, so some operations can directly be made from
* memory.
*/
#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
const int16_t *src, int rnd)\
{\
int h = 8;\
\
src -= 1;\
rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
__asm__ volatile(\
LOAD_ROUNDER_MMX("%4")\
"movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
"movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
"1: \n\t"\
"movq 2*0+0(%1), %%mm1 \n\t"\
"movq 2*0+8(%1), %%mm2 \n\t"\
"movq 2*1+0(%1), %%mm3 \n\t"\
"movq 2*1+8(%1), %%mm4 \n\t"\
"paddw 2*3+0(%1), %%mm1 \n\t"\
"paddw 2*3+8(%1), %%mm2 \n\t"\
"paddw 2*2+0(%1), %%mm3 \n\t"\
"paddw 2*2+8(%1), %%mm4 \n\t"\
"pmullw %%mm5, %%mm3 \n\t"\
"pmullw %%mm5, %%mm4 \n\t"\
"psubw %%mm1, %%mm3 \n\t"\
"psubw %%mm2, %%mm4 \n\t"\
NORMALIZE_MMX("$7")\
/* Remove bias */\
"paddw %%mm6, %%mm3 \n\t"\
"paddw %%mm6, %%mm4 \n\t"\
TRANSFER_DO_PACK(OP)\
"add $24, %1 \n\t"\
"add %3, %2 \n\t"\
"decl %0 \n\t"\
"jnz 1b \n\t"\
: "+r"(h), "+r" (src), "+r" (dst)\
: "r"(stride), "m"(rnd)\
: "memory"\
);\
}
VC1_HOR_16b_SHIFT2(OP_PUT, put_)
VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
/**
* Purely vertical or horizontal 1/2 shift interpolation.
* Sacrify mm6 for *9 factor.
*/
#define VC1_SHIFT2(OP, OPNAME)\
static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
x86_reg stride, int rnd, x86_reg offset)\
{\
rnd = 8-rnd;\
__asm__ volatile(\
"mov $8, %%"REG_c" \n\t"\
LOAD_ROUNDER_MMX("%5")\
"movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
"1: \n\t"\
"movd 0(%0 ), %%mm3 \n\t"\
"movd 4(%0 ), %%mm4 \n\t"\
"movd 0(%0,%2), %%mm1 \n\t"\
"movd 4(%0,%2), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm0, %%mm3 \n\t"\
"punpcklbw %%mm0, %%mm4 \n\t"\
"punpcklbw %%mm0, %%mm1 \n\t"\
"punpcklbw %%mm0, %%mm2 \n\t"\
"paddw %%mm1, %%mm3 \n\t"\
"paddw %%mm2, %%mm4 \n\t"\
"movd 0(%0,%3), %%mm1 \n\t"\
"movd 4(%0,%3), %%mm2 \n\t"\
"pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
"pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
"punpcklbw %%mm0, %%mm1 \n\t"\
"punpcklbw %%mm0, %%mm2 \n\t"\
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
"movd 0(%0,%2), %%mm1 \n\t"\
"movd 4(%0,%2), %%mm2 \n\t"\
"punpcklbw %%mm0, %%mm1 \n\t"\
"punpcklbw %%mm0, %%mm2 \n\t"\
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
NORMALIZE_MMX("$4")\
"packuswb %%mm4, %%mm3 \n\t"\
OP((%1), %%mm3)\
"movq %%mm3, (%1) \n\t"\
"add %6, %0 \n\t"\
"add %4, %1 \n\t"\
"dec %%"REG_c" \n\t"\
"jnz 1b \n\t"\
: "+r"(src), "+r"(dst)\
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
"g"(stride-offset)\
: "%"REG_c, "memory"\
);\
}
VC1_SHIFT2(OP_PUT, put_)
VC1_SHIFT2(OP_AVG, avg_)
/**
* Core of the 1/4 and 3/4 shift bicubic interpolation.
*
* @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
* @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
* @param A1 Address of 1st tap (beware of unpacked/packed).
* @param A2 Address of 2nd tap
* @param A3 Address of 3rd tap
* @param A4 Address of 4th tap
*/
#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
MOVQ "*0+"A1", %%mm1 \n\t" \
MOVQ "*4+"A1", %%mm2 \n\t" \
UNPACK("%%mm1") \
UNPACK("%%mm2") \
"pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
"pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
MOVQ "*0+"A2", %%mm3 \n\t" \
MOVQ "*4+"A2", %%mm4 \n\t" \
UNPACK("%%mm3") \
UNPACK("%%mm4") \
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
"pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
"psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
"psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
MOVQ "*0+"A4", %%mm1 \n\t" \
MOVQ "*4+"A4", %%mm2 \n\t" \
UNPACK("%%mm1") \
UNPACK("%%mm2") \
"psllw $2, %%mm1 \n\t" /* 4* */ \
"psllw $2, %%mm2 \n\t" /* 4* */ \
"psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
"psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
MOVQ "*0+"A3", %%mm1 \n\t" \
MOVQ "*4+"A3", %%mm2 \n\t" \
UNPACK("%%mm1") \
UNPACK("%%mm2") \
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
"pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
"paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
"paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
/**
* Macro to build the vertical 16bits version of vc1_put_shift[13].
* Here, offset=src_stride. Parameters passed A1 to A4 must use
* %3 (src_stride) and %4 (3*src_stride).
*
* @param NAME Either 1 or 3
* @see MSPEL_FILTER13_CORE for information on A1->A4
*/
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
static void \
vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
x86_reg src_stride, \
int rnd, int64_t shift) \
{ \
int h = 8; \
src -= src_stride; \
__asm__ volatile( \
LOAD_ROUNDER_MMX("%5") \
"movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
"movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
".p2align 3 \n\t" \
"1: \n\t" \
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
NORMALIZE_MMX("%6") \
TRANSFER_DONT_PACK(OP_PUT) \
/* Last 3 (in fact 4) bytes on the line */ \
"movd 8+"A1", %%mm1 \n\t" \
DO_UNPACK("%%mm1") \
"movq %%mm1, %%mm3 \n\t" \
"paddw %%mm1, %%mm1 \n\t" \
"paddw %%mm3, %%mm1 \n\t" /* 3* */ \
"movd 8+"A2", %%mm3 \n\t" \
DO_UNPACK("%%mm3") \
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
"psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
"movd 8+"A3", %%mm1 \n\t" \
DO_UNPACK("%%mm1") \
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
"paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
"movd 8+"A4", %%mm1 \n\t" \
DO_UNPACK("%%mm1") \
"psllw $2, %%mm1 \n\t" /* 4* */ \
"psubw %%mm1, %%mm3 \n\t" \
"paddw %%mm7, %%mm3 \n\t" \
"psraw %6, %%mm3 \n\t" \
"movq %%mm3, 16(%2) \n\t" \
"add %3, %1 \n\t" \
"add $24, %2 \n\t" \
"decl %0 \n\t" \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(src_stride), "r"(3*src_stride), \
"m"(rnd), "m"(shift) \
: "memory" \
); \
}
/**
* Macro to build the horizontal 16bits version of vc1_put_shift[13].
* Here, offset=16bits, so parameters passed A1 to A4 should be simple.
*
* @param NAME Either 1 or 3
* @see MSPEL_FILTER13_CORE for information on A1->A4
*/
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
static void \
OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
const int16_t *src, int rnd) \
{ \
int h = 8; \
src -= 1; \
rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
__asm__ volatile( \
LOAD_ROUNDER_MMX("%4") \
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
".p2align 3 \n\t" \
"1: \n\t" \
MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
NORMALIZE_MMX("$7") \
/* Remove bias */ \
"paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
"paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
TRANSFER_DO_PACK(OP) \
"add $24, %1 \n\t" \
"add %3, %2 \n\t" \
"decl %0 \n\t" \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(stride), "m"(rnd) \
: "memory" \
); \
}
/**
* Macro to build the 8bits, any direction, version of vc1_put_shift[13].
* Here, offset=src_stride. Parameters passed A1 to A4 must use
* %3 (offset) and %4 (3*offset).
*
* @param NAME Either 1 or 3
* @see MSPEL_FILTER13_CORE for information on A1->A4
*/
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
static void \
OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
x86_reg stride, int rnd, x86_reg offset) \
{ \
int h = 8; \
src -= offset; \
rnd = 32-rnd; \
__asm__ volatile ( \
LOAD_ROUNDER_MMX("%6") \
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
".p2align 3 \n\t" \
"1: \n\t" \
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
NORMALIZE_MMX("$6") \
TRANSFER_DO_PACK(OP) \
"add %5, %1 \n\t" \
"add %5, %2 \n\t" \
"decl %0 \n\t" \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
: "memory" \
); \
}
/** 1/4 shift bicubic interpolation */
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
/** 3/4 shift bicubic interpolation */
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
/**
* Interpolate fractional pel values by applying proper vertical then
* horizontal filter.
*
* @param dst Destination buffer for interpolated pels.
* @param src Source buffer.
* @param stride Stride for both src and dst buffers.
* @param hmode Horizontal filter (expressed in quarter pixels shift).
* @param hmode Vertical filter.
* @param rnd Rounding bias.
*/
#define VC1_MSPEL_MC(OP)\
static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
int hmode, int vmode, int rnd)\
{\
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
{ NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
{ NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
{ NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
\
__asm__ volatile(\
"pxor %%mm0, %%mm0 \n\t"\
::: "memory"\
);\
\
if (vmode) { /* Vertical filter to apply */\
if (hmode) { /* Horizontal filter to apply, output to tmp */\
static const int shift_value[] = { 0, 5, 1, 5 };\
int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
int r;\
DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
\
r = (1<<(shift-1)) + rnd-1;\
vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
\
vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
return;\
}\
else { /* No horizontal filter, output 8 lines to dst */\
vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
return;\
}\
}\
\
/* Horizontal mode with no vertical mode */\
vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
}
VC1_MSPEL_MC(put_)
VC1_MSPEL_MC(avg_)
/** Macro to ease bicubic filter interpolation functions declarations */
#define DECLARE_FUNCTION(a, b) \
static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride, \
int rnd) \
{ \
put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
}\
static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride, \
int rnd) \
{ \
avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
}
DECLARE_FUNCTION(0, 1)
DECLARE_FUNCTION(0, 2)
DECLARE_FUNCTION(0, 3)
DECLARE_FUNCTION(1, 0)
DECLARE_FUNCTION(1, 1)
DECLARE_FUNCTION(1, 2)
DECLARE_FUNCTION(1, 3)
DECLARE_FUNCTION(2, 0)
DECLARE_FUNCTION(2, 1)
DECLARE_FUNCTION(2, 2)
DECLARE_FUNCTION(2, 3)
DECLARE_FUNCTION(3, 0)
DECLARE_FUNCTION(3, 1)
DECLARE_FUNCTION(3, 2)
DECLARE_FUNCTION(3, 3)
static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = (17 * dc + 4) >> 3;
dc = (17 * dc + 64) >> 7;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movd %0, %%mm2 \n\t"
"movd %1, %%mm3 \n\t"
"movd %2, %%mm4 \n\t"
"movd %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movd %%mm2, %0 \n\t"
"movd %%mm3, %1 \n\t"
"movd %%mm4, %2 \n\t"
"movd %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = (17 * dc + 4) >> 3;
dc = (12 * dc + 64) >> 7;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movd %0, %%mm2 \n\t"
"movd %1, %%mm3 \n\t"
"movd %2, %%mm4 \n\t"
"movd %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movd %%mm2, %0 \n\t"
"movd %%mm3, %1 \n\t"
"movd %%mm4, %2 \n\t"
"movd %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
dest += 4*linesize;
__asm__ volatile(
"movd %0, %%mm2 \n\t"
"movd %1, %%mm3 \n\t"
"movd %2, %%mm4 \n\t"
"movd %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movd %%mm2, %0 \n\t"
"movd %%mm3, %1 \n\t"
"movd %%mm4, %2 \n\t"
"movd %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = ( 3 * dc + 1) >> 1;
dc = (17 * dc + 64) >> 7;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movq %0, %%mm2 \n\t"
"movq %1, %%mm3 \n\t"
"movq %2, %%mm4 \n\t"
"movq %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movq %%mm2, %0 \n\t"
"movq %%mm3, %1 \n\t"
"movq %%mm4, %2 \n\t"
"movq %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = (3 * dc + 1) >> 1;
dc = (3 * dc + 16) >> 5;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movq %0, %%mm2 \n\t"
"movq %1, %%mm3 \n\t"
"movq %2, %%mm4 \n\t"
"movq %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movq %%mm2, %0 \n\t"
"movq %%mm3, %1 \n\t"
"movq %%mm4, %2 \n\t"
"movq %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
dest += 4*linesize;
__asm__ volatile(
"movq %0, %%mm2 \n\t"
"movq %1, %%mm3 \n\t"
"movq %2, %%mm4 \n\t"
"movq %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movq %%mm2, %0 \n\t"
"movq %%mm3, %1 \n\t"
"movq %%mm4, %2 \n\t"
"movq %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride, int rnd)
{
ff_put_pixels8_mmx(dst, src, stride, 8);
}
av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
{
dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
}
av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
{
dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
}
#endif /* HAVE_INLINE_ASM */

View File

@@ -0,0 +1,444 @@
;******************************************************************************
;* Core video DSP functions
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION .text
; slow vertical extension loop function. Works with variable-width, and
; does per-line reading/writing of source data
%macro V_COPY_ROW 2 ; type (top/body/bottom), h
.%1_y_loop: ; do {
mov wq, r7mp ; initialize w (r7mp = wmp)
.%1_x_loop: ; do {
movu m0, [srcq+wq] ; m0 = read($mmsize)
movu [dstq+wq], m0 ; write(m0, $mmsize)
add wq, mmsize ; w -= $mmsize
cmp wq, -mmsize ; } while (w > $mmsize);
jl .%1_x_loop
movu m0, [srcq-mmsize] ; m0 = read($mmsize)
movu [dstq-mmsize], m0 ; write(m0, $mmsize)
%ifidn %1, body ; if ($type == body) {
add srcq, src_strideq ; src += src_stride
%endif ; }
add dstq, dst_strideq ; dst += dst_stride
dec %2 ; } while (--$h);
jnz .%1_y_loop
%endmacro
%macro vvar_fn 0
; .----. <- zero
; | | <- top is copied from first line in body of source
; |----| <- start_y
; | | <- body is copied verbatim (line-by-line) from source
; |----| <- end_y
; | | <- bottom is copied from last line in body of source
; '----' <- bh
%if ARCH_X86_64
cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
start_y, end_y, bh, w
%else ; x86-32
cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
%define src_strideq r3mp
%define dst_strideq r1mp
mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
%endif
sub bhq, end_yq ; bh -= end_q
sub end_yq, start_yq ; end_q -= start_q
add srcq, r7mp ; (r7mp = wmp)
add dstq, r7mp ; (r7mp = wmp)
neg r7mp ; (r7mp = wmp)
test start_yq, start_yq ; if (start_q) {
jz .body
V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq)
.body: ; }
V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq)
test bhq, bhq ; if (bh) {
jz .end
sub srcq, src_strideq ; src -= src_stride
V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh)
.end: ; }
RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
vvar_fn
%endif
INIT_XMM sse
vvar_fn
%macro hvar_fn 0
cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
lea dstq, [dstq+n_wordsq*2]
neg n_wordsq
lea start_xq, [start_xq+n_wordsq*2]
.y_loop: ; do {
; FIXME also write a ssse3 version using pshufb
movzx wd, byte [dstq+start_xq] ; w = read(1)
imul wd, 0x01010101 ; w *= 0x01010101
movd m0, wd
mov wq, n_wordsq ; initialize w
%if cpuflag(sse2)
pshufd m0, m0, q0000 ; splat
%else ; mmx
punpckldq m0, m0 ; splat
%endif ; mmx/sse
.x_loop: ; do {
movu [dstq+wq*2], m0 ; write($reg, $mmsize)
add wq, mmsize/2 ; w -= $mmsize/2
cmp wq, -mmsize/2 ; } while (w > $mmsize/2)
jl .x_loop
movu [dstq-mmsize], m0 ; write($reg, $mmsize)
add dstq, dst_strideq ; dst += dst_stride
dec hq ; } while (h--)
jnz .y_loop
RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
hvar_fn
%endif
INIT_XMM sse2
hvar_fn
; macro to read/write a horizontal number of pixels (%2) to/from registers
; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
; - if (%2 & 8) fills 8 bytes into xmm$next
; - if (%2 & 4) fills 4 bytes into xmm$next
; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
; - if (%2 & 4) fills 4 bytes into mm$next
; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
; writing data out is in the same way
%macro READ_NUM_BYTES 2
%assign %%off 0 ; offset in source buffer
%assign %%mmx_idx 0 ; mmx register index
%assign %%xmm_idx 0 ; xmm register index
%rep %2/mmsize
%if mmsize == 16
movu xmm %+ %%xmm_idx, [srcq+%%off]
%assign %%xmm_idx %%xmm_idx+1
%else ; mmx
movu mm %+ %%mmx_idx, [srcq+%%off]
%assign %%mmx_idx %%mmx_idx+1
%endif
%assign %%off %%off+mmsize
%endrep ; %2/mmsize
%if mmsize == 16
%if (%2-%%off) >= 8
%if %2 > 16 && (%2-%%off) > 8
movu xmm %+ %%xmm_idx, [srcq+%2-16]
%assign %%xmm_idx %%xmm_idx+1
%assign %%off %2
%else
movq mm %+ %%mmx_idx, [srcq+%%off]
%assign %%mmx_idx %%mmx_idx+1
%assign %%off %%off+8
%endif
%endif ; (%2-%%off) >= 8
%endif
%if (%2-%%off) >= 4
%if %2 > 8 && (%2-%%off) > 4
movq mm %+ %%mmx_idx, [srcq+%2-8]
%assign %%off %2
%else
movd mm %+ %%mmx_idx, [srcq+%%off]
%assign %%off %%off+4
%endif
%assign %%mmx_idx %%mmx_idx+1
%endif ; (%2-%%off) >= 4
%if (%2-%%off) >= 1
%if %2 >= 4
movd mm %+ %%mmx_idx, [srcq+%2-4]
%elif (%2-%%off) == 1
mov valb, [srcq+%2-1]
%elif (%2-%%off) == 2
mov valw, [srcq+%2-2]
%elifidn %1, body
mov vald, [srcq+%2-3]
%else
movd mm %+ %%mmx_idx, [srcq+%2-3]
%endif
%endif ; (%2-%%off) >= 1
%endmacro ; READ_NUM_BYTES
%macro WRITE_NUM_BYTES 2
%assign %%off 0 ; offset in destination buffer
%assign %%mmx_idx 0 ; mmx register index
%assign %%xmm_idx 0 ; xmm register index
%rep %2/mmsize
%if mmsize == 16
movu [dstq+%%off], xmm %+ %%xmm_idx
%assign %%xmm_idx %%xmm_idx+1
%else ; mmx
movu [dstq+%%off], mm %+ %%mmx_idx
%assign %%mmx_idx %%mmx_idx+1
%endif
%assign %%off %%off+mmsize
%endrep ; %2/mmsize
%if mmsize == 16
%if (%2-%%off) >= 8
%if %2 > 16 && (%2-%%off) > 8
movu [dstq+%2-16], xmm %+ %%xmm_idx
%assign %%xmm_idx %%xmm_idx+1
%assign %%off %2
%else
movq [dstq+%%off], mm %+ %%mmx_idx
%assign %%mmx_idx %%mmx_idx+1
%assign %%off %%off+8
%endif
%endif ; (%2-%%off) >= 8
%endif
%if (%2-%%off) >= 4
%if %2 > 8 && (%2-%%off) > 4
movq [dstq+%2-8], mm %+ %%mmx_idx
%assign %%off %2
%else
movd [dstq+%%off], mm %+ %%mmx_idx
%assign %%off %%off+4
%endif
%assign %%mmx_idx %%mmx_idx+1
%endif ; (%2-%%off) >= 4
%if (%2-%%off) >= 1
%if %2 >= 4
movd [dstq+%2-4], mm %+ %%mmx_idx
%elif (%2-%%off) == 1
mov [dstq+%2-1], valb
%elif (%2-%%off) == 2
mov [dstq+%2-2], valw
%elifidn %1, body
mov [dstq+%2-3], valw
shr vald, 16
mov [dstq+%2-1], valb
%else
movd vald, mm %+ %%mmx_idx
mov [dstq+%2-3], valw
shr vald, 16
mov [dstq+%2-1], valb
%endif
%endif ; (%2-%%off) >= 1
%endmacro ; WRITE_NUM_BYTES
; vertical top/bottom extend and body copy fast loops
; these are function pointers to set-width line copy functions, i.e.
; they read a fixed number of pixels into set registers, and write
; those out into the destination buffer
%macro VERTICAL_EXTEND 2
%assign %%n %1
%rep 1+%2-%1
%if %%n <= 3
%if ARCH_X86_64
cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
start_y, end_y, val, bh
mov bhq, r6mp ; r6mp = bhmp
%else ; x86-32
cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
mov dstq, r0mp
mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
%define dst_strideq r1mp
%define src_strideq r3mp
%endif ; x86-64/32
%else
%if ARCH_X86_64
cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
start_y, end_y, bh
%else ; x86-32
cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
%define dst_strideq r1mp
%define src_strideq r3mp
%endif ; x86-64/32
%endif
; FIXME move this to c wrapper?
sub bhq, end_yq ; bh -= end_y
sub end_yq, start_yq ; end_y -= start_y
; extend pixels above body
test start_yq, start_yq ; if (start_y) {
jz .body_loop
READ_NUM_BYTES top, %%n ; $variable_regs = read($n)
.top_loop: ; do {
WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n)
add dstq, dst_strideq ; dst += linesize
dec start_yq ; } while (--start_y)
jnz .top_loop ; }
; copy body pixels
.body_loop: ; do {
READ_NUM_BYTES body, %%n ; $variable_regs = read($n)
WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n)
add dstq, dst_strideq ; dst += dst_stride
add srcq, src_strideq ; src += src_stride
dec end_yq ; } while (--end_y)
jnz .body_loop
; copy bottom pixels
test bhq, bhq ; if (block_h) {
jz .end
sub srcq, src_strideq ; src -= linesize
READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n)
.bottom_loop: ; do {
WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n)
add dstq, dst_strideq ; dst += linesize
dec bhq ; } while (--bh)
jnz .bottom_loop ; }
.end:
RET
%assign %%n %%n+1
%endrep ; 1+%2-%1
%endmacro ; VERTICAL_EXTEND
INIT_MMX mmx
VERTICAL_EXTEND 1, 15
%if ARCH_X86_32
VERTICAL_EXTEND 16, 22
%endif
INIT_XMM sse
VERTICAL_EXTEND 16, 22
; left/right (horizontal) fast extend functions
; these are essentially identical to the vertical extend ones above,
; just left/right separated because number of pixels to extend is
; obviously not the same on both sides.
%macro READ_V_PIXEL 2
movzx vald, byte %2
imul vald, 0x01010101
%if %1 >= 8
movd m0, vald
%if mmsize == 16
pshufd m0, m0, q0000
%else
punpckldq m0, m0
%endif ; mmsize == 16
%endif ; %1 > 16
%endmacro ; READ_V_PIXEL
%macro WRITE_V_PIXEL 2
%assign %%off 0
%if %1 >= 8
%rep %1/mmsize
movu [%2+%%off], m0
%assign %%off %%off+mmsize
%endrep ; %1/mmsize
%if mmsize == 16
%if %1-%%off >= 8
%if %1 > 16 && %1-%%off > 8
movu [%2+%1-16], m0
%assign %%off %1
%else
movq [%2+%%off], m0
%assign %%off %%off+8
%endif
%endif ; %1-%%off >= 8
%endif ; mmsize == 16
%if %1-%%off >= 4
%if %1 > 8 && %1-%%off > 4
movq [%2+%1-8], m0
%assign %%off %1
%else
movd [%2+%%off], m0
%assign %%off %%off+4
%endif
%endif ; %1-%%off >= 4
%else ; %1 < 8
%rep %1/4
mov [%2+%%off], vald
%assign %%off %%off+4
%endrep ; %1/4
%endif ; %1 >=/< 8
%if %1-%%off == 2
mov [%2+%%off], valw
%endif ; (%1-%%off)/2
%endmacro ; WRITE_V_PIXEL
%macro H_EXTEND 2
%assign %%n %1
%rep 1+(%2-%1)/2
cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
.loop_y: ; do {
READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n)
WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n)
add dstq, dst_strideq ; dst += dst_stride
dec bhq ; } while (--bh)
jnz .loop_y
RET
%assign %%n %%n+2
%endrep ; 1+(%2-%1)/2
%endmacro ; H_EXTEND
INIT_MMX mmx
H_EXTEND 2, 14
%if ARCH_X86_32
H_EXTEND 16, 22
%endif
INIT_XMM sse2
H_EXTEND 16, 22
%macro PREFETCH_FN 1
cglobal prefetch, 3, 3, 0, buf, stride, h
.loop:
%1 [bufq]
add bufq, strideq
dec hd
jg .loop
REP_RET
%endmacro
INIT_MMX mmxext
PREFETCH_FN prefetcht0
%if ARCH_X86_32
INIT_MMX 3dnow
PREFETCH_FN prefetch
%endif

View File

@@ -0,0 +1,266 @@
/*
* Copyright (C) 2002-2012 Michael Niedermayer
* Copyright (C) 2012 Ronald S. Bultje
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/videodsp.h"
#if HAVE_YASM
typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh);
typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh,
x86_reg w);
extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
#if ARCH_X86_32
static emu_edge_vfix_func *vfixtbl_mmx[22] = {
&ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx,
&ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx,
&ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx,
&ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx,
&ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx,
&ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx,
&ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx,
&ff_emu_edge_vfix22_mmx
};
#endif
extern emu_edge_vvar_func ff_emu_edge_vvar_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
static emu_edge_vfix_func *vfixtbl_sse[22] = {
ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx,
ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx,
ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx,
ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
ff_emu_edge_vfix22_sse
};
extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
x86_reg start_x, x86_reg bh);
typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
x86_reg start_x, x86_reg n_words, x86_reg bh);
extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
#if ARCH_X86_32
static emu_edge_hfix_func *hfixtbl_mmx[11] = {
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx
};
#endif
extern emu_edge_hvar_func ff_emu_edge_hvar_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
static emu_edge_hfix_func *hfixtbl_sse2[11] = {
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
};
extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
static av_always_inline void emulated_edge_mc(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
x86_reg block_w, x86_reg block_h,
x86_reg src_x, x86_reg src_y,
x86_reg w, x86_reg h,
emu_edge_vfix_func **vfix_tbl,
emu_edge_vvar_func *v_extend_var,
emu_edge_hfix_func **hfix_tbl,
emu_edge_hvar_func *h_extend_var)
{
x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
if(!w || !h)
return;
if (src_y >= h) {
src -= src_y*src_stride;
src_y_add = h - 1;
src_y = h - 1;
} else if (src_y <= -block_h) {
src -= src_y*src_stride;
src_y_add = 1 - block_h;
src_y = 1 - block_h;
}
if (src_x >= w) {
src += w - 1 - src_x;
src_x = w - 1;
} else if (src_x <= -block_w) {
src += 1 - block_w - src_x;
src_x = 1 - block_w;
}
start_y = FFMAX(0, -src_y);
start_x = FFMAX(0, -src_x);
end_y = FFMIN(block_h, h-src_y);
end_x = FFMIN(block_w, w-src_x);
av_assert2(start_x < end_x && block_w > 0);
av_assert2(start_y < end_y && block_h > 0);
// fill in the to-be-copied part plus all above/below
src += (src_y_add + start_y) * src_stride + start_x;
w = end_x - start_x;
if (w <= 22) {
vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
start_y, end_y, block_h);
} else {
v_extend_var(dst + start_x, dst_stride, src, src_stride,
start_y, end_y, block_h, w);
}
// fill left
if (start_x) {
if (start_x <= 22) {
hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h);
} else {
h_extend_var(dst, dst_stride,
start_x, (start_x + 1) >> 1, block_h);
}
}
// fill right
p = block_w - end_x;
if (p) {
if (p <= 22) {
hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride,
-!(p & 1), block_h);
} else {
h_extend_var(dst + end_x - (p & 1), dst_stride,
-!(p & 1), (p + 1) >> 1, block_h);
}
}
}
#if ARCH_X86_32
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, ptrdiff_t buf_stride,
const uint8_t *src, ptrdiff_t src_stride,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h,
src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx,
hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
}
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, ptrdiff_t buf_stride,
const uint8_t *src, ptrdiff_t src_stride,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h,
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
}
#endif
static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, ptrdiff_t buf_stride,
const uint8_t *src, ptrdiff_t src_stride,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h, src_x,
src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
}
#endif /* HAVE_YASM */
void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
ctx->prefetch = ff_prefetch_3dnow;
}
#endif /* ARCH_X86_32 */
if (EXTERNAL_MMXEXT(cpu_flags)) {
ctx->prefetch = ff_prefetch_mmxext;
}
#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_sse;
}
#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_sse2;
}
#endif /* HAVE_YASM */
}

View File

@@ -0,0 +1,83 @@
;******************************************************************************
;* Vorbis x86 optimizations
;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pdw_80000000: times 4 dd 0x80000000
SECTION .text
%if ARCH_X86_32
INIT_MMX 3dnow
cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
pxor m7, m7
lea magq, [magq+block_sizeq*4]
lea angq, [angq+block_sizeq*4]
neg block_sizeq
.loop:
mova m0, [magq+block_sizeq*4]
mova m1, [angq+block_sizeq*4]
mova m2, m0
mova m3, m1
pfcmpge m2, m7 ; m <= 0.0
pfcmpge m3, m7 ; a <= 0.0
pslld m2, 31 ; keep only the sign bit
pxor m1, m2
mova m4, m3
pand m3, m1
pandn m4, m1
pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
mova [angq+block_sizeq*4], m3
mova [magq+block_sizeq*4], m0
add block_sizeq, 2
jl .loop
femms
RET
%endif
INIT_XMM sse
cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr
mova m5, [pdw_80000000]
xor cntrq, cntrq
align 16
.loop:
mova m0, [magq+cntrq*4]
mova m1, [angq+cntrq*4]
xorps m2, m2
xorps m3, m3
cmpleps m2, m0 ; m <= 0.0
cmpleps m3, m1 ; a <= 0.0
andps m2, m5 ; keep only the sign bit
xorps m1, m2
mova m4, m3
andps m3, m1
andnps m4, m1
addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
mova [angq+cntrq*4], m3
mova [magq+cntrq*4], m0
add cntrq, 4
cmp cntrq, block_sizeq
jl .loop
RET

View File

@@ -0,0 +1,44 @@
/*
* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vorbisdsp.h"
void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang,
intptr_t blocksize);
void ff_vorbis_inverse_coupling_sse(float *mag, float *ang,
intptr_t blocksize);
av_cold void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags))
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow;
#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE(cpu_flags))
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse;
#endif /* HAVE_YASM */
}

View File

@@ -0,0 +1,709 @@
;******************************************************************************
;* MMX/SSE2-optimized functions for the VP3 decoder
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
; MMX-optimized functions cribbed from the original VP3 source code.
SECTION_RODATA
vp3_idct_data: times 8 dw 64277
times 8 dw 60547
times 8 dw 54491
times 8 dw 46341
times 8 dw 36410
times 8 dw 25080
times 8 dw 12785
pb_7: times 8 db 0x07
pb_1F: times 8 db 0x1f
pb_81: times 8 db 0x81
cextern pb_1
cextern pb_3
cextern pb_80
cextern pw_8
SECTION .text
; this is off by one or two for some cases when filter_limit is greater than 63
; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
; out: p1 in mm4, p2 in mm3
%macro VP3_LOOP_FILTER 0
movq m7, m6
pand m6, [pb_7] ; p0&7
psrlw m7, 3
pand m7, [pb_1F] ; p0>>3
movq m3, m2 ; p2
pxor m2, m4
pand m2, [pb_1] ; (p2^p1)&1
movq m5, m2
paddb m2, m2
paddb m2, m5 ; 3*(p2^p1)&1
paddb m2, m6 ; extra bits lost in shifts
pcmpeqb m0, m0
pxor m1, m0 ; 255 - p3
pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
pxor m0, m4 ; 255 - p1
pavgb m0, m3 ; (256 + p2-p1) >> 1
paddb m1, [pb_3]
pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
paddusb m7, m1 ; d+128+1
movq m6, [pb_81]
psubusb m6, m7
psubusb m7, [pb_81]
movq m5, [r2+516] ; flim
pminub m6, m5
pminub m7, m5
movq m0, m6
movq m1, m7
paddb m6, m6
paddb m7, m7
pminub m6, m5
pminub m7, m5
psubb m6, m0
psubb m7, m1
paddusb m4, m7
psubusb m4, m6
psubusb m3, m7
paddusb m3, m6
%endmacro
%macro STORE_4_WORDS 1
movd r2d, %1
mov [r0 -1], r2w
psrlq %1, 32
shr r2, 16
mov [r0+r1 -1], r2w
movd r2d, %1
mov [r0+r1*2-1], r2w
shr r2, 16
mov [r0+r3 -1], r2w
%endmacro
INIT_MMX mmxext
cglobal vp3_v_loop_filter, 3, 4
%if ARCH_X86_64
movsxd r1, r1d
%endif
mov r3, r1
neg r1
movq m6, [r0+r1*2]
movq m4, [r0+r1 ]
movq m2, [r0 ]
movq m1, [r0+r3 ]
VP3_LOOP_FILTER
movq [r0+r1], m4
movq [r0 ], m3
RET
cglobal vp3_h_loop_filter, 3, 4
%if ARCH_X86_64
movsxd r1, r1d
%endif
lea r3, [r1*3]
movd m6, [r0 -2]
movd m4, [r0+r1 -2]
movd m2, [r0+r1*2-2]
movd m1, [r0+r3 -2]
lea r0, [r0+r1*4 ]
punpcklbw m6, [r0 -2]
punpcklbw m4, [r0+r1 -2]
punpcklbw m2, [r0+r1*2-2]
punpcklbw m1, [r0+r3 -2]
sub r0, r3
sub r0, r1
TRANSPOSE4x4B 6, 4, 2, 1, 0
VP3_LOOP_FILTER
SBUTTERFLY bw, 4, 3, 5
STORE_4_WORDS m4
lea r0, [r0+r1*4 ]
STORE_4_WORDS m3
RET
; from original comments: The Macro does IDct on 4 1-D Dcts
%macro BeginIDCT 0
movq m2, I(3)
movq m6, C(3)
movq m4, m2
movq m7, J(5)
pmulhw m4, m6 ; r4 = c3*i3 - i3
movq m1, C(5)
pmulhw m6, m7 ; r6 = c3*i5 - i5
movq m5, m1
pmulhw m1, m2 ; r1 = c5*i3 - i3
movq m3, I(1)
pmulhw m5, m7 ; r5 = c5*i5 - i5
movq m0, C(1)
paddw m4, m2 ; r4 = c3*i3
paddw m6, m7 ; r6 = c3*i5
paddw m2, m1 ; r2 = c5*i3
movq m1, J(7)
paddw m7, m5 ; r7 = c5*i5
movq m5, m0 ; r5 = c1
pmulhw m0, m3 ; r0 = c1*i1 - i1
paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
pmulhw m5, m1 ; r5 = c1*i7 - i7
movq m7, C(7)
psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
paddw m0, m3 ; r0 = c1*i1
pmulhw m3, m7 ; r3 = c7*i1
movq m2, I(2)
pmulhw m7, m1 ; r7 = c7*i7
paddw m5, m1 ; r5 = c1*i7
movq m1, m2 ; r1 = i2
pmulhw m2, C(2) ; r2 = c2*i2 - i2
psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
movq m5, J(6)
paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
movq m7, m5 ; r7 = i6
psubsw m0, m4 ; r0 = A - C
pmulhw m5, C(2) ; r5 = c2*i6 - i6
paddw m2, m1 ; r2 = c2*i2
pmulhw m1, C(6) ; r1 = c6*i2
paddsw m4, m4 ; r4 = C + C
paddsw m4, m0 ; r4 = C. = A + C
psubsw m3, m6 ; r3 = B - D
paddw m5, m7 ; r5 = c2*i6
paddsw m6, m6 ; r6 = D + D
pmulhw m7, C(6) ; r7 = c6*i6
paddsw m6, m3 ; r6 = D. = B + D
movq I(1), m4 ; save C. at I(1)
psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
movq m4, C(4)
movq m5, m3 ; r5 = B - D
pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
movq I(2), m6 ; save D. at I(2)
movq m2, m0 ; r2 = A - C
movq m6, I(0)
pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
paddw m5, m3 ; r5 = B. = c4 * (B - D)
movq m3, J(4)
psubsw m5, m1 ; r5 = B.. = B. - H
paddw m2, m0 ; r0 = A. = c4 * (A - C)
psubsw m6, m3 ; r6 = i0 - i4
movq m0, m6
pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
paddsw m3, m3 ; r3 = i4 + i4
paddsw m1, m1 ; r1 = H + H
paddsw m3, m0 ; r3 = i0 + i4
paddsw m1, m5 ; r1 = H. = B + H
pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
psubsw m6, m2 ; r6 = F. = F - A.
paddsw m2, m2 ; r2 = A. + A.
movq m0, I(1) ; r0 = C.
paddsw m2, m6 ; r2 = A.. = F + A.
paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
psubsw m2, m1 ; r2 = R2 = A.. - H.
%endmacro
; RowIDCT gets ready to transpose
%macro RowIDCT 0
BeginIDCT
movq m3, I(2) ; r3 = D.
psubsw m4, m7 ; r4 = E. = E - G
paddsw m1, m1 ; r1 = H. + H.
paddsw m7, m7 ; r7 = G + G
paddsw m1, m2 ; r1 = R1 = A.. + H.
paddsw m7, m4 ; r1 = R1 = A.. + H.
psubsw m4, m3 ; r4 = R4 = E. - D.
paddsw m3, m3
psubsw m6, m5 ; r6 = R6 = F. - B..
paddsw m5, m5
paddsw m3, m4 ; r3 = R3 = E. + D.
paddsw m5, m6 ; r5 = R5 = F. + B..
psubsw m7, m0 ; r7 = R7 = G. - C.
paddsw m0, m0
movq I(1), m1 ; save R1
paddsw m0, m7 ; r0 = R0 = G. + C.
%endmacro
; Column IDCT normalizes and stores final results
%macro ColumnIDCT 0
BeginIDCT
paddsw m2, OC_8 ; adjust R2 (and R1) for shift
paddsw m1, m1 ; r1 = H. + H.
paddsw m1, m2 ; r1 = R1 = A.. + H.
psraw m2, 4 ; r2 = NR2
psubsw m4, m7 ; r4 = E. = E - G
psraw m1, 4 ; r1 = NR2
movq m3, I(2) ; r3 = D.
paddsw m7, m7 ; r7 = G + G
movq I(2), m2 ; store NR2 at I2
paddsw m7, m4 ; r7 = G. = E + G
movq I(1), m1 ; store NR1 at I1
psubsw m4, m3 ; r4 = R4 = E. - D.
paddsw m4, OC_8 ; adjust R4 (and R3) for shift
paddsw m3, m3 ; r3 = D. + D.
paddsw m3, m4 ; r3 = R3 = E. + D.
psraw m4, 4 ; r4 = NR4
psubsw m6, m5 ; r6 = R6 = F. - B..
psraw m3, 4 ; r3 = NR3
paddsw m6, OC_8 ; adjust R6 (and R5) for shift
paddsw m5, m5 ; r5 = B.. + B..
paddsw m5, m6 ; r5 = R5 = F. + B..
psraw m6, 4 ; r6 = NR6
movq J(4), m4 ; store NR4 at J4
psraw m5, 4 ; r5 = NR5
movq I(3), m3 ; store NR3 at I3
psubsw m7, m0 ; r7 = R7 = G. - C.
paddsw m7, OC_8 ; adjust R7 (and R0) for shift
paddsw m0, m0 ; r0 = C. + C.
paddsw m0, m7 ; r0 = R0 = G. + C.
psraw m7, 4 ; r7 = NR7
movq J(6), m6 ; store NR6 at J6
psraw m0, 4 ; r0 = NR0
movq J(5), m5 ; store NR5 at J5
movq J(7), m7 ; store NR7 at J7
movq I(0), m0 ; store NR0 at I0
%endmacro
; Following macro does two 4x4 transposes in place.
;
; At entry (we assume):
;
; r0 = a3 a2 a1 a0
; I(1) = b3 b2 b1 b0
; r2 = c3 c2 c1 c0
; r3 = d3 d2 d1 d0
;
; r4 = e3 e2 e1 e0
; r5 = f3 f2 f1 f0
; r6 = g3 g2 g1 g0
; r7 = h3 h2 h1 h0
;
; At exit, we have:
;
; I(0) = d0 c0 b0 a0
; I(1) = d1 c1 b1 a1
; I(2) = d2 c2 b2 a2
; I(3) = d3 c3 b3 a3
;
; J(4) = h0 g0 f0 e0
; J(5) = h1 g1 f1 e1
; J(6) = h2 g2 f2 e2
; J(7) = h3 g3 f3 e3
;
; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
;
; Since r1 is free at entry, we calculate the Js first.
%macro Transpose 0
movq m1, m4 ; r1 = e3 e2 e1 e0
punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
movq I(0), m0 ; save a3 a2 a1 a0
punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
movq m0, m6 ; r0 = g3 g2 g1 g0
punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
movq m5, m4 ; r5 = f1 e1 f0 e0
punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
movq m6, m1 ; r6 = f3 e3 f2 e2
movq J(4), m4
punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
movq J(5), m5
punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
movq m4, I(0) ; r4 = a3 a2 a1 a0
punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
movq m5, I(1) ; r5 = b3 b2 b1 b0
movq m0, m4 ; r0 = a3 a2 a1 a0
movq J(7), m6
punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
movq J(6), m1
punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
movq m5, m2 ; r5 = c3 c2 c1 c0
punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
movq m1, m0 ; r1 = b1 a1 b0 a0
punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
movq m2, m4 ; r2 = b3 a3 b2 a2
movq I(0), m0
punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
movq I(1), m1
punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
movq I(3), m4
movq I(2), m2
%endmacro
%macro VP3_1D_IDCT_SSE2 0
movdqa m2, I(3) ; xmm2 = i3
movdqa m6, C(3) ; xmm6 = c3
movdqa m4, m2 ; xmm4 = i3
movdqa m7, I(5) ; xmm7 = i5
pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
movdqa m1, C(5) ; xmm1 = c5
pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
movdqa m5, m1 ; xmm5 = c5
pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
movdqa m3, I(1) ; xmm3 = i1
pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
movdqa m0, C(1) ; xmm0 = c1
paddw m4, m2 ; xmm4 = c3 * i3
paddw m6, m7 ; xmm6 = c3 * i5
paddw m2, m1 ; xmm2 = c5 * i3
movdqa m1, I(7) ; xmm1 = i7
paddw m7, m5 ; xmm7 = c5 * i5
movdqa m5, m0 ; xmm5 = c1
pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
movdqa m7, C(7) ; xmm7 = c7
psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
paddw m0, m3 ; xmm0 = c1 * i1
pmulhw m3, m7 ; xmm3 = c7 * i1
movdqa m2, I(2) ; xmm2 = i2
pmulhw m7, m1 ; xmm7 = c7 * i7
paddw m5, m1 ; xmm5 = c1 * i7
movdqa m1, m2 ; xmm1 = i2
pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
movdqa m5, I(6) ; xmm5 = i6
paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
movdqa m7, m5 ; xmm7 = i6
psubsw m0, m4 ; xmm0 = A - C
pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
paddw m2, m1 ; xmm2 = i2 * c2
pmulhw m1, C(6) ; xmm1 = c6 * i2
paddsw m4, m4 ; xmm4 = C + C
paddsw m4, m0 ; xmm4 = A + C = C.
psubsw m3, m6 ; xmm3 = B - D
paddw m5, m7 ; xmm5 = c2 * i6
paddsw m6, m6 ; xmm6 = D + D
pmulhw m7, C(6) ; xmm7 = c6 * i6
paddsw m6, m3 ; xmm6 = B + D = D.
movdqa I(1), m4 ; Save C. at I(1)
psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
movdqa m4, C(4) ; xmm4 = C4
movdqa m5, m3 ; xmm5 = B - D
pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
movdqa I(2), m6 ; save D. at I(2)
movdqa m2, m0 ; xmm2 = A - C
movdqa m6, I(0) ; xmm6 = i0
pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
movdqa m3, I(4) ; xmm3 = i4
psubsw m5, m1 ; xmm5 = B. - H = B..
paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
psubsw m6, m3 ; xmm6 = i0 - i4
movdqa m0, m6 ; xmm0 = i0 - i4
pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
paddsw m3, m3 ; xmm3 = i4 + i4
paddsw m1, m1 ; xmm1 = H + H
paddsw m3, m0 ; xmm3 = i0 + i4
paddsw m1, m5 ; xmm1 = B. + H = H.
pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
psubsw m6, m2 ; xmm6 = F - A. = F.
paddsw m2, m2 ; xmm2 = A. + A.
movdqa m0, I(1) ; Load C. from I(1)
paddsw m2, m6 ; xmm2 = F + A. = A..
paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
psubsw m2, m1 ; xmm2 = A.. - H. = R2
ADD(m2) ; Adjust R2 and R1 before shifting
paddsw m1, m1 ; xmm1 = H. + H.
paddsw m1, m2 ; xmm1 = A.. + H. = R1
SHIFT(m2) ; xmm2 = op2
psubsw m4, m7 ; xmm4 = E - G = E.
SHIFT(m1) ; xmm1 = op1
movdqa m3, I(2) ; Load D. from I(2)
paddsw m7, m7 ; xmm7 = G + G
paddsw m7, m4 ; xmm7 = E + G = G.
psubsw m4, m3 ; xmm4 = E. - D. = R4
ADD(m4) ; Adjust R4 and R3 before shifting
paddsw m3, m3 ; xmm3 = D. + D.
paddsw m3, m4 ; xmm3 = E. + D. = R3
SHIFT(m4) ; xmm4 = op4
psubsw m6, m5 ; xmm6 = F. - B..= R6
SHIFT(m3) ; xmm3 = op3
ADD(m6) ; Adjust R6 and R5 before shifting
paddsw m5, m5 ; xmm5 = B.. + B..
paddsw m5, m6 ; xmm5 = F. + B.. = R5
SHIFT(m6) ; xmm6 = op6
SHIFT(m5) ; xmm5 = op5
psubsw m7, m0 ; xmm7 = G. - C. = R7
ADD(m7) ; Adjust R7 and R0 before shifting
paddsw m0, m0 ; xmm0 = C. + C.
paddsw m0, m7 ; xmm0 = G. + C.
SHIFT(m7) ; xmm7 = op7
SHIFT(m0) ; xmm0 = op0
%endmacro
%macro PUT_BLOCK 8
movdqa O(0), m%1
movdqa O(1), m%2
movdqa O(2), m%3
movdqa O(3), m%4
movdqa O(4), m%5
movdqa O(5), m%6
movdqa O(6), m%7
movdqa O(7), m%8
%endmacro
%macro VP3_IDCT 1
%if mmsize == 16
%define I(x) [%1+16*x]
%define O(x) [%1+16*x]
%define C(x) [vp3_idct_data+16*(x-1)]
%define SHIFT(x)
%define ADD(x)
VP3_1D_IDCT_SSE2
%if ARCH_X86_64
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
%endif
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%define SHIFT(x) psraw x, 4
%define ADD(x) paddsw x, [pw_8]
VP3_1D_IDCT_SSE2
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%else ; mmsize == 8
; eax = quantized input
; ebx = dequantizer matrix
; ecx = IDCT constants
; M(I) = ecx + MaskOffset(0) + I * 8
; C(I) = ecx + CosineOffset(32) + (I-1) * 8
; edx = output
; r0..r7 = mm0..mm7
%define OC_8 [pw_8]
%define C(x) [vp3_idct_data+16*(x-1)]
; at this point, function has completed dequantization + dezigzag +
; partial transposition; now do the idct itself
%define I(x) [%1+16*x]
%define J(x) [%1+16*x]
RowIDCT
Transpose
%define I(x) [%1+16*x+8]
%define J(x) [%1+16*x+8]
RowIDCT
Transpose
%define I(x) [%1+16* x]
%define J(x) [%1+16*(x-4)+8]
ColumnIDCT
%define I(x) [%1+16* x +64]
%define J(x) [%1+16*(x-4)+72]
ColumnIDCT
%endif ; mmsize == 16/8
%endmacro
%macro vp3_idct_funcs 0
cglobal vp3_idct_put, 3, 4, 9
VP3_IDCT r2
movsxdifnidn r1, r1d
mova m4, [pb_80]
lea r3, [r1*3]
%assign %%i 0
%rep 16/mmsize
mova m0, [r2+mmsize*0+%%i]
mova m1, [r2+mmsize*2+%%i]
mova m2, [r2+mmsize*4+%%i]
mova m3, [r2+mmsize*6+%%i]
%if mmsize == 8
packsswb m0, [r2+mmsize*8+%%i]
packsswb m1, [r2+mmsize*10+%%i]
packsswb m2, [r2+mmsize*12+%%i]
packsswb m3, [r2+mmsize*14+%%i]
%else
packsswb m0, [r2+mmsize*1+%%i]
packsswb m1, [r2+mmsize*3+%%i]
packsswb m2, [r2+mmsize*5+%%i]
packsswb m3, [r2+mmsize*7+%%i]
%endif
paddb m0, m4
paddb m1, m4
paddb m2, m4
paddb m3, m4
movq [r0 ], m0
%if mmsize == 8
movq [r0+r1 ], m1
movq [r0+r1*2], m2
movq [r0+r3 ], m3
%else
movhps [r0+r1 ], m0
movq [r0+r1*2], m1
movhps [r0+r3 ], m1
%endif
%if %%i == 0
lea r0, [r0+r1*4]
%endif
%if mmsize == 16
movq [r0 ], m2
movhps [r0+r1 ], m2
movq [r0+r1*2], m3
movhps [r0+r3 ], m3
%endif
%assign %%i %%i+8
%endrep
pxor m0, m0
%assign %%offset 0
%rep 128/mmsize
mova [r2+%%offset], m0
%assign %%offset %%offset+mmsize
%endrep
RET
cglobal vp3_idct_add, 3, 4, 9
VP3_IDCT r2
movsxdifnidn r1, r1d
lea r3, [r1*3]
pxor m4, m4
%if mmsize == 16
%assign %%i 0
%rep 2
movq m0, [r0]
movq m1, [r0+r1]
movq m2, [r0+r1*2]
movq m3, [r0+r3]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
paddsw m0, [r2+ 0+%%i]
paddsw m1, [r2+16+%%i]
paddsw m2, [r2+32+%%i]
paddsw m3, [r2+48+%%i]
packuswb m0, m1
packuswb m2, m3
movq [r0 ], m0
movhps [r0+r1 ], m0
movq [r0+r1*2], m2
movhps [r0+r3 ], m2
%if %%i == 0
lea r0, [r0+r1*4]
%endif
%assign %%i %%i+64
%endrep
%else
%assign %%i 0
%rep 2
movq m0, [r0]
movq m1, [r0+r1]
movq m2, [r0+r1*2]
movq m3, [r0+r3]
movq m5, m0
movq m6, m1
movq m7, m2
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpckhbw m5, m4
punpckhbw m6, m4
punpckhbw m7, m4
paddsw m0, [r2+ 0+%%i]
paddsw m1, [r2+16+%%i]
paddsw m2, [r2+32+%%i]
paddsw m5, [r2+64+%%i]
paddsw m6, [r2+80+%%i]
paddsw m7, [r2+96+%%i]
packuswb m0, m5
movq m5, m3
punpcklbw m3, m4
punpckhbw m5, m4
packuswb m1, m6
paddsw m3, [r2+48+%%i]
paddsw m5, [r2+112+%%i]
packuswb m2, m7
packuswb m3, m5
movq [r0 ], m0
movq [r0+r1 ], m1
movq [r0+r1*2], m2
movq [r0+r3 ], m3
%if %%i == 0
lea r0, [r0+r1*4]
%endif
%assign %%i %%i+8
%endrep
%endif
%assign %%i 0
%rep 128/mmsize
mova [r2+%%i], m4
%assign %%i %%i+mmsize
%endrep
RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
vp3_idct_funcs
%endif
INIT_XMM sse2
vp3_idct_funcs
%macro DC_ADD 0
movq m2, [r0 ]
movq m3, [r0+r1 ]
paddusb m2, m0
movq m4, [r0+r1*2]
paddusb m3, m0
movq m5, [r0+r2 ]
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
psubusb m3, m1
movq [r0 ], m2
psubusb m4, m1
movq [r0+r1 ], m3
psubusb m5, m1
movq [r0+r1*2], m4
movq [r0+r2 ], m5
%endmacro
INIT_MMX mmxext
cglobal vp3_idct_dc_add, 3, 4
%if ARCH_X86_64
movsxd r1, r1d
%endif
movsx r3, word [r2]
mov word [r2], 0
lea r2, [r1*3]
add r3, 15
sar r3, 5
movd m0, r3d
pshufw m0, m0, 0x0
pxor m1, m1
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
DC_ADD
lea r0, [r0+r1*4]
DC_ADD
RET

View File

@@ -0,0 +1,128 @@
/*
* Copyright (c) 2009 David Conrad <lessen42@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/vp3dsp.h"
#include "config.h"
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, int16_t *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, int16_t *block);
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size,
int16_t *block);
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
int *bounding_values);
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
int *bounding_values);
#if HAVE_MMX_INLINE
#define MOVQ_BFE(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"paddb %%"#regd", %%"#regd" \n\t" ::)
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pand "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psrlq $1, "#regd" \n\t" \
"paddb "#regb", "#regr" \n\t" \
"paddb "#regd", "#regp" \n\t"
static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h)
{
// START_TIMER
MOVQ_BFE(mm6);
__asm__ volatile(
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t"
"movq (%1,%4), %%mm2 \n\t"
"movq (%2,%4), %%mm3 \n\t"
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t"
"movq %%mm5, (%3,%4) \n\t"
"movq (%1,%4,2), %%mm0 \n\t"
"movq (%2,%4,2), %%mm1 \n\t"
"movq (%1,%5), %%mm2 \n\t"
"movq (%2,%5), %%mm3 \n\t"
"lea (%1,%4,4), %1 \n\t"
"lea (%2,%4,4), %2 \n\t"
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3,%4,2) \n\t"
"movq %%mm5, (%3,%5) \n\t"
"lea (%3,%4,4), %3 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
:"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
:"memory");
// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
}
#endif /* HAVE_MMX_INLINE */
av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_MMX_INLINE
c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx;
#endif /* HAVE_MMX_INLINE */
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->idct_put = ff_vp3_idct_put_mmx;
c->idct_add = ff_vp3_idct_add_mmx;
}
#endif
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
if (!(flags & CODEC_FLAG_BITEXACT)) {
c->v_loop_filter = ff_vp3_v_loop_filter_mmxext;
c->h_loop_filter = ff_vp3_h_loop_filter_mmxext;
}
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_vp3_idct_put_sse2;
c->idct_add = ff_vp3_idct_add_sse2;
}
}

View File

@@ -0,0 +1,54 @@
/**
* VP5 and VP6 compatible video decoder (arith decoder)
*
* Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org>
* Copyright (C) 2010 Eli Friedman
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_VP56_ARITH_H
#define AVCODEC_X86_VP56_ARITH_H
#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
#define vp56_rac_get_prob vp56_rac_get_prob
static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
{
unsigned int code_word = vp56_rac_renorm(c);
unsigned int high = c->high;
unsigned int low = 1 + (((high - 1) * prob) >> 8);
unsigned int low_shift = low << 16;
int bit = 0;
__asm__(
"subl %4, %1 \n\t"
"subl %3, %2 \n\t"
"leal (%2, %3), %3 \n\t"
"setae %b0 \n\t"
"cmovb %4, %1 \n\t"
"cmovb %3, %2 \n\t"
: "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
: "r"(low)
);
c->high = high;
c->code_word = code_word;
return bit;
}
#endif
#endif /* AVCODEC_X86_VP56_ARITH_H */

View File

@@ -0,0 +1,170 @@
;******************************************************************************
;* MMX/SSE2-optimized functions for the VP6 decoder
;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
cextern pw_64
SECTION .text
%macro DIAG4 6
%if mmsize == 8
movq m0, [%1+%2]
movq m1, [%1+%3]
movq m3, m0
movq m4, m1
punpcklbw m0, m7
punpcklbw m1, m7
punpckhbw m3, m7
punpckhbw m4, m7
pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0]
pmullw m1, [rsp+8*12] ; src[x ] * biweight [1]
pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0]
pmullw m4, [rsp+8*12] ; src[x ] * biweight [1]
paddw m0, m1
paddw m3, m4
movq m1, [%1+%4]
movq m2, [%1+%5]
movq m4, m1
movq m5, m2
punpcklbw m1, m7
punpcklbw m2, m7
punpckhbw m4, m7
punpckhbw m5, m7
pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2]
pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3]
pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2]
pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3]
paddw m1, m2
paddw m4, m5
paddsw m0, m1
paddsw m3, m4
paddsw m0, m6 ; Add 64
paddsw m3, m6 ; Add 64
psraw m0, 7
psraw m3, 7
packuswb m0, m3
movq [%6], m0
%else ; mmsize == 16
movq m0, [%1+%2]
movq m1, [%1+%3]
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4 ; src[x-8 ] * biweight [0]
pmullw m1, m5 ; src[x ] * biweight [1]
paddw m0, m1
movq m1, [%1+%4]
movq m2, [%1+%5]
punpcklbw m1, m7
punpcklbw m2, m7
pmullw m1, m6 ; src[x+8 ] * biweight [2]
pmullw m2, m3 ; src[x+16] * biweight [3]
paddw m1, m2
paddsw m0, m1
paddsw m0, [pw_64] ; Add 64
psraw m0, 7
packuswb m0, m0
movq [%6], m0
%endif ; mmsize == 8/16
%endmacro
%macro SPLAT4REGS 0
%if mmsize == 8
movq m5, m3
punpcklwd m3, m3
movq m4, m3
punpckldq m3, m3
punpckhdq m4, m4
punpckhwd m5, m5
movq m2, m5
punpckhdq m2, m2
punpckldq m5, m5
movq [rsp+8*11], m3
movq [rsp+8*12], m4
movq [rsp+8*13], m5
movq [rsp+8*14], m2
%else ; mmsize == 16
pshuflw m4, m3, 0x0
pshuflw m5, m3, 0x55
pshuflw m6, m3, 0xAA
pshuflw m3, m3, 0xFF
punpcklqdq m4, m4
punpcklqdq m5, m5
punpcklqdq m6, m6
punpcklqdq m3, m3
%endif ; mmsize == 8/16
%endmacro
%macro vp6_filter_diag4 0
; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride,
; const int16_t h_weight[4], const int16_t v_weights[4])
cglobal vp6_filter_diag4, 5, 7, 8
mov r5, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
%if mmsize == 16
sub rsp, 8*11
%else
sub rsp, 8*15
movq m6, [pw_64]
%endif
%if ARCH_X86_64
movsxd r2, r2d
%endif
sub r1, r2
pxor m7, m7
movq m3, [r3]
SPLAT4REGS
mov r3, rsp
mov r6, 11
.nextrow:
DIAG4 r1, -1, 0, 1, 2, r3
add r3, 8
add r1, r2
dec r6
jnz .nextrow
movq m3, [r4]
SPLAT4REGS
lea r3, [rsp+8]
mov r6, 8
.nextcol:
DIAG4 r3, -8, 0, 8, 16, r0
add r3, 8
add r0, r2
dec r6
jnz .nextcol
mov rsp, r5 ; restore stack pointer
RET
%endmacro
%if ARCH_X86_32
INIT_MMX mmx
vp6_filter_diag4
%endif
INIT_XMM sse2
vp6_filter_diag4

View File

@@ -0,0 +1,45 @@
/*
* VP6 MMX/SSE2 optimizations
* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vp56dsp.h"
void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride,
const int16_t *h_weights,const int16_t *v_weights);
void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride,
const int16_t *h_weights,const int16_t *v_weights);
av_cold void ff_vp6dsp_init_x86(VP56DSPContext* c, enum AVCodecID codec)
{
int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx;
}
#endif
if (EXTERNAL_SSE2(cpu_flags)) {
c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,441 @@
/*
* VP8 DSP functions x86-optimized
* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vp8dsp.h"
#if HAVE_YASM
/*
* MC functions
*/
void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
dst, dststride, src, srcstride, height, mx, my); \
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
dst + 8, dststride, src + 8, srcstride, height, mx, my); \
}
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
dst, dststride, src, srcstride, height, mx, my); \
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
dst + 4, dststride, src + 4, srcstride, height, mx, my); \
}
#if ARCH_X86_32
TAP_W8 (mmxext, epel, h4)
TAP_W8 (mmxext, epel, h6)
TAP_W16(mmxext, epel, h6)
TAP_W8 (mmxext, epel, v4)
TAP_W8 (mmxext, epel, v6)
TAP_W16(mmxext, epel, v6)
TAP_W8 (mmxext, bilinear, h)
TAP_W16(mmxext, bilinear, h)
TAP_W8 (mmxext, bilinear, v)
TAP_W16(mmxext, bilinear, v)
#endif
TAP_W16(sse2, epel, h6)
TAP_W16(sse2, epel, v6)
TAP_W16(sse2, bilinear, h)
TAP_W16(sse2, bilinear, v)
TAP_W16(ssse3, epel, h6)
TAP_W16(ssse3, epel, v6)
TAP_W16(ssse3, bilinear, h)
TAP_W16(ssse3, bilinear, v)
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
src -= srcstride * (TAPNUMY / 2 - 1); \
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
dst, dststride, tmpptr, SIZE, height, mx, my); \
}
#if ARCH_X86_32
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8) \
HVTAP(mmxext, 8, x, y, 8, 16)
HVTAP(mmxext, 8, 6, 6, 16, 16)
#else
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8)
#endif
HVTAPMMX(4, 4)
HVTAPMMX(4, 6)
HVTAPMMX(6, 4)
HVTAPMMX(6, 6)
#define HVTAPSSE2(x, y, w) \
HVTAP(sse2, 16, x, y, w, 16) \
HVTAP(ssse3, 16, x, y, w, 16)
HVTAPSSE2(4, 4, 8)
HVTAPSSE2(4, 6, 8)
HVTAPSSE2(6, 4, 8)
HVTAPSSE2(6, 6, 8)
HVTAPSSE2(6, 6, 16)
HVTAP(ssse3, 16, 4, 4, 4, 8)
HVTAP(ssse3, 16, 4, 6, 4, 8)
HVTAP(ssse3, 16, 6, 4, 4, 8)
HVTAP(ssse3, 16, 6, 6, 4, 8)
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
tmp, SIZE, src, srcstride, height + 1, mx, my); \
ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \
}
HVBILIN(mmxext, 8, 4, 8)
#if ARCH_X86_32
HVBILIN(mmxext, 8, 8, 16)
HVBILIN(mmxext, 8, 16, 16)
#endif
HVBILIN(sse2, 8, 8, 16)
HVBILIN(sse2, 8, 16, 16)
HVBILIN(ssse3, 8, 4, 8)
HVBILIN(ssse3, 8, 8, 16)
HVBILIN(ssse3, 8, 16, 16)
void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
ptrdiff_t stride);
void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
#define DECLARE_LOOP_FILTER(NAME) \
void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int flim); \
void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int flim); \
void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt); \
void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt);
DECLARE_LOOP_FILTER(mmx)
DECLARE_LOOP_FILTER(mmxext)
DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3)
DECLARE_LOOP_FILTER(sse4)
#endif /* HAVE_YASM */
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
#define VP8_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
#if ARCH_X86_32
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
c->vp8_idct_add = ff_vp8_idct_add_mmx;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
#endif
c->put_vp8_epel_pixels_tab[1][0][0] =
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
#if ARCH_X86_32
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
#endif
}
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (EXTERNAL_MMXEXT(cpu_flags)) {
VP8_MC_FUNC(2, 4, mmxext);
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
#if ARCH_X86_32
VP8_LUMA_MC_FUNC(0, 16, mmxext);
VP8_MC_FUNC(1, 8, mmxext);
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
#endif
}
if (EXTERNAL_SSE(cpu_flags)) {
c->vp8_idct_add = ff_vp8_idct_add_sse;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
}
if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
VP8_LUMA_MC_FUNC(0, 16, sse2);
VP8_MC_FUNC(1, 8, sse2);
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
VP8_LUMA_MC_FUNC(0, 16, ssse3);
VP8_MC_FUNC(1, 8, ssse3);
VP8_MC_FUNC(2, 4, ssse3);
VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
}
#endif /* HAVE_YASM */
}

Some files were not shown because too many files have changed in this diff Show More