ffmpeg-2.1.1: move directory

git-svn-id: svn://kolibrios.org@6148 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Sergey Semyonov (Serge)
2016-02-05 22:14:10 +00:00
parent a4b787f4b8
commit ecf3e862ea
4011 changed files with 1868 additions and 4 deletions

View File

@@ -0,0 +1,95 @@
ARCH_HEADERS = mathops.h
OBJS += arm/fmtconvert_init_arm.o
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
arm/sbrdsp_init_arm.o
OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \
arm/ac3dsp_arm.o
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_arm.o \
arm/dsputil_arm.o \
arm/jrevdct_arm.o \
arm/simple_idct_arm.o
OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o \
arm/fft_fixed_init_arm.o
OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \
arm/flacdsp_arm.o
OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o
OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o
OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
arm/hpeldsp_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o
OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \
arm/rv40dsp_init_arm.o
OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o \
ARMV5TE-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv5te.o \
arm/simple_idct_armv5te.o
ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \
arm/mpegvideo_armv5te_s.o
ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \
arm/videodsp_armv5te.o
ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv6.o \
arm/dsputil_armv6.o \
arm/simple_idct_armv6.o \
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
arm/hpeldsp_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
arm/vp8dsp_init_armv6.o \
arm/vp8dsp_armv6.o
VFP-OBJS += arm/fmtconvert_vfp.o
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
arm/synth_filter_vfp.o
VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp_armv6.o
NEON-OBJS += arm/fmtconvert_neon.o
NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
arm/sbrdsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_neon.o \
arm/dsputil_neon.o \
arm/int_neon.o \
arm/simple_idct_neon.o
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
arm/fft_fixed_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_neon.o \
arm/h264idct_neon.o
NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o
NEON-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_neon.o \
arm/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \
arm/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \
arm/mdct_fixed_neon.o
NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o
NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
arm/rv40dsp_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
NEON-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_neon.o \
arm/vp8dsp_neon.o

View File

@@ -0,0 +1,143 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_AAC_H
#define AVCODEC_ARM_AAC_H
#include "config.h"
#if HAVE_NEON_INLINE
#define VMUL2 VMUL2
static inline float *VMUL2(float *dst, const float *v, unsigned idx,
const float *scale)
{
unsigned v0, v1;
__asm__ ("ubfx %0, %6, #0, #4 \n\t"
"ubfx %1, %6, #4, #4 \n\t"
"ldr %0, [%5, %0, lsl #2] \n\t"
"ldr %1, [%5, %1, lsl #2] \n\t"
"vld1.32 {d1[]}, [%7,:32] \n\t"
"vmov d0, %0, %1 \n\t"
"vmul.f32 d0, d0, d1 \n\t"
"vst1.32 {d0}, [%2,:64]! \n\t"
: "=&r"(v0), "=&r"(v1), "+r"(dst), "=m"(dst[0]), "=m"(dst[1])
: "r"(v), "r"(idx), "r"(scale)
: "d0", "d1");
return dst;
}
#define VMUL4 VMUL4
static inline float *VMUL4(float *dst, const float *v, unsigned idx,
const float *scale)
{
unsigned v0, v1, v2, v3;
__asm__ ("ubfx %0, %10, #0, #2 \n\t"
"ubfx %1, %10, #2, #2 \n\t"
"ldr %0, [%9, %0, lsl #2] \n\t"
"ubfx %2, %10, #4, #2 \n\t"
"ldr %1, [%9, %1, lsl #2] \n\t"
"ubfx %3, %10, #6, #2 \n\t"
"ldr %2, [%9, %2, lsl #2] \n\t"
"vmov d0, %0, %1 \n\t"
"ldr %3, [%9, %3, lsl #2] \n\t"
"vld1.32 {d2[],d3[]},[%11,:32] \n\t"
"vmov d1, %2, %3 \n\t"
"vmul.f32 q0, q0, q1 \n\t"
"vst1.32 {q0}, [%4,:128]! \n\t"
: "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
"=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3])
: "r"(v), "r"(idx), "r"(scale)
: "d0", "d1", "d2", "d3");
return dst;
}
#define VMUL2S VMUL2S
static inline float *VMUL2S(float *dst, const float *v, unsigned idx,
unsigned sign, const float *scale)
{
unsigned v0, v1, v2, v3;
__asm__ ("ubfx %0, %8, #0, #4 \n\t"
"ubfx %1, %8, #4, #4 \n\t"
"ldr %0, [%7, %0, lsl #2] \n\t"
"lsl %2, %10, #30 \n\t"
"ldr %1, [%7, %1, lsl #2] \n\t"
"lsl %3, %10, #31 \n\t"
"vmov d0, %0, %1 \n\t"
"bic %2, %2, #1<<30 \n\t"
"vld1.32 {d1[]}, [%9,:32] \n\t"
"vmov d2, %2, %3 \n\t"
"veor d0, d0, d2 \n\t"
"vmul.f32 d0, d0, d1 \n\t"
"vst1.32 {d0}, [%4,:64]! \n\t"
: "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
"=m"(dst[0]), "=m"(dst[1])
: "r"(v), "r"(idx), "r"(scale), "r"(sign)
: "d0", "d1", "d2");
return dst;
}
#define VMUL4S VMUL4S
static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
unsigned sign, const float *scale)
{
unsigned v0, v1, v2, v3, nz;
__asm__ ("vld1.32 {d2[],d3[]},[%13,:32] \n\t"
"ubfx %0, %12, #0, #2 \n\t"
"ubfx %1, %12, #2, #2 \n\t"
"ldr %0, [%11,%0, lsl #2] \n\t"
"ubfx %2, %12, #4, #2 \n\t"
"ldr %1, [%11,%1, lsl #2] \n\t"
"ubfx %3, %12, #6, #2 \n\t"
"ldr %2, [%11,%2, lsl #2] \n\t"
"vmov d0, %0, %1 \n\t"
"ldr %3, [%11,%3, lsl #2] \n\t"
"lsr %6, %12, #12 \n\t"
"rbit %6, %6 \n\t"
"vmov d1, %2, %3 \n\t"
"lsls %6, %6, #1 \n\t"
"and %0, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %1, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %2, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"vmov d4, %0, %1 \n\t"
"and %3, %5, #1<<31 \n\t"
"vmov d5, %2, %3 \n\t"
"veor q0, q0, q2 \n\t"
"vmul.f32 q0, q0, q1 \n\t"
"vst1.32 {q0}, [%4,:128]! \n\t"
: "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
"+r"(sign), "=r"(nz),
"=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3])
: "r"(v), "r"(idx), "r"(scale)
: "cc", "d0", "d1", "d2", "d3", "d4", "d5");
return dst;
}
#endif /* HAVE_NEON_INLINE */
#endif /* AVCODEC_ARM_AAC_H */

View File

@@ -0,0 +1,57 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/aacpsdsp.h"
void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
float *src1, int n);
void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
const float (*filter)[8][2],
int stride, int n);
void ff_ps_hybrid_analysis_ileave_neon(float (*out)[32][2], float L[2][38][64],
int i, int len);
void ff_ps_hybrid_synthesis_deint_neon(float out[2][38][64], float (*in)[32][2],
int i, int len);
void ff_ps_decorrelate_neon(float (*out)[2], float (*delay)[2],
float (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
const float phi_fract[2], float (*Q_fract)[2],
const float *transient_gain, float g_decay_slope,
int len);
void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
av_cold void ff_psdsp_init_arm(PSDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
s->add_squares = ff_ps_add_squares_neon;
s->mul_pair_single = ff_ps_mul_pair_single_neon;
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_neon;
s->hybrid_analysis = ff_ps_hybrid_analysis_neon;
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
}
}

View File

@@ -0,0 +1,272 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_ps_add_squares_neon, export=1
mov r3, r0
sub r2, r2, #4
vld1.32 {q0}, [r1,:128]!
vmul.f32 q0, q0, q0
vld1.32 {q2}, [r1,:128]!
vmul.f32 q2, q2, q2
vld1.32 {q1}, [r0,:128]!
1:
vpadd.f32 d6, d0, d1
vld1.32 {q0}, [r1,:128]!
vpadd.f32 d7, d4, d5
vmul.f32 q0, q0, q0
vld1.32 {q2}, [r1,:128]!
vadd.f32 q3, q1, q3
vld1.32 {q1}, [r0,:128]!
vmul.f32 q2, q2, q2
vst1.32 {q3}, [r3,:128]!
subs r2, r2, #4
bgt 1b
vpadd.f32 d6, d0, d1
vpadd.f32 d7, d4, d5
vadd.f32 q1, q1, q3
vst1.32 {q1}, [r3,:128]!
bx lr
endfunc
function ff_ps_mul_pair_single_neon, export=1
sub r3, r3, #4
tst r1, #8
bne 2f
vld1.32 {q0}, [r1,:128]!
1:
vld1.32 {q3}, [r2,:128]!
vmul.f32 d4, d0, d6[0]
vmul.f32 d5, d1, d6[1]
vld1.32 {q1}, [r1,:128]!
vmul.f32 d6, d2, d7[0]
vmul.f32 d7, d3, d7[1]
vld1.32 {q0}, [r1,:128]!
vst1.32 {q2,q3}, [r0,:128]!
subs r3, r3, #4
bgt 1b
vld1.32 {q3}, [r2,:128]!
vmul.f32 d4, d0, d6[0]
vmul.f32 d5, d1, d6[1]
vld1.32 {q1}, [r1,:128]!
vmul.f32 d6, d2, d7[0]
vmul.f32 d7, d3, d7[1]
vst1.32 {q2,q3}, [r0,:128]!
bx lr
2:
vld1.32 {d0}, [r1,:64]!
vld1.32 {d1,d2}, [r1,:128]!
1:
vld1.32 {q3}, [r2,:128]!
vmul.f32 d4, d0, d6[0]
vmul.f32 d5, d1, d6[1]
vld1.32 {d0,d1}, [r1,:128]!
vmul.f32 d6, d2, d7[0]
vmul.f32 d7, d0, d7[1]
vmov d0, d1
vld1.32 {d1,d2}, [r1,:128]!
vst1.32 {q2,q3}, [r0,:128]!
subs r3, r3, #4
bgt 1b
vld1.32 {q3}, [r2,:128]!
vmul.f32 d4, d0, d6[0]
vmul.f32 d5, d1, d6[1]
vld1.32 {d0}, [r1,:64]!
vmul.f32 d6, d2, d7[0]
vmul.f32 d7, d0, d7[1]
vst1.32 {q2,q3}, [r0,:128]!
bx lr
endfunc
function ff_ps_hybrid_synthesis_deint_neon, export=1
push {r4-r8,lr}
add r0, r0, r2, lsl #2
add r1, r1, r2, lsl #5+1+2
rsb r2, r2, #64
mov r5, #64*4
mov lr, r0
add r4, r0, #38*64*4
mov r12, r3
2:
vld1.32 {d0,d1}, [r1,:128]!
vst1.32 {d0[0]}, [lr,:32], r5
vst1.32 {d0[1]}, [r4,:32], r5
vst1.32 {d1[0]}, [lr,:32], r5
vst1.32 {d1[1]}, [r4,:32], r5
subs r12, r12, #2
bgt 2b
add r0, r0, #4
sub r2, r2, #1
tst r2, #2
bne 6f
1:
mov lr, r0
add r4, r0, #38*64*4
add r6, r1, # 32*2*4
add r7, r1, #2*32*2*4
add r8, r1, #3*32*2*4
mov r12, r3
2:
vld1.32 {d0,d1}, [r1,:128]!
vld1.32 {d2,d3}, [r6,:128]!
vld1.32 {d4,d5}, [r7,:128]!
vld1.32 {d6,d7}, [r8,:128]!
vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5
vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5
vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5
vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5
subs r12, r12, #2
bgt 2b
add r0, r0, #16
add r1, r1, #3*32*2*4
subs r2, r2, #4
bgt 1b
pop {r4-r8,pc}
6:
mov lr, r0
add r4, r0, #38*64*4
add r6, r1, #32*2*4
mov r12, r3
2:
vld1.32 {d0,d1}, [r1,:128]!
vld1.32 {d2,d3}, [r6,:128]!
vst2.32 {d0[0],d2[0]}, [lr,:64], r5
vst2.32 {d0[1],d2[1]}, [r4,:64], r5
vst2.32 {d1[0],d3[0]}, [lr,:64], r5
vst2.32 {d1[1],d3[1]}, [r4,:64], r5
subs r12, r12, #2
bgt 2b
add r0, r0, #8
add r1, r1, #32*2*4
sub r2, r2, #2
b 1b
endfunc
function ff_ps_hybrid_analysis_neon, export=1
vldm r1, {d19-d31}
ldr r12, [sp]
lsl r3, r3, #3
vadd.f32 d16, d19, d31
vadd.f32 d17, d20, d30
vsub.f32 d18, d19, d31
vsub.f32 d19, d20, d30
vsub.f32 d0, d21, d29
vsub.f32 d1, d22, d28
vadd.f32 d2, d21, d29
vadd.f32 d3, d22, d28
vadd.f32 d20, d23, d27
vadd.f32 d21, d24, d26
vsub.f32 d22, d23, d27
vsub.f32 d23, d24, d26
vmov.i32 d6, #1<<31
vmov.i32 d7, #0
vmov.f32 q14, #0.0
vmov.f32 q15, #0.0
vtrn.32 d6, d7
vrev64.32 q9, q9
vrev64.32 q0, q0
vrev64.32 q11, q11
veor q9, q9, q3
veor q0, q0, q3
veor q11, q11, q3
vld1.32 {q13}, [r2,:128]!
vtrn.32 q8, q9
vtrn.32 q1, q0
vtrn.32 q10, q11
sub r12, r12, #1
vmla.f32 q14, q8, q13
vld1.32 {q2}, [r2,:128]!
vmla.f32 q15, q9, q13
1:
vmla.f32 q14, q1, q2
vld1.32 {q13}, [r2,:128]!
vmla.f32 q15, q0, q2
vmla.f32 q14, q10, q13
vld1.32 {q2}, [r2,:128]!
vmla.f32 q15, q11, q13
vld1.32 {q13}, [r2,:128]!
vadd.f32 d6, d28, d29
vadd.f32 d7, d30, d31
vmov.f32 q14, #0.0
vmov.f32 q15, #0.0
vmla.f32 q14, q8, q13
vpadd.f32 d6, d6, d7
vmla.f32 q15, q9, q13
vmla.f32 d6, d25, d4[0]
vld1.32 {q2}, [r2,:128]!
vst1.32 {d6}, [r0,:64], r3
subs r12, r12, #1
bgt 1b
vmla.f32 q14, q1, q2
vld1.32 {q13}, [r2,:128]!
vmla.f32 q15, q0, q2
vmla.f32 q14, q10, q13
vld1.32 {q2}, [r2,:128]!
vmla.f32 q15, q11, q13
vadd.f32 d6, d28, d29
vadd.f32 d7, d30, d31
vpadd.f32 d6, d6, d7
vmla.f32 d6, d25, d4[0]
vst1.32 {d6}, [r0,:64], r3
bx lr
endfunc
function ff_ps_stereo_interpolate_neon, export=1
vld1.32 {q0}, [r2]
vld1.32 {q14}, [r3]
vadd.f32 q15, q14, q14
mov r2, r0
mov r3, r1
ldr r12, [sp]
vadd.f32 q1, q0, q14
vadd.f32 q0, q0, q15
vld1.32 {q2}, [r0,:64]!
vld1.32 {q3}, [r1,:64]!
subs r12, r12, #1
beq 2f
1:
vmul.f32 d16, d4, d2[0]
vmul.f32 d17, d5, d0[0]
vmul.f32 d18, d4, d2[1]
vmul.f32 d19, d5, d0[1]
vmla.f32 d16, d6, d3[0]
vmla.f32 d17, d7, d1[0]
vmla.f32 d18, d6, d3[1]
vmla.f32 d19, d7, d1[1]
vadd.f32 q1, q1, q15
vadd.f32 q0, q0, q15
vld1.32 {q2}, [r0,:64]!
vld1.32 {q3}, [r1,:64]!
vst1.32 {q8}, [r2,:64]!
vst1.32 {q9}, [r3,:64]!
subs r12, r12, #2
bgt 1b
it lt
bxlt lr
2:
vmul.f32 d16, d4, d2[0]
vmul.f32 d18, d4, d2[1]
vmla.f32 d16, d6, d3[0]
vmla.f32 d18, d6, d3[1]
vst1.32 {d16}, [r2,:64]!
vst1.32 {d18}, [r3,:64]!
bx lr
endfunc

View File

@@ -0,0 +1,36 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_ac3_update_bap_counts_arm, export=1
push {lr}
ldrb lr, [r1], #1
1:
lsl r3, lr, #1
ldrh r12, [r0, r3]
subs r2, r2, #1
it gt
ldrbgt lr, [r1], #1
add r12, r12, #1
strh r12, [r0, r3]
bgt 1b
pop {pc}
endfunc

View File

@@ -0,0 +1,84 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_ac3_bit_alloc_calc_bap_armv6, export=1
ldr r12, [sp]
cmp r12, #-960
beq 4f
push {r4-r11,lr}
add r5, sp, #40
movrelx r4, X(ff_ac3_bin_to_band_tab), r11
movrelx lr, X(ff_ac3_band_start_tab)
ldm r5, {r5-r7}
ldrb r4, [r4, r2]
add r1, r1, r2, lsl #1 @ psd + start
add r0, r0, r4, lsl #1 @ mask + band
add r4, r4, lr
add r7, r7, r2 @ bap + start
1:
ldrsh r9, [r0], #2 @ mask[band]
mov r8, #0xff0
sub r9, r9, r12 @ - snr_offset
ldrb r10, [r4, #1]! @ band_start_tab[++band]
subs r9, r9, r5 @ - floor
it lt
movlt r9, #0
cmp r10, r3 @ - end
and r9, r9, r8, lsl #1 @ & 0x1fe0
ite gt
subgt r8, r3, r2
suble r8, r10, r2
mov r2, r10
add r9, r9, r5 @ + floor => m
tst r8, #1
add r11, r7, r8
bne 3f
b 5f
2:
ldrsh r8, [r1], #2
ldrsh lr, [r1], #2
sub r8, r8, r9
sub lr, lr, r9
usat r8, #6, r8, asr #5 @ address
usat lr, #6, lr, asr #5
ldrb r8, [r6, r8] @ bap_tab[address]
ldrb lr, [r6, lr]
strb r8, [r7], #1 @ bap[bin]
strb lr, [r7], #1
5: cmp r7, r11
blo 2b
cmp r3, r10
bgt 1b
pop {r4-r11,pc}
3:
ldrsh r8, [r1], #2 @ psd[bin]
sub r8, r8, r9 @ - m
usat r8, #6, r8, asr #5 @ address
ldrb r8, [r6, r8] @ bap_tab[address]
strb r8, [r7], #1 @ bap[bin]
b 5b
4:
ldr r0, [sp, #12]
mov r1, #0
mov r2, #256
b X(memset)
endfunc

View File

@@ -0,0 +1,70 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/ac3dsp.h"
#include "config.h"
void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
int ff_ac3_max_msb_abs_int16_neon(const int16_t *src, int len);
void ff_ac3_lshift_int16_neon(int16_t *src, unsigned len, unsigned shift);
void ff_ac3_rshift_int32_neon(int32_t *src, unsigned len, unsigned shift);
void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len);
void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
const int32_t *coef0,
const int32_t *coef1,
int len);
void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
const float *coef0,
const float *coef1,
int len);
void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
int start, int end,
int snr_offset, int floor,
const uint8_t *bap_tab, uint8_t *bap);
void ff_ac3_update_bap_counts_arm(uint16_t mant_cnt[16], uint8_t *bap, int len);
av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
{
int cpu_flags = av_get_cpu_flags();
c->update_bap_counts = ff_ac3_update_bap_counts_arm;
if (have_armv6(cpu_flags)) {
c->bit_alloc_calc_bap = ff_ac3_bit_alloc_calc_bap_armv6;
}
if (have_neon(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_neon;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_neon;
c->ac3_lshift_int16 = ff_ac3_lshift_int16_neon;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_neon;
c->float_to_fixed24 = ff_float_to_fixed24_neon;
c->extract_exponents = ff_ac3_extract_exponents_neon;
c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
}
}

View File

@@ -0,0 +1,154 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_ac3_max_msb_abs_int16_neon, export=1
vmov.i16 q0, #0
vmov.i16 q2, #0
1: vld1.16 {q1}, [r0,:128]!
vabs.s16 q1, q1
vld1.16 {q3}, [r0,:128]!
vabs.s16 q3, q3
vorr q0, q0, q1
vorr q2, q2, q3
subs r1, r1, #16
bgt 1b
vorr q0, q0, q2
vorr d0, d0, d1
vpmax.u16 d0, d0, d0
vpmax.u16 d0, d0, d0
vmov.u16 r0, d0[0]
bx lr
endfunc
function ff_ac3_exponent_min_neon, export=1
cmp r1, #0
it eq
bxeq lr
push {lr}
mov r12, #256
1:
vld1.8 {q0}, [r0,:128]
mov lr, r1
add r3, r0, #256
2: vld1.8 {q1}, [r3,:128], r12
subs lr, lr, #1
vmin.u8 q0, q0, q1
bgt 2b
subs r2, r2, #16
vst1.8 {q0}, [r0,:128]!
bgt 1b
pop {pc}
endfunc
function ff_ac3_lshift_int16_neon, export=1
vdup.16 q0, r2
1: vld1.16 {q1}, [r0,:128]
vshl.s16 q1, q1, q0
vst1.16 {q1}, [r0,:128]!
subs r1, r1, #8
bgt 1b
bx lr
endfunc
function ff_ac3_rshift_int32_neon, export=1
rsb r2, r2, #0
vdup.32 q0, r2
1: vld1.32 {q1}, [r0,:128]
vshl.s32 q1, q1, q0
vst1.32 {q1}, [r0,:128]!
subs r1, r1, #4
bgt 1b
bx lr
endfunc
function ff_float_to_fixed24_neon, export=1
1: vld1.32 {q0-q1}, [r1,:128]!
vcvt.s32.f32 q0, q0, #24
vld1.32 {q2-q3}, [r1,:128]!
vcvt.s32.f32 q1, q1, #24
vcvt.s32.f32 q2, q2, #24
vst1.32 {q0-q1}, [r0,:128]!
vcvt.s32.f32 q3, q3, #24
vst1.32 {q2-q3}, [r0,:128]!
subs r2, r2, #16
bgt 1b
bx lr
endfunc
function ff_ac3_extract_exponents_neon, export=1
vmov.i32 q15, #8
1:
vld1.32 {q0}, [r1,:128]!
vabs.s32 q1, q0
vclz.i32 q3, q1
vsub.i32 q3, q3, q15
vmovn.i32 d6, q3
vmovn.i16 d6, q3
vst1.32 {d6[0]}, [r0,:32]!
subs r2, r2, #4
bgt 1b
bx lr
endfunc
function ff_ac3_sum_square_butterfly_int32_neon, export=1
vmov.i64 q0, #0
vmov.i64 q1, #0
vmov.i64 q2, #0
vmov.i64 q3, #0
1:
vld1.32 {d16}, [r1]!
vld1.32 {d17}, [r2]!
vadd.s32 d18, d16, d17
vsub.s32 d19, d16, d17
vmlal.s32 q0, d16, d16
vmlal.s32 q1, d17, d17
vmlal.s32 q2, d18, d18
vmlal.s32 q3, d19, d19
subs r3, r3, #2
bgt 1b
vadd.s64 d0, d0, d1
vadd.s64 d1, d2, d3
vadd.s64 d2, d4, d5
vadd.s64 d3, d6, d7
vst1.64 {q0-q1}, [r0]
bx lr
endfunc
function ff_ac3_sum_square_butterfly_float_neon, export=1
vmov.f32 q0, #0.0
vmov.f32 q1, #0.0
1:
vld1.32 {d16}, [r1]!
vld1.32 {d17}, [r2]!
vadd.f32 d18, d16, d17
vsub.f32 d19, d16, d17
vmla.f32 d0, d16, d16
vmla.f32 d1, d17, d17
vmla.f32 d2, d18, d18
vmla.f32 d3, d19, d19
subs r3, r3, #2
bgt 1b
vpadd.f32 d0, d0, d1
vpadd.f32 d1, d2, d3
vst1.32 {q0}, [r0]
bx lr
endfunc

View File

@@ -0,0 +1,39 @@
/*
* Copyright (c) 2010 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_ASM_OFFSETS_H
#define AVCODEC_ARM_ASM_OFFSETS_H
#ifndef __ASSEMBLER__
#include <stddef.h>
#define CHK_OFFS(s, m, o) struct check_##o { \
int x_##o[offsetof(s, m) == o? 1: -1]; \
}
#endif
/* MpegEncContext */
#define Y_DC_SCALE 0xa8
#define C_DC_SCALE 0xac
#define AC_PRED 0xb0
#define BLOCK_LAST_INDEX 0xb4
#define H263_AIC 0xe4
#define INTER_SCANTAB_RASTER_END 0x12c
#endif /* AVCODEC_ARM_ASM_OFFSETS_H */

View File

@@ -0,0 +1,103 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_DCA_H
#define AVCODEC_ARM_DCA_H
#include <stdint.h>
#include "config.h"
#include "libavcodec/mathops.h"
#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
#define decode_blockcodes decode_blockcodes
static inline int decode_blockcodes(int code1, int code2, int levels,
int32_t *values)
{
int32_t v0, v1, v2, v3, v4, v5;
__asm__ ("smmul %0, %6, %10 \n"
"smmul %3, %7, %10 \n"
"smlabb %6, %0, %9, %6 \n"
"smlabb %7, %3, %9, %7 \n"
"smmul %1, %0, %10 \n"
"smmul %4, %3, %10 \n"
"sub %6, %6, %8, lsr #1 \n"
"sub %7, %7, %8, lsr #1 \n"
"smlabb %0, %1, %9, %0 \n"
"smlabb %3, %4, %9, %3 \n"
"smmul %2, %1, %10 \n"
"smmul %5, %4, %10 \n"
"str %6, [%11, #0] \n"
"str %7, [%11, #16] \n"
"sub %0, %0, %8, lsr #1 \n"
"sub %3, %3, %8, lsr #1 \n"
"smlabb %1, %2, %9, %1 \n"
"smlabb %4, %5, %9, %4 \n"
"smmul %6, %2, %10 \n"
"smmul %7, %5, %10 \n"
"str %0, [%11, #4] \n"
"str %3, [%11, #20] \n"
"sub %1, %1, %8, lsr #1 \n"
"sub %4, %4, %8, lsr #1 \n"
"smlabb %2, %6, %9, %2 \n"
"smlabb %5, %7, %9, %5 \n"
"str %1, [%11, #8] \n"
"str %4, [%11, #24] \n"
"sub %2, %2, %8, lsr #1 \n"
"sub %5, %5, %8, lsr #1 \n"
"str %2, [%11, #12] \n"
"str %5, [%11, #28] \n"
: "=&r"(v0), "=&r"(v1), "=&r"(v2),
"=&r"(v3), "=&r"(v4), "=&r"(v5),
"+&r"(code1), "+&r"(code2)
: "r"(levels - 1), "r"(-levels),
"r"(ff_inverse[levels]), "r"(values)
: "memory");
return code1 | code2;
}
#endif
#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
#define int8x8_fmul_int32 int8x8_fmul_int32
static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int scale)
{
__asm__ ("vcvt.f32.s32 %2, %2, #4 \n"
"vld1.8 {d0}, [%1,:64] \n"
"vmovl.s8 q0, d0 \n"
"vmovl.s16 q1, d1 \n"
"vmovl.s16 q0, d0 \n"
"vcvt.f32.s32 q0, q0 \n"
"vcvt.f32.s32 q1, q1 \n"
"vmul.f32 q0, q0, %y2 \n"
"vmul.f32 q1, q1, %y2 \n"
"vst1.32 {q0-q1}, [%m0,:128] \n"
: "=Um"(*(float (*)[8])dst)
: "r"(src), "x"(scale)
: "d0", "d1", "d2", "d3");
}
#endif
#endif /* AVCODEC_ARM_DCA_H */

View File

@@ -0,0 +1,70 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/dcadsp.h"
void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
int decifactor, float scale);
void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
SynthFilterContext *synth, FFTContext *imdct,
float synth_buf_ptr[512],
int *synth_buf_offset, float synth_buf2[32],
const float window[512], float *samples_out,
float raXin[32], float scale);
void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
int decifactor, float scale);
void ff_synth_filter_float_vfp(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
float synth_buf2[32], const float window[512],
float out[32], const float in[32],
float scale);
void ff_synth_filter_float_neon(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
float synth_buf2[32], const float window[512],
float out[32], const float in[32],
float scale);
av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
s->lfe_fir = ff_dca_lfe_fir_vfp;
s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
}
if (have_neon(cpu_flags))
s->lfe_fir = ff_dca_lfe_fir_neon;
}
av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
s->synth_filter_float = ff_synth_filter_float_vfp;
if (have_neon(cpu_flags))
s->synth_filter_float = ff_synth_filter_float_neon;
}

View File

@@ -0,0 +1,61 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_dca_lfe_fir_neon, export=1
push {r4-r6,lr}
add r4, r0, r3, lsl #2 @ out2
add r5, r2, #256*4-16 @ cf1
sub r1, r1, #12
cmp r3, #32
ite eq
moveq r6, #256/32
movne r6, #256/64
NOVFP vldr s0, [sp, #16] @ scale
mov lr, #-16
1:
vmov.f32 q2, #0.0 @ v0
vmov.f32 q3, #0.0 @ v1
mov r12, r6
2:
vld1.32 {q8}, [r2,:128]! @ cf0
vld1.32 {q9}, [r5,:128], lr @ cf1
vld1.32 {q1}, [r1], lr @ in
subs r12, r12, #4
vrev64.32 q10, q8
vmla.f32 q3, q1, q9
vmla.f32 d4, d2, d21
vmla.f32 d5, d3, d20
bne 2b
add r1, r1, r6, lsl #2
subs r3, r3, #1
vadd.f32 d4, d4, d5
vadd.f32 d6, d6, d7
vpadd.f32 d4, d4, d6
vmul.f32 d5, d4, d0[0]
vst1.32 {d5[0]}, [r0,:32]!
vst1.32 {d5[1]}, [r4,:32]!
bne 1b
pop {r4-r6,pc}
endfunc

View File

@@ -0,0 +1,493 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
POUT .req a1
PIN .req a2
PCOEF .req a3
DECIFACTOR .req a4
OLDFPSCR .req a4
COUNTER .req ip
SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
IN0 .req s4
IN1 .req s5
IN2 .req s6
IN3 .req s7
IN4 .req s0
IN5 .req s1
IN6 .req s2
IN7 .req s3
COEF0 .req s8 @ coefficient elements
COEF1 .req s9
COEF2 .req s10
COEF3 .req s11
COEF4 .req s12
COEF5 .req s13
COEF6 .req s14
COEF7 .req s15
ACCUM0 .req s16 @ double-buffered multiply-accumulate results
ACCUM4 .req s20
POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
POST1 .req s25
POST2 .req s26
POST3 .req s27
.macro inner_loop decifactor, dir, tail, head
.ifc "\dir","up"
.set X, 0
.set Y, 4
.else
.set X, 4*JMAX*4 - 4
.set Y, -4
.endif
.ifnc "\head",""
vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
.endif
.ifnc "\tail",""
vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
.endif
.ifnc "\head",""
vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
.endif
.ifnc "\tail",""
vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
.endif
.ifnc "\head",""
vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
.ifc "\tail",""
vmul.f ACCUM4, COEF4, IN1 @ vector operation
.endif
vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
.ifnc "\tail",""
vmul.f ACCUM4, COEF4, IN1 @ vector operation
.endif
vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
.endif
.ifnc "\tail",""
vstmia POUT!, {POST0-POST3}
.endif
.ifnc "\head",""
vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
.if \decifactor == 32
vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
.endif
.endif
.endm
.macro dca_lfe_fir decifactor
.if \decifactor == 32
.set JMAX, 8
vpush {s16-s31}
vmov SCALE32, s0 @ duplicate scalar across vector
vldr IN4, [PIN, #-4*4]
vldr IN5, [PIN, #-5*4]
vldr IN6, [PIN, #-6*4]
vldr IN7, [PIN, #-7*4]
.else
.set JMAX, 4
vpush {s16-s27}
.endif
mov COUNTER, #\decifactor/4 - 1
inner_loop \decifactor, up,, head
1: add PCOEF, PCOEF, #4*JMAX*4
subs COUNTER, COUNTER, #1
inner_loop \decifactor, up, tail, head
bne 1b
inner_loop \decifactor, up, tail
mov COUNTER, #\decifactor/4 - 1
inner_loop \decifactor, down,, head
1: sub PCOEF, PCOEF, #4*JMAX*4
subs COUNTER, COUNTER, #1
inner_loop \decifactor, down, tail, head
bne 1b
inner_loop \decifactor, down, tail
.if \decifactor == 32
vpop {s16-s31}
.else
vpop {s16-s27}
.endif
fmxr FPSCR, OLDFPSCR
bx lr
.endm
/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
* int decifactor, float scale)
*/
function ff_dca_lfe_fir_vfp, export=1
teq DECIFACTOR, #32
fmrx OLDFPSCR, FPSCR
ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, ip
NOVFP vldr s0, [sp]
vldr IN0, [PIN, #-0*4]
vldr IN1, [PIN, #-1*4]
vldr IN2, [PIN, #-2*4]
vldr IN3, [PIN, #-3*4]
beq 32f
64: dca_lfe_fir 64
.ltorg
32: dca_lfe_fir 32
endfunc
.unreq POUT
.unreq PIN
.unreq PCOEF
.unreq DECIFACTOR
.unreq OLDFPSCR
.unreq COUNTER
.unreq SCALE32
.unreq SCALE64
.unreq IN0
.unreq IN1
.unreq IN2
.unreq IN3
.unreq IN4
.unreq IN5
.unreq IN6
.unreq IN7
.unreq COEF0
.unreq COEF1
.unreq COEF2
.unreq COEF3
.unreq COEF4
.unreq COEF5
.unreq COEF6
.unreq COEF7
.unreq ACCUM0
.unreq ACCUM4
.unreq POST0
.unreq POST1
.unreq POST2
.unreq POST3
IN .req a1
SBACT .req a2
OLDFPSCR .req a3
IMDCT .req a4
WINDOW .req v1
OUT .req v2
BUF .req v3
SCALEINT .req v4 @ only used in softfp case
COUNT .req v5
SCALE .req s0
/* Stack layout differs in softfp and hardfp cases:
*
* hardfp
* fp -> 6 arg words saved by caller
* a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
* s16-s23 on entry
* align 16
* buf -> 8*32*4 bytes buffer
* s0 on entry
* sp -> 3 arg words for callee
*
* softfp
* fp -> 7 arg words saved by caller
* a4,v1-v5,fp,lr on entry
* s16-s23 on entry
* align 16
* buf -> 8*32*4 bytes buffer
* sp -> 4 arg words for callee
*/
/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
* SynthFilterContext *synth, FFTContext *imdct,
* float (*synth_buf_ptr)[512],
* int *synth_buf_offset, float (*synth_buf2)[32],
* const float (*window)[512], float *samples_out,
* float (*raXin)[32], float scale);
*/
function ff_dca_qmf_32_subbands_vfp, export=1
VFP push {a3-a4,v1-v3,v5,fp,lr}
NOVFP push {a4,v1-v5,fp,lr}
add fp, sp, #8*4
vpush {s16-s23}
@ The buffer pointed at by raXin isn't big enough for us to do a
@ complete matrix transposition as we want to, so allocate an
@ alternative buffer from the stack. Align to 4 words for speed.
sub BUF, sp, #8*32*4
bic BUF, BUF, #15
mov sp, BUF
ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
fmrx OLDFPSCR, FPSCR
fmxr FPSCR, lr
@ COUNT is used to count down 2 things at once:
@ bits 0-4 are the number of word pairs remaining in the output row
@ bits 5-31 are the number of words to copy (with possible negation)
@ from the source matrix before we start zeroing the remainder
mov COUNT, #(-4 << 5) + 16
adds COUNT, COUNT, SBACT, lsl #5
bmi 2f
1:
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, [IN, #(1*8+0)*4]
vldr s11, [IN, #(1*8+1)*4]
vldr s13, [IN, #(1*8+2)*4]
vldr s15, [IN, #(1*8+3)*4]
vneg.f s16, s16
vldr s17, [IN, #(1*8+4)*4]
vldr s19, [IN, #(1*8+5)*4]
vldr s21, [IN, #(1*8+6)*4]
vldr s23, [IN, #(1*8+7)*4]
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
vldr s9, [IN, #(3*8+0)*4]
vldr s11, [IN, #(3*8+1)*4]
vldr s13, [IN, #(3*8+2)*4]
vldr s15, [IN, #(3*8+3)*4]
vldr s17, [IN, #(3*8+4)*4]
vldr s19, [IN, #(3*8+5)*4]
vldr s21, [IN, #(3*8+6)*4]
vldr s23, [IN, #(3*8+7)*4]
vneg.f s9, s9
vldr s8, [IN, #(2*8+0)*4]
vldr s10, [IN, #(2*8+1)*4]
vldr s12, [IN, #(2*8+2)*4]
vldr s14, [IN, #(2*8+3)*4]
vneg.f s17, s17
vldr s16, [IN, #(2*8+4)*4]
vldr s18, [IN, #(2*8+5)*4]
vldr s20, [IN, #(2*8+6)*4]
vldr s22, [IN, #(2*8+7)*4]
vstr d4, [BUF, #(0*32+2)*4]
vstr d5, [BUF, #(1*32+2)*4]
vstr d6, [BUF, #(2*32+2)*4]
vstr d7, [BUF, #(3*32+2)*4]
vstr d8, [BUF, #(4*32+2)*4]
vstr d9, [BUF, #(5*32+2)*4]
vstr d10, [BUF, #(6*32+2)*4]
vstr d11, [BUF, #(7*32+2)*4]
add IN, IN, #4*8*4
add BUF, BUF, #4*4
subs COUNT, COUNT, #(4 << 5) + 2
bpl 1b
2: @ Now deal with trailing < 4 samples
adds COUNT, COUNT, #3 << 5
bmi 4f @ sb_act was a multiple of 4
bics lr, COUNT, #0x1F
bne 3f
@ sb_act was n*4+1
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, zero
vldr s11, zero
vldr s13, zero
vldr s15, zero
vneg.f s16, s16
vldr s17, zero
vldr s19, zero
vldr s21, zero
vldr s23, zero
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #1
b 4f
3: @ sb_act was n*4+2 or n*4+3, so do the first 2
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, [IN, #(1*8+0)*4]
vldr s11, [IN, #(1*8+1)*4]
vldr s13, [IN, #(1*8+2)*4]
vldr s15, [IN, #(1*8+3)*4]
vneg.f s16, s16
vldr s17, [IN, #(1*8+4)*4]
vldr s19, [IN, #(1*8+5)*4]
vldr s21, [IN, #(1*8+6)*4]
vldr s23, [IN, #(1*8+7)*4]
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #(2 << 5) + 1
bics lr, COUNT, #0x1F
bne 4f
@ sb_act was n*4+3
vldr s8, [IN, #(2*8+0)*4]
vldr s10, [IN, #(2*8+1)*4]
vldr s12, [IN, #(2*8+2)*4]
vldr s14, [IN, #(2*8+3)*4]
vldr s16, [IN, #(2*8+4)*4]
vldr s18, [IN, #(2*8+5)*4]
vldr s20, [IN, #(2*8+6)*4]
vldr s22, [IN, #(2*8+7)*4]
vldr s9, zero
vldr s11, zero
vldr s13, zero
vldr s15, zero
vldr s17, zero
vldr s19, zero
vldr s21, zero
vldr s23, zero
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #1
4: @ Now fill the remainder with 0
vldr s8, zero
vldr s9, zero
ands COUNT, COUNT, #0x1F
beq 6f
5: vstr d4, [BUF, #(0*32+0)*4]
vstr d4, [BUF, #(1*32+0)*4]
vstr d4, [BUF, #(2*32+0)*4]
vstr d4, [BUF, #(3*32+0)*4]
vstr d4, [BUF, #(4*32+0)*4]
vstr d4, [BUF, #(5*32+0)*4]
vstr d4, [BUF, #(6*32+0)*4]
vstr d4, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
subs COUNT, COUNT, #1
bne 5b
6:
fmxr FPSCR, OLDFPSCR
ldr WINDOW, [fp, #3*4]
ldr OUT, [fp, #4*4]
sub BUF, BUF, #32*4
NOVFP ldr SCALEINT, [fp, #6*4]
mov COUNT, #8
VFP vpush {SCALE}
VFP sub sp, sp, #3*4
NOVFP sub sp, sp, #4*4
7:
VFP ldr a1, [fp, #-7*4] @ imdct
NOVFP ldr a1, [fp, #-8*4]
ldmia fp, {a2-a4}
VFP stmia sp, {WINDOW, OUT, BUF}
NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
VFP vldr SCALE, [sp, #3*4]
bl X(ff_synth_filter_float_vfp)
add OUT, OUT, #32*4
add BUF, BUF, #32*4
subs COUNT, COUNT, #1
bne 7b
A sub sp, fp, #(8+8)*4
T sub fp, fp, #(8+8)*4
T mov sp, fp
vpop {s16-s23}
VFP pop {a3-a4,v1-v3,v5,fp,pc}
NOVFP pop {a4,v1-v5,fp,pc}
endfunc
.unreq IN
.unreq SBACT
.unreq OLDFPSCR
.unreq IMDCT
.unreq WINDOW
.unreq OUT
.unreq BUF
.unreq SCALEINT
.unreq COUNT
.unreq SCALE
.align 2
zero: .word 0

View File

@@ -0,0 +1,125 @@
@
@ ARMv4 optimized DSP utils
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
@ This file is part of FFmpeg.
@
@ FFmpeg is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2.1 of the License, or (at your option) any later version.
@
@ FFmpeg is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
@ License along with FFmpeg; if not, write to the Free Software
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@
#include "config.h"
#include "libavutil/arm/asm.S"
#if !HAVE_ARMV5TE_EXTERNAL
#define pld @
#endif
.align 5
@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
function ff_add_pixels_clamped_arm, export=1
push {r4-r10}
mov r10, #8
1:
ldr r4, [r1] /* load dest */
/* block[0] and block[1]*/
ldrsh r5, [r0]
ldrsh r7, [r0, #2]
and r6, r4, #0xFF
and r8, r4, #0xFF00
add r6, r6, r5
add r8, r7, r8, lsr #8
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #4] /* moved form [A] */
orr r9, r9, r8, lsl #8
/* block[2] and block[3] */
/* [A] */
ldrsh r7, [r0, #6]
and r6, r4, #0xFF0000
and r8, r4, #0xFF000000
add r6, r5, r6, lsr #16
add r8, r7, r8, lsr #24
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
ldr r4, [r1, #4] /* moved form [B] */
orr r9, r9, r8, lsl #24
/* store dest */
ldrsh r5, [r0, #8] /* moved form [C] */
str r9, [r1]
/* load dest */
/* [B] */
/* block[4] and block[5] */
/* [C] */
ldrsh r7, [r0, #10]
and r6, r4, #0xFF
and r8, r4, #0xFF00
add r6, r6, r5
add r8, r7, r8, lsr #8
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #12] /* moved from [D] */
orr r9, r9, r8, lsl #8
/* block[6] and block[7] */
/* [D] */
ldrsh r7, [r0, #14]
and r6, r4, #0xFF0000
and r8, r4, #0xFF000000
add r6, r5, r6, lsr #16
add r8, r7, r8, lsr #24
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
add r0, r0, #16 /* moved from [E] */
orr r9, r9, r8, lsl #24
subs r10, r10, #1 /* moved from [F] */
/* store dest */
str r9, [r1, #4]
/* [E] */
/* [F] */
add r1, r1, r2
bne 1b
pop {r4-r10}
bx lr
endfunc

View File

@@ -0,0 +1,32 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_DSPUTIL_H
#define AVCODEC_ARM_DSPUTIL_H
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
#endif /* AVCODEC_ARM_DSPUTIL_H */

View File

@@ -0,0 +1,381 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_add_pixels_clamped_armv6, export=1
push {r4-r8,lr}
mov r3, #8
1:
ldm r0!, {r4,r5,r12,lr}
ldrd r6, r7, [r1]
pkhbt r8, r4, r5, lsl #16
pkhtb r5, r5, r4, asr #16
pkhbt r4, r12, lr, lsl #16
pkhtb lr, lr, r12, asr #16
pld [r1, r2]
uxtab16 r8, r8, r6
uxtab16 r5, r5, r6, ror #8
uxtab16 r4, r4, r7
uxtab16 lr, lr, r7, ror #8
usat16 r8, #8, r8
usat16 r5, #8, r5
usat16 r4, #8, r4
usat16 lr, #8, lr
orr r6, r8, r5, lsl #8
orr r7, r4, lr, lsl #8
subs r3, r3, #1
strd_post r6, r7, r1, r2
bgt 1b
pop {r4-r8,pc}
endfunc
function ff_get_pixels_armv6, export=1
pld [r1, r2]
push {r4-r8, lr}
mov lr, #8
1:
ldrd_post r4, r5, r1, r2
subs lr, lr, #1
uxtb16 r6, r4
uxtb16 r4, r4, ror #8
uxtb16 r12, r5
uxtb16 r8, r5, ror #8
pld [r1, r2]
pkhbt r5, r6, r4, lsl #16
pkhtb r6, r4, r6, asr #16
pkhbt r7, r12, r8, lsl #16
pkhtb r12, r8, r12, asr #16
stm r0!, {r5,r6,r7,r12}
bgt 1b
pop {r4-r8, pc}
endfunc
function ff_diff_pixels_armv6, export=1
pld [r1, r3]
pld [r2, r3]
push {r4-r9, lr}
mov lr, #8
1:
ldrd_post r4, r5, r1, r3
ldrd_post r6, r7, r2, r3
uxtb16 r8, r4
uxtb16 r4, r4, ror #8
uxtb16 r9, r6
uxtb16 r6, r6, ror #8
pld [r1, r3]
ssub16 r9, r8, r9
ssub16 r6, r4, r6
uxtb16 r8, r5
uxtb16 r5, r5, ror #8
pld [r2, r3]
pkhbt r4, r9, r6, lsl #16
pkhtb r6, r6, r9, asr #16
uxtb16 r9, r7
uxtb16 r7, r7, ror #8
ssub16 r9, r8, r9
ssub16 r5, r5, r7
subs lr, lr, #1
pkhbt r8, r9, r5, lsl #16
pkhtb r9, r5, r9, asr #16
stm r0!, {r4,r6,r8,r9}
bgt 1b
pop {r4-r9, pc}
endfunc
function ff_pix_abs16_armv6, export=1
ldr r0, [sp]
push {r4-r9, lr}
mov r12, #0
mov lr, #0
ldm r1, {r4-r7}
ldr r8, [r2]
1:
ldr r9, [r2, #4]
pld [r1, r3]
usada8 r12, r4, r8, r12
ldr r8, [r2, #8]
pld [r2, r3]
usada8 lr, r5, r9, lr
ldr r9, [r2, #12]
usada8 r12, r6, r8, r12
subs r0, r0, #1
usada8 lr, r7, r9, lr
beq 2f
add r1, r1, r3
ldm r1, {r4-r7}
add r2, r2, r3
ldr r8, [r2]
b 1b
2:
add r0, r12, lr
pop {r4-r9, pc}
endfunc
function ff_pix_abs16_x2_armv6, export=1
ldr r12, [sp]
push {r4-r11, lr}
mov r0, #0
mov lr, #1
orr lr, lr, lr, lsl #8
orr lr, lr, lr, lsl #16
1:
ldr r8, [r2]
ldr r9, [r2, #4]
lsr r10, r8, #8
ldr r4, [r1]
lsr r6, r9, #8
orr r10, r10, r9, lsl #24
ldr r5, [r2, #8]
eor r11, r8, r10
uhadd8 r7, r8, r10
orr r6, r6, r5, lsl #24
and r11, r11, lr
uadd8 r7, r7, r11
ldr r8, [r1, #4]
usada8 r0, r4, r7, r0
eor r7, r9, r6
lsr r10, r5, #8
and r7, r7, lr
uhadd8 r4, r9, r6
ldr r6, [r2, #12]
uadd8 r4, r4, r7
pld [r1, r3]
orr r10, r10, r6, lsl #24
usada8 r0, r8, r4, r0
ldr r4, [r1, #8]
eor r11, r5, r10
ldrb r7, [r2, #16]
and r11, r11, lr
uhadd8 r8, r5, r10
ldr r5, [r1, #12]
uadd8 r8, r8, r11
pld [r2, r3]
lsr r10, r6, #8
usada8 r0, r4, r8, r0
orr r10, r10, r7, lsl #24
subs r12, r12, #1
eor r11, r6, r10
add r1, r1, r3
uhadd8 r9, r6, r10
and r11, r11, lr
uadd8 r9, r9, r11
add r2, r2, r3
usada8 r0, r5, r9, r0
bgt 1b
pop {r4-r11, pc}
endfunc
.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3
ldr \n0, [r2]
eor \n1, \p0, \n0
uhadd8 \p0, \p0, \n0
and \n1, \n1, lr
ldr \n2, [r1]
uadd8 \p0, \p0, \n1
ldr \n1, [r2, #4]
usada8 r0, \p0, \n2, r0
pld [r1, r3]
eor \n3, \p1, \n1
uhadd8 \p1, \p1, \n1
and \n3, \n3, lr
ldr \p0, [r1, #4]
uadd8 \p1, \p1, \n3
ldr \n2, [r2, #8]
usada8 r0, \p1, \p0, r0
pld [r2, r3]
eor \p0, \p2, \n2
uhadd8 \p2, \p2, \n2
and \p0, \p0, lr
ldr \p1, [r1, #8]
uadd8 \p2, \p2, \p0
ldr \n3, [r2, #12]
usada8 r0, \p2, \p1, r0
eor \p1, \p3, \n3
uhadd8 \p3, \p3, \n3
and \p1, \p1, lr
ldr \p0, [r1, #12]
uadd8 \p3, \p3, \p1
add r1, r1, r3
usada8 r0, \p3, \p0, r0
add r2, r2, r3
.endm
function ff_pix_abs16_y2_armv6, export=1
pld [r1]
pld [r2]
ldr r12, [sp]
push {r4-r11, lr}
mov r0, #0
mov lr, #1
orr lr, lr, lr, lsl #8
orr lr, lr, lr, lsl #16
ldr r4, [r2]
ldr r5, [r2, #4]
ldr r6, [r2, #8]
ldr r7, [r2, #12]
add r2, r2, r3
1:
usad_y2 r4, r5, r6, r7, r8, r9, r10, r11
subs r12, r12, #2
usad_y2 r8, r9, r10, r11, r4, r5, r6, r7
bgt 1b
pop {r4-r11, pc}
endfunc
function ff_pix_abs8_armv6, export=1
pld [r2, r3]
ldr r12, [sp]
push {r4-r9, lr}
mov r0, #0
mov lr, #0
ldrd_post r4, r5, r1, r3
1:
subs r12, r12, #2
ldr r7, [r2, #4]
ldr_post r6, r2, r3
ldrd_post r8, r9, r1, r3
usada8 r0, r4, r6, r0
pld [r2, r3]
usada8 lr, r5, r7, lr
ldr r7, [r2, #4]
ldr_post r6, r2, r3
beq 2f
ldrd_post r4, r5, r1, r3
usada8 r0, r8, r6, r0
pld [r2, r3]
usada8 lr, r9, r7, lr
b 1b
2:
usada8 r0, r8, r6, r0
usada8 lr, r9, r7, lr
add r0, r0, lr
pop {r4-r9, pc}
endfunc
function ff_sse16_armv6, export=1
ldr r12, [sp]
push {r4-r9, lr}
mov r0, #0
1:
ldrd r4, r5, [r1]
ldr r8, [r2]
uxtb16 lr, r4
uxtb16 r4, r4, ror #8
uxtb16 r9, r8
uxtb16 r8, r8, ror #8
ldr r7, [r2, #4]
usub16 lr, lr, r9
usub16 r4, r4, r8
smlad r0, lr, lr, r0
uxtb16 r6, r5
uxtb16 lr, r5, ror #8
uxtb16 r8, r7
uxtb16 r9, r7, ror #8
smlad r0, r4, r4, r0
ldrd r4, r5, [r1, #8]
usub16 r6, r6, r8
usub16 r8, lr, r9
ldr r7, [r2, #8]
smlad r0, r6, r6, r0
uxtb16 lr, r4
uxtb16 r4, r4, ror #8
uxtb16 r9, r7
uxtb16 r7, r7, ror #8
smlad r0, r8, r8, r0
ldr r8, [r2, #12]
usub16 lr, lr, r9
usub16 r4, r4, r7
smlad r0, lr, lr, r0
uxtb16 r6, r5
uxtb16 r5, r5, ror #8
uxtb16 r9, r8
uxtb16 r8, r8, ror #8
smlad r0, r4, r4, r0
usub16 r6, r6, r9
usub16 r5, r5, r8
smlad r0, r6, r6, r0
add r1, r1, r3
add r2, r2, r3
subs r12, r12, #1
smlad r0, r5, r5, r0
bgt 1b
pop {r4-r9, pc}
endfunc
function ff_pix_norm1_armv6, export=1
push {r4-r6, lr}
mov r12, #16
mov lr, #0
1:
ldm r0, {r2-r5}
uxtb16 r6, r2
uxtb16 r2, r2, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r3
smlad lr, r2, r2, lr
uxtb16 r3, r3, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r4
smlad lr, r3, r3, lr
uxtb16 r4, r4, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r5
smlad lr, r4, r4, lr
uxtb16 r5, r5, ror #8
smlad lr, r6, r6, lr
subs r12, r12, #1
add r0, r0, r1
smlad lr, r5, r5, lr
bgt 1b
mov r0, lr
pop {r4-r6, pc}
endfunc
function ff_pix_sum_armv6, export=1
push {r4-r7, lr}
mov r12, #16
mov r2, #0
mov r3, #0
mov lr, #0
ldr r4, [r0]
1:
subs r12, r12, #1
ldr r5, [r0, #4]
usada8 r2, r4, lr, r2
ldr r6, [r0, #8]
usada8 r3, r5, lr, r3
ldr r7, [r0, #12]
usada8 r2, r6, lr, r2
beq 2f
ldr_pre r4, r0, r1
usada8 r3, r7, lr, r3
bgt 1b
2:
usada8 r3, r7, lr, r3
add r0, r2, r3
pop {r4-r7, pc}
endfunc

View File

@@ -0,0 +1,86 @@
/*
* ARM optimized DSP utils
* Copyright (c) 2001 Lionel Ulmer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "dsputil_arm.h"
void ff_j_rev_dct_arm(int16_t *data);
void ff_simple_idct_arm(int16_t *data);
/* XXX: local hack */
static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
int line_size);
/* XXX: those functions should be suppressed ASAP when all IDCTs are
converted */
static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block)
{
ff_j_rev_dct_arm (block);
ff_put_pixels_clamped(block, dest, line_size);
}
static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block)
{
ff_j_rev_dct_arm (block);
ff_add_pixels_clamped(block, dest, line_size);
}
static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block)
{
ff_simple_idct_arm (block);
ff_put_pixels_clamped(block, dest, line_size);
}
static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block)
{
ff_simple_idct_arm (block);
ff_add_pixels_clamped(block, dest, line_size);
}
av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
ff_put_pixels_clamped = c->put_pixels_clamped;
ff_add_pixels_clamped = c->add_pixels_clamped;
if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) {
if(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_ARM){
c->idct_put = j_rev_dct_arm_put;
c->idct_add = j_rev_dct_arm_add;
c->idct = ff_j_rev_dct_arm;
c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
} else if (avctx->idct_algo == FF_IDCT_SIMPLEARM){
c->idct_put = simple_idct_arm_put;
c->idct_add = simple_idct_arm_add;
c->idct = ff_simple_idct_arm;
c->idct_permutation_type = FF_NO_IDCT_PERM;
}
}
c->add_pixels_clamped = ff_add_pixels_clamped_arm;
if (have_armv5te(cpu_flags)) ff_dsputil_init_armv5te(c, avctx);
if (have_armv6(cpu_flags)) ff_dsputil_init_armv6(c, avctx);
if (have_neon(cpu_flags)) ff_dsputil_init_neon(c, avctx);
}

View File

@@ -0,0 +1,37 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "dsputil_arm.h"
void ff_simple_idct_armv5te(int16_t *data);
void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data);
void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data);
av_cold void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx)
{
if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
c->idct_put = ff_simple_idct_put_armv5te;
c->idct_add = ff_simple_idct_add_armv5te;
c->idct = ff_simple_idct_armv5te;
c->idct_permutation_type = FF_NO_IDCT_PERM;
}
}

View File

@@ -0,0 +1,85 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavcodec/avcodec.h"
#include "dsputil_arm.h"
void ff_simple_idct_armv6(int16_t *data);
void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data);
void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data);
void ff_add_pixels_clamped_armv6(const int16_t *block,
uint8_t *restrict pixels,
int line_size);
void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride);
void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
const uint8_t *s2, int stride);
int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
int line_size, int h);
int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
int line_size, int h);
int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
int line_size, int h);
int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
int line_size, int h);
int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
int line_size, int h);
int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
int ff_pix_sum_armv6(uint8_t *pix, int line_size);
av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) {
c->idct_put = ff_simple_idct_put_armv6;
c->idct_add = ff_simple_idct_add_armv6;
c->idct = ff_simple_idct_armv6;
c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
}
if (!high_bit_depth)
c->get_pixels = ff_get_pixels_armv6;
c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
c->diff_pixels = ff_diff_pixels_armv6;
c->pix_abs[0][0] = ff_pix_abs16_armv6;
c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
c->pix_abs[1][0] = ff_pix_abs8_armv6;
c->sad[0] = ff_pix_abs16_armv6;
c->sad[1] = ff_pix_abs8_armv6;
c->sse[0] = ff_sse16_armv6;
c->pix_norm1 = ff_pix_norm1_armv6;
c->pix_sum = ff_pix_sum_armv6;
}

View File

@@ -0,0 +1,81 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "dsputil_arm.h"
void ff_simple_idct_neon(int16_t *data);
void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
void ff_clear_block_neon(int16_t *block);
void ff_clear_blocks_neon(int16_t *blocks);
void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
int len);
void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
const int16_t *v3, int len, int mul);
void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
const int16_t *window, unsigned n);
av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) {
if (avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLENEON) {
c->idct_put = ff_simple_idct_put_neon;
c->idct_add = ff_simple_idct_add_neon;
c->idct = ff_simple_idct_neon;
c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
}
}
if (!high_bit_depth) {
c->clear_block = ff_clear_block_neon;
c->clear_blocks = ff_clear_blocks_neon;
}
c->add_pixels_clamped = ff_add_pixels_clamped_neon;
c->put_pixels_clamped = ff_put_pixels_clamped_neon;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
c->vector_clipf = ff_vector_clipf_neon;
c->vector_clip_int32 = ff_vector_clip_int32_neon;
c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
c->apply_window_int16 = ff_apply_window_int16_neon;
}

View File

@@ -0,0 +1,209 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_clear_block_neon, export=1
vmov.i16 q0, #0
.rept 8
vst1.16 {q0}, [r0,:128]!
.endr
bx lr
endfunc
function ff_clear_blocks_neon, export=1
vmov.i16 q0, #0
.rept 8*6
vst1.16 {q0}, [r0,:128]!
.endr
bx lr
endfunc
function ff_put_pixels_clamped_neon, export=1
vld1.16 {d16-d19}, [r0,:128]!
vqmovun.s16 d0, q8
vld1.16 {d20-d23}, [r0,:128]!
vqmovun.s16 d1, q9
vld1.16 {d24-d27}, [r0,:128]!
vqmovun.s16 d2, q10
vld1.16 {d28-d31}, [r0,:128]!
vqmovun.s16 d3, q11
vst1.8 {d0}, [r1,:64], r2
vqmovun.s16 d4, q12
vst1.8 {d1}, [r1,:64], r2
vqmovun.s16 d5, q13
vst1.8 {d2}, [r1,:64], r2
vqmovun.s16 d6, q14
vst1.8 {d3}, [r1,:64], r2
vqmovun.s16 d7, q15
vst1.8 {d4}, [r1,:64], r2
vst1.8 {d5}, [r1,:64], r2
vst1.8 {d6}, [r1,:64], r2
vst1.8 {d7}, [r1,:64], r2
bx lr
endfunc
function ff_put_signed_pixels_clamped_neon, export=1
vmov.u8 d31, #128
vld1.16 {d16-d17}, [r0,:128]!
vqmovn.s16 d0, q8
vld1.16 {d18-d19}, [r0,:128]!
vqmovn.s16 d1, q9
vld1.16 {d16-d17}, [r0,:128]!
vqmovn.s16 d2, q8
vld1.16 {d18-d19}, [r0,:128]!
vadd.u8 d0, d0, d31
vld1.16 {d20-d21}, [r0,:128]!
vadd.u8 d1, d1, d31
vld1.16 {d22-d23}, [r0,:128]!
vadd.u8 d2, d2, d31
vst1.8 {d0}, [r1,:64], r2
vqmovn.s16 d3, q9
vst1.8 {d1}, [r1,:64], r2
vqmovn.s16 d4, q10
vst1.8 {d2}, [r1,:64], r2
vqmovn.s16 d5, q11
vld1.16 {d24-d25}, [r0,:128]!
vadd.u8 d3, d3, d31
vld1.16 {d26-d27}, [r0,:128]!
vadd.u8 d4, d4, d31
vadd.u8 d5, d5, d31
vst1.8 {d3}, [r1,:64], r2
vqmovn.s16 d6, q12
vst1.8 {d4}, [r1,:64], r2
vqmovn.s16 d7, q13
vst1.8 {d5}, [r1,:64], r2
vadd.u8 d6, d6, d31
vadd.u8 d7, d7, d31
vst1.8 {d6}, [r1,:64], r2
vst1.8 {d7}, [r1,:64], r2
bx lr
endfunc
function ff_add_pixels_clamped_neon, export=1
mov r3, r1
vld1.8 {d16}, [r1,:64], r2
vld1.16 {d0-d1}, [r0,:128]!
vaddw.u8 q0, q0, d16
vld1.8 {d17}, [r1,:64], r2
vld1.16 {d2-d3}, [r0,:128]!
vqmovun.s16 d0, q0
vld1.8 {d18}, [r1,:64], r2
vaddw.u8 q1, q1, d17
vld1.16 {d4-d5}, [r0,:128]!
vaddw.u8 q2, q2, d18
vst1.8 {d0}, [r3,:64], r2
vqmovun.s16 d2, q1
vld1.8 {d19}, [r1,:64], r2
vld1.16 {d6-d7}, [r0,:128]!
vaddw.u8 q3, q3, d19
vqmovun.s16 d4, q2
vst1.8 {d2}, [r3,:64], r2
vld1.8 {d16}, [r1,:64], r2
vqmovun.s16 d6, q3
vld1.16 {d0-d1}, [r0,:128]!
vaddw.u8 q0, q0, d16
vst1.8 {d4}, [r3,:64], r2
vld1.8 {d17}, [r1,:64], r2
vld1.16 {d2-d3}, [r0,:128]!
vaddw.u8 q1, q1, d17
vst1.8 {d6}, [r3,:64], r2
vqmovun.s16 d0, q0
vld1.8 {d18}, [r1,:64], r2
vld1.16 {d4-d5}, [r0,:128]!
vaddw.u8 q2, q2, d18
vst1.8 {d0}, [r3,:64], r2
vqmovun.s16 d2, q1
vld1.8 {d19}, [r1,:64], r2
vqmovun.s16 d4, q2
vld1.16 {d6-d7}, [r0,:128]!
vaddw.u8 q3, q3, d19
vst1.8 {d2}, [r3,:64], r2
vqmovun.s16 d6, q3
vst1.8 {d4}, [r3,:64], r2
vst1.8 {d6}, [r3,:64], r2
bx lr
endfunc
function ff_vector_clipf_neon, export=1
VFP vdup.32 q1, d0[1]
VFP vdup.32 q0, d0[0]
NOVFP vdup.32 q0, r2
NOVFP vdup.32 q1, r3
NOVFP ldr r2, [sp]
vld1.f32 {q2},[r1,:128]!
vmin.f32 q10, q2, q1
vld1.f32 {q3},[r1,:128]!
vmin.f32 q11, q3, q1
1: vmax.f32 q8, q10, q0
vmax.f32 q9, q11, q0
subs r2, r2, #8
beq 2f
vld1.f32 {q2},[r1,:128]!
vmin.f32 q10, q2, q1
vld1.f32 {q3},[r1,:128]!
vmin.f32 q11, q3, q1
vst1.f32 {q8},[r0,:128]!
vst1.f32 {q9},[r0,:128]!
b 1b
2: vst1.f32 {q8},[r0,:128]!
vst1.f32 {q9},[r0,:128]!
bx lr
endfunc
function ff_apply_window_int16_neon, export=1
push {r4,lr}
add r4, r1, r3, lsl #1
add lr, r0, r3, lsl #1
sub r4, r4, #16
sub lr, lr, #16
mov r12, #-16
1:
vld1.16 {q0}, [r1,:128]!
vld1.16 {q2}, [r2,:128]!
vld1.16 {q1}, [r4,:128], r12
vrev64.16 q3, q2
vqrdmulh.s16 q0, q0, q2
vqrdmulh.s16 d2, d2, d7
vqrdmulh.s16 d3, d3, d6
vst1.16 {q0}, [r0,:128]!
vst1.16 {q1}, [lr,:128], r12
subs r3, r3, #16
bgt 1b
pop {r4,pc}
endfunc
function ff_vector_clip_int32_neon, export=1
vdup.32 q0, r2
vdup.32 q1, r3
ldr r2, [sp]
1:
vld1.32 {q2-q3}, [r1,:128]!
vmin.s32 q2, q2, q1
vmin.s32 q3, q3, q1
vmax.s32 q2, q2, q0
vmax.s32 q3, q3, q0
vst1.32 {q2-q3}, [r0,:128]!
subs r2, r2, #8
bgt 1b
bx lr
endfunc

View File

@@ -0,0 +1,48 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/cpu.h"
#define CONFIG_FFT_FLOAT 0
#include "libavcodec/fft.h"
void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z);
void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
av_cold void ff_fft_fixed_init_arm(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
#if CONFIG_FFT
s->fft_calc = ff_fft_fixed_calc_neon;
#endif
#if CONFIG_MDCT
if (!s->inverse && s->nbits >= 3) {
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
s->mdct_calc = ff_mdct_fixed_calc_neon;
s->mdct_calcw = ff_mdct_fixed_calcw_neon;
}
#endif
}
}

View File

@@ -0,0 +1,261 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro bflies d0, d1, r0, r1
vrev64.32 \r0, \d1 @ t5, t6, t1, t2
vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2
vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2
vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5
vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1
@ t5, t6, t4, t3
vhsub.s16 \d1, \d0, \r0
vhadd.s16 \d0, \d0, \r0
.endm
.macro transform01 q0, q1, d3, c0, c1, r0, w0, w1
vrev32.16 \r0, \d3
vmull.s16 \w0, \d3, \c0
vmlal.s16 \w0, \r0, \c1
vshrn.s32 \d3, \w0, #15
bflies \q0, \q1, \w0, \w1
.endm
.macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \
r0, r1, w0, w1
vrev32.16 \r0, \d1
vrev32.16 \r1, \d3
vmull.s16 \w0, \d1, \c0
vmlal.s16 \w0, \r0, \c1
vmull.s16 \w1, \d3, \c2
vmlal.s16 \w1, \r1, \c3
vshrn.s32 \d1, \w0, #15
vshrn.s32 \d3, \w1, #15
bflies \q0, \q1, \w0, \w1
.endm
.macro fft4 d0, d1, r0, r1
vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7
vhsub.s16 \r1, \d1, \d0
vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5
vmov.i64 \d1, #0xffff00000000
vbit \r0, \r1, \d1
vrev64.16 \r1, \r0 @ t7, t8, t4, t3
vtrn.32 \r0, \r1 @ t3, t4, t7, t8
vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7
vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1
vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3
.endm
.macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1
fft4 \d0, \d1, \r0, \r1
vtrn.32 \d0, \d1 @ z0, z2, z1, z3
vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4
vhsub.s16 \d3, \d2, \d3 @ z5, z7
vmov \d2, \r0
transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
.endm
function fft4_neon
vld1.16 {d0-d1}, [r0]
fft4 d0, d1, d2, d3
vst1.16 {d0-d1}, [r0]
bx lr
endfunc
function fft8_neon
vld1.16 {d0-d3}, [r0,:128]
movrel r1, coefs
vld1.16 {d30}, [r1,:64]
vdup.16 d31, d30[0]
fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9
vtrn.32 d0, d1
vtrn.32 d2, d3
vst1.16 {d0-d3}, [r0,:128]
bx lr
endfunc
function fft16_neon
vld1.16 {d0-d3}, [r0,:128]!
vld1.16 {d4-d7}, [r0,:128]
movrel r1, coefs
sub r0, r0, #32
vld1.16 {d28-d31},[r1,:128]
vdup.16 d31, d28[0]
fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9
vswp d5, d6
fft4 q2, q3, q8, q9
vswp d5, d6
vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7
vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15
vswp d1, d2
vdup.16 d31, d28[0]
transform01 q0, q2, d5, d31, d28, d20, q8, q9
vdup.16 d26, d29[0]
vdup.16 d27, d30[0]
transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \
d20, d21, q8, q9
vtrn.32 q0, q1
vtrn.32 q2, q3
vst1.16 {d0-d3}, [r0,:128]!
vst1.16 {d4-d7}, [r0,:128]
bx lr
endfunc
function fft_pass_neon
push {r4,lr}
movrel lr, coefs+24
vld1.16 {d30}, [lr,:64]
lsl r12, r2, #3
vmov d31, d30
add r3, r1, r2, lsl #2
mov lr, #-8
sub r3, r3, #2
mov r4, r0
vld1.16 {d27[]}, [r3,:16]
sub r3, r3, #6
vld1.16 {q0}, [r4,:128], r12
vld1.16 {q1}, [r4,:128], r12
vld1.16 {q2}, [r4,:128], r12
vld1.16 {q3}, [r4,:128], r12
vld1.16 {d28}, [r1,:64]!
vld1.16 {d29}, [r3,:64], lr
vswp d1, d2
vswp d5, d6
vtrn.32 d0, d1
vtrn.32 d4, d5
vdup.16 d25, d28[1]
vmul.s16 d27, d27, d31
transform01 q0, q2, d5, d25, d27, d20, q8, q9
b 2f
1:
mov r4, r0
vdup.16 d26, d29[0]
vld1.16 {q0}, [r4,:128], r12
vld1.16 {q1}, [r4,:128], r12
vld1.16 {q2}, [r4,:128], r12
vld1.16 {q3}, [r4,:128], r12
vld1.16 {d28}, [r1,:64]!
vld1.16 {d29}, [r3,:64], lr
vswp d1, d2
vswp d5, d6
vtrn.32 d0, d1
vtrn.32 d4, d5
vdup.16 d24, d28[0]
vdup.16 d25, d28[1]
vdup.16 d27, d29[3]
vmul.s16 q13, q13, q15
transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \
d16, d17, q9, q10
2:
vtrn.32 d2, d3
vtrn.32 d6, d7
vdup.16 d24, d28[2]
vdup.16 d26, d29[2]
vdup.16 d25, d28[3]
vdup.16 d27, d29[1]
vmul.s16 q13, q13, q15
transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \
d16, d17, q9, q10
vtrn.32 d0, d1
vtrn.32 d2, d3
vtrn.32 d4, d5
vtrn.32 d6, d7
vswp d1, d2
vswp d5, d6
mov r4, r0
vst1.16 {q0}, [r4,:128], r12
vst1.16 {q1}, [r4,:128], r12
vst1.16 {q2}, [r4,:128], r12
vst1.16 {q3}, [r4,:128], r12
add r0, r0, #16
subs r2, r2, #2
bgt 1b
pop {r4,pc}
endfunc
#define F_SQRT1_2 23170
#define F_COS_16_1 30274
#define F_COS_16_3 12540
const coefs, align=4
.short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2
.short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
.short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
.short 1, -1, -1, 1
endconst
.macro def_fft n, n2, n4
function fft\n\()_neon
push {r4, lr}
mov r4, r0
bl fft\n2\()_neon
add r0, r4, #\n4*2*4
bl fft\n4\()_neon
add r0, r4, #\n4*3*4
bl fft\n4\()_neon
mov r0, r4
pop {r4, lr}
movrelx r1, X(ff_cos_\n\()_fixed)
mov r2, #\n4/2
b fft_pass_neon
endfunc
.endm
def_fft 32, 16, 8
def_fft 64, 32, 16
def_fft 128, 64, 32
def_fft 256, 128, 64
def_fft 512, 256, 128
def_fft 1024, 512, 256
def_fft 2048, 1024, 512
def_fft 4096, 2048, 1024
def_fft 8192, 4096, 2048
def_fft 16384, 8192, 4096
def_fft 32768, 16384, 8192
def_fft 65536, 32768, 16384
function ff_fft_fixed_calc_neon, export=1
ldr r2, [r0]
sub r2, r2, #2
movrel r3, fft_fixed_tab_neon
ldr r3, [r3, r2, lsl #2]
mov r0, r1
bx r3
endfunc
const fft_fixed_tab_neon
.word fft4_neon
.word fft8_neon
.word fft16_neon
.word fft32_neon
.word fft64_neon
.word fft128_neon
.word fft256_neon
.word fft512_neon
.word fft1024_neon
.word fft2048_neon
.word fft4096_neon
.word fft8192_neon
.word fft16384_neon
.word fft32768_neon
.word fft65536_neon
endconst

View File

@@ -0,0 +1,70 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/cpu.h"
#include "libavcodec/fft.h"
#include "libavcodec/rdft.h"
#include "libavcodec/synth_filter.h"
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
av_cold void ff_fft_init_arm(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags)) {
#if CONFIG_MDCT
if (!have_vfpv3(cpu_flags))
s->imdct_half = ff_imdct_half_vfp;
#endif
}
if (have_neon(cpu_flags)) {
#if CONFIG_FFT
s->fft_permute = ff_fft_permute_neon;
s->fft_calc = ff_fft_calc_neon;
#endif
#if CONFIG_MDCT
s->imdct_calc = ff_imdct_calc_neon;
s->imdct_half = ff_imdct_half_neon;
s->mdct_calc = ff_mdct_calc_neon;
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
#endif
}
}
#if CONFIG_RDFT
av_cold void ff_rdft_init_arm(RDFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
s->rdft_calc = ff_rdft_calc_neon;
}
#endif

View File

@@ -0,0 +1,375 @@
/*
* ARM NEON optimised FFT
*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2009 Naotoshi Nojiri
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#define M_SQRT1_2 0.70710678118654752440
function fft4_neon
vld1.32 {d0-d3}, [r0,:128]
vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2
vsub.f32 d6, d0, d1 @ r0-r1,i0-i1
vsub.f32 d7, d16, d17 @ r3-r2,i2-i3
vadd.f32 d4, d0, d1 @ r0+r1,i0+i1
vadd.f32 d5, d2, d3 @ i2+i3,r2+r3
vadd.f32 d1, d6, d7
vsub.f32 d3, d6, d7
vadd.f32 d0, d4, d5
vsub.f32 d2, d4, d5
vst1.32 {d0-d3}, [r0,:128]
bx lr
endfunc
function fft8_neon
mov r1, r0
vld1.32 {d0-d3}, [r1,:128]!
vld1.32 {d16-d19}, [r1,:128]
movw r2, #0x04f3 @ sqrt(1/2)
movt r2, #0x3f35
eor r3, r2, #1<<31
vdup.32 d31, r2
vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2
vadd.f32 d4, d16, d17 @ r4+r5,i4+i5
vmov d28, r3, r2
vadd.f32 d5, d18, d19 @ r6+r7,i6+i7
vsub.f32 d17, d16, d17 @ r4-r5,i4-i5
vsub.f32 d19, d18, d19 @ r6-r7,i6-i7
vrev64.32 d29, d28
vadd.f32 d20, d0, d1 @ r0+r1,i0+i1
vadd.f32 d21, d2, d3 @ r2+r3,i2+i3
vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w
vext.32 q3, q2, q2, #1
vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w
vsub.f32 d23, d22, d23 @ i2-i3,r3-r2
vsub.f32 d22, d0, d1 @ r0-r1,i0-i1
vmul.f32 d24, d17, d31 @ a2r*w,a2i*w
vmul.f32 d25, d19, d31 @ a3r*w,a3i*w
vadd.f32 d0, d20, d21
vsub.f32 d2, d20, d21
vadd.f32 d1, d22, d23
vrev64.32 q13, q13
vsub.f32 d3, d22, d23
vsub.f32 d6, d6, d7
vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2
vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6
vadd.f32 d7, d4, d5
vsub.f32 d18, d2, d6
vext.32 q13, q12, q12, #1
vadd.f32 d2, d2, d6
vsub.f32 d16, d0, d7
vadd.f32 d5, d25, d24
vsub.f32 d4, d26, d27
vadd.f32 d0, d0, d7
vsub.f32 d17, d1, d5
vsub.f32 d19, d3, d4
vadd.f32 d3, d3, d4
vadd.f32 d1, d1, d5
vst1.32 {d16-d19}, [r1,:128]
vst1.32 {d0-d3}, [r0,:128]
bx lr
endfunc
function fft16_neon
movrel r1, mppm
vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
pld [r0, #32]
vld1.32 {d2-d3}, [r1,:128]
vext.32 q13, q9, q9, #1
vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
vadd.f32 d4, d16, d17
vsub.f32 d5, d16, d17
vadd.f32 d18, d18, d19
vsub.f32 d19, d26, d27
vadd.f32 d20, d22, d23
vsub.f32 d22, d22, d23
vsub.f32 d23, d24, d25
vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1}
vadd.f32 d21, d24, d25
vmul.f32 d24, d22, d2
vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3}
vmul.f32 d25, d23, d3
vuzp.32 d16, d17 @ {r0,r1,i0,i1}
vmul.f32 q1, q11, d2[1]
vuzp.32 d18, d19 @ {r2,r3,i2,i3}
vrev64.32 q12, q12
vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6}
vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
vzip.32 q10, q11
vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
vadd.f32 d0, d22, d20
vadd.f32 d1, d21, d23
vsub.f32 d2, d21, d23
vsub.f32 d3, d22, d20
sub r0, r0, #96
vext.32 q13, q13, q13, #1
vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5}
vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
vext.32 q15, q15, q15, #1
vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7}
vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3}
vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
movrelx r2, X(ff_cos_16)
vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
vrev64.32 d1, d1
vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
vrev64.32 d3, d3
movrel r3, pmmp
vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13}
vld1.32 {d4-d5}, [r2,:64]
vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11}
vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15}
vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
vld1.32 {d6-d7}, [r3,:128]
vrev64.32 q1, q14
vmul.f32 q14, q14, d4[1]
vmul.f32 q1, q1, q3
vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a}
vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
vzip.32 q12, q14
vadd.f32 d0, d28, d24
vadd.f32 d1, d25, d29
vsub.f32 d2, d25, d29
vsub.f32 d3, d28, d24
vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9}
vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13}
mov r1, #32
vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5}
vrev64.32 q0, q13
vmul.f32 q13, q13, d5[0]
vrev64.32 q1, q15
vmul.f32 q15, q15, d5[1]
vst2.32 {d16-d17},[r0,:128], r1
vmul.f32 q0, q0, q3
vst2.32 {d20-d21},[r0,:128], r1
vmul.f32 q1, q1, q3
vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6}
vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a}
vst2.32 {d24-d25},[r0,:128], r1
vst2.32 {d28-d29},[r0,:128]
vzip.32 q13, q15
sub r0, r0, #80
vadd.f32 d0, d30, d26
vadd.f32 d1, d27, d31
vsub.f32 d2, d27, d31
vsub.f32 d3, d30, d26
vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11}
vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3}
vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15}
vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7}
vst2.32 {d18-d19},[r0,:128], r1
vst2.32 {d22-d23},[r0,:128], r1
vst2.32 {d26-d27},[r0,:128], r1
vst2.32 {d30-d31},[r0,:128]
bx lr
endfunc
function fft_pass_neon
push {r4-r6,lr}
mov r6, r2 @ n
lsl r5, r2, #3 @ 2 * n * sizeof FFTSample
lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex
lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex
add r3, r2, r4
add r4, r4, r0 @ &z[o1]
add r2, r2, r0 @ &z[o2]
add r3, r3, r0 @ &z[o3]
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
movrel r12, pmmp
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
add r5, r5, r1 @ wim
vld1.32 {d6-d7}, [r12,:128] @ pmmp
vswp d21, d22
vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]}
sub r5, r5, #4 @ wim--
vrev64.32 q1, q11
vmul.f32 q11, q11, d4[1]
vmul.f32 q1, q1, q3
vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1]
vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
sub r6, r6, #1 @ n--
vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
vzip.32 q10, q11
vadd.f32 d0, d22, d20
vadd.f32 d1, d21, d23
vsub.f32 d2, d21, d23
vsub.f32 d3, d22, d20
vsub.f32 q10, q8, q0
vadd.f32 q8, q8, q0
vsub.f32 q11, q9, q1
vadd.f32 q9, q9, q1
vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]}
vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]}
vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]}
vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]}
sub r5, r5, #8 @ wim -= 2
1:
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
vswp d21, d22
vld1.32 {d4}, [r1]! @ {wre[0],wre[1]}
vrev64.32 q0, q10
vmul.f32 q10, q10, d4[0]
vrev64.32 q1, q11
vmul.f32 q11, q11, d4[1]
vld1.32 {d5}, [r5] @ {wim[-1],wim[0]}
vmul.f32 q0, q0, q3
sub r5, r5, #8 @ wim -= 2
vmul.f32 q1, q1, q3
vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6}
vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
subs r6, r6, #1 @ n--
vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
vzip.32 q10, q11
vadd.f32 d0, d22, d20
vadd.f32 d1, d21, d23
vsub.f32 d2, d21, d23
vsub.f32 d3, d22, d20
vsub.f32 q10, q8, q0
vadd.f32 q8, q8, q0
vsub.f32 q11, q9, q1
vadd.f32 q9, q9, q1
vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]}
vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]}
vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]}
vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]}
bne 1b
pop {r4-r6,pc}
endfunc
.macro def_fft n, n2, n4
.align 6
function fft\n\()_neon
push {r4, lr}
mov r4, r0
bl fft\n2\()_neon
add r0, r4, #\n4*2*8
bl fft\n4\()_neon
add r0, r4, #\n4*3*8
bl fft\n4\()_neon
mov r0, r4
pop {r4, lr}
movrelx r1, X(ff_cos_\n)
mov r2, #\n4/2
b fft_pass_neon
endfunc
.endm
def_fft 32, 16, 8
def_fft 64, 32, 16
def_fft 128, 64, 32
def_fft 256, 128, 64
def_fft 512, 256, 128
def_fft 1024, 512, 256
def_fft 2048, 1024, 512
def_fft 4096, 2048, 1024
def_fft 8192, 4096, 2048
def_fft 16384, 8192, 4096
def_fft 32768, 16384, 8192
def_fft 65536, 32768, 16384
function ff_fft_calc_neon, export=1
ldr r2, [r0]
sub r2, r2, #2
movrel r3, fft_tab_neon
ldr r3, [r3, r2, lsl #2]
mov r0, r1
bx r3
endfunc
function ff_fft_permute_neon, export=1
push {r4,lr}
mov r12, #1
ldr r2, [r0] @ nbits
ldr r3, [r0, #12] @ tmp_buf
ldr r0, [r0, #8] @ revtab
lsl r12, r12, r2
mov r2, r12
1:
vld1.32 {d0-d1}, [r1,:128]!
ldr r4, [r0], #4
uxth lr, r4
uxth r4, r4, ror #16
add lr, r3, lr, lsl #3
add r4, r3, r4, lsl #3
vst1.32 {d0}, [lr,:64]
vst1.32 {d1}, [r4,:64]
subs r12, r12, #2
bgt 1b
sub r1, r1, r2, lsl #3
1:
vld1.32 {d0-d3}, [r3,:128]!
vst1.32 {d0-d3}, [r1,:128]!
subs r2, r2, #4
bgt 1b
pop {r4,pc}
endfunc
const fft_tab_neon
.word fft4_neon
.word fft8_neon
.word fft16_neon
.word fft32_neon
.word fft64_neon
.word fft128_neon
.word fft256_neon
.word fft512_neon
.word fft1024_neon
.word fft2048_neon
.word fft4096_neon
.word fft8192_neon
.word fft16384_neon
.word fft32768_neon
.word fft65536_neon
endconst
const pmmp, align=4
.float +1.0, -1.0, -1.0, +1.0
endconst
const mppm, align=4
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
endconst

View File

@@ -0,0 +1,298 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
@ TODO: * FFTs wider than 16
@ * dispatch code
function fft4_vfp
vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
@ stall
vadd.f s12, s0, s8 @ i0
vadd.f s13, s1, s9 @ i1
vadd.f s14, s2, s10 @ i2
vadd.f s15, s3, s11 @ i3
vsub.f s8, s0, s8 @ i4
vsub.f s9, s1, s9 @ i5
vsub.f s10, s2, s10 @ i6
vsub.f s11, s3, s11 @ i7
@ stall
@ stall
vadd.f s0, s12, s14 @ z[0].re
vsub.f s4, s12, s14 @ z[2].re
vadd.f s1, s13, s15 @ z[0].im
vsub.f s5, s13, s15 @ z[2].im
vadd.f s7, s9, s10 @ z[3].im
vsub.f s3, s9, s10 @ z[1].im
vadd.f s2, s8, s11 @ z[1].re
vsub.f s6, s8, s11 @ z[3].re
@ stall
@ stall
vstr d0, [a1, #0*2*4]
vstr d2, [a1, #2*2*4]
@ stall
@ stall
vstr d1, [a1, #1*2*4]
vstr d3, [a1, #3*2*4]
bx lr
endfunc
.macro macro_fft8_head
@ FFT4
vldr d4, [a1, #0 * 2*4]
vldr d6, [a1, #1 * 2*4]
vldr d5, [a1, #2 * 2*4]
vldr d7, [a1, #3 * 2*4]
@ BF
vldr d12, [a1, #4 * 2*4]
vadd.f s16, s8, s12 @ vector op
vldr d14, [a1, #5 * 2*4]
vldr d13, [a1, #6 * 2*4]
vldr d15, [a1, #7 * 2*4]
vsub.f s20, s8, s12 @ vector op
vadd.f s0, s16, s18
vsub.f s2, s16, s18
vadd.f s1, s17, s19
vsub.f s3, s17, s19
vadd.f s7, s21, s22
vsub.f s5, s21, s22
vadd.f s4, s20, s23
vsub.f s6, s20, s23
vsub.f s20, s24, s28 @ vector op
vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
vstr d1, [a1, #1 * 2*4]
vldr s0, cos1pi4
vadd.f s16, s24, s28 @ vector op
vstr d2, [a1, #2 * 2*4]
vstr d3, [a1, #3 * 2*4]
vldr d12, [a1, #0 * 2*4]
@ TRANSFORM
vmul.f s20, s20, s0 @ vector x scalar op
vldr d13, [a1, #1 * 2*4]
vldr d14, [a1, #2 * 2*4]
vldr d15, [a1, #3 * 2*4]
@ BUTTERFLIES
vadd.f s0, s18, s16
vadd.f s1, s17, s19
vsub.f s2, s17, s19
vsub.f s3, s18, s16
vadd.f s4, s21, s20
vsub.f s5, s21, s20
vadd.f s6, s22, s23
vsub.f s7, s22, s23
vadd.f s8, s0, s24 @ vector op
vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
vstr d1, [a1, #1 * 2*4]
vldr d6, [a1, #0 * 2*4]
vldr d7, [a1, #1 * 2*4]
vadd.f s1, s5, s6
vadd.f s0, s7, s4
vsub.f s2, s5, s6
vsub.f s3, s7, s4
vsub.f s12, s24, s12 @ vector op
vsub.f s5, s29, s1
vsub.f s4, s28, s0
vsub.f s6, s30, s2
vsub.f s7, s31, s3
vadd.f s16, s0, s28 @ vector op
vstr d6, [a1, #4 * 2*4]
vstr d7, [a1, #6 * 2*4]
vstr d4, [a1, #0 * 2*4]
vstr d5, [a1, #2 * 2*4]
vstr d2, [a1, #5 * 2*4]
vstr d3, [a1, #7 * 2*4]
.endm
.macro macro_fft8_tail
vstr d8, [a1, #1 * 2*4]
vstr d9, [a1, #3 * 2*4]
.endm
function fft8_vfp
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
fmrx a2, FPSCR
fmxr FPSCR, a3
vpush {s16-s31}
macro_fft8_head
macro_fft8_tail
vpop {s16-s31}
fmxr FPSCR, a2
bx lr
endfunc
.align 3
cos1pi4: @ cos(1*pi/4) = sqrt(2)
.float 0.707106769084930419921875
cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
.float 0.92387950420379638671875
cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
.float 0.3826834261417388916015625
function ff_fft16_vfp, export=1
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
fmrx a2, FPSCR
fmxr FPSCR, a3
vpush {s16-s31}
macro_fft8_head
@ FFT4(z+8)
vldr d10, [a1, #8 * 2*4]
vldr d12, [a1, #9 * 2*4]
vldr d11, [a1, #10 * 2*4]
vldr d13, [a1, #11 * 2*4]
macro_fft8_tail
vadd.f s16, s20, s24 @ vector op
@ FFT4(z+12)
vldr d4, [a1, #12 * 2*4]
vldr d6, [a1, #13 * 2*4]
vldr d5, [a1, #14 * 2*4]
vsub.f s20, s20, s24 @ vector op
vldr d7, [a1, #15 * 2*4]
vadd.f s0, s16, s18
vsub.f s4, s16, s18
vadd.f s1, s17, s19
vsub.f s5, s17, s19
vadd.f s7, s21, s22
vsub.f s3, s21, s22
vadd.f s2, s20, s23
vsub.f s6, s20, s23
vadd.f s16, s8, s12 @ vector op
vstr d0, [a1, #8 * 2*4]
vstr d2, [a1, #10 * 2*4]
vstr d1, [a1, #9 * 2*4]
vsub.f s20, s8, s12
vstr d3, [a1, #11 * 2*4]
@ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
vldr d12, [a1, #10 * 2*4]
vadd.f s0, s16, s18
vadd.f s1, s17, s19
vsub.f s6, s16, s18
vsub.f s7, s17, s19
vsub.f s3, s21, s22
vadd.f s2, s20, s23
vadd.f s5, s21, s22
vsub.f s4, s20, s23
vstr d0, [a1, #12 * 2*4]
vmov s0, s6
@ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
vldr d6, [a1, #9 * 2*4]
vstr d1, [a1, #13 * 2*4]
vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
vstr d2, [a1, #15 * 2*4]
vldr d7, [a1, #13 * 2*4]
vadd.f s4, s25, s24
vsub.f s5, s25, s24
vsub.f s6, s0, s7
vadd.f s7, s0, s7
vmul.f s20, s12, s3 @ vector op
@ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
vldr d4, [a1, #11 * 2*4]
vldr d5, [a1, #15 * 2*4]
vldr s1, cos3pi8
vmul.f s24, s4, s2 @ vector * scalar op
vmul.f s28, s12, s1 @ vector * scalar op
vmul.f s12, s8, s1 @ vector * scalar op
vadd.f s4, s20, s29
vsub.f s5, s21, s28
vsub.f s6, s22, s31
vadd.f s7, s23, s30
vmul.f s8, s8, s3 @ vector * scalar op
vldr d8, [a1, #1 * 2*4]
vldr d9, [a1, #5 * 2*4]
vldr d10, [a1, #3 * 2*4]
vldr d11, [a1, #7 * 2*4]
vldr d14, [a1, #2 * 2*4]
vadd.f s0, s6, s4
vadd.f s1, s5, s7
vsub.f s2, s5, s7
vsub.f s3, s6, s4
vadd.f s4, s12, s9
vsub.f s5, s13, s8
vsub.f s6, s14, s11
vadd.f s7, s15, s10
vadd.f s12, s0, s16 @ vector op
vstr d0, [a1, #1 * 2*4]
vstr d1, [a1, #5 * 2*4]
vldr d4, [a1, #1 * 2*4]
vldr d5, [a1, #5 * 2*4]
vadd.f s0, s6, s4
vadd.f s1, s5, s7
vsub.f s2, s5, s7
vsub.f s3, s6, s4
vsub.f s8, s16, s8 @ vector op
vstr d6, [a1, #1 * 2*4]
vstr d7, [a1, #5 * 2*4]
vldr d15, [a1, #6 * 2*4]
vsub.f s4, s20, s0
vsub.f s5, s21, s1
vsub.f s6, s22, s2
vsub.f s7, s23, s3
vadd.f s20, s0, s20 @ vector op
vstr d4, [a1, #9 * 2*4]
@ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
vldr d6, [a1, #8 * 2*4]
vstr d5, [a1, #13 * 2*4]
vldr d7, [a1, #12 * 2*4]
vstr d2, [a1, #11 * 2*4]
vldr d8, [a1, #0 * 2*4]
vstr d3, [a1, #15 * 2*4]
vldr d9, [a1, #4 * 2*4]
vadd.f s0, s26, s24
vadd.f s1, s25, s27
vsub.f s2, s25, s27
vsub.f s3, s26, s24
vadd.f s4, s14, s12
vadd.f s5, s13, s15
vsub.f s6, s13, s15
vsub.f s7, s14, s12
vadd.f s8, s0, s28 @ vector op
vstr d0, [a1, #3 * 2*4]
vstr d1, [a1, #7 * 2*4]
vldr d6, [a1, #3 * 2*4]
vldr d7, [a1, #7 * 2*4]
vsub.f s0, s16, s4
vsub.f s1, s17, s5
vsub.f s2, s18, s6
vsub.f s3, s19, s7
vsub.f s12, s28, s12 @ vector op
vadd.f s16, s4, s16 @ vector op
vstr d10, [a1, #3 * 2*4]
vstr d11, [a1, #7 * 2*4]
vstr d4, [a1, #2 * 2*4]
vstr d5, [a1, #6 * 2*4]
vstr d0, [a1, #8 * 2*4]
vstr d1, [a1, #12 * 2*4]
vstr d6, [a1, #10 * 2*4]
vstr d7, [a1, #14 * 2*4]
vstr d8, [a1, #0 * 2*4]
vstr d9, [a1, #4 * 2*4]
vpop {s16-s31}
fmxr FPSCR, a2
bx lr
endfunc

View File

@@ -0,0 +1,146 @@
/*
* Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function flac_lpc_16_1_arm
ldr r12, [sp]
push {r4, lr}
ldr r1, [r1]
subs r12, r12, #2
ldr lr, [r0], #4
beq 2f
it lt
poplt {r4, pc}
1:
mul r4, lr, r1
ldm r0, {r2, lr}
add_sh r2, r2, r4, asr r3
mul r4, r2, r1
subs r12, r12, #2
add_sh lr, lr, r4, asr r3
stm r0!, {r2, lr}
bgt 1b
it lt
poplt {r4, pc}
2:
mul r4, lr, r1
ldr r2, [r0]
add_sh r2, r2, r4, asr r3
str r2, [r0]
pop {r4, pc}
endfunc
function flac_lpc_16_2_arm
ldr r12, [sp]
subs r12, r12, r2
it le
bxle lr
push {r4-r9, lr}
ldm r0!, {r6, r7}
ldm r1, {r8, r9}
subs r12, r12, #1
beq 2f
1:
mul r4, r6, r8
mul r5, r7, r8
mla r4, r7, r9, r4
ldm r0, {r6, r7}
add_sh r6, r6, r4, asr r3
mla r5, r6, r9, r5
add_sh r7, r7, r5, asr r3
stm r0!, {r6, r7}
subs r12, r12, #2
bgt 1b
it lt
poplt {r4-r9, pc}
2:
mul r4, r6, r8
mla r4, r7, r9, r4
ldr r5, [r0]
add_sh r5, r5, r4, asr r3
str r5, [r0]
pop {r4-r9, pc}
endfunc
function ff_flac_lpc_16_arm, export=1
cmp r2, #2
blt flac_lpc_16_1_arm
beq flac_lpc_16_2_arm
ldr r12, [sp]
subs r12, r12, r2
it le
bxle lr
push {r4-r9, lr}
subs r12, r12, #1
beq 3f
1:
sub lr, r2, #2
mov r4, #0
mov r5, #0
ldr r7, [r0], #4
ldr r9, [r1], #4
2:
mla r4, r7, r9, r4
ldm r0!, {r6, r7}
mla r5, r6, r9, r5
ldm r1!, {r8, r9}
mla r4, r6, r8, r4
subs lr, lr, #2
mla r5, r7, r8, r5
bgt 2b
blt 6f
mla r4, r7, r9, r4
ldr r7, [r0], #4
mla r5, r7, r9, r5
ldr r9, [r1], #4
6:
mla r4, r7, r9, r4
ldm r0, {r6, r7}
add_sh r6, r6, r4, asr r3
mla r5, r6, r9, r5
add_sh r7, r7, r5, asr r3
stm r0!, {r6, r7}
sub r0, r0, r2, lsl #2
sub r1, r1, r2, lsl #2
subs r12, r12, #2
bgt 1b
it lt
poplt {r4-r9, pc}
3:
mov r4, #0
4:
ldr r5, [r1], #4
ldr r6, [r0], #4
mla r4, r5, r6, r4
subs r2, r2, #1
bgt 4b
ldr r5, [r0]
add_sh r5, r5, r4, asr r3
str r5, [r0]
pop {r4-r9, pc}
endfunc

View File

@@ -0,0 +1,32 @@
/*
* Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/flacdsp.h"
#include "config.h"
void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
int qlevel, int len);
av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt,
int bps)
{
if (bps <= 16)
c->lpc = ff_flac_lpc_16_arm;
}

View File

@@ -0,0 +1,65 @@
/*
* ARM optimized Format Conversion Utils
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/fmtconvert.h"
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
float mul, int len);
void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
float mul, int len);
void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst,
const int32_t *src, const float *mul,
int len);
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags)) {
if (!have_vfpv3(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_vfp;
}
if (have_armv6(cpu_flags)) {
c->float_to_int16 = ff_float_to_int16_vfp;
}
}
if (have_neon(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = ff_float_to_int16_neon;
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
}
}
}

View File

@@ -0,0 +1,392 @@
/*
* ARM NEON optimised Format Conversion Utils
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/asm.S"
function ff_float_to_int16_neon, export=1
subs r2, r2, #8
vld1.64 {d0-d1}, [r1,:128]!
vcvt.s32.f32 q8, q0, #16
vld1.64 {d2-d3}, [r1,:128]!
vcvt.s32.f32 q9, q1, #16
beq 3f
bics ip, r2, #15
beq 2f
1: subs ip, ip, #16
vshrn.s32 d4, q8, #16
vld1.64 {d0-d1}, [r1,:128]!
vcvt.s32.f32 q0, q0, #16
vshrn.s32 d5, q9, #16
vld1.64 {d2-d3}, [r1,:128]!
vcvt.s32.f32 q1, q1, #16
vshrn.s32 d6, q0, #16
vst1.64 {d4-d5}, [r0,:128]!
vshrn.s32 d7, q1, #16
vld1.64 {d16-d17},[r1,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r1,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.64 {d6-d7}, [r0,:128]!
bne 1b
ands r2, r2, #15
beq 3f
2: vld1.64 {d0-d1}, [r1,:128]!
vshrn.s32 d4, q8, #16
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r1,:128]!
vshrn.s32 d5, q9, #16
vcvt.s32.f32 q1, q1, #16
vshrn.s32 d6, q0, #16
vst1.64 {d4-d5}, [r0,:128]!
vshrn.s32 d7, q1, #16
vst1.64 {d6-d7}, [r0,:128]!
bx lr
3: vshrn.s32 d4, q8, #16
vshrn.s32 d5, q9, #16
vst1.64 {d4-d5}, [r0,:128]!
bx lr
endfunc
function ff_float_to_int16_interleave_neon, export=1
cmp r3, #2
itt lt
ldrlt r1, [r1]
blt ff_float_to_int16_neon
bne 4f
ldr r3, [r1]
ldr r1, [r1, #4]
subs r2, r2, #8
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q8, q0, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q9, q1, #16
vld1.64 {d20-d21},[r1,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r1,:128]!
vcvt.s32.f32 q11, q11, #16
beq 3f
bics ip, r2, #15
beq 2f
1: subs ip, ip, #16
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 q10, q8, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q1, q1, #16
vld1.64 {d24-d25},[r1,:128]!
vcvt.s32.f32 q12, q12, #16
vld1.64 {d26-d27},[r1,:128]!
vsri.32 q11, q9, #16
vst1.64 {d20-d21},[r0,:128]!
vcvt.s32.f32 q13, q13, #16
vst1.64 {d22-d23},[r0,:128]!
vsri.32 q12, q0, #16
vld1.64 {d16-d17},[r3,:128]!
vsri.32 q13, q1, #16
vst1.64 {d24-d25},[r0,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r3,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r1,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r1,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.64 {d26-d27},[r0,:128]!
bne 1b
ands r2, r2, #15
beq 3f
2: vsri.32 q10, q8, #16
vld1.64 {d0-d1}, [r3,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r3,:128]!
vcvt.s32.f32 q1, q1, #16
vld1.64 {d24-d25},[r1,:128]!
vcvt.s32.f32 q12, q12, #16
vsri.32 q11, q9, #16
vld1.64 {d26-d27},[r1,:128]!
vcvt.s32.f32 q13, q13, #16
vst1.64 {d20-d21},[r0,:128]!
vsri.32 q12, q0, #16
vst1.64 {d22-d23},[r0,:128]!
vsri.32 q13, q1, #16
vst1.64 {d24-d27},[r0,:128]!
bx lr
3: vsri.32 q10, q8, #16
vsri.32 q11, q9, #16
vst1.64 {d20-d23},[r0,:128]!
bx lr
4: push {r4-r8,lr}
cmp r3, #4
lsl ip, r3, #1
blt 4f
@ 4 channels
5: ldmia r1!, {r4-r7}
mov lr, r2
mov r8, r0
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r6,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r7,:128]!
vcvt.s32.f32 q11, q11, #16
6: subs lr, lr, #8
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 q9, q8, #16
vld1.64 {d2-d3}, [r5,:128]!
vcvt.s32.f32 q1, q1, #16
vsri.32 q11, q10, #16
vld1.64 {d4-d5}, [r6,:128]!
vcvt.s32.f32 q2, q2, #16
vzip.32 d18, d22
vld1.64 {d6-d7}, [r7,:128]!
vcvt.s32.f32 q3, q3, #16
vzip.32 d19, d23
vst1.64 {d18}, [r8], ip
vsri.32 q1, q0, #16
vst1.64 {d22}, [r8], ip
vsri.32 q3, q2, #16
vst1.64 {d19}, [r8], ip
vzip.32 d2, d6
vst1.64 {d23}, [r8], ip
vzip.32 d3, d7
beq 7f
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.64 {d2}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.64 {d6}, [r8], ip
vld1.64 {d20-d21},[r6,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.64 {d3}, [r8], ip
vld1.64 {d22-d23},[r7,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.64 {d7}, [r8], ip
b 6b
7: vst1.64 {d2}, [r8], ip
vst1.64 {d6}, [r8], ip
vst1.64 {d3}, [r8], ip
vst1.64 {d7}, [r8], ip
subs r3, r3, #4
it eq
popeq {r4-r8,pc}
cmp r3, #4
add r0, r0, #8
bge 5b
@ 2 channels
4: cmp r3, #2
blt 4f
ldmia r1!, {r4-r5}
mov lr, r2
mov r8, r0
tst lr, #8
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
beq 6f
subs lr, lr, #8
beq 7f
vsri.32 d18, d16, #16
vsri.32 d19, d17, #16
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vst1.32 {d19[1]}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.32 {d22[0]}, [r8], ip
vst1.32 {d22[1]}, [r8], ip
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
6: subs lr, lr, #16
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vsri.32 d18, d16, #16
vld1.64 {d2-d3}, [r5,:128]!
vcvt.s32.f32 q1, q1, #16
vsri.32 d19, d17, #16
vld1.64 {d4-d5}, [r4,:128]!
vcvt.s32.f32 q2, q2, #16
vld1.64 {d6-d7}, [r5,:128]!
vcvt.s32.f32 q3, q3, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vsri.32 d2, d0, #16
vst1.32 {d19[1]}, [r8], ip
vsri.32 d3, d1, #16
vst1.32 {d22[0]}, [r8], ip
vsri.32 d6, d4, #16
vst1.32 {d22[1]}, [r8], ip
vsri.32 d7, d5, #16
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
beq 6f
vld1.64 {d16-d17},[r4,:128]!
vcvt.s32.f32 q8, q8, #16
vst1.32 {d2[0]}, [r8], ip
vst1.32 {d2[1]}, [r8], ip
vld1.64 {d18-d19},[r5,:128]!
vcvt.s32.f32 q9, q9, #16
vst1.32 {d3[0]}, [r8], ip
vst1.32 {d3[1]}, [r8], ip
vld1.64 {d20-d21},[r4,:128]!
vcvt.s32.f32 q10, q10, #16
vst1.32 {d6[0]}, [r8], ip
vst1.32 {d6[1]}, [r8], ip
vld1.64 {d22-d23},[r5,:128]!
vcvt.s32.f32 q11, q11, #16
vst1.32 {d7[0]}, [r8], ip
vst1.32 {d7[1]}, [r8], ip
bgt 6b
6: vst1.32 {d2[0]}, [r8], ip
vst1.32 {d2[1]}, [r8], ip
vst1.32 {d3[0]}, [r8], ip
vst1.32 {d3[1]}, [r8], ip
vst1.32 {d6[0]}, [r8], ip
vst1.32 {d6[1]}, [r8], ip
vst1.32 {d7[0]}, [r8], ip
vst1.32 {d7[1]}, [r8], ip
b 8f
7: vsri.32 d18, d16, #16
vsri.32 d19, d17, #16
vst1.32 {d18[0]}, [r8], ip
vsri.32 d22, d20, #16
vst1.32 {d18[1]}, [r8], ip
vsri.32 d23, d21, #16
vst1.32 {d19[0]}, [r8], ip
vst1.32 {d19[1]}, [r8], ip
vst1.32 {d22[0]}, [r8], ip
vst1.32 {d22[1]}, [r8], ip
vst1.32 {d23[0]}, [r8], ip
vst1.32 {d23[1]}, [r8], ip
8: subs r3, r3, #2
add r0, r0, #4
it eq
popeq {r4-r8,pc}
@ 1 channel
4: ldr r4, [r1],#4
tst r2, #8
mov lr, r2
mov r5, r0
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
bne 8f
6: subs lr, lr, #16
vld1.64 {d4-d5}, [r4,:128]!
vcvt.s32.f32 q2, q2, #16
vld1.64 {d6-d7}, [r4,:128]!
vcvt.s32.f32 q3, q3, #16
vst1.16 {d0[1]}, [r5,:16], ip
vst1.16 {d0[3]}, [r5,:16], ip
vst1.16 {d1[1]}, [r5,:16], ip
vst1.16 {d1[3]}, [r5,:16], ip
vst1.16 {d2[1]}, [r5,:16], ip
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
beq 7f
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
7: vst1.16 {d4[1]}, [r5,:16], ip
vst1.16 {d4[3]}, [r5,:16], ip
vst1.16 {d5[1]}, [r5,:16], ip
vst1.16 {d5[3]}, [r5,:16], ip
vst1.16 {d6[1]}, [r5,:16], ip
vst1.16 {d6[3]}, [r5,:16], ip
vst1.16 {d7[1]}, [r5,:16], ip
vst1.16 {d7[3]}, [r5,:16], ip
bgt 6b
pop {r4-r8,pc}
8: subs lr, lr, #8
vst1.16 {d0[1]}, [r5,:16], ip
vst1.16 {d0[3]}, [r5,:16], ip
vst1.16 {d1[1]}, [r5,:16], ip
vst1.16 {d1[3]}, [r5,:16], ip
vst1.16 {d2[1]}, [r5,:16], ip
vst1.16 {d2[3]}, [r5,:16], ip
vst1.16 {d3[1]}, [r5,:16], ip
vst1.16 {d3[3]}, [r5,:16], ip
it eq
popeq {r4-r8,pc}
vld1.64 {d0-d1}, [r4,:128]!
vcvt.s32.f32 q0, q0, #16
vld1.64 {d2-d3}, [r4,:128]!
vcvt.s32.f32 q1, q1, #16
b 6b
endfunc
function ff_int32_to_float_fmul_scalar_neon, export=1
VFP vdup.32 q0, d0[0]
VFP len .req r2
NOVFP vdup.32 q0, r2
NOVFP len .req r3
vld1.32 {q1},[r1,:128]!
vcvt.f32.s32 q3, q1
vld1.32 {q2},[r1,:128]!
vcvt.f32.s32 q8, q2
1: subs len, len, #8
pld [r1, #16]
vmul.f32 q9, q3, q0
vmul.f32 q10, q8, q0
beq 2f
vld1.32 {q1},[r1,:128]!
vcvt.f32.s32 q3, q1
vld1.32 {q2},[r1,:128]!
vcvt.f32.s32 q8, q2
vst1.32 {q9}, [r0,:128]!
vst1.32 {q10},[r0,:128]!
b 1b
2: vst1.32 {q9}, [r0,:128]!
vst1.32 {q10},[r0,:128]!
bx lr
.unreq len
endfunc

View File

@@ -0,0 +1,221 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/asm.S"
/**
* ARM VFP optimised int32 to float conversion.
* Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
* (16 bytes alignment is best for BCM2835), little-endian.
*/
@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
function ff_int32_to_float_fmul_array8_vfp, export=1
push {lr}
ldr a1, [sp, #4]
subs lr, a1, #3*8
bcc 50f @ too short to pipeline
@ Now need to find (len / 8) % 3. The approximation
@ x / 24 = (x * 0xAB) >> 12
@ is good for x < 4096, which is true for both AC3 and DCA.
mov a1, #0xAB
ldr ip, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
mul a1, lr, a1
vpush {s16-s31}
mov a1, a1, lsr #12
add a1, a1, a1, lsl #1
rsb a1, a1, lr, lsr #3
cmp a1, #1
fmrx a1, FPSCR
fmxr FPSCR, ip
beq 11f
blo 10f
@ Array is (2 + multiple of 3) x 8 floats long
@ drop through...
vldmia a3!, {s16-s23}
vldmia a4!, {s2,s3}
vldmia a3!, {s24-s31}
vcvt.f32.s32 s16, s16
vcvt.f32.s32 s17, s17
vcvt.f32.s32 s18, s18
vcvt.f32.s32 s19, s19
vcvt.f32.s32 s20, s20
vcvt.f32.s32 s21, s21
vcvt.f32.s32 s22, s22
vcvt.f32.s32 s23, s23
vmul.f32 s16, s16, s2
@ drop through...
3:
vldmia a3!, {s8-s15}
vldmia a4!, {s1}
vcvt.f32.s32 s24, s24
vcvt.f32.s32 s25, s25
vcvt.f32.s32 s26, s26
vcvt.f32.s32 s27, s27
vcvt.f32.s32 s28, s28
vcvt.f32.s32 s29, s29
vcvt.f32.s32 s30, s30
vcvt.f32.s32 s31, s31
vmul.f32 s24, s24, s3
vstmia a2!, {s16-s19}
vstmia a2!, {s20-s23}
2:
vldmia a3!, {s16-s23}
vldmia a4!, {s2}
vcvt.f32.s32 s8, s8
vcvt.f32.s32 s9, s9
vcvt.f32.s32 s10, s10
vcvt.f32.s32 s11, s11
vcvt.f32.s32 s12, s12
vcvt.f32.s32 s13, s13
vcvt.f32.s32 s14, s14
vcvt.f32.s32 s15, s15
vmul.f32 s8, s8, s1
vstmia a2!, {s24-s27}
vstmia a2!, {s28-s31}
1:
vldmia a3!, {s24-s31}
vldmia a4!, {s3}
vcvt.f32.s32 s16, s16
vcvt.f32.s32 s17, s17
vcvt.f32.s32 s18, s18
vcvt.f32.s32 s19, s19
vcvt.f32.s32 s20, s20
vcvt.f32.s32 s21, s21
vcvt.f32.s32 s22, s22
vcvt.f32.s32 s23, s23
vmul.f32 s16, s16, s2
vstmia a2!, {s8-s11}
vstmia a2!, {s12-s15}
subs lr, lr, #8*3
bpl 3b
vcvt.f32.s32 s24, s24
vcvt.f32.s32 s25, s25
vcvt.f32.s32 s26, s26
vcvt.f32.s32 s27, s27
vcvt.f32.s32 s28, s28
vcvt.f32.s32 s29, s29
vcvt.f32.s32 s30, s30
vcvt.f32.s32 s31, s31
vmul.f32 s24, s24, s3
vstmia a2!, {s16-s19}
vstmia a2!, {s20-s23}
vstmia a2!, {s24-s27}
vstmia a2!, {s28-s31}
fmxr FPSCR, a1
vpop {s16-s31}
pop {pc}
10: @ Array is (multiple of 3) x 8 floats long
vldmia a3!, {s8-s15}
vldmia a4!, {s1,s2}
vldmia a3!, {s16-s23}
vcvt.f32.s32 s8, s8
vcvt.f32.s32 s9, s9
vcvt.f32.s32 s10, s10
vcvt.f32.s32 s11, s11
vcvt.f32.s32 s12, s12
vcvt.f32.s32 s13, s13
vcvt.f32.s32 s14, s14
vcvt.f32.s32 s15, s15
vmul.f32 s8, s8, s1
b 1b
11: @ Array is (1 + multiple of 3) x 8 floats long
vldmia a3!, {s24-s31}
vldmia a4!, {s3}
vldmia a3!, {s8-s15}
vldmia a4!, {s1}
vcvt.f32.s32 s24, s24
vcvt.f32.s32 s25, s25
vcvt.f32.s32 s26, s26
vcvt.f32.s32 s27, s27
vcvt.f32.s32 s28, s28
vcvt.f32.s32 s29, s29
vcvt.f32.s32 s30, s30
vcvt.f32.s32 s31, s31
vmul.f32 s24, s24, s3
b 2b
50:
ldr lr, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
fmrx ip, FPSCR
fmxr FPSCR, lr
51:
vldmia a3!, {s8-s15}
vldmia a4!, {s0}
vcvt.f32.s32 s8, s8
vcvt.f32.s32 s9, s9
vcvt.f32.s32 s10, s10
vcvt.f32.s32 s11, s11
vcvt.f32.s32 s12, s12
vcvt.f32.s32 s13, s13
vcvt.f32.s32 s14, s14
vcvt.f32.s32 s15, s15
vmul.f32 s8, s8, s0
subs a1, a1, #8
vstmia a2!, {s8-s11}
vstmia a2!, {s12-s15}
bne 51b
fmxr FPSCR, ip
pop {pc}
endfunc
/**
* ARM VFP optimised int32 to float conversion.
* Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
* (16 bytes alignment is best for BCM2835), little-endian.
* TODO: could be further optimised by unrolling and interleaving, as above
*/
@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
function ff_int32_to_float_fmul_scalar_vfp, export=1
VFP tmp .req a4
VFP len .req a3
NOVFP tmp .req a3
NOVFP len .req a4
NOVFP vmov s0, a3
ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
fmrx ip, FPSCR
fmxr FPSCR, tmp
1:
vldmia a2!, {s8-s15}
vcvt.f32.s32 s8, s8
vcvt.f32.s32 s9, s9
vcvt.f32.s32 s10, s10
vcvt.f32.s32 s11, s11
vcvt.f32.s32 s12, s12
vcvt.f32.s32 s13, s13
vcvt.f32.s32 s14, s14
vcvt.f32.s32 s15, s15
vmul.f32 s8, s8, s0
subs len, len, #8
vstmia a1!, {s8-s11}
vstmia a1!, {s12-s15}
bne 1b
fmxr FPSCR, ip
bx lr
endfunc
.unreq tmp
.unreq len

View File

@@ -0,0 +1,78 @@
/*
* Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/asm.S"
/**
* ARM VFP optimized float to int16 conversion.
* Assume that len is a positive number and is multiple of 8, destination
* buffer is at least 4 bytes aligned (8 bytes alignment is better for
* performance), little-endian byte sex.
*/
@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
function ff_float_to_int16_vfp, export=1
push {r4-r8,lr}
vpush {d8-d11}
vldmia r1!, {s16-s23}
vcvt.s32.f32 s0, s16
vcvt.s32.f32 s1, s17
vcvt.s32.f32 s2, s18
vcvt.s32.f32 s3, s19
vcvt.s32.f32 s4, s20
vcvt.s32.f32 s5, s21
vcvt.s32.f32 s6, s22
vcvt.s32.f32 s7, s23
1:
subs r2, r2, #8
vmov r3, r4, s0, s1
vmov r5, r6, s2, s3
vmov r7, r8, s4, s5
vmov ip, lr, s6, s7
it gt
vldmiagt r1!, {s16-s23}
ssat r4, #16, r4
ssat r3, #16, r3
ssat r6, #16, r6
ssat r5, #16, r5
pkhbt r3, r3, r4, lsl #16
pkhbt r4, r5, r6, lsl #16
itttt gt
vcvtgt.s32.f32 s0, s16
vcvtgt.s32.f32 s1, s17
vcvtgt.s32.f32 s2, s18
vcvtgt.s32.f32 s3, s19
itttt gt
vcvtgt.s32.f32 s4, s20
vcvtgt.s32.f32 s5, s21
vcvtgt.s32.f32 s6, s22
vcvtgt.s32.f32 s7, s23
ssat r8, #16, r8
ssat r7, #16, r7
ssat lr, #16, lr
ssat ip, #16, ip
pkhbt r5, r7, r8, lsl #16
pkhbt r6, ip, lr, lsl #16
stmia r0!, {r3-r6}
bgt 1b
vpop {d8-d11}
pop {r4-r8,pc}
endfunc

View File

@@ -0,0 +1,51 @@
/*
* ARM NEON optimised H.264 chroma functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/h264chroma.h"
void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
av_cold void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth)
{
const int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
}
}

View File

@@ -0,0 +1,398 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
.macro h264_chroma_mc8 type, codec=h264
function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
push {r4-r7, lr}
ldrd r4, r5, [sp, #20]
.ifc \type,avg
mov lr, r0
.endif
pld [r1]
pld [r1, r2]
.ifc \codec,rv40
movrel r6, rv40bias
lsr r7, r5, #1
add r6, r6, r7, lsl #3
lsr r7, r4, #1
add r6, r6, r7, lsl #1
vld1.16 {d22[],d23[]}, [r6,:16]
.endif
A muls r7, r4, r5
T mul r7, r4, r5
T cmp r7, #0
rsb r6, r7, r5, lsl #3
rsb r12, r7, r4, lsl #3
sub r4, r7, r4, lsl #3
sub r4, r4, r5, lsl #3
add r4, r4, #64
beq 2f
vdup.8 d0, r4
vdup.8 d1, r12
vld1.8 {d4, d5}, [r1], r2
vdup.8 d2, r6
vdup.8 d3, r7
vext.8 d5, d4, d5, #1
1: vld1.8 {d6, d7}, [r1], r2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
vext.8 d7, d6, d7, #1
vld1.8 {d4, d5}, [r1], r2
vmlal.u8 q8, d6, d2
pld [r1]
vext.8 d5, d4, d5, #1
vmlal.u8 q8, d7, d3
vmull.u8 q9, d6, d0
subs r3, r3, #2
vmlal.u8 q9, d7, d1
vmlal.u8 q9, d4, d2
vmlal.u8 q9, d5, d3
pld [r1, r2]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 1b
pop {r4-r7, pc}
2: tst r6, r6
add r12, r12, r6
vdup.8 d0, r4
vdup.8 d1, r12
beq 4f
vld1.8 {d4}, [r1], r2
3: vld1.8 {d6}, [r1], r2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d1
vld1.8 {d4}, [r1], r2
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d1
pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
pld [r1, r2]
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
subs r3, r3, #2
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 3b
pop {r4-r7, pc}
4: vld1.8 {d4, d5}, [r1], r2
vld1.8 {d6, d7}, [r1], r2
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
pld [r1]
subs r3, r3, #2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
vmull.u8 q9, d6, d0
vmlal.u8 q9, d7, d1
pld [r1, r2]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 4b
pop {r4-r7, pc}
endfunc
.endm
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
.macro h264_chroma_mc4 type, codec=h264
function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
push {r4-r7, lr}
ldrd r4, r5, [sp, #20]
.ifc \type,avg
mov lr, r0
.endif
pld [r1]
pld [r1, r2]
.ifc \codec,rv40
movrel r6, rv40bias
lsr r7, r5, #1
add r6, r6, r7, lsl #3
lsr r7, r4, #1
add r6, r6, r7, lsl #1
vld1.16 {d22[],d23[]}, [r6,:16]
.endif
A muls r7, r4, r5
T mul r7, r4, r5
T cmp r7, #0
rsb r6, r7, r5, lsl #3
rsb r12, r7, r4, lsl #3
sub r4, r7, r4, lsl #3
sub r4, r4, r5, lsl #3
add r4, r4, #64
beq 2f
vdup.8 d0, r4
vdup.8 d1, r12
vld1.8 {d4}, [r1], r2
vdup.8 d2, r6
vdup.8 d3, r7
vext.8 d5, d4, d5, #1
vtrn.32 d4, d5
vtrn.32 d0, d1
vtrn.32 d2, d3
1: vld1.8 {d6}, [r1], r2
vext.8 d7, d6, d7, #1
vtrn.32 d6, d7
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d2
vld1.8 {d4}, [r1], r2
vext.8 d5, d4, d5, #1
vtrn.32 d4, d5
pld [r1]
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d2
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
subs r3, r3, #2
pld [r1, r2]
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 1b
pop {r4-r7, pc}
2: tst r6, r6
add r12, r12, r6
vdup.8 d0, r4
vdup.8 d1, r12
vtrn.32 d0, d1
beq 4f
vext.32 d1, d0, d1, #1
vld1.32 {d4[0]}, [r1], r2
3: vld1.32 {d4[1]}, [r1], r2
vmull.u8 q8, d4, d0
vld1.32 {d4[0]}, [r1], r2
vmull.u8 q9, d4, d1
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
subs r3, r3, #2
pld [r1, r2]
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 3b
pop {r4-r7, pc}
4: vld1.8 {d4}, [r1], r2
vld1.8 {d6}, [r1], r2
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vtrn.32 d4, d5
vtrn.32 d6, d7
vmull.u8 q8, d4, d0
vmull.u8 q9, d6, d0
subs r3, r3, #2
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
pld [r1]
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 4b
pop {r4-r7, pc}
endfunc
.endm
.macro h264_chroma_mc2 type
function ff_\type\()_h264_chroma_mc2_neon, export=1
push {r4-r6, lr}
ldr r4, [sp, #16]
ldr lr, [sp, #20]
pld [r1]
pld [r1, r2]
orrs r5, r4, lr
beq 2f
mul r5, r4, lr
rsb r6, r5, lr, lsl #3
rsb r12, r5, r4, lsl #3
sub r4, r5, r4, lsl #3
sub r4, r4, lr, lsl #3
add r4, r4, #64
vdup.8 d0, r4
vdup.8 d2, r12
vdup.8 d1, r6
vdup.8 d3, r5
vtrn.16 q0, q1
1:
vld1.32 {d4[0]}, [r1], r2
vld1.32 {d4[1]}, [r1], r2
vrev64.32 d5, d4
vld1.32 {d5[1]}, [r1]
vext.8 q3, q2, q2, #1
vtrn.16 q2, q3
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
.ifc \type,avg
vld1.16 {d18[0]}, [r0,:16], r2
vld1.16 {d18[1]}, [r0,:16]
sub r0, r0, r2
.endif
vtrn.32 d16, d17
vadd.i16 d16, d16, d17
vrshrn.u16 d16, q8, #6
.ifc \type,avg
vrhadd.u8 d16, d16, d18
.endif
vst1.16 {d16[0]}, [r0,:16], r2
vst1.16 {d16[1]}, [r0,:16], r2
subs r3, r3, #2
bgt 1b
pop {r4-r6, pc}
2:
.ifc \type,put
ldrh_post r5, r1, r2
strh_post r5, r0, r2
ldrh_post r6, r1, r2
strh_post r6, r0, r2
.else
vld1.16 {d16[0]}, [r1], r2
vld1.16 {d16[1]}, [r1], r2
vld1.16 {d18[0]}, [r0,:16], r2
vld1.16 {d18[1]}, [r0,:16]
sub r0, r0, r2
vrhadd.u8 d16, d16, d18
vst1.16 {d16[0]}, [r0,:16], r2
vst1.16 {d16[1]}, [r0,:16], r2
.endif
subs r3, r3, #2
bgt 2b
pop {r4-r6, pc}
endfunc
.endm
h264_chroma_mc8 put
h264_chroma_mc8 avg
h264_chroma_mc4 put
h264_chroma_mc4 avg
h264_chroma_mc2 put
h264_chroma_mc2 avg
#if CONFIG_RV40_DECODER
const rv40bias
.short 0, 16, 32, 16
.short 32, 28, 32, 28
.short 0, 32, 16, 32
.short 32, 28, 32, 28
endconst
h264_chroma_mc8 put, rv40
h264_chroma_mc8 avg, rv40
h264_chroma_mc4 put, rv40
h264_chroma_mc4 avg, rv40
#endif

View File

@@ -0,0 +1,253 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
RESULT .req a1
BUF .req a1
SIZE .req a2
PATTERN .req a3
PTR .req a4
DAT0 .req v1
DAT1 .req v2
DAT2 .req v3
DAT3 .req v4
TMP0 .req v5
TMP1 .req v6
TMP2 .req ip
TMP3 .req lr
#define PRELOAD_DISTANCE 4
.macro innerloop4
ldr DAT0, [PTR], #4
subs SIZE, SIZE, #4 @ C flag survives rest of macro
sub TMP0, DAT0, PATTERN, lsr #14
bic TMP0, TMP0, DAT0
ands TMP0, TMP0, PATTERN
.endm
.macro innerloop16 decrement, do_preload
ldmia PTR!, {DAT0,DAT1,DAT2,DAT3}
.ifnc "\do_preload",""
pld [PTR, #PRELOAD_DISTANCE*32]
.endif
.ifnc "\decrement",""
subs SIZE, SIZE, #\decrement @ C flag survives rest of macro
.endif
sub TMP0, DAT0, PATTERN, lsr #14
sub TMP1, DAT1, PATTERN, lsr #14
bic TMP0, TMP0, DAT0
bic TMP1, TMP1, DAT1
sub TMP2, DAT2, PATTERN, lsr #14
sub TMP3, DAT3, PATTERN, lsr #14
ands TMP0, TMP0, PATTERN
bic TMP2, TMP2, DAT2
it eq
andseq TMP1, TMP1, PATTERN
bic TMP3, TMP3, DAT3
itt eq
andseq TMP2, TMP2, PATTERN
andseq TMP3, TMP3, PATTERN
.endm
/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */
function ff_h264_find_start_code_candidate_armv6, export=1
push {v1-v6,lr}
mov PTR, BUF
@ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
@ before using code that does preloads
cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
blo 60f
@ Get to word-alignment, 1 byte at a time
tst PTR, #3
beq 2f
1: ldrb DAT0, [PTR], #1
sub SIZE, SIZE, #1
teq DAT0, #0
beq 90f
tst PTR, #3
bne 1b
2: @ Get to 4-word alignment, 1 word at a time
ldr PATTERN, =0x80008000
setend be
tst PTR, #12
beq 4f
3: innerloop4
bne 91f
tst PTR, #12
bne 3b
4: @ Get to cacheline (8-word) alignment
tst PTR, #16
beq 5f
innerloop16 16
bne 93f
5: @ Check complete cachelines, with preloading
@ We need to stop when there are still (PRELOAD_DISTANCE+1)
@ complete cachelines to go
sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
6: innerloop16 , do_preload
bne 93f
innerloop16 32
bne 93f
bcs 6b
@ Preload trailing part-cacheline, if any
tst SIZE, #31
beq 7f
pld [PTR, #(PRELOAD_DISTANCE+1)*32]
@ Check remaining data without doing any more preloads. First
@ do in chunks of 4 words:
7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
bmi 9f
8: innerloop16 16
bne 93f
bcs 8b
@ Then in words:
9: adds SIZE, SIZE, #16 - 4
bmi 11f
10: innerloop4
bne 91f
bcs 10b
11: setend le
@ Check second byte of final halfword
ldrb DAT0, [PTR, #-1]
teq DAT0, #0
beq 90f
@ Check any remaining bytes
tst SIZE, #3
beq 13f
12: ldrb DAT0, [PTR], #1
sub SIZE, SIZE, #1
teq DAT0, #0
beq 90f
tst SIZE, #3
bne 12b
@ No candidate found
13: sub RESULT, PTR, BUF
b 99f
60: @ Small buffer - simply check by looping over bytes
subs SIZE, SIZE, #1
bcc 99f
61: ldrb DAT0, [PTR], #1
subs SIZE, SIZE, #1
teq DAT0, #0
beq 90f
bcs 61b
@ No candidate found
sub RESULT, PTR, BUF
b 99f
90: @ Found a candidate at the preceding byte
sub RESULT, PTR, BUF
sub RESULT, RESULT, #1
b 99f
91: @ Found a candidate somewhere in the preceding 4 bytes
sub RESULT, PTR, BUF
sub RESULT, RESULT, #4
sub TMP0, DAT0, #0x20000
bics TMP0, TMP0, DAT0
itt pl
ldrbpl DAT0, [PTR, #-3]
addpl RESULT, RESULT, #2
bpl 92f
teq RESULT, #0
beq 98f @ don't look back a byte if found at first byte in buffer
ldrb DAT0, [PTR, #-5]
92: teq DAT0, #0
it eq
subeq RESULT, RESULT, #1
b 98f
93: @ Found a candidate somewhere in the preceding 16 bytes
sub RESULT, PTR, BUF
sub RESULT, RESULT, #16
teq TMP0, #0
beq 95f @ not in first 4 bytes
sub TMP0, DAT0, #0x20000
bics TMP0, TMP0, DAT0
itt pl
ldrbpl DAT0, [PTR, #-15]
addpl RESULT, RESULT, #2
bpl 94f
teq RESULT, #0
beq 98f @ don't look back a byte if found at first byte in buffer
ldrb DAT0, [PTR, #-17]
94: teq DAT0, #0
it eq
subeq RESULT, RESULT, #1
b 98f
95: add RESULT, RESULT, #4
teq TMP1, #0
beq 96f @ not in next 4 bytes
sub TMP1, DAT1, #0x20000
bics TMP1, TMP1, DAT1
itee mi
ldrbmi DAT0, [PTR, #-13]
ldrbpl DAT0, [PTR, #-11]
addpl RESULT, RESULT, #2
teq DAT0, #0
it eq
subeq RESULT, RESULT, #1
b 98f
96: add RESULT, RESULT, #4
teq TMP2, #0
beq 97f @ not in next 4 bytes
sub TMP2, DAT2, #0x20000
bics TMP2, TMP2, DAT2
itee mi
ldrbmi DAT0, [PTR, #-9]
ldrbpl DAT0, [PTR, #-7]
addpl RESULT, RESULT, #2
teq DAT0, #0
it eq
subeq RESULT, RESULT, #1
b 98f
97: add RESULT, RESULT, #4
sub TMP3, DAT3, #0x20000
bics TMP3, TMP3, DAT3
itee mi
ldrbmi DAT0, [PTR, #-5]
ldrbpl DAT0, [PTR, #-3]
addpl RESULT, RESULT, #2
teq DAT0, #0
it eq
subeq RESULT, RESULT, #1
@ drop through to 98f
98: setend le
99: pop {v1-v6,pc}
.endfunc
.unreq RESULT
.unreq BUF
.unreq SIZE
.unreq PATTERN
.unreq PTR
.unreq DAT0
.unreq DAT1
.unreq DAT2
.unreq DAT3
.unreq TMP0
.unreq TMP1
.unreq TMP2
.unreq TMP3

View File

@@ -0,0 +1,115 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/h264dsp.h"
int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size);
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_NEON
if (bit_depth == 8) {
c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
if(chroma_format_idc == 1){
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
}
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
c->h264_idct_add16 = ff_h264_idct_add16_neon;
c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_neon;
c->h264_idct8_add = ff_h264_idct8_add_neon;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
}
#endif // HAVE_NEON
}
av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv6(cpu_flags))
c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6;
if (have_neon(cpu_flags))
h264dsp_init_neon(c, bit_depth, chroma_format_idc);
}

View File

@@ -0,0 +1,541 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
/* H.264 loop filter */
.macro h264_loop_filter_start
ldr r12, [sp]
tst r2, r2
ldr r12, [r12]
it ne
tstne r3, r3
vmov.32 d24[0], r12
and r12, r12, r12, lsl #16
it eq
bxeq lr
ands r12, r12, r12, lsl #8
it lt
bxlt lr
.endm
.macro h264_loop_filter_luma
vdup.8 q11, r2 @ alpha
vmovl.u8 q12, d24
vabd.u8 q6, q8, q0 @ abs(p0 - q0)
vmovl.u16 q12, d24
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
vsli.16 q12, q12, #8
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
vsli.32 q12, q12, #16
vclt.u8 q6, q6, q11 @ < alpha
vdup.8 q11, r3 @ beta
vclt.s8 q7, q12, #0
vclt.u8 q14, q14, q11 @ < beta
vclt.u8 q15, q15, q11 @ < beta
vbic q6, q6, q7
vabd.u8 q4, q10, q8 @ abs(p2 - p0)
vand q6, q6, q14
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
vclt.u8 q4, q4, q11 @ < beta
vand q6, q6, q15
vclt.u8 q5, q5, q11 @ < beta
vand q4, q4, q6
vand q5, q5, q6
vand q12, q12, q6
vrhadd.u8 q14, q8, q0
vsub.i8 q6, q12, q4
vqadd.u8 q7, q9, q12
vhadd.u8 q10, q10, q14
vsub.i8 q6, q6, q5
vhadd.u8 q14, q2, q14
vmin.u8 q7, q7, q10
vqsub.u8 q11, q9, q12
vqadd.u8 q2, q1, q12
vmax.u8 q7, q7, q11
vqsub.u8 q11, q1, q12
vmin.u8 q14, q2, q14
vmovl.u8 q2, d0
vmax.u8 q14, q14, q11
vmovl.u8 q10, d1
vsubw.u8 q2, q2, d16
vsubw.u8 q10, q10, d17
vshl.i16 q2, q2, #2
vshl.i16 q10, q10, #2
vaddw.u8 q2, q2, d18
vaddw.u8 q10, q10, d19
vsubw.u8 q2, q2, d2
vsubw.u8 q10, q10, d3
vrshrn.i16 d4, q2, #3
vrshrn.i16 d5, q10, #3
vbsl q4, q7, q9
vbsl q5, q14, q1
vneg.s8 q7, q6
vmovl.u8 q14, d16
vmin.s8 q2, q2, q6
vmovl.u8 q6, d17
vmax.s8 q2, q2, q7
vmovl.u8 q11, d0
vmovl.u8 q12, d1
vaddw.s8 q14, q14, d4
vaddw.s8 q6, q6, d5
vsubw.s8 q11, q11, d4
vsubw.s8 q12, q12, d5
vqmovun.s16 d16, q14
vqmovun.s16 d17, q6
vqmovun.s16 d0, q11
vqmovun.s16 d1, q12
.endm
function ff_h264_v_loop_filter_luma_neon, export=1
h264_loop_filter_start
vld1.8 {d0, d1}, [r0,:128], r1
vld1.8 {d2, d3}, [r0,:128], r1
vld1.8 {d4, d5}, [r0,:128], r1
sub r0, r0, r1, lsl #2
sub r0, r0, r1, lsl #1
vld1.8 {d20,d21}, [r0,:128], r1
vld1.8 {d18,d19}, [r0,:128], r1
vld1.8 {d16,d17}, [r0,:128], r1
vpush {d8-d15}
h264_loop_filter_luma
sub r0, r0, r1, lsl #1
vst1.8 {d8, d9}, [r0,:128], r1
vst1.8 {d16,d17}, [r0,:128], r1
vst1.8 {d0, d1}, [r0,:128], r1
vst1.8 {d10,d11}, [r0,:128]
vpop {d8-d15}
bx lr
endfunc
function ff_h264_h_loop_filter_luma_neon, export=1
h264_loop_filter_start
sub r0, r0, #4
vld1.8 {d6}, [r0], r1
vld1.8 {d20}, [r0], r1
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d4}, [r0], r1
vld1.8 {d26}, [r0], r1
vld1.8 {d7}, [r0], r1
vld1.8 {d21}, [r0], r1
vld1.8 {d19}, [r0], r1
vld1.8 {d17}, [r0], r1
vld1.8 {d1}, [r0], r1
vld1.8 {d3}, [r0], r1
vld1.8 {d5}, [r0], r1
vld1.8 {d27}, [r0], r1
transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
vpush {d8-d15}
h264_loop_filter_luma
transpose_4x4 q4, q8, q0, q5
sub r0, r0, r1, lsl #4
add r0, r0, #2
vst1.32 {d8[0]}, [r0], r1
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d10[0]}, [r0], r1
vst1.32 {d8[1]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0], r1
vst1.32 {d10[1]}, [r0], r1
vst1.32 {d9[0]}, [r0], r1
vst1.32 {d17[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d11[0]}, [r0], r1
vst1.32 {d9[1]}, [r0], r1
vst1.32 {d17[1]}, [r0], r1
vst1.32 {d1[1]}, [r0], r1
vst1.32 {d11[1]}, [r0], r1
vpop {d8-d15}
bx lr
endfunc
.macro h264_loop_filter_chroma
vdup.8 d22, r2 @ alpha
vmovl.u8 q12, d24
vabd.u8 d26, d16, d0 @ abs(p0 - q0)
vmovl.u8 q2, d0
vabd.u8 d28, d18, d16 @ abs(p1 - p0)
vsubw.u8 q2, q2, d16
vsli.16 d24, d24, #8
vshl.i16 q2, q2, #2
vabd.u8 d30, d2, d0 @ abs(q1 - q0)
vaddw.u8 q2, q2, d18
vclt.u8 d26, d26, d22 @ < alpha
vsubw.u8 q2, q2, d2
vdup.8 d22, r3 @ beta
vrshrn.i16 d4, q2, #3
vclt.u8 d28, d28, d22 @ < beta
vclt.u8 d30, d30, d22 @ < beta
vmin.s8 d4, d4, d24
vneg.s8 d25, d24
vand d26, d26, d28
vmax.s8 d4, d4, d25
vand d26, d26, d30
vmovl.u8 q11, d0
vand d4, d4, d26
vmovl.u8 q14, d16
vaddw.s8 q14, q14, d4
vsubw.s8 q11, q11, d4
vqmovun.s16 d16, q14
vqmovun.s16 d0, q11
.endm
function ff_h264_v_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub r0, r0, r1, lsl #1
vld1.8 {d18}, [r0,:64], r1
vld1.8 {d16}, [r0,:64], r1
vld1.8 {d0}, [r0,:64], r1
vld1.8 {d2}, [r0,:64]
h264_loop_filter_chroma
sub r0, r0, r1, lsl #1
vst1.8 {d16}, [r0,:64], r1
vst1.8 {d0}, [r0,:64], r1
bx lr
endfunc
function ff_h264_h_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub r0, r0, #2
vld1.32 {d18[0]}, [r0], r1
vld1.32 {d16[0]}, [r0], r1
vld1.32 {d0[0]}, [r0], r1
vld1.32 {d2[0]}, [r0], r1
vld1.32 {d18[1]}, [r0], r1
vld1.32 {d16[1]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1
vld1.32 {d2[1]}, [r0], r1
vtrn.16 d18, d0
vtrn.16 d16, d2
vtrn.8 d18, d16
vtrn.8 d0, d2
h264_loop_filter_chroma
vtrn.16 d18, d0
vtrn.16 d16, d2
vtrn.8 d18, d16
vtrn.8 d0, d2
sub r0, r0, r1, lsl #3
vst1.32 {d18[0]}, [r0], r1
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d2[0]}, [r0], r1
vst1.32 {d18[1]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0], r1
vst1.32 {d2[1]}, [r0], r1
bx lr
endfunc
@ Biweighted prediction
.macro biweight_16 macs, macd
vdup.8 d0, r4
vdup.8 d1, r5
vmov q2, q8
vmov q3, q8
1: subs r3, r3, #2
vld1.8 {d20-d21},[r0,:128], r2
\macd q2, d0, d20
pld [r0]
\macd q3, d0, d21
vld1.8 {d22-d23},[r1,:128], r2
\macs q2, d1, d22
pld [r1]
\macs q3, d1, d23
vmov q12, q8
vld1.8 {d28-d29},[r0,:128], r2
vmov q13, q8
\macd q12, d0, d28
pld [r0]
\macd q13, d0, d29
vld1.8 {d30-d31},[r1,:128], r2
\macs q12, d1, d30
pld [r1]
\macs q13, d1, d31
vshl.s16 q2, q2, q9
vshl.s16 q3, q3, q9
vqmovun.s16 d4, q2
vqmovun.s16 d5, q3
vshl.s16 q12, q12, q9
vshl.s16 q13, q13, q9
vqmovun.s16 d24, q12
vqmovun.s16 d25, q13
vmov q3, q8
vst1.8 {d4- d5}, [r6,:128], r2
vmov q2, q8
vst1.8 {d24-d25},[r6,:128], r2
bne 1b
pop {r4-r6, pc}
.endm
.macro biweight_8 macs, macd
vdup.8 d0, r4
vdup.8 d1, r5
vmov q1, q8
vmov q10, q8
1: subs r3, r3, #2
vld1.8 {d4},[r0,:64], r2
\macd q1, d0, d4
pld [r0]
vld1.8 {d5},[r1,:64], r2
\macs q1, d1, d5
pld [r1]
vld1.8 {d6},[r0,:64], r2
\macd q10, d0, d6
pld [r0]
vld1.8 {d7},[r1,:64], r2
\macs q10, d1, d7
pld [r1]
vshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vshl.s16 q10, q10, q9
vqmovun.s16 d4, q10
vmov q10, q8
vst1.8 {d2},[r6,:64], r2
vmov q1, q8
vst1.8 {d4},[r6,:64], r2
bne 1b
pop {r4-r6, pc}
.endm
.macro biweight_4 macs, macd
vdup.8 d0, r4
vdup.8 d1, r5
vmov q1, q8
vmov q10, q8
1: subs r3, r3, #4
vld1.32 {d4[0]},[r0,:32], r2
vld1.32 {d4[1]},[r0,:32], r2
\macd q1, d0, d4
pld [r0]
vld1.32 {d5[0]},[r1,:32], r2
vld1.32 {d5[1]},[r1,:32], r2
\macs q1, d1, d5
pld [r1]
blt 2f
vld1.32 {d6[0]},[r0,:32], r2
vld1.32 {d6[1]},[r0,:32], r2
\macd q10, d0, d6
pld [r0]
vld1.32 {d7[0]},[r1,:32], r2
vld1.32 {d7[1]},[r1,:32], r2
\macs q10, d1, d7
pld [r1]
vshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vshl.s16 q10, q10, q9
vqmovun.s16 d4, q10
vmov q10, q8
vst1.32 {d2[0]},[r6,:32], r2
vst1.32 {d2[1]},[r6,:32], r2
vmov q1, q8
vst1.32 {d4[0]},[r6,:32], r2
vst1.32 {d4[1]},[r6,:32], r2
bne 1b
pop {r4-r6, pc}
2: vshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vst1.32 {d2[0]},[r6,:32], r2
vst1.32 {d2[1]},[r6,:32], r2
pop {r4-r6, pc}
.endm
.macro biweight_func w
function ff_biweight_h264_pixels_\w\()_neon, export=1
push {r4-r6, lr}
ldr r12, [sp, #16]
add r4, sp, #20
ldm r4, {r4-r6}
lsr lr, r4, #31
add r6, r6, #1
eors lr, lr, r5, lsr #30
orr r6, r6, #1
vdup.16 q9, r12
lsl r6, r6, r12
vmvn q9, q9
vdup.16 q8, r6
mov r6, r0
beq 10f
subs lr, lr, #1
beq 20f
subs lr, lr, #1
beq 30f
b 40f
10: biweight_\w vmlal.u8, vmlal.u8
20: rsb r4, r4, #0
biweight_\w vmlal.u8, vmlsl.u8
30: rsb r4, r4, #0
rsb r5, r5, #0
biweight_\w vmlsl.u8, vmlsl.u8
40: rsb r5, r5, #0
biweight_\w vmlsl.u8, vmlal.u8
endfunc
.endm
biweight_func 16
biweight_func 8
biweight_func 4
@ Weighted prediction
.macro weight_16 add
vdup.8 d0, r12
1: subs r2, r2, #2
vld1.8 {d20-d21},[r0,:128], r1
vmull.u8 q2, d0, d20
pld [r0]
vmull.u8 q3, d0, d21
vld1.8 {d28-d29},[r0,:128], r1
vmull.u8 q12, d0, d28
pld [r0]
vmull.u8 q13, d0, d29
\add q2, q8, q2
vrshl.s16 q2, q2, q9
\add q3, q8, q3
vrshl.s16 q3, q3, q9
vqmovun.s16 d4, q2
vqmovun.s16 d5, q3
\add q12, q8, q12
vrshl.s16 q12, q12, q9
\add q13, q8, q13
vrshl.s16 q13, q13, q9
vqmovun.s16 d24, q12
vqmovun.s16 d25, q13
vst1.8 {d4- d5}, [r4,:128], r1
vst1.8 {d24-d25},[r4,:128], r1
bne 1b
pop {r4, pc}
.endm
.macro weight_8 add
vdup.8 d0, r12
1: subs r2, r2, #2
vld1.8 {d4},[r0,:64], r1
vmull.u8 q1, d0, d4
pld [r0]
vld1.8 {d6},[r0,:64], r1
vmull.u8 q10, d0, d6
\add q1, q8, q1
pld [r0]
vrshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
\add q10, q8, q10
vrshl.s16 q10, q10, q9
vqmovun.s16 d4, q10
vst1.8 {d2},[r4,:64], r1
vst1.8 {d4},[r4,:64], r1
bne 1b
pop {r4, pc}
.endm
.macro weight_4 add
vdup.8 d0, r12
vmov q1, q8
vmov q10, q8
1: subs r2, r2, #4
vld1.32 {d4[0]},[r0,:32], r1
vld1.32 {d4[1]},[r0,:32], r1
vmull.u8 q1, d0, d4
pld [r0]
blt 2f
vld1.32 {d6[0]},[r0,:32], r1
vld1.32 {d6[1]},[r0,:32], r1
vmull.u8 q10, d0, d6
pld [r0]
\add q1, q8, q1
vrshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
\add q10, q8, q10
vrshl.s16 q10, q10, q9
vqmovun.s16 d4, q10
vmov q10, q8
vst1.32 {d2[0]},[r4,:32], r1
vst1.32 {d2[1]},[r4,:32], r1
vmov q1, q8
vst1.32 {d4[0]},[r4,:32], r1
vst1.32 {d4[1]},[r4,:32], r1
bne 1b
pop {r4, pc}
2: \add q1, q8, q1
vrshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vst1.32 {d2[0]},[r4,:32], r1
vst1.32 {d2[1]},[r4,:32], r1
pop {r4, pc}
.endm
.macro weight_func w
function ff_weight_h264_pixels_\w\()_neon, export=1
push {r4, lr}
ldr r12, [sp, #8]
ldr r4, [sp, #12]
cmp r3, #1
lsl r4, r4, r3
vdup.16 q8, r4
mov r4, r0
ble 20f
rsb lr, r3, #1
vdup.16 q9, lr
cmp r12, #0
blt 10f
weight_\w vhadd.s16
10: rsb r12, r12, #0
weight_\w vhsub.s16
20: rsb lr, r3, #0
vdup.16 q9, lr
cmp r12, #0
blt 10f
weight_\w vadd.s16
10: rsb r12, r12, #0
weight_\w vsub.s16
endfunc
.endm
weight_func 16
weight_func 8
weight_func 4

View File

@@ -0,0 +1,413 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_h264_idct_add_neon, export=1
vld1.64 {d0-d3}, [r1,:128]
vmov.i16 q15, #0
vswp d1, d2
vst1.16 {q15}, [r1,:128]!
vadd.i16 d4, d0, d1
vst1.16 {q15}, [r1,:128]!
vshr.s16 q8, q1, #1
vsub.i16 d5, d0, d1
vadd.i16 d6, d2, d17
vsub.i16 d7, d16, d3
vadd.i16 q0, q2, q3
vsub.i16 q1, q2, q3
vtrn.16 d0, d1
vtrn.16 d3, d2
vtrn.32 d0, d3
vtrn.32 d1, d2
vadd.i16 d4, d0, d3
vld1.32 {d18[0]}, [r0,:32], r2
vswp d1, d3
vshr.s16 q8, q1, #1
vld1.32 {d19[1]}, [r0,:32], r2
vsub.i16 d5, d0, d1
vld1.32 {d18[1]}, [r0,:32], r2
vadd.i16 d6, d16, d3
vld1.32 {d19[0]}, [r0,:32], r2
vsub.i16 d7, d2, d17
sub r0, r0, r2, lsl #2
vadd.i16 q0, q2, q3
vsub.i16 q1, q2, q3
vrshr.s16 q0, q0, #6
vrshr.s16 q1, q1, #6
vaddw.u8 q0, q0, d18
vaddw.u8 q1, q1, d19
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vst1.32 {d0[0]}, [r0,:32], r2
vst1.32 {d1[1]}, [r0,:32], r2
vst1.32 {d0[1]}, [r0,:32], r2
vst1.32 {d1[0]}, [r0,:32], r2
sub r1, r1, #32
bx lr
endfunc
function ff_h264_idct_dc_add_neon, export=1
mov r3, #0
vld1.16 {d2[],d3[]}, [r1,:16]
strh r3, [r1]
vrshr.s16 q1, q1, #6
vld1.32 {d0[0]}, [r0,:32], r2
vld1.32 {d0[1]}, [r0,:32], r2
vaddw.u8 q2, q1, d0
vld1.32 {d1[0]}, [r0,:32], r2
vld1.32 {d1[1]}, [r0,:32], r2
vaddw.u8 q1, q1, d1
vqmovun.s16 d0, q2
vqmovun.s16 d1, q1
sub r0, r0, r2, lsl #2
vst1.32 {d0[0]}, [r0,:32], r2
vst1.32 {d0[1]}, [r0,:32], r2
vst1.32 {d1[0]}, [r0,:32], r2
vst1.32 {d1[1]}, [r0,:32], r2
bx lr
endfunc
function ff_h264_idct_add16_neon, export=1
push {r4-r8,lr}
mov r4, r0
mov r5, r1
mov r1, r2
mov r2, r3
ldr r6, [sp, #24]
movrel r7, scan8
mov ip, #16
1: ldrb r8, [r7], #1
ldr r0, [r5], #4
ldrb r8, [r6, r8]
subs r8, r8, #1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
it ne
movne lr, #0
cmp lr, #0
ite ne
adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
blx lr
2: subs ip, ip, #1
add r1, r1, #32
bne 1b
pop {r4-r8,pc}
endfunc
function ff_h264_idct_add16intra_neon, export=1
push {r4-r8,lr}
mov r4, r0
mov r5, r1
mov r1, r2
mov r2, r3
ldr r6, [sp, #24]
movrel r7, scan8
mov ip, #16
1: ldrb r8, [r7], #1
ldr r0, [r5], #4
ldrb r8, [r6, r8]
add r0, r0, r4
cmp r8, #0
ldrsh r8, [r1]
iteet ne
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
subs ip, ip, #1
add r1, r1, #32
bne 1b
pop {r4-r8,pc}
endfunc
function ff_h264_idct_add8_neon, export=1
push {r4-r10,lr}
ldm r0, {r4,r9}
add r5, r1, #16*4
add r1, r2, #16*32
mov r2, r3
mov r10, r1
ldr r6, [sp, #32]
movrel r7, scan8+16
mov r12, #0
1: ldrb r8, [r7, r12]
ldr r0, [r5, r12, lsl #2]
ldrb r8, [r6, r8]
add r0, r0, r4
add r1, r10, r12, lsl #5
cmp r8, #0
ldrsh r8, [r1]
iteet ne
adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
cmpeq r8, #0
blxne lr
add r12, r12, #1
cmp r12, #4
itt eq
moveq r12, #16
moveq r4, r9
cmp r12, #20
blt 1b
pop {r4-r10,pc}
endfunc
.macro idct8x8_cols pass
.if \pass == 0
qa .req q2
qb .req q14
vshr.s16 q2, q10, #1
vadd.i16 q0, q8, q12
vld1.16 {q14-q15},[r1,:128]
vst1.16 {q3}, [r1,:128]!
vst1.16 {q3}, [r1,:128]!
vsub.i16 q1, q8, q12
vshr.s16 q3, q14, #1
vsub.i16 q2, q2, q14
vadd.i16 q3, q3, q10
.else
qa .req q14
qb .req q2
vtrn.32 q8, q10
vtrn.16 q12, q13
vtrn.32 q9, q11
vtrn.32 q12, q2
vtrn.32 q13, q15
vswp d21, d4
vshr.s16 q14, q10, #1
vswp d17, d24
vshr.s16 q3, q2, #1
vswp d19, d26
vadd.i16 q0, q8, q12
vswp d23, d30
vsub.i16 q1, q8, q12
vsub.i16 q14, q14, q2
vadd.i16 q3, q3, q10
.endif
vadd.i16 q10, q1, qa
vsub.i16 q12, q1, qa
vadd.i16 q8, q0, q3
vsub.i16 qb, q0, q3
vsub.i16 q0, q13, q11
vadd.i16 q1, q15, q9
vsub.i16 qa, q15, q9
vadd.i16 q3, q13, q11
vsub.i16 q0, q0, q15
vsub.i16 q1, q1, q11
vadd.i16 qa, qa, q13
vadd.i16 q3, q3, q9
vshr.s16 q9, q9, #1
vshr.s16 q11, q11, #1
vshr.s16 q13, q13, #1
vshr.s16 q15, q15, #1
vsub.i16 q0, q0, q15
vsub.i16 q1, q1, q11
vadd.i16 qa, qa, q13
vadd.i16 q3, q3, q9
vshr.s16 q9, q0, #2
vshr.s16 q11, q1, #2
vshr.s16 q13, qa, #2
vshr.s16 q15, q3, #2
vsub.i16 q3, q3, q9
vsub.i16 qa, q11, qa
vadd.i16 q1, q1, q13
vadd.i16 q0, q0, q15
.if \pass == 0
vsub.i16 q15, q8, q3
vadd.i16 q8, q8, q3
vadd.i16 q9, q10, q2
vsub.i16 q2, q10, q2
vtrn.16 q8, q9
vadd.i16 q10, q12, q1
vtrn.16 q2, q15
vadd.i16 q11, q14, q0
vsub.i16 q13, q12, q1
vtrn.16 q10, q11
vsub.i16 q12, q14, q0
.else
vsub.i16 q15, q8, q3
vadd.i16 q8, q8, q3
vadd.i16 q9, q10, q14
vsub.i16 q14, q10, q14
vadd.i16 q10, q12, q1
vsub.i16 q13, q12, q1
vadd.i16 q11, q2, q0
vsub.i16 q12, q2, q0
.endif
.unreq qa
.unreq qb
.endm
function ff_h264_idct8_add_neon, export=1
vmov.i16 q3, #0
vld1.16 {q8-q9}, [r1,:128]
vst1.16 {q3}, [r1,:128]!
vst1.16 {q3}, [r1,:128]!
vld1.16 {q10-q11},[r1,:128]
vst1.16 {q3}, [r1,:128]!
vst1.16 {q3}, [r1,:128]!
vld1.16 {q12-q13},[r1,:128]
vst1.16 {q3}, [r1,:128]!
vst1.16 {q3}, [r1,:128]!
idct8x8_cols 0
idct8x8_cols 1
mov r3, r0
vrshr.s16 q8, q8, #6
vld1.8 {d0}, [r0,:64], r2
vrshr.s16 q9, q9, #6
vld1.8 {d1}, [r0,:64], r2
vrshr.s16 q10, q10, #6
vld1.8 {d2}, [r0,:64], r2
vrshr.s16 q11, q11, #6
vld1.8 {d3}, [r0,:64], r2
vrshr.s16 q12, q12, #6
vld1.8 {d4}, [r0,:64], r2
vrshr.s16 q13, q13, #6
vld1.8 {d5}, [r0,:64], r2
vrshr.s16 q14, q14, #6
vld1.8 {d6}, [r0,:64], r2
vrshr.s16 q15, q15, #6
vld1.8 {d7}, [r0,:64], r2
vaddw.u8 q8, q8, d0
vaddw.u8 q9, q9, d1
vaddw.u8 q10, q10, d2
vqmovun.s16 d0, q8
vaddw.u8 q11, q11, d3
vqmovun.s16 d1, q9
vaddw.u8 q12, q12, d4
vqmovun.s16 d2, q10
vst1.8 {d0}, [r3,:64], r2
vaddw.u8 q13, q13, d5
vqmovun.s16 d3, q11
vst1.8 {d1}, [r3,:64], r2
vaddw.u8 q14, q14, d6
vqmovun.s16 d4, q12
vst1.8 {d2}, [r3,:64], r2
vaddw.u8 q15, q15, d7
vqmovun.s16 d5, q13
vst1.8 {d3}, [r3,:64], r2
vqmovun.s16 d6, q14
vqmovun.s16 d7, q15
vst1.8 {d4}, [r3,:64], r2
vst1.8 {d5}, [r3,:64], r2
vst1.8 {d6}, [r3,:64], r2
vst1.8 {d7}, [r3,:64], r2
sub r1, r1, #128
bx lr
endfunc
function ff_h264_idct8_dc_add_neon, export=1
mov r3, #0
vld1.16 {d30[],d31[]},[r1,:16]
strh r3, [r1]
vld1.32 {d0}, [r0,:64], r2
vrshr.s16 q15, q15, #6
vld1.32 {d1}, [r0,:64], r2
vld1.32 {d2}, [r0,:64], r2
vaddw.u8 q8, q15, d0
vld1.32 {d3}, [r0,:64], r2
vaddw.u8 q9, q15, d1
vld1.32 {d4}, [r0,:64], r2
vaddw.u8 q10, q15, d2
vld1.32 {d5}, [r0,:64], r2
vaddw.u8 q11, q15, d3
vld1.32 {d6}, [r0,:64], r2
vaddw.u8 q12, q15, d4
vld1.32 {d7}, [r0,:64], r2
vaddw.u8 q13, q15, d5
vaddw.u8 q14, q15, d6
vaddw.u8 q15, q15, d7
vqmovun.s16 d0, q8
vqmovun.s16 d1, q9
vqmovun.s16 d2, q10
vqmovun.s16 d3, q11
sub r0, r0, r2, lsl #3
vst1.32 {d0}, [r0,:64], r2
vqmovun.s16 d4, q12
vst1.32 {d1}, [r0,:64], r2
vqmovun.s16 d5, q13
vst1.32 {d2}, [r0,:64], r2
vqmovun.s16 d6, q14
vst1.32 {d3}, [r0,:64], r2
vqmovun.s16 d7, q15
vst1.32 {d4}, [r0,:64], r2
vst1.32 {d5}, [r0,:64], r2
vst1.32 {d6}, [r0,:64], r2
vst1.32 {d7}, [r0,:64], r2
bx lr
endfunc
function ff_h264_idct8_add4_neon, export=1
push {r4-r8,lr}
mov r4, r0
mov r5, r1
mov r1, r2
mov r2, r3
ldr r6, [sp, #24]
movrel r7, scan8
mov r12, #16
1: ldrb r8, [r7], #4
ldr r0, [r5], #16
ldrb r8, [r6, r8]
subs r8, r8, #1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
it ne
movne lr, #0
cmp lr, #0
ite ne
adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
blx lr
2: subs r12, r12, #4
add r1, r1, #128
bne 1b
pop {r4-r8,pc}
endfunc
const scan8
.byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
.byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
.byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
.byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
.byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
.byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
.byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
.byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
.byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
.byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
.byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
.byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
endconst

View File

@@ -0,0 +1,92 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/h264pred.h"
void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_NEON
const int high_depth = bit_depth > 8;
if (high_depth)
return;
if(chroma_format_idc == 1){
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon;
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon;
if (codec_id != AV_CODEC_ID_VP8)
h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon;
if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP8) {
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon;
h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
}
}
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon;
h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP8)
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
#endif // HAVE_NEON
}
av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
int bit_depth, const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
}

View File

@@ -0,0 +1,359 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro ldcol.8 rd, rs, rt, n=8, hi=0
.if \n == 8 || \hi == 0
vld1.8 {\rd[0]}, [\rs], \rt
vld1.8 {\rd[1]}, [\rs], \rt
vld1.8 {\rd[2]}, [\rs], \rt
vld1.8 {\rd[3]}, [\rs], \rt
.endif
.if \n == 8 || \hi == 1
vld1.8 {\rd[4]}, [\rs], \rt
vld1.8 {\rd[5]}, [\rs], \rt
vld1.8 {\rd[6]}, [\rs], \rt
vld1.8 {\rd[7]}, [\rs], \rt
.endif
.endm
.macro add16x8 dq, dl, dh, rl, rh
vaddl.u8 \dq, \rl, \rh
vadd.u16 \dl, \dl, \dh
vpadd.u16 \dl, \dl, \dl
vpadd.u16 \dl, \dl, \dl
.endm
function ff_pred16x16_128_dc_neon, export=1
vmov.i8 q0, #128
b .L_pred16x16_dc_end
endfunc
function ff_pred16x16_top_dc_neon, export=1
sub r2, r0, r1
vld1.8 {q0}, [r2,:128]
add16x8 q0, d0, d1, d0, d1
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
b .L_pred16x16_dc_end
endfunc
function ff_pred16x16_left_dc_neon, export=1
sub r2, r0, #1
ldcol.8 d0, r2, r1
ldcol.8 d1, r2, r1
add16x8 q0, d0, d1, d0, d1
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
b .L_pred16x16_dc_end
endfunc
function ff_pred16x16_dc_neon, export=1
sub r2, r0, r1
vld1.8 {q0}, [r2,:128]
sub r2, r0, #1
ldcol.8 d2, r2, r1
ldcol.8 d3, r2, r1
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #5
vdup.8 q0, d0[0]
.L_pred16x16_dc_end:
mov r3, #8
6: vst1.8 {q0}, [r0,:128], r1
vst1.8 {q0}, [r0,:128], r1
subs r3, r3, #1
bne 6b
bx lr
endfunc
function ff_pred16x16_hor_neon, export=1
sub r2, r0, #1
mov r3, #16
1: vld1.8 {d0[],d1[]},[r2], r1
vst1.8 {q0}, [r0,:128], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function ff_pred16x16_vert_neon, export=1
sub r0, r0, r1
vld1.8 {q0}, [r0,:128], r1
mov r3, #8
1: vst1.8 {q0}, [r0,:128], r1
vst1.8 {q0}, [r0,:128], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function ff_pred16x16_plane_neon, export=1
sub r3, r0, r1
add r2, r3, #8
sub r3, r3, #1
vld1.8 {d0}, [r3]
vld1.8 {d2}, [r2,:64], r1
ldcol.8 d1, r3, r1
add r3, r3, r1
ldcol.8 d3, r3, r1
vrev64.8 q0, q0
vaddl.u8 q8, d2, d3
vsubl.u8 q2, d2, d0
vsubl.u8 q3, d3, d1
movrel r3, p16weight
vld1.8 {q0}, [r3,:128]
vmul.s16 q2, q2, q0
vmul.s16 q3, q3, q0
vadd.i16 d4, d4, d5
vadd.i16 d5, d6, d7
vpadd.i16 d4, d4, d5
vpadd.i16 d4, d4, d4
vshll.s16 q3, d4, #2
vaddw.s16 q2, q3, d4
vrshrn.s32 d4, q2, #6
mov r3, #0
vtrn.16 d4, d5
vadd.i16 d2, d4, d5
vshl.i16 d3, d2, #3
vrev64.16 d16, d17
vsub.i16 d3, d3, d2
vadd.i16 d16, d16, d0
vshl.i16 d2, d16, #4
vsub.i16 d2, d2, d3
vshl.i16 d3, d4, #4
vext.16 q0, q0, q0, #7
vsub.i16 d6, d5, d3
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q2, d4[0]
vdup.16 q3, d6[0]
vshl.i16 q2, q2, #3
vadd.i16 q1, q1, q0
vadd.i16 q3, q3, q2
mov r3, #16
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q2
vqshrun.s16 d1, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {q0}, [r0,:128], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
const p16weight, align=4
.short 1,2,3,4,5,6,7,8
endconst
function ff_pred8x8_hor_neon, export=1
sub r2, r0, #1
mov r3, #8
1: vld1.8 {d0[]}, [r2], r1
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function ff_pred8x8_vert_neon, export=1
sub r0, r0, r1
vld1.8 {d0}, [r0,:64], r1
mov r3, #4
1: vst1.8 {d0}, [r0,:64], r1
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function ff_pred8x8_plane_neon, export=1
sub r3, r0, r1
add r2, r3, #4
sub r3, r3, #1
vld1.32 {d0[0]}, [r3]
vld1.32 {d2[0]}, [r2,:32], r1
ldcol.8 d0, r3, r1, 4, hi=1
add r3, r3, r1
ldcol.8 d3, r3, r1, 4
vaddl.u8 q8, d2, d3
vrev32.8 d0, d0
vtrn.32 d2, d3
vsubl.u8 q2, d2, d0
movrel r3, p16weight
vld1.16 {q0}, [r3,:128]
vmul.s16 d4, d4, d0
vmul.s16 d5, d5, d0
vpadd.i16 d4, d4, d5
vpaddl.s16 d4, d4
vshl.i32 d5, d4, #4
vadd.s32 d4, d4, d5
vrshrn.s32 d4, q2, #5
mov r3, #0
vtrn.16 d4, d5
vadd.i16 d2, d4, d5
vshl.i16 d3, d2, #2
vrev64.16 d16, d16
vsub.i16 d3, d3, d2
vadd.i16 d16, d16, d0
vshl.i16 d2, d16, #4
vsub.i16 d2, d2, d3
vshl.i16 d3, d4, #3
vext.16 q0, q0, q0, #7
vsub.i16 d6, d5, d3
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q2, d4[0]
vdup.16 q3, d6[0]
vshl.i16 q2, q2, #3
vadd.i16 q1, q1, q0
vadd.i16 q3, q3, q2
mov r3, #8
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function ff_pred8x8_128_dc_neon, export=1
vmov.i8 q0, #128
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_top_dc_neon, export=1
sub r2, r0, r1
vld1.8 {d0}, [r2,:64]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
vtrn.32 d0, d1
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_left_dc_neon, export=1
sub r2, r0, #1
ldcol.8 d0, r2, r1
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_dc_neon, export=1
sub r2, r0, r1
vld1.8 {d0}, [r2,:64]
sub r2, r0, #1
ldcol.8 d1, r2, r1
vtrn.32 d0, d1
vpaddl.u8 q0, q0
vpadd.u16 d0, d0, d1
vpadd.u16 d1, d0, d0
vrshrn.u16 d2, q0, #3
vrshrn.u16 d3, q0, #2
vdup.8 d0, d2[4]
vdup.8 d1, d3[3]
vdup.8 d4, d3[2]
vdup.8 d5, d2[5]
vtrn.32 q0, q2
.L_pred8x8_dc_end:
mov r3, #4
add r2, r0, r1, lsl #2
6: vst1.8 {d0}, [r0,:64], r1
vst1.8 {d1}, [r2,:64], r1
subs r3, r3, #1
bne 6b
bx lr
endfunc
function ff_pred8x8_l0t_dc_neon, export=1
sub r2, r0, r1
vld1.8 {d0}, [r2,:64]
sub r2, r0, #1
ldcol.8 d1, r2, r1, 4
vtrn.32 d0, d1
vpaddl.u8 q0, q0
vpadd.u16 d0, d0, d1
vpadd.u16 d1, d0, d0
vrshrn.u16 d2, q0, #3
vrshrn.u16 d3, q0, #2
vdup.8 d0, d2[4]
vdup.8 d1, d3[0]
vdup.8 q2, d3[2]
vtrn.32 q0, q2
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_l00_dc_neon, export=1
sub r2, r0, #1
ldcol.8 d0, r2, r1, 4
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vmov.i8 d1, #128
vdup.8 d0, d0[0]
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_0lt_dc_neon, export=1
sub r2, r0, r1
vld1.8 {d0}, [r2,:64]
add r2, r0, r1, lsl #2
sub r2, r2, #1
ldcol.8 d1, r2, r1, 4, hi=1
vtrn.32 d0, d1
vpaddl.u8 q0, q0
vpadd.u16 d0, d0, d1
vpadd.u16 d1, d0, d0
vrshrn.u16 d3, q0, #2
vrshrn.u16 d2, q0, #3
vdup.8 d0, d3[0]
vdup.8 d1, d3[3]
vdup.8 d4, d3[2]
vdup.8 d5, d2[5]
vtrn.32 q0, q2
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_0l0_dc_neon, export=1
add r2, r0, r1, lsl #2
sub r2, r2, #1
ldcol.8 d1, r2, r1, 4
vpaddl.u8 d2, d1
vpadd.u16 d2, d2, d2
vrshrn.u16 d1, q1, #2
vmov.i8 d0, #128
vdup.8 d1, d1[0]
b .L_pred8x8_dc_end
endfunc

View File

@@ -0,0 +1,171 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/h264qpel.h"
void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
av_cold void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth)
{
const int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags) && !high_bit_depth) {
c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon;
c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
}
}

View File

@@ -0,0 +1,955 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
/* H.264 qpel MC */
.macro lowpass_const r
movw \r, #5
movt \r, #20
vmov.32 d6[0], \r
.endm
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
.if \narrow
t0 .req q0
t1 .req q8
.else
t0 .req \d0
t1 .req \d1
.endif
vext.8 d2, \r0, \r1, #2
vext.8 d3, \r0, \r1, #3
vaddl.u8 q1, d2, d3
vext.8 d4, \r0, \r1, #1
vext.8 d5, \r0, \r1, #4
vaddl.u8 q2, d4, d5
vext.8 d30, \r0, \r1, #5
vaddl.u8 t0, \r0, d30
vext.8 d18, \r2, \r3, #2
vmla.i16 t0, q1, d6[1]
vext.8 d19, \r2, \r3, #3
vaddl.u8 q9, d18, d19
vext.8 d20, \r2, \r3, #1
vmls.i16 t0, q2, d6[0]
vext.8 d21, \r2, \r3, #4
vaddl.u8 q10, d20, d21
vext.8 d31, \r2, \r3, #5
vaddl.u8 t1, \r2, d31
vmla.i16 t1, q9, d6[1]
vmls.i16 t1, q10, d6[0]
.if \narrow
vqrshrun.s16 \d0, t0, #5
vqrshrun.s16 \d1, t1, #5
.endif
.unreq t0
.unreq t1
.endm
.macro lowpass_8_1 r0, r1, d0, narrow=1
.if \narrow
t0 .req q0
.else
t0 .req \d0
.endif
vext.8 d2, \r0, \r1, #2
vext.8 d3, \r0, \r1, #3
vaddl.u8 q1, d2, d3
vext.8 d4, \r0, \r1, #1
vext.8 d5, \r0, \r1, #4
vaddl.u8 q2, d4, d5
vext.8 d30, \r0, \r1, #5
vaddl.u8 t0, \r0, d30
vmla.i16 t0, q1, d6[1]
vmls.i16 t0, q2, d6[0]
.if \narrow
vqrshrun.s16 \d0, t0, #5
.endif
.unreq t0
.endm
.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
vext.16 q1, \r0, \r1, #2
vext.16 q0, \r0, \r1, #3
vaddl.s16 q9, d2, d0
vext.16 q2, \r0, \r1, #1
vaddl.s16 q1, d3, d1
vext.16 q3, \r0, \r1, #4
vaddl.s16 q10, d4, d6
vext.16 \r1, \r0, \r1, #5
vaddl.s16 q2, d5, d7
vaddl.s16 q0, \h0, \h1
vaddl.s16 q8, \l0, \l1
vshl.i32 q3, q9, #4
vshl.i32 q9, q9, #2
vshl.i32 q15, q10, #2
vadd.i32 q9, q9, q3
vadd.i32 q10, q10, q15
vshl.i32 q3, q1, #4
vshl.i32 q1, q1, #2
vshl.i32 q15, q2, #2
vadd.i32 q1, q1, q3
vadd.i32 q2, q2, q15
vadd.i32 q9, q9, q8
vsub.i32 q9, q9, q10
vadd.i32 q1, q1, q0
vsub.i32 q1, q1, q2
vrshrn.s32 d18, q9, #10
vrshrn.s32 d19, q1, #10
vqmovun.s16 \d, q9
.endm
function put_h264_qpel16_h_lowpass_neon_packed
mov r4, lr
mov r12, #16
mov r3, #8
bl put_h264_qpel8_h_lowpass_neon
sub r1, r1, r2, lsl #4
add r1, r1, #8
mov r12, #16
mov lr, r4
b put_h264_qpel8_h_lowpass_neon
endfunc
.macro h264_qpel_h_lowpass type
function \type\()_h264_qpel16_h_lowpass_neon
push {lr}
mov r12, #16
bl \type\()_h264_qpel8_h_lowpass_neon
sub r0, r0, r3, lsl #4
sub r1, r1, r2, lsl #4
add r0, r0, #8
add r1, r1, #8
mov r12, #16
pop {lr}
endfunc
function \type\()_h264_qpel8_h_lowpass_neon
1: vld1.8 {d0, d1}, [r1], r2
vld1.8 {d16,d17}, [r1], r2
subs r12, r12, #2
lowpass_8 d0, d1, d16, d17, d0, d16
.ifc \type,avg
vld1.8 {d2}, [r0,:64], r3
vrhadd.u8 d0, d0, d2
vld1.8 {d3}, [r0,:64]
vrhadd.u8 d16, d16, d3
sub r0, r0, r3
.endif
vst1.8 {d0}, [r0,:64], r3
vst1.8 {d16}, [r0,:64], r3
bne 1b
bx lr
endfunc
.endm
h264_qpel_h_lowpass put
h264_qpel_h_lowpass avg
.macro h264_qpel_h_lowpass_l2 type
function \type\()_h264_qpel16_h_lowpass_l2_neon
push {lr}
mov r12, #16
bl \type\()_h264_qpel8_h_lowpass_l2_neon
sub r0, r0, r2, lsl #4
sub r1, r1, r2, lsl #4
sub r3, r3, r2, lsl #4
add r0, r0, #8
add r1, r1, #8
add r3, r3, #8
mov r12, #16
pop {lr}
endfunc
function \type\()_h264_qpel8_h_lowpass_l2_neon
1: vld1.8 {d0, d1}, [r1], r2
vld1.8 {d16,d17}, [r1], r2
vld1.8 {d28}, [r3], r2
vld1.8 {d29}, [r3], r2
subs r12, r12, #2
lowpass_8 d0, d1, d16, d17, d0, d1
vrhadd.u8 q0, q0, q14
.ifc \type,avg
vld1.8 {d2}, [r0,:64], r2
vrhadd.u8 d0, d0, d2
vld1.8 {d3}, [r0,:64]
vrhadd.u8 d1, d1, d3
sub r0, r0, r2
.endif
vst1.8 {d0}, [r0,:64], r2
vst1.8 {d1}, [r0,:64], r2
bne 1b
bx lr
endfunc
.endm
h264_qpel_h_lowpass_l2 put
h264_qpel_h_lowpass_l2 avg
function put_h264_qpel16_v_lowpass_neon_packed
mov r4, lr
mov r2, #8
bl put_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #2
bl put_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
bl put_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #2
mov lr, r4
b put_h264_qpel8_v_lowpass_neon
endfunc
.macro h264_qpel_v_lowpass type
function \type\()_h264_qpel16_v_lowpass_neon
mov r4, lr
bl \type\()_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #2
bl \type\()_h264_qpel8_v_lowpass_neon
sub r0, r0, r2, lsl #4
add r0, r0, #8
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
bl \type\()_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #2
mov lr, r4
endfunc
function \type\()_h264_qpel8_v_lowpass_neon
vld1.8 {d8}, [r1], r3
vld1.8 {d10}, [r1], r3
vld1.8 {d12}, [r1], r3
vld1.8 {d14}, [r1], r3
vld1.8 {d22}, [r1], r3
vld1.8 {d24}, [r1], r3
vld1.8 {d26}, [r1], r3
vld1.8 {d28}, [r1], r3
vld1.8 {d9}, [r1], r3
vld1.8 {d11}, [r1], r3
vld1.8 {d13}, [r1], r3
vld1.8 {d15}, [r1], r3
vld1.8 {d23}, [r1]
transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
lowpass_8 d8, d9, d10, d11, d8, d10
lowpass_8 d12, d13, d14, d15, d12, d14
lowpass_8 d22, d23, d24, d25, d22, d24
lowpass_8 d26, d27, d28, d29, d26, d28
transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
.ifc \type,avg
vld1.8 {d9}, [r0,:64], r2
vrhadd.u8 d8, d8, d9
vld1.8 {d11}, [r0,:64], r2
vrhadd.u8 d10, d10, d11
vld1.8 {d13}, [r0,:64], r2
vrhadd.u8 d12, d12, d13
vld1.8 {d15}, [r0,:64], r2
vrhadd.u8 d14, d14, d15
vld1.8 {d23}, [r0,:64], r2
vrhadd.u8 d22, d22, d23
vld1.8 {d25}, [r0,:64], r2
vrhadd.u8 d24, d24, d25
vld1.8 {d27}, [r0,:64], r2
vrhadd.u8 d26, d26, d27
vld1.8 {d29}, [r0,:64], r2
vrhadd.u8 d28, d28, d29
sub r0, r0, r2, lsl #3
.endif
vst1.8 {d8}, [r0,:64], r2
vst1.8 {d10}, [r0,:64], r2
vst1.8 {d12}, [r0,:64], r2
vst1.8 {d14}, [r0,:64], r2
vst1.8 {d22}, [r0,:64], r2
vst1.8 {d24}, [r0,:64], r2
vst1.8 {d26}, [r0,:64], r2
vst1.8 {d28}, [r0,:64], r2
bx lr
endfunc
.endm
h264_qpel_v_lowpass put
h264_qpel_v_lowpass avg
.macro h264_qpel_v_lowpass_l2 type
function \type\()_h264_qpel16_v_lowpass_l2_neon
mov r4, lr
bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub r1, r1, r3, lsl #2
bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub r0, r0, r3, lsl #4
sub r12, r12, r2, lsl #4
add r0, r0, #8
add r12, r12, #8
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub r1, r1, r3, lsl #2
mov lr, r4
endfunc
function \type\()_h264_qpel8_v_lowpass_l2_neon
vld1.8 {d8}, [r1], r3
vld1.8 {d10}, [r1], r3
vld1.8 {d12}, [r1], r3
vld1.8 {d14}, [r1], r3
vld1.8 {d22}, [r1], r3
vld1.8 {d24}, [r1], r3
vld1.8 {d26}, [r1], r3
vld1.8 {d28}, [r1], r3
vld1.8 {d9}, [r1], r3
vld1.8 {d11}, [r1], r3
vld1.8 {d13}, [r1], r3
vld1.8 {d15}, [r1], r3
vld1.8 {d23}, [r1]
transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
lowpass_8 d8, d9, d10, d11, d8, d9
lowpass_8 d12, d13, d14, d15, d12, d13
lowpass_8 d22, d23, d24, d25, d22, d23
lowpass_8 d26, d27, d28, d29, d26, d27
transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
vld1.8 {d0}, [r12], r2
vld1.8 {d1}, [r12], r2
vld1.8 {d2}, [r12], r2
vld1.8 {d3}, [r12], r2
vld1.8 {d4}, [r12], r2
vrhadd.u8 q0, q0, q4
vld1.8 {d5}, [r12], r2
vrhadd.u8 q1, q1, q6
vld1.8 {d10}, [r12], r2
vrhadd.u8 q2, q2, q11
vld1.8 {d11}, [r12], r2
vrhadd.u8 q5, q5, q13
.ifc \type,avg
vld1.8 {d16}, [r0,:64], r3
vrhadd.u8 d0, d0, d16
vld1.8 {d17}, [r0,:64], r3
vrhadd.u8 d1, d1, d17
vld1.8 {d16}, [r0,:64], r3
vrhadd.u8 d2, d2, d16
vld1.8 {d17}, [r0,:64], r3
vrhadd.u8 d3, d3, d17
vld1.8 {d16}, [r0,:64], r3
vrhadd.u8 d4, d4, d16
vld1.8 {d17}, [r0,:64], r3
vrhadd.u8 d5, d5, d17
vld1.8 {d16}, [r0,:64], r3
vrhadd.u8 d10, d10, d16
vld1.8 {d17}, [r0,:64], r3
vrhadd.u8 d11, d11, d17
sub r0, r0, r3, lsl #3
.endif
vst1.8 {d0}, [r0,:64], r3
vst1.8 {d1}, [r0,:64], r3
vst1.8 {d2}, [r0,:64], r3
vst1.8 {d3}, [r0,:64], r3
vst1.8 {d4}, [r0,:64], r3
vst1.8 {d5}, [r0,:64], r3
vst1.8 {d10}, [r0,:64], r3
vst1.8 {d11}, [r0,:64], r3
bx lr
endfunc
.endm
h264_qpel_v_lowpass_l2 put
h264_qpel_v_lowpass_l2 avg
function put_h264_qpel8_hv_lowpass_neon_top
lowpass_const r12
mov r12, #12
1: vld1.8 {d0, d1}, [r1], r3
vld1.8 {d16,d17}, [r1], r3
subs r12, r12, #2
lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
vst1.8 {d22-d25}, [r4,:128]!
bne 1b
vld1.8 {d0, d1}, [r1]
lowpass_8_1 d0, d1, q12, narrow=0
mov r12, #-16
add r4, r4, r12
vld1.8 {d30,d31}, [r4,:128], r12
vld1.8 {d20,d21}, [r4,:128], r12
vld1.8 {d18,d19}, [r4,:128], r12
vld1.8 {d16,d17}, [r4,:128], r12
vld1.8 {d14,d15}, [r4,:128], r12
vld1.8 {d12,d13}, [r4,:128], r12
vld1.8 {d10,d11}, [r4,:128], r12
vld1.8 {d8, d9}, [r4,:128], r12
vld1.8 {d6, d7}, [r4,:128], r12
vld1.8 {d4, d5}, [r4,:128], r12
vld1.8 {d2, d3}, [r4,:128], r12
vld1.8 {d0, d1}, [r4,:128]
swap4 d1, d3, d5, d7, d8, d10, d12, d14
transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
swap4 d17, d19, d21, d31, d24, d26, d28, d22
transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
vst1.8 {d30,d31}, [r4,:128]!
vst1.8 {d6, d7}, [r4,:128]!
vst1.8 {d20,d21}, [r4,:128]!
vst1.8 {d4, d5}, [r4,:128]!
vst1.8 {d18,d19}, [r4,:128]!
vst1.8 {d2, d3}, [r4,:128]!
vst1.8 {d16,d17}, [r4,:128]!
vst1.8 {d0, d1}, [r4,:128]
lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
vld1.8 {d16,d17}, [r4,:128], r12
vld1.8 {d30,d31}, [r4,:128], r12
lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
vld1.8 {d16,d17}, [r4,:128], r12
vld1.8 {d30,d31}, [r4,:128], r12
lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
vld1.8 {d16,d17}, [r4,:128], r12
vld1.8 {d30,d31}, [r4,:128], r12
lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
vld1.8 {d16,d17}, [r4,:128], r12
vld1.8 {d30,d31}, [r4,:128]
lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
bx lr
endfunc
.macro h264_qpel8_hv_lowpass type
function \type\()_h264_qpel8_hv_lowpass_neon
mov r10, lr
bl put_h264_qpel8_hv_lowpass_neon_top
.ifc \type,avg
vld1.8 {d0}, [r0,:64], r2
vrhadd.u8 d12, d12, d0
vld1.8 {d1}, [r0,:64], r2
vrhadd.u8 d13, d13, d1
vld1.8 {d2}, [r0,:64], r2
vrhadd.u8 d14, d14, d2
vld1.8 {d3}, [r0,:64], r2
vrhadd.u8 d15, d15, d3
vld1.8 {d4}, [r0,:64], r2
vrhadd.u8 d8, d8, d4
vld1.8 {d5}, [r0,:64], r2
vrhadd.u8 d9, d9, d5
vld1.8 {d6}, [r0,:64], r2
vrhadd.u8 d10, d10, d6
vld1.8 {d7}, [r0,:64], r2
vrhadd.u8 d11, d11, d7
sub r0, r0, r2, lsl #3
.endif
vst1.8 {d12}, [r0,:64], r2
vst1.8 {d13}, [r0,:64], r2
vst1.8 {d14}, [r0,:64], r2
vst1.8 {d15}, [r0,:64], r2
vst1.8 {d8}, [r0,:64], r2
vst1.8 {d9}, [r0,:64], r2
vst1.8 {d10}, [r0,:64], r2
vst1.8 {d11}, [r0,:64], r2
mov lr, r10
bx lr
endfunc
.endm
h264_qpel8_hv_lowpass put
h264_qpel8_hv_lowpass avg
.macro h264_qpel8_hv_lowpass_l2 type
function \type\()_h264_qpel8_hv_lowpass_l2_neon
mov r10, lr
bl put_h264_qpel8_hv_lowpass_neon_top
vld1.8 {d0, d1}, [r2,:128]!
vld1.8 {d2, d3}, [r2,:128]!
vrhadd.u8 q0, q0, q6
vld1.8 {d4, d5}, [r2,:128]!
vrhadd.u8 q1, q1, q7
vld1.8 {d6, d7}, [r2,:128]!
vrhadd.u8 q2, q2, q4
vrhadd.u8 q3, q3, q5
.ifc \type,avg
vld1.8 {d16}, [r0,:64], r3
vrhadd.u8 d0, d0, d16
vld1.8 {d17}, [r0,:64], r3
vrhadd.u8 d1, d1, d17
vld1.8 {d18}, [r0,:64], r3
vrhadd.u8 d2, d2, d18
vld1.8 {d19}, [r0,:64], r3
vrhadd.u8 d3, d3, d19
vld1.8 {d20}, [r0,:64], r3
vrhadd.u8 d4, d4, d20
vld1.8 {d21}, [r0,:64], r3
vrhadd.u8 d5, d5, d21
vld1.8 {d22}, [r0,:64], r3
vrhadd.u8 d6, d6, d22
vld1.8 {d23}, [r0,:64], r3
vrhadd.u8 d7, d7, d23
sub r0, r0, r3, lsl #3
.endif
vst1.8 {d0}, [r0,:64], r3
vst1.8 {d1}, [r0,:64], r3
vst1.8 {d2}, [r0,:64], r3
vst1.8 {d3}, [r0,:64], r3
vst1.8 {d4}, [r0,:64], r3
vst1.8 {d5}, [r0,:64], r3
vst1.8 {d6}, [r0,:64], r3
vst1.8 {d7}, [r0,:64], r3
mov lr, r10
bx lr
endfunc
.endm
h264_qpel8_hv_lowpass_l2 put
h264_qpel8_hv_lowpass_l2 avg
.macro h264_qpel16_hv type
function \type\()_h264_qpel16_hv_lowpass_neon
mov r9, lr
bl \type\()_h264_qpel8_hv_lowpass_neon
sub r1, r1, r3, lsl #2
bl \type\()_h264_qpel8_hv_lowpass_neon
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
sub r0, r0, r2, lsl #4
add r0, r0, #8
bl \type\()_h264_qpel8_hv_lowpass_neon
sub r1, r1, r3, lsl #2
mov lr, r9
b \type\()_h264_qpel8_hv_lowpass_neon
endfunc
function \type\()_h264_qpel16_hv_lowpass_l2_neon
mov r9, lr
sub r2, r4, #256
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub r1, r1, r3, lsl #2
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
sub r0, r0, r3, lsl #4
add r0, r0, #8
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub r1, r1, r3, lsl #2
mov lr, r9
b \type\()_h264_qpel8_hv_lowpass_l2_neon
endfunc
.endm
h264_qpel16_hv put
h264_qpel16_hv avg
.macro h264_qpel8 type
function ff_\type\()_h264_qpel8_mc10_neon, export=1
lowpass_const r3
mov r3, r1
sub r1, r1, #2
mov r12, #8
b \type\()_h264_qpel8_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel8_mc20_neon, export=1
lowpass_const r3
sub r1, r1, #2
mov r3, r2
mov r12, #8
b \type\()_h264_qpel8_h_lowpass_neon
endfunc
function ff_\type\()_h264_qpel8_mc30_neon, export=1
lowpass_const r3
add r3, r1, #1
sub r1, r1, #2
mov r12, #8
b \type\()_h264_qpel8_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel8_mc01_neon, export=1
push {lr}
mov r12, r1
\type\()_h264_qpel8_mc01:
lowpass_const r3
mov r3, r2
sub r1, r1, r2, lsl #1
vpush {d8-d15}
bl \type\()_h264_qpel8_v_lowpass_l2_neon
vpop {d8-d15}
pop {pc}
endfunc
function ff_\type\()_h264_qpel8_mc11_neon, export=1
push {r0, r1, r11, lr}
\type\()_h264_qpel8_mc11:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #64
mov r0, sp
sub r1, r1, #2
mov r3, #8
mov r12, #8
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
ldrd r0, r1, [r11], #8
mov r3, r2
add r12, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #8
bl \type\()_h264_qpel8_v_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc21_neon, export=1
push {r0, r1, r4, r10, r11, lr}
\type\()_h264_qpel8_mc21:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(8*8+16*12)
sub r1, r1, #2
mov r3, #8
mov r0, sp
mov r12, #8
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
mov r4, r0
ldrd r0, r1, [r11], #8
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
sub r2, r4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r10, r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc31_neon, export=1
add r1, r1, #1
push {r0, r1, r11, lr}
sub r1, r1, #1
b \type\()_h264_qpel8_mc11
endfunc
function ff_\type\()_h264_qpel8_mc02_neon, export=1
push {lr}
lowpass_const r3
sub r1, r1, r2, lsl #1
mov r3, r2
vpush {d8-d15}
bl \type\()_h264_qpel8_v_lowpass_neon
vpop {d8-d15}
pop {pc}
endfunc
function ff_\type\()_h264_qpel8_mc12_neon, export=1
push {r0, r1, r4, r10, r11, lr}
\type\()_h264_qpel8_mc12:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(8*8+16*12)
sub r1, r1, r2, lsl #1
mov r3, r2
mov r2, #8
mov r0, sp
vpush {d8-d15}
bl put_h264_qpel8_v_lowpass_neon
mov r4, r0
ldrd r0, r1, [r11], #8
sub r1, r1, r3, lsl #1
sub r1, r1, #2
sub r2, r4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r10, r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc22_neon, export=1
push {r4, r10, r11, lr}
mov r11, sp
A bic sp, sp, #15
T bic r4, r11, #15
T mov sp, r4
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
sub sp, sp, #(16*12)
mov r4, sp
vpush {d8-d15}
bl \type\()_h264_qpel8_hv_lowpass_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r10, r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc32_neon, export=1
push {r0, r1, r4, r10, r11, lr}
add r1, r1, #1
b \type\()_h264_qpel8_mc12
endfunc
function ff_\type\()_h264_qpel8_mc03_neon, export=1
push {lr}
add r12, r1, r2
b \type\()_h264_qpel8_mc01
endfunc
function ff_\type\()_h264_qpel8_mc13_neon, export=1
push {r0, r1, r11, lr}
add r1, r1, r2
b \type\()_h264_qpel8_mc11
endfunc
function ff_\type\()_h264_qpel8_mc23_neon, export=1
push {r0, r1, r4, r10, r11, lr}
add r1, r1, r2
b \type\()_h264_qpel8_mc21
endfunc
function ff_\type\()_h264_qpel8_mc33_neon, export=1
add r1, r1, #1
push {r0, r1, r11, lr}
add r1, r1, r2
sub r1, r1, #1
b \type\()_h264_qpel8_mc11
endfunc
.endm
h264_qpel8 put
h264_qpel8 avg
.macro h264_qpel16 type
function ff_\type\()_h264_qpel16_mc10_neon, export=1
lowpass_const r3
mov r3, r1
sub r1, r1, #2
b \type\()_h264_qpel16_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel16_mc20_neon, export=1
lowpass_const r3
sub r1, r1, #2
mov r3, r2
b \type\()_h264_qpel16_h_lowpass_neon
endfunc
function ff_\type\()_h264_qpel16_mc30_neon, export=1
lowpass_const r3
add r3, r1, #1
sub r1, r1, #2
b \type\()_h264_qpel16_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel16_mc01_neon, export=1
push {r4, lr}
mov r12, r1
\type\()_h264_qpel16_mc01:
lowpass_const r3
mov r3, r2
sub r1, r1, r2, lsl #1
vpush {d8-d15}
bl \type\()_h264_qpel16_v_lowpass_l2_neon
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_h264_qpel16_mc11_neon, export=1
push {r0, r1, r4, r11, lr}
\type\()_h264_qpel16_mc11:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #256
mov r0, sp
sub r1, r1, #2
mov r3, #16
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon
ldrd r0, r1, [r11], #8
mov r3, r2
add r12, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #16
bl \type\()_h264_qpel16_v_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r11, pc}
endfunc
function ff_\type\()_h264_qpel16_mc21_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
\type\()_h264_qpel16_mc21:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(16*16+16*12)
sub r1, r1, #2
mov r0, sp
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon_packed
mov r4, r0
ldrd r0, r1, [r11], #8
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r4-r5, r9-r11, pc}
endfunc
function ff_\type\()_h264_qpel16_mc31_neon, export=1
add r1, r1, #1
push {r0, r1, r4, r11, lr}
sub r1, r1, #1
b \type\()_h264_qpel16_mc11
endfunc
function ff_\type\()_h264_qpel16_mc02_neon, export=1
push {r4, lr}
lowpass_const r3
sub r1, r1, r2, lsl #1
mov r3, r2
vpush {d8-d15}
bl \type\()_h264_qpel16_v_lowpass_neon
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_h264_qpel16_mc12_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
\type\()_h264_qpel16_mc12:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(16*16+16*12)
sub r1, r1, r2, lsl #1
mov r0, sp
mov r3, r2
vpush {d8-d15}
bl put_h264_qpel16_v_lowpass_neon_packed
mov r4, r0
ldrd r0, r1, [r11], #8
sub r1, r1, r3, lsl #1
sub r1, r1, #2
mov r2, r3
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r4-r5, r9-r11, pc}
endfunc
function ff_\type\()_h264_qpel16_mc22_neon, export=1
push {r4, r9-r11, lr}
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r4, r11, #15
T mov sp, r4
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
sub sp, sp, #(16*12)
mov r4, sp
vpush {d8-d15}
bl \type\()_h264_qpel16_hv_lowpass_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r9-r11, pc}
endfunc
function ff_\type\()_h264_qpel16_mc32_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
add r1, r1, #1
b \type\()_h264_qpel16_mc12
endfunc
function ff_\type\()_h264_qpel16_mc03_neon, export=1
push {r4, lr}
add r12, r1, r2
b \type\()_h264_qpel16_mc01
endfunc
function ff_\type\()_h264_qpel16_mc13_neon, export=1
push {r0, r1, r4, r11, lr}
add r1, r1, r2
b \type\()_h264_qpel16_mc11
endfunc
function ff_\type\()_h264_qpel16_mc23_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
add r1, r1, r2
b \type\()_h264_qpel16_mc21
endfunc
function ff_\type\()_h264_qpel16_mc33_neon, export=1
add r1, r1, #1
push {r0, r1, r4, r11, lr}
add r1, r1, r2
sub r1, r1, #1
b \type\()_h264_qpel16_mc11
endfunc
.endm
h264_qpel16 put
h264_qpel16 avg

View File

@@ -0,0 +1,611 @@
@
@ ARMv4 optimized DSP utils
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
@ This file is part of FFmpeg.
@
@ FFmpeg is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2.1 of the License, or (at your option) any later version.
@
@ FFmpeg is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
@ License along with FFmpeg; if not, write to the Free Software
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@
#include "config.h"
#include "libavutil/arm/asm.S"
#if !HAVE_ARMV5TE_EXTERNAL
#define pld @
#endif
.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
mov \Rd0, \Rn0, lsr #(\shift * 8)
mov \Rd1, \Rn1, lsr #(\shift * 8)
mov \Rd2, \Rn2, lsr #(\shift * 8)
mov \Rd3, \Rn3, lsr #(\shift * 8)
orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
.endm
.macro ALIGN_DWORD shift, R0, R1, R2
mov \R0, \R0, lsr #(\shift * 8)
orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
mov \R1, \R1, lsr #(\shift * 8)
orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
.endm
.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
.endm
.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
@ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
@ Rmask = 0xFEFEFEFE
@ Rn = destroy
eor \Rd0, \Rn0, \Rm0
eor \Rd1, \Rn1, \Rm1
orr \Rn0, \Rn0, \Rm0
orr \Rn1, \Rn1, \Rm1
and \Rd0, \Rd0, \Rmask
and \Rd1, \Rd1, \Rmask
sub \Rd0, \Rn0, \Rd0, lsr #1
sub \Rd1, \Rn1, \Rd1, lsr #1
.endm
.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
@ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
@ Rmask = 0xFEFEFEFE
@ Rn = destroy
eor \Rd0, \Rn0, \Rm0
eor \Rd1, \Rn1, \Rm1
and \Rn0, \Rn0, \Rm0
and \Rn1, \Rn1, \Rm1
and \Rd0, \Rd0, \Rmask
and \Rd1, \Rd1, \Rmask
add \Rd0, \Rn0, \Rd0, lsr #1
add \Rd1, \Rn1, \Rd1, lsr #1
.endm
.macro JMP_ALIGN tmp, reg
ands \tmp, \reg, #3
bic \reg, \reg, #3
beq 1f
subs \tmp, \tmp, #1
beq 2f
subs \tmp, \tmp, #1
beq 3f
b 4f
.endm
@ ----------------------------------------------------------------
.align 5
function ff_put_pixels16_arm, export=1
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r11, lr}
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r7}
add r1, r1, r2
stm r0, {r4-r7}
pld [r1]
subs r3, r3, #1
add r0, r0, r2
bne 1b
pop {r4-r11, pc}
.align 5
2:
ldm r1, {r4-r8}
add r1, r1, r2
ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stm r0, {r9-r12}
add r0, r0, r2
bne 2b
pop {r4-r11, pc}
.align 5
3:
ldm r1, {r4-r8}
add r1, r1, r2
ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stm r0, {r9-r12}
add r0, r0, r2
bne 3b
pop {r4-r11, pc}
.align 5
4:
ldm r1, {r4-r8}
add r1, r1, r2
ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stm r0, {r9-r12}
add r0, r0, r2
bne 4b
pop {r4-r11,pc}
endfunc
@ ----------------------------------------------------------------
.align 5
function ff_put_pixels8_arm, export=1
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r5,lr}
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r5}
add r1, r1, r2
subs r3, r3, #1
pld [r1]
stm r0, {r4-r5}
add r0, r0, r2
bne 1b
pop {r4-r5,pc}
.align 5
2:
ldm r1, {r4-r5, r12}
add r1, r1, r2
ALIGN_DWORD 1, r4, r5, r12
pld [r1]
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 2b
pop {r4-r5,pc}
.align 5
3:
ldm r1, {r4-r5, r12}
add r1, r1, r2
ALIGN_DWORD 2, r4, r5, r12
pld [r1]
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 3b
pop {r4-r5,pc}
.align 5
4:
ldm r1, {r4-r5, r12}
add r1, r1, r2
ALIGN_DWORD 3, r4, r5, r12
pld [r1]
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 4b
pop {r4-r5,pc}
endfunc
@ ----------------------------------------------------------------
.align 5
function ff_put_pixels8_x2_arm, export=1
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r10,lr}
ldr r12, =0xfefefefe
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
pld [r1]
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 1b
pop {r4-r10,pc}
.align 5
2:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
pld [r1]
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 2b
pop {r4-r10,pc}
.align 5
3:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
pld [r1]
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 3b
pop {r4-r10,pc}
.align 5
4:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
pld [r1]
RND_AVG32 r8, r9, r6, r7, r5, r10, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 4b
pop {r4-r10,pc}
endfunc
.align 5
function ff_put_no_rnd_pixels8_x2_arm, export=1
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r10,lr}
ldr r12, =0xfefefefe
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
pld [r1]
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 1b
pop {r4-r10,pc}
.align 5
2:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
pld [r1]
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 2b
pop {r4-r10,pc}
.align 5
3:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
pld [r1]
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 3b
pop {r4-r10,pc}
.align 5
4:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
pld [r1]
NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 4b
pop {r4-r10,pc}
endfunc
@ ----------------------------------------------------------------
.align 5
function ff_put_pixels8_y2_arm, export=1
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r11,lr}
mov r3, r3, lsr #1
ldr r12, =0xfefefefe
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r5}
add r1, r1, r2
6: ldm r1, {r6-r7}
add r1, r1, r2
pld [r1]
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
ldm r1, {r4-r5}
add r1, r1, r2
stm r0, {r8-r9}
add r0, r0, r2
pld [r1]
RND_AVG32 r8, r9, r6, r7, r4, r5, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
2:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
3:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
4:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
endfunc
.align 5
function ff_put_no_rnd_pixels8_y2_arm, export=1
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r11,lr}
mov r3, r3, lsr #1
ldr r12, =0xfefefefe
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r5}
add r1, r1, r2
6: ldm r1, {r6-r7}
add r1, r1, r2
pld [r1]
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
ldm r1, {r4-r5}
add r1, r1, r2
stm r0, {r8-r9}
add r0, r0, r2
pld [r1]
NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
2:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
3:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
4:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
endfunc
.ltorg
@ ----------------------------------------------------------------
.macro RND_XY2_IT align, rnd
@ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
@ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
.if \align == 0
ldm r1, {r6-r8}
.elseif \align == 3
ldm r1, {r5-r7}
.else
ldm r1, {r8-r10}
.endif
add r1, r1, r2
pld [r1]
.if \align == 0
ALIGN_DWORD_D 1, r4, r5, r6, r7, r8
.elseif \align == 1
ALIGN_DWORD_D 1, r4, r5, r8, r9, r10
ALIGN_DWORD_D 2, r6, r7, r8, r9, r10
.elseif \align == 2
ALIGN_DWORD_D 2, r4, r5, r8, r9, r10
ALIGN_DWORD_D 3, r6, r7, r8, r9, r10
.elseif \align == 3
ALIGN_DWORD_D 3, r4, r5, r5, r6, r7
.endif
ldr r14, =0x03030303
tst r3, #1
and r8, r4, r14
and r9, r5, r14
and r10, r6, r14
and r11, r7, r14
it eq
andeq r14, r14, r14, \rnd #1
add r8, r8, r10
add r9, r9, r11
ldr r12, =0xfcfcfcfc >> 2
itt eq
addeq r8, r8, r14
addeq r9, r9, r14
and r4, r12, r4, lsr #2
and r5, r12, r5, lsr #2
and r6, r12, r6, lsr #2
and r7, r12, r7, lsr #2
add r10, r4, r6
add r11, r5, r7
subs r3, r3, #1
.endm
.macro RND_XY2_EXPAND align, rnd
RND_XY2_IT \align, \rnd
6: push {r8-r11}
RND_XY2_IT \align, \rnd
pop {r4-r7}
add r4, r4, r8
add r5, r5, r9
ldr r14, =0x0f0f0f0f
add r6, r6, r10
add r7, r7, r11
and r4, r14, r4, lsr #2
and r5, r14, r5, lsr #2
add r4, r4, r6
add r5, r5, r7
stm r0, {r4-r5}
add r0, r0, r2
bge 6b
pop {r4-r11,pc}
.endm
.align 5
function ff_put_pixels8_xy2_arm, export=1
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r11,lr} @ R14 is also called LR
JMP_ALIGN r5, r1
1: RND_XY2_EXPAND 0, lsl
.align 5
2: RND_XY2_EXPAND 1, lsl
.align 5
3: RND_XY2_EXPAND 2, lsl
.align 5
4: RND_XY2_EXPAND 3, lsl
endfunc
.align 5
function ff_put_no_rnd_pixels8_xy2_arm, export=1
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r11,lr}
JMP_ALIGN r5, r1
1: RND_XY2_EXPAND 0, lsr
.align 5
2: RND_XY2_EXPAND 1, lsr
.align 5
3: RND_XY2_EXPAND 2, lsr
.align 5
4: RND_XY2_EXPAND 3, lsr
endfunc

View File

@@ -0,0 +1,29 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_HPELDSP_H
#define AVCODEC_ARM_HPELDSP_H
#include "libavcodec/hpeldsp.h"
void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags);
#endif /* AVCODEC_ARM_HPELDSP_H */

View File

@@ -0,0 +1,259 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro call_2x_pixels type, subp
function ff_\type\()_pixels16\subp\()_armv6, export=1
push {r0-r3, lr}
bl ff_\type\()_pixels8\subp\()_armv6
pop {r0-r3, lr}
add r0, r0, #8
add r1, r1, #8
b ff_\type\()_pixels8\subp\()_armv6
endfunc
.endm
call_2x_pixels avg
call_2x_pixels put, _x2
call_2x_pixels put, _y2
call_2x_pixels put, _x2_no_rnd
call_2x_pixels put, _y2_no_rnd
function ff_put_pixels16_armv6, export=1
push {r4-r11}
1:
ldr r5, [r1, #4]
ldr r6, [r1, #8]
ldr r7, [r1, #12]
ldr_post r4, r1, r2
strd r6, r7, [r0, #8]
ldr r9, [r1, #4]
strd_post r4, r5, r0, r2
ldr r10, [r1, #8]
ldr r11, [r1, #12]
ldr_post r8, r1, r2
strd r10, r11, [r0, #8]
subs r3, r3, #2
strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11}
bx lr
endfunc
function ff_put_pixels8_armv6, export=1
push {r4-r7}
1:
ldr r5, [r1, #4]
ldr_post r4, r1, r2
ldr r7, [r1, #4]
strd_post r4, r5, r0, r2
ldr_post r6, r1, r2
subs r3, r3, #2
strd_post r6, r7, r0, r2
bne 1b
pop {r4-r7}
bx lr
endfunc
function ff_put_pixels8_x2_armv6, export=1
push {r4-r11, lr}
mov r12, #1
orr r12, r12, r12, lsl #8
orr r12, r12, r12, lsl #16
1:
ldr r4, [r1]
subs r3, r3, #2
ldr r5, [r1, #4]
ldr r7, [r1, #5]
lsr r6, r4, #8
ldr_pre r8, r1, r2
orr r6, r6, r5, lsl #24
ldr r9, [r1, #4]
ldr r11, [r1, #5]
lsr r10, r8, #8
add r1, r1, r2
orr r10, r10, r9, lsl #24
eor r14, r4, r6
uhadd8 r4, r4, r6
eor r6, r5, r7
uhadd8 r5, r5, r7
and r14, r14, r12
and r6, r6, r12
uadd8 r4, r4, r14
eor r14, r8, r10
uadd8 r5, r5, r6
eor r6, r9, r11
uhadd8 r8, r8, r10
and r14, r14, r12
uhadd8 r9, r9, r11
and r6, r6, r12
uadd8 r8, r8, r14
strd_post r4, r5, r0, r2
uadd8 r9, r9, r6
strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11, pc}
endfunc
function ff_put_pixels8_y2_armv6, export=1
push {r4-r11}
mov r12, #1
orr r12, r12, r12, lsl #8
orr r12, r12, r12, lsl #16
ldr r4, [r1]
ldr r5, [r1, #4]
ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
uhadd8 r8, r4, r6
eor r10, r4, r6
uhadd8 r9, r5, r7
eor r11, r5, r7
and r10, r10, r12
ldr_pre r4, r1, r2
uadd8 r8, r8, r10
and r11, r11, r12
uadd8 r9, r9, r11
ldr r5, [r1, #4]
uhadd8 r10, r4, r6
eor r6, r4, r6
uhadd8 r11, r5, r7
and r6, r6, r12
eor r7, r5, r7
uadd8 r10, r10, r6
and r7, r7, r12
ldr_pre r6, r1, r2
uadd8 r11, r11, r7
strd_post r8, r9, r0, r2
ldr r7, [r1, #4]
strd_post r10, r11, r0, r2
bne 1b
pop {r4-r11}
bx lr
endfunc
function ff_put_pixels8_x2_no_rnd_armv6, export=1
push {r4-r9, lr}
1:
subs r3, r3, #2
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r7, [r1, #5]
ldr_pre r8, r1, r2
ldr r9, [r1, #4]
ldr r14, [r1, #5]
add r1, r1, r2
lsr r6, r4, #8
orr r6, r6, r5, lsl #24
lsr r12, r8, #8
orr r12, r12, r9, lsl #24
uhadd8 r4, r4, r6
uhadd8 r5, r5, r7
uhadd8 r8, r8, r12
uhadd8 r9, r9, r14
stm r0, {r4,r5}
add r0, r0, r2
stm r0, {r8,r9}
add r0, r0, r2
bne 1b
pop {r4-r9, pc}
endfunc
function ff_put_pixels8_y2_no_rnd_armv6, export=1
push {r4-r9, lr}
ldr r4, [r1]
ldr r5, [r1, #4]
ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
uhadd8 r8, r4, r6
ldr_pre r4, r1, r2
uhadd8 r9, r5, r7
ldr r5, [r1, #4]
uhadd8 r12, r4, r6
ldr_pre r6, r1, r2
uhadd8 r14, r5, r7
ldr r7, [r1, #4]
stm r0, {r8,r9}
add r0, r0, r2
stm r0, {r12,r14}
add r0, r0, r2
bne 1b
pop {r4-r9, pc}
endfunc
function ff_avg_pixels8_armv6, export=1
pld [r1, r2]
push {r4-r10, lr}
mov lr, #1
orr lr, lr, lr, lsl #8
orr lr, lr, lr, lsl #16
ldrd r4, r5, [r0]
ldr r10, [r1, #4]
ldr_post r9, r1, r2
subs r3, r3, #2
1:
pld [r1, r2]
eor r8, r4, r9
uhadd8 r4, r4, r9
eor r12, r5, r10
ldrd_reg r6, r7, r0, r2
uhadd8 r5, r5, r10
and r8, r8, lr
ldr r10, [r1, #4]
and r12, r12, lr
uadd8 r4, r4, r8
ldr_post r9, r1, r2
eor r8, r6, r9
uadd8 r5, r5, r12
pld [r1, r2, lsl #1]
eor r12, r7, r10
uhadd8 r6, r6, r9
strd_post r4, r5, r0, r2
uhadd8 r7, r7, r10
beq 2f
and r8, r8, lr
ldrd_reg r4, r5, r0, r2
uadd8 r6, r6, r8
ldr r10, [r1, #4]
and r12, r12, lr
subs r3, r3, #2
uadd8 r7, r7, r12
ldr_post r9, r1, r2
strd_post r6, r7, r0, r2
b 1b
2:
and r8, r8, lr
and r12, r12, lr
uadd8 r6, r6, r8
uadd8 r7, r7, r12
strd_post r6, r7, r0, r2
pop {r4-r10, pc}
endfunc

View File

@@ -0,0 +1,72 @@
/*
* ARM optimized DSP utils
* Copyright (c) 2001 Lionel Ulmer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/bit_depth_template.c" // for CALL_2X_PIXELS
#include "libavcodec/rnd_avg.h"
#include "hpeldsp_arm.h"
void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8)
CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8)
CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8)
CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8)
CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8)
CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
av_cold void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
if (have_armv6(cpu_flags))
ff_hpeldsp_init_armv6(c, flags);
if (have_neon(cpu_flags))
ff_hpeldsp_init_neon(c, flags);
}

View File

@@ -0,0 +1,67 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "libavutil/attributes.h"
#include "hpeldsp_arm.h"
void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags)
{
c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
}

View File

@@ -0,0 +1,88 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "libavutil/attributes.h"
#include "hpeldsp_arm.h"
void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
av_cold void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags)
{
c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
}

View File

@@ -0,0 +1,410 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro pixels16 rnd=1, avg=0
.if \avg
mov r12, r0
.endif
1: vld1.8 {q0}, [r1], r2
vld1.8 {q1}, [r1], r2
vld1.8 {q2}, [r1], r2
pld [r1, r2, lsl #2]
vld1.8 {q3}, [r1], r2
pld [r1]
pld [r1, r2]
pld [r1, r2, lsl #1]
.if \avg
vld1.8 {q8}, [r12,:128], r2
vrhadd.u8 q0, q0, q8
vld1.8 {q9}, [r12,:128], r2
vrhadd.u8 q1, q1, q9
vld1.8 {q10}, [r12,:128], r2
vrhadd.u8 q2, q2, q10
vld1.8 {q11}, [r12,:128], r2
vrhadd.u8 q3, q3, q11
.endif
subs r3, r3, #4
vst1.64 {q0}, [r0,:128], r2
vst1.64 {q1}, [r0,:128], r2
vst1.64 {q2}, [r0,:128], r2
vst1.64 {q3}, [r0,:128], r2
bne 1b
bx lr
.endm
.macro pixels16_x2 rnd=1, avg=0
1: vld1.8 {d0-d2}, [r1], r2
vld1.8 {d4-d6}, [r1], r2
pld [r1]
pld [r1, r2]
subs r3, r3, #2
vext.8 q1, q0, q1, #1
avg q0, q0, q1
vext.8 q3, q2, q3, #1
avg q2, q2, q3
.if \avg
vld1.8 {q1}, [r0,:128], r2
vld1.8 {q3}, [r0,:128]
vrhadd.u8 q0, q0, q1
vrhadd.u8 q2, q2, q3
sub r0, r0, r2
.endif
vst1.8 {q0}, [r0,:128], r2
vst1.8 {q2}, [r0,:128], r2
bne 1b
bx lr
.endm
.macro pixels16_y2 rnd=1, avg=0
sub r3, r3, #2
vld1.8 {q0}, [r1], r2
vld1.8 {q1}, [r1], r2
1: subs r3, r3, #2
avg q2, q0, q1
vld1.8 {q0}, [r1], r2
avg q3, q0, q1
vld1.8 {q1}, [r1], r2
pld [r1]
pld [r1, r2]
.if \avg
vld1.8 {q8}, [r0,:128], r2
vld1.8 {q9}, [r0,:128]
vrhadd.u8 q2, q2, q8
vrhadd.u8 q3, q3, q9
sub r0, r0, r2
.endif
vst1.8 {q2}, [r0,:128], r2
vst1.8 {q3}, [r0,:128], r2
bne 1b
avg q2, q0, q1
vld1.8 {q0}, [r1], r2
avg q3, q0, q1
.if \avg
vld1.8 {q8}, [r0,:128], r2
vld1.8 {q9}, [r0,:128]
vrhadd.u8 q2, q2, q8
vrhadd.u8 q3, q3, q9
sub r0, r0, r2
.endif
vst1.8 {q2}, [r0,:128], r2
vst1.8 {q3}, [r0,:128], r2
bx lr
.endm
.macro pixels16_xy2 rnd=1, avg=0
sub r3, r3, #2
vld1.8 {d0-d2}, [r1], r2
vld1.8 {d4-d6}, [r1], r2
NRND vmov.i16 q13, #1
pld [r1]
pld [r1, r2]
vext.8 q1, q0, q1, #1
vext.8 q3, q2, q3, #1
vaddl.u8 q8, d0, d2
vaddl.u8 q10, d1, d3
vaddl.u8 q9, d4, d6
vaddl.u8 q11, d5, d7
1: subs r3, r3, #2
vld1.8 {d0-d2}, [r1], r2
vadd.u16 q12, q8, q9
pld [r1]
NRND vadd.u16 q12, q12, q13
vext.8 q15, q0, q1, #1
vadd.u16 q1 , q10, q11
shrn d28, q12, #2
NRND vadd.u16 q1, q1, q13
shrn d29, q1, #2
.if \avg
vld1.8 {q8}, [r0,:128]
vrhadd.u8 q14, q14, q8
.endif
vaddl.u8 q8, d0, d30
vld1.8 {d2-d4}, [r1], r2
vaddl.u8 q10, d1, d31
vst1.8 {q14}, [r0,:128], r2
vadd.u16 q12, q8, q9
pld [r1, r2]
NRND vadd.u16 q12, q12, q13
vext.8 q2, q1, q2, #1
vadd.u16 q0, q10, q11
shrn d30, q12, #2
NRND vadd.u16 q0, q0, q13
shrn d31, q0, #2
.if \avg
vld1.8 {q9}, [r0,:128]
vrhadd.u8 q15, q15, q9
.endif
vaddl.u8 q9, d2, d4
vaddl.u8 q11, d3, d5
vst1.8 {q15}, [r0,:128], r2
bgt 1b
vld1.8 {d0-d2}, [r1], r2
vadd.u16 q12, q8, q9
NRND vadd.u16 q12, q12, q13
vext.8 q15, q0, q1, #1
vadd.u16 q1 , q10, q11
shrn d28, q12, #2
NRND vadd.u16 q1, q1, q13
shrn d29, q1, #2
.if \avg
vld1.8 {q8}, [r0,:128]
vrhadd.u8 q14, q14, q8
.endif
vaddl.u8 q8, d0, d30
vaddl.u8 q10, d1, d31
vst1.8 {q14}, [r0,:128], r2
vadd.u16 q12, q8, q9
NRND vadd.u16 q12, q12, q13
vadd.u16 q0, q10, q11
shrn d30, q12, #2
NRND vadd.u16 q0, q0, q13
shrn d31, q0, #2
.if \avg
vld1.8 {q9}, [r0,:128]
vrhadd.u8 q15, q15, q9
.endif
vst1.8 {q15}, [r0,:128], r2
bx lr
.endm
.macro pixels8 rnd=1, avg=0
1: vld1.8 {d0}, [r1], r2
vld1.8 {d1}, [r1], r2
vld1.8 {d2}, [r1], r2
pld [r1, r2, lsl #2]
vld1.8 {d3}, [r1], r2
pld [r1]
pld [r1, r2]
pld [r1, r2, lsl #1]
.if \avg
vld1.8 {d4}, [r0,:64], r2
vrhadd.u8 d0, d0, d4
vld1.8 {d5}, [r0,:64], r2
vrhadd.u8 d1, d1, d5
vld1.8 {d6}, [r0,:64], r2
vrhadd.u8 d2, d2, d6
vld1.8 {d7}, [r0,:64], r2
vrhadd.u8 d3, d3, d7
sub r0, r0, r2, lsl #2
.endif
subs r3, r3, #4
vst1.8 {d0}, [r0,:64], r2
vst1.8 {d1}, [r0,:64], r2
vst1.8 {d2}, [r0,:64], r2
vst1.8 {d3}, [r0,:64], r2
bne 1b
bx lr
.endm
.macro pixels8_x2 rnd=1, avg=0
1: vld1.8 {q0}, [r1], r2
vext.8 d1, d0, d1, #1
vld1.8 {q1}, [r1], r2
vext.8 d3, d2, d3, #1
pld [r1]
pld [r1, r2]
subs r3, r3, #2
vswp d1, d2
avg q0, q0, q1
.if \avg
vld1.8 {d4}, [r0,:64], r2
vld1.8 {d5}, [r0,:64]
vrhadd.u8 q0, q0, q2
sub r0, r0, r2
.endif
vst1.8 {d0}, [r0,:64], r2
vst1.8 {d1}, [r0,:64], r2
bne 1b
bx lr
.endm
.macro pixels8_y2 rnd=1, avg=0
sub r3, r3, #2
vld1.8 {d0}, [r1], r2
vld1.8 {d1}, [r1], r2
1: subs r3, r3, #2
avg d4, d0, d1
vld1.8 {d0}, [r1], r2
avg d5, d0, d1
vld1.8 {d1}, [r1], r2
pld [r1]
pld [r1, r2]
.if \avg
vld1.8 {d2}, [r0,:64], r2
vld1.8 {d3}, [r0,:64]
vrhadd.u8 q2, q2, q1
sub r0, r0, r2
.endif
vst1.8 {d4}, [r0,:64], r2
vst1.8 {d5}, [r0,:64], r2
bne 1b
avg d4, d0, d1
vld1.8 {d0}, [r1], r2
avg d5, d0, d1
.if \avg
vld1.8 {d2}, [r0,:64], r2
vld1.8 {d3}, [r0,:64]
vrhadd.u8 q2, q2, q1
sub r0, r0, r2
.endif
vst1.8 {d4}, [r0,:64], r2
vst1.8 {d5}, [r0,:64], r2
bx lr
.endm
.macro pixels8_xy2 rnd=1, avg=0
sub r3, r3, #2
vld1.8 {q0}, [r1], r2
vld1.8 {q1}, [r1], r2
NRND vmov.i16 q11, #1
pld [r1]
pld [r1, r2]
vext.8 d4, d0, d1, #1
vext.8 d6, d2, d3, #1
vaddl.u8 q8, d0, d4
vaddl.u8 q9, d2, d6
1: subs r3, r3, #2
vld1.8 {q0}, [r1], r2
pld [r1]
vadd.u16 q10, q8, q9
vext.8 d4, d0, d1, #1
NRND vadd.u16 q10, q10, q11
vaddl.u8 q8, d0, d4
shrn d5, q10, #2
vld1.8 {q1}, [r1], r2
vadd.u16 q10, q8, q9
pld [r1, r2]
.if \avg
vld1.8 {d7}, [r0,:64]
vrhadd.u8 d5, d5, d7
.endif
NRND vadd.u16 q10, q10, q11
vst1.8 {d5}, [r0,:64], r2
shrn d7, q10, #2
.if \avg
vld1.8 {d5}, [r0,:64]
vrhadd.u8 d7, d7, d5
.endif
vext.8 d6, d2, d3, #1
vaddl.u8 q9, d2, d6
vst1.8 {d7}, [r0,:64], r2
bgt 1b
vld1.8 {q0}, [r1], r2
vadd.u16 q10, q8, q9
vext.8 d4, d0, d1, #1
NRND vadd.u16 q10, q10, q11
vaddl.u8 q8, d0, d4
shrn d5, q10, #2
vadd.u16 q10, q8, q9
.if \avg
vld1.8 {d7}, [r0,:64]
vrhadd.u8 d5, d5, d7
.endif
NRND vadd.u16 q10, q10, q11
vst1.8 {d5}, [r0,:64], r2
shrn d7, q10, #2
.if \avg
vld1.8 {d5}, [r0,:64]
vrhadd.u8 d7, d7, d5
.endif
vst1.8 {d7}, [r0,:64], r2
bx lr
.endm
.macro pixfunc pfx, name, suf, rnd=1, avg=0
.if \rnd
.macro avg rd, rn, rm
vrhadd.u8 \rd, \rn, \rm
.endm
.macro shrn rd, rn, rm
vrshrn.u16 \rd, \rn, \rm
.endm
.macro NRND insn:vararg
.endm
.else
.macro avg rd, rn, rm
vhadd.u8 \rd, \rn, \rm
.endm
.macro shrn rd, rn, rm
vshrn.u16 \rd, \rn, \rm
.endm
.macro NRND insn:vararg
\insn
.endm
.endif
function ff_\pfx\name\suf\()_neon, export=1
\name \rnd, \avg
endfunc
.purgem avg
.purgem shrn
.purgem NRND
.endm
.macro pixfunc2 pfx, name, avg=0
pixfunc \pfx, \name, rnd=1, avg=\avg
pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
.endm
function ff_put_h264_qpel16_mc00_neon, export=1
mov r3, #16
endfunc
pixfunc put_, pixels16, avg=0
pixfunc2 put_, pixels16_x2, avg=0
pixfunc2 put_, pixels16_y2, avg=0
pixfunc2 put_, pixels16_xy2, avg=0
function ff_avg_h264_qpel16_mc00_neon, export=1
mov r3, #16
endfunc
pixfunc avg_, pixels16, avg=1
pixfunc2 avg_, pixels16_x2, avg=1
pixfunc2 avg_, pixels16_y2, avg=1
pixfunc2 avg_, pixels16_xy2, avg=1
function ff_put_h264_qpel8_mc00_neon, export=1
mov r3, #8
endfunc
pixfunc put_, pixels8, avg=0
pixfunc2 put_, pixels8_x2, avg=0
pixfunc2 put_, pixels8_y2, avg=0
pixfunc2 put_, pixels8_xy2, avg=0
function ff_avg_h264_qpel8_mc00_neon, export=1
mov r3, #8
endfunc
pixfunc avg_, pixels8, avg=1
pixfunc avg_, pixels8_x2, avg=1
pixfunc avg_, pixels8_y2, avg=1
pixfunc avg_, pixels8_xy2, avg=1

View File

@@ -0,0 +1,92 @@
/*
* ARM NEON optimised integer operations
* Copyright (c) 2009 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.fpu neon
function ff_scalarproduct_int16_neon, export=1
vmov.i16 q0, #0
vmov.i16 q1, #0
vmov.i16 q2, #0
vmov.i16 q3, #0
1: vld1.16 {d16-d17}, [r0]!
vld1.16 {d20-d21}, [r1,:128]!
vmlal.s16 q0, d16, d20
vld1.16 {d18-d19}, [r0]!
vmlal.s16 q1, d17, d21
vld1.16 {d22-d23}, [r1,:128]!
vmlal.s16 q2, d18, d22
vmlal.s16 q3, d19, d23
subs r2, r2, #16
bne 1b
vpadd.s32 d16, d0, d1
vpadd.s32 d17, d2, d3
vpadd.s32 d10, d4, d5
vpadd.s32 d11, d6, d7
vpadd.s32 d0, d16, d17
vpadd.s32 d1, d10, d11
vpadd.s32 d2, d0, d1
vpaddl.s32 d3, d2
vmov.32 r0, d3[0]
bx lr
endfunc
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
function ff_scalarproduct_and_madd_int16_neon, export=1
vld1.16 {d28[],d29[]}, [sp]
vmov.i16 q0, #0
vmov.i16 q1, #0
vmov.i16 q2, #0
vmov.i16 q3, #0
mov r12, r0
1: vld1.16 {d16-d17}, [r0,:128]!
vld1.16 {d18-d19}, [r1]!
vld1.16 {d20-d21}, [r2]!
vld1.16 {d22-d23}, [r0,:128]!
vld1.16 {d24-d25}, [r1]!
vld1.16 {d26-d27}, [r2]!
vmul.s16 q10, q10, q14
vmul.s16 q13, q13, q14
vmlal.s16 q0, d16, d18
vmlal.s16 q1, d17, d19
vadd.s16 q10, q8, q10
vadd.s16 q13, q11, q13
vmlal.s16 q2, d22, d24
vmlal.s16 q3, d23, d25
vst1.16 {q10}, [r12,:128]!
subs r3, r3, #16
vst1.16 {q13}, [r12,:128]!
bne 1b
vpadd.s32 d16, d0, d1
vpadd.s32 d17, d2, d3
vpadd.s32 d10, d4, d5
vpadd.s32 d11, d6, d7
vpadd.s32 d0, d16, d17
vpadd.s32 d1, d10, d11
vpadd.s32 d2, d0, d1
vpaddl.s32 d3, d2
vmov.32 r0, d3[0]
bx lr
endfunc

View File

@@ -0,0 +1,383 @@
/*
C-like prototype :
void j_rev_dct_arm(DCTBLOCK data)
With DCTBLOCK being a pointer to an array of 64 'signed shorts'
Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libavutil/arm/asm.S"
#define FIX_0_298631336 2446
#define FIX_0_541196100 4433
#define FIX_0_765366865 6270
#define FIX_1_175875602 9633
#define FIX_1_501321110 12299
#define FIX_2_053119869 16819
#define FIX_3_072711026 25172
#define FIX_M_0_390180644 -3196
#define FIX_M_0_899976223 -7373
#define FIX_M_1_847759065 -15137
#define FIX_M_1_961570560 -16069
#define FIX_M_2_562915447 -20995
#define FIX_0xFFFF 0xFFFF
#define FIX_0_298631336_ID 0
#define FIX_0_541196100_ID 4
#define FIX_0_765366865_ID 8
#define FIX_1_175875602_ID 12
#define FIX_1_501321110_ID 16
#define FIX_2_053119869_ID 20
#define FIX_3_072711026_ID 24
#define FIX_M_0_390180644_ID 28
#define FIX_M_0_899976223_ID 32
#define FIX_M_1_847759065_ID 36
#define FIX_M_1_961570560_ID 40
#define FIX_M_2_562915447_ID 44
#define FIX_0xFFFF_ID 48
function ff_j_rev_dct_arm, export=1
push {r0, r4 - r11, lr}
mov lr, r0 @ lr = pointer to the current row
mov r12, #8 @ r12 = row-counter
movrel r11, const_array @ r11 = base pointer to the constants array
row_loop:
ldrsh r0, [lr, # 0] @ r0 = 'd0'
ldrsh r2, [lr, # 2] @ r2 = 'd2'
@ Optimization for row that have all items except the first set to 0
@ (this works as the int16_t are always 4-byte aligned)
ldr r5, [lr, # 0]
ldr r6, [lr, # 4]
ldr r3, [lr, # 8]
ldr r4, [lr, #12]
orr r3, r3, r4
orr r3, r3, r6
orrs r5, r3, r5
beq end_of_row_loop @ nothing to be done as ALL of them are '0'
orrs r3, r3, r2
beq empty_row
ldrsh r1, [lr, # 8] @ r1 = 'd1'
ldrsh r4, [lr, # 4] @ r4 = 'd4'
ldrsh r6, [lr, # 6] @ r6 = 'd6'
ldr r3, [r11, #FIX_0_541196100_ID]
add r7, r2, r6
ldr r5, [r11, #FIX_M_1_847759065_ID]
mul r7, r3, r7 @ r7 = z1
ldr r3, [r11, #FIX_0_765366865_ID]
mla r6, r5, r6, r7 @ r6 = tmp2
add r5, r0, r4 @ r5 = tmp0
mla r2, r3, r2, r7 @ r2 = tmp3
sub r3, r0, r4 @ r3 = tmp1
add r0, r2, r5, lsl #13 @ r0 = tmp10
rsb r2, r2, r5, lsl #13 @ r2 = tmp13
add r4, r6, r3, lsl #13 @ r4 = tmp11
rsb r3, r6, r3, lsl #13 @ r3 = tmp12
push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11
ldrsh r3, [lr, #10] @ r3 = 'd3'
ldrsh r5, [lr, #12] @ r5 = 'd5'
ldrsh r7, [lr, #14] @ r7 = 'd7'
add r0, r3, r5 @ r0 = 'z2'
add r2, r1, r7 @ r2 = 'z1'
add r4, r3, r7 @ r4 = 'z3'
add r6, r1, r5 @ r6 = 'z4'
ldr r9, [r11, #FIX_1_175875602_ID]
add r8, r4, r6 @ r8 = z3 + z4
ldr r10, [r11, #FIX_M_0_899976223_ID]
mul r8, r9, r8 @ r8 = 'z5'
ldr r9, [r11, #FIX_M_2_562915447_ID]
mul r2, r10, r2 @ r2 = 'z1'
ldr r10, [r11, #FIX_M_1_961570560_ID]
mul r0, r9, r0 @ r0 = 'z2'
ldr r9, [r11, #FIX_M_0_390180644_ID]
mla r4, r10, r4, r8 @ r4 = 'z3'
ldr r10, [r11, #FIX_0_298631336_ID]
mla r6, r9, r6, r8 @ r6 = 'z4'
ldr r9, [r11, #FIX_2_053119869_ID]
mla r7, r10, r7, r2 @ r7 = tmp0 + z1
ldr r10, [r11, #FIX_3_072711026_ID]
mla r5, r9, r5, r0 @ r5 = tmp1 + z2
ldr r9, [r11, #FIX_1_501321110_ID]
mla r3, r10, r3, r0 @ r3 = tmp2 + z2
add r7, r7, r4 @ r7 = tmp0
mla r1, r9, r1, r2 @ r1 = tmp3 + z1
add r5, r5, r6 @ r5 = tmp1
add r3, r3, r4 @ r3 = tmp2
add r1, r1, r6 @ r1 = tmp3
pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
@ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
add r8, r0, r1
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 0]
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
sub r8, r0, r1
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, #14]
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
add r8, r6, r3
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 2]
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
sub r8, r6, r3
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, #12]
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
add r8, r4, r5
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 4]
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
sub r8, r4, r5
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, #10]
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
add r8, r2, r7
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 6]
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
sub r8, r2, r7
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 8]
@ End of row loop
add lr, lr, #16
subs r12, r12, #1
bne row_loop
beq start_column_loop
empty_row:
ldr r1, [r11, #FIX_0xFFFF_ID]
mov r0, r0, lsl #2
and r0, r0, r1
add r0, r0, r0, lsl #16
str r0, [lr, # 0]
str r0, [lr, # 4]
str r0, [lr, # 8]
str r0, [lr, #12]
end_of_row_loop:
@ End of loop
add lr, lr, #16
subs r12, r12, #1
bne row_loop
start_column_loop:
@ Start of column loop
pop {lr}
mov r12, #8
column_loop:
ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
ldr r3, [r11, #FIX_0_541196100_ID]
add r1, r2, r6
ldr r5, [r11, #FIX_M_1_847759065_ID]
mul r1, r3, r1 @ r1 = z1
ldr r3, [r11, #FIX_0_765366865_ID]
mla r6, r5, r6, r1 @ r6 = tmp2
add r5, r0, r4 @ r5 = tmp0
mla r2, r3, r2, r1 @ r2 = tmp3
sub r3, r0, r4 @ r3 = tmp1
add r0, r2, r5, lsl #13 @ r0 = tmp10
rsb r2, r2, r5, lsl #13 @ r2 = tmp13
add r4, r6, r3, lsl #13 @ r4 = tmp11
rsb r6, r6, r3, lsl #13 @ r6 = tmp12
ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
@ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
orr r9, r1, r3
orr r10, r5, r7
orrs r10, r9, r10
beq empty_odd_column
push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11
add r0, r3, r5 @ r0 = 'z2'
add r2, r1, r7 @ r2 = 'z1'
add r4, r3, r7 @ r4 = 'z3'
add r6, r1, r5 @ r6 = 'z4'
ldr r9, [r11, #FIX_1_175875602_ID]
add r8, r4, r6
ldr r10, [r11, #FIX_M_0_899976223_ID]
mul r8, r9, r8 @ r8 = 'z5'
ldr r9, [r11, #FIX_M_2_562915447_ID]
mul r2, r10, r2 @ r2 = 'z1'
ldr r10, [r11, #FIX_M_1_961570560_ID]
mul r0, r9, r0 @ r0 = 'z2'
ldr r9, [r11, #FIX_M_0_390180644_ID]
mla r4, r10, r4, r8 @ r4 = 'z3'
ldr r10, [r11, #FIX_0_298631336_ID]
mla r6, r9, r6, r8 @ r6 = 'z4'
ldr r9, [r11, #FIX_2_053119869_ID]
mla r7, r10, r7, r2 @ r7 = tmp0 + z1
ldr r10, [r11, #FIX_3_072711026_ID]
mla r5, r9, r5, r0 @ r5 = tmp1 + z2
ldr r9, [r11, #FIX_1_501321110_ID]
mla r3, r10, r3, r0 @ r3 = tmp2 + z2
add r7, r7, r4 @ r7 = tmp0
mla r1, r9, r1, r2 @ r1 = tmp3 + z1
add r5, r5, r6 @ r5 = tmp1
add r3, r3, r4 @ r3 = tmp2
add r1, r1, r6 @ r1 = tmp3
pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
@ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
add r8, r0, r1
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 0*8)]
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
sub r8, r0, r1
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #(14*8)]
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
add r8, r4, r3
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 2*8)]
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
sub r8, r4, r3
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #(12*8)]
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
add r8, r6, r5
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 4*8)]
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
sub r8, r6, r5
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #(10*8)]
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
add r8, r2, r7
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 6*8)]
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
sub r8, r2, r7
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 8*8)]
@ End of row loop
add lr, lr, #2
subs r12, r12, #1
bne column_loop
beq the_end
empty_odd_column:
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
add r0, r0, #(1<<17)
mov r0, r0, asr #18
strh r0, [lr, #( 0*8)]
strh r0, [lr, #(14*8)]
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
add r4, r4, #(1<<17)
mov r4, r4, asr #18
strh r4, [lr, #( 2*8)]
strh r4, [lr, #(12*8)]
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
add r6, r6, #(1<<17)
mov r6, r6, asr #18
strh r6, [lr, #( 4*8)]
strh r6, [lr, #(10*8)]
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
add r2, r2, #(1<<17)
mov r2, r2, asr #18
strh r2, [lr, #( 6*8)]
strh r2, [lr, #( 8*8)]
@ End of row loop
add lr, lr, #2
subs r12, r12, #1
bne column_loop
the_end:
@ The end....
pop {r4 - r11, pc}
endfunc
const const_array
.word FIX_0_298631336
.word FIX_0_541196100
.word FIX_0_765366865
.word FIX_1_175875602
.word FIX_1_501321110
.word FIX_2_053119869
.word FIX_3_072711026
.word FIX_M_0_390180644
.word FIX_M_0_899976223
.word FIX_M_1_847759065
.word FIX_M_1_961570560
.word FIX_M_2_562915447
.word FIX_0xFFFF
endconst

View File

@@ -0,0 +1,108 @@
/*
* simple math operations
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_MATHOPS_H
#define AVCODEC_ARM_MATHOPS_H
#include <stdint.h>
#include "config.h"
#include "libavutil/common.h"
#if HAVE_INLINE_ASM
#if HAVE_ARMV6_INLINE
#define MULH MULH
static inline av_const int MULH(int a, int b)
{
int r;
__asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
return r;
}
#define FASTDIV FASTDIV
static av_always_inline av_const int FASTDIV(int a, int b)
{
int r;
__asm__ ("cmp %2, #2 \n\t"
"ldr %0, [%3, %2, lsl #2] \n\t"
"ite le \n\t"
"lsrle %0, %1, #1 \n\t"
"smmulgt %0, %0, %1 \n\t"
: "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
return r;
}
#else /* HAVE_ARMV6_INLINE */
#define FASTDIV FASTDIV
static av_always_inline av_const int FASTDIV(int a, int b)
{
int r, t;
__asm__ ("umull %1, %0, %2, %3"
: "=&r"(r), "=&r"(t) : "r"(a), "r"(ff_inverse[b]));
return r;
}
#endif
#define MLS64(d, a, b) MAC64(d, -(a), b)
#if HAVE_ARMV5TE_INLINE
/* signed 16x16 -> 32 multiply add accumulate */
# define MAC16(rt, ra, rb) \
__asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
/* signed 16x16 -> 32 multiply */
# define MUL16 MUL16
static inline av_const int MUL16(int ra, int rb)
{
int rt;
__asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
return rt;
}
#endif
#define mid_pred mid_pred
static inline av_const int mid_pred(int a, int b, int c)
{
int m;
__asm__ (
"mov %0, %2 \n\t"
"cmp %1, %2 \n\t"
"itt gt \n\t"
"movgt %0, %1 \n\t"
"movgt %1, %2 \n\t"
"cmp %1, %3 \n\t"
"it le \n\t"
"movle %1, %3 \n\t"
"cmp %0, %1 \n\t"
"it gt \n\t"
"movgt %0, %1 \n\t"
: "=&r"(m), "+r"(a)
: "r"(b), "r"(c)
: "cc");
return m;
}
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_ARM_MATHOPS_H */

View File

@@ -0,0 +1,193 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro prerot dst, rt
lsr r3, r6, #2 @ n4
add \rt, r4, r6, lsr #1 @ revtab + n4
add r9, r3, r3, lsl #1 @ n3
add r8, r7, r6 @ tcos + n4
add r3, r2, r6, lsr #1 @ in + n4
add r9, r2, r9, lsl #1 @ in + n3
sub r8, r8, #16
sub r10, r3, #16
sub r11, r9, #16
mov r12, #-16
1:
vld2.16 {d0,d1}, [r9, :128]!
vld2.16 {d2,d3}, [r11,:128], r12
vld2.16 {d4,d5}, [r3, :128]!
vld2.16 {d6,d7}, [r10,:128], r12
vld2.16 {d16,d17},[r7, :128]! @ cos, sin
vld2.16 {d18,d19},[r8, :128], r12
vrev64.16 q1, q1
vrev64.16 q3, q3
vrev64.16 q9, q9
vneg.s16 d0, d0
vneg.s16 d2, d2
vneg.s16 d16, d16
vneg.s16 d18, d18
vhsub.s16 d0, d0, d3 @ re
vhsub.s16 d4, d7, d4 @ im
vhsub.s16 d6, d6, d5
vhsub.s16 d2, d2, d1
vmull.s16 q10, d0, d16
vmlsl.s16 q10, d4, d17
vmull.s16 q11, d0, d17
vmlal.s16 q11, d4, d16
vmull.s16 q12, d6, d18
vmlsl.s16 q12, d2, d19
vmull.s16 q13, d6, d19
vmlal.s16 q13, d2, d18
vshrn.s32 d0, q10, #15
vshrn.s32 d1, q11, #15
vshrn.s32 d2, q12, #15
vshrn.s32 d3, q13, #15
vzip.16 d0, d1
vzip.16 d2, d3
ldrh lr, [r4], #2
ldrh r2, [\rt, #-2]!
add lr, \dst, lr, lsl #2
add r2, \dst, r2, lsl #2
vst1.32 {d0[0]}, [lr,:32]
vst1.32 {d2[0]}, [r2,:32]
ldrh lr, [r4], #2
ldrh r2, [\rt, #-2]!
add lr, \dst, lr, lsl #2
add r2, \dst, r2, lsl #2
vst1.32 {d0[1]}, [lr,:32]
vst1.32 {d2[1]}, [r2,:32]
ldrh lr, [r4], #2
ldrh r2, [\rt, #-2]!
add lr, \dst, lr, lsl #2
add r2, \dst, r2, lsl #2
vst1.32 {d1[0]}, [lr,:32]
vst1.32 {d3[0]}, [r2,:32]
ldrh lr, [r4], #2
ldrh r2, [\rt, #-2]!
add lr, \dst, lr, lsl #2
add r2, \dst, r2, lsl #2
vst1.32 {d1[1]}, [lr,:32]
vst1.32 {d3[1]}, [r2,:32]
subs r6, r6, #32
bgt 1b
.endm
function ff_mdct_fixed_calc_neon, export=1
push {r1,r4-r11,lr}
ldr r4, [r0, #8] @ revtab
ldr r6, [r0, #16] @ mdct_size; n
ldr r7, [r0, #24] @ tcos
prerot r1, r5
mov r4, r0
bl X(ff_fft_fixed_calc_neon)
pop {r5}
mov r12, #-16
ldr r6, [r4, #16] @ mdct_size; n
ldr r7, [r4, #24] @ tcos
add r5, r5, r6, lsr #1
add r7, r7, r6, lsr #1
sub r1, r5, #16
sub r2, r7, #16
1:
vld2.16 {d4,d5}, [r7,:128]!
vld2.16 {d6,d7}, [r2,:128], r12
vld2.16 {d0,d1}, [r5,:128]
vld2.16 {d2,d3}, [r1,:128]
vrev64.16 q3, q3
vrev64.16 q1, q1
vneg.s16 q3, q3
vneg.s16 q2, q2
vmull.s16 q11, d2, d6
vmlal.s16 q11, d3, d7
vmull.s16 q8, d0, d5
vmlsl.s16 q8, d1, d4
vmull.s16 q9, d0, d4
vmlal.s16 q9, d1, d5
vmull.s16 q10, d2, d7
vmlsl.s16 q10, d3, d6
vshrn.s32 d0, q11, #15
vshrn.s32 d1, q8, #15
vshrn.s32 d2, q9, #15
vshrn.s32 d3, q10, #15
vrev64.16 q0, q0
vst2.16 {d2,d3}, [r5,:128]!
vst2.16 {d0,d1}, [r1,:128], r12
subs r6, r6, #32
bgt 1b
pop {r4-r11,pc}
endfunc
function ff_mdct_fixed_calcw_neon, export=1
push {r1,r4-r11,lr}
ldrd r4, r5, [r0, #8] @ revtab, tmp_buf
ldr r6, [r0, #16] @ mdct_size; n
ldr r7, [r0, #24] @ tcos
prerot r5, r1
mov r4, r0
mov r1, r5
bl X(ff_fft_fixed_calc_neon)
pop {r7}
mov r12, #-16
ldr r6, [r4, #16] @ mdct_size; n
ldr r9, [r4, #24] @ tcos
add r5, r5, r6, lsr #1
add r7, r7, r6
add r9, r9, r6, lsr #1
sub r3, r5, #16
sub r1, r7, #16
sub r2, r9, #16
1:
vld2.16 {d4,d5}, [r9,:128]!
vld2.16 {d6,d7}, [r2,:128], r12
vld2.16 {d0,d1}, [r5,:128]!
vld2.16 {d2,d3}, [r3,:128], r12
vrev64.16 q3, q3
vrev64.16 q1, q1
vneg.s16 q3, q3
vneg.s16 q2, q2
vmull.s16 q8, d2, d6
vmlal.s16 q8, d3, d7
vmull.s16 q9, d0, d5
vmlsl.s16 q9, d1, d4
vmull.s16 q10, d0, d4
vmlal.s16 q10, d1, d5
vmull.s16 q11, d2, d7
vmlsl.s16 q11, d3, d6
vrev64.32 q8, q8
vrev64.32 q9, q9
vst2.32 {q10,q11},[r7,:128]!
vst2.32 {d16,d18},[r1,:128], r12
vst2.32 {d17,d19},[r1,:128], r12
subs r6, r6, #32
bgt 1b
pop {r4-r11,pc}
endfunc

View File

@@ -0,0 +1,301 @@
/*
* ARM NEON optimised MDCT
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#define ff_fft_calc_neon X(ff_fft_calc_neon)
function ff_imdct_half_neon, export=1
push {r4-r8,lr}
mov r12, #1
ldr lr, [r0, #20] @ mdct_bits
ldr r4, [r0, #24] @ tcos
ldr r3, [r0, #8] @ revtab
lsl r12, r12, lr @ n = 1 << nbits
lsr lr, r12, #2 @ n4 = n >> 2
add r7, r2, r12, lsl #1
mov r12, #-16
sub r7, r7, #16
vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
vrev64.32 d17, d17
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
vmul.f32 d6, d17, d2
vmul.f32 d7, d0, d2
1:
subs lr, lr, #2
ldr r6, [r3], #4
vmul.f32 d4, d0, d3
vmul.f32 d5, d17, d3
vsub.f32 d4, d6, d4
vadd.f32 d5, d5, d7
uxth r8, r6, ror #16
uxth r6, r6
add r8, r1, r8, lsl #3
add r6, r1, r6, lsl #3
beq 1f
vld2.32 {d16-d17},[r7,:128],r12
vld2.32 {d0-d1}, [r2,:128]!
vrev64.32 d17, d17
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
vmul.f32 d6, d17, d2
vmul.f32 d7, d0, d2
vst2.32 {d4[0],d5[0]}, [r6,:64]
vst2.32 {d4[1],d5[1]}, [r8,:64]
b 1b
1:
vst2.32 {d4[0],d5[0]}, [r6,:64]
vst2.32 {d4[1],d5[1]}, [r8,:64]
mov r4, r0
mov r6, r1
bl ff_fft_calc_neon
mov r12, #1
ldr lr, [r4, #20] @ mdct_bits
ldr r4, [r4, #24] @ tcos
lsl r12, r12, lr @ n = 1 << nbits
lsr lr, r12, #3 @ n8 = n >> 3
add r4, r4, lr, lsl #3
add r6, r6, lr, lsl #3
sub r1, r4, #16
sub r3, r6, #16
mov r7, #-16
mov r8, r6
mov r0, r3
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
1:
subs lr, lr, #2
vmul.f32 d7, d0, d18
vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
vmul.f32 d4, d1, d18
vmul.f32 d5, d21, d19
vmul.f32 d6, d20, d19
vmul.f32 d22, d1, d16
vmul.f32 d23, d21, d17
vmul.f32 d24, d0, d16
vmul.f32 d25, d20, d17
vadd.f32 d7, d7, d22
vadd.f32 d6, d6, d23
vsub.f32 d4, d4, d24
vsub.f32 d5, d5, d25
beq 1f
vld2.32 {d0-d1}, [r3,:128], r7
vld2.32 {d20-d21},[r6,:128]!
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128], r7
vst2.32 {d5,d7}, [r8,:128]!
b 1b
1:
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128]
vst2.32 {d5,d7}, [r8,:128]
pop {r4-r8,pc}
endfunc
function ff_imdct_calc_neon, export=1
push {r4-r6,lr}
ldr r3, [r0, #20]
mov r4, #1
mov r5, r1
lsl r4, r4, r3
add r1, r1, r4
bl ff_imdct_half_neon
add r0, r5, r4, lsl #2
add r1, r5, r4, lsl #1
sub r0, r0, #8
sub r2, r1, #16
mov r3, #-16
mov r6, #-8
vmov.i32 d30, #1<<31
1:
vld1.32 {d0-d1}, [r2,:128], r3
pld [r0, #-16]
vrev64.32 q0, q0
vld1.32 {d2-d3}, [r1,:128]!
veor d4, d1, d30
pld [r2, #-16]
vrev64.32 q1, q1
veor d5, d0, d30
vst1.32 {d2}, [r0,:64], r6
vst1.32 {d3}, [r0,:64], r6
vst1.32 {d4-d5}, [r5,:128]!
subs r4, r4, #16
bgt 1b
pop {r4-r6,pc}
endfunc
function ff_mdct_calc_neon, export=1
push {r4-r10,lr}
mov r12, #1
ldr lr, [r0, #20] @ mdct_bits
ldr r4, [r0, #24] @ tcos
ldr r3, [r0, #8] @ revtab
lsl lr, r12, lr @ n = 1 << nbits
add r7, r2, lr @ in4u
sub r9, r7, #16 @ in4d
add r2, r7, lr, lsl #1 @ in3u
add r8, r9, lr, lsl #1 @ in3d
add r5, r4, lr, lsl #1
sub r5, r5, #16
sub r3, r3, #4
mov r12, #-16
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
vsub.f32 d0, d18, d0 @ in4d-in4u I
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
vadd.f32 d1, d1, d19 @ in3u+in3d -R
vsub.f32 d16, d16, d2 @ in0u-in2d R
vadd.f32 d17, d17, d3 @ in2u+in1d -I
1:
vmul.f32 d7, d0, d21 @ I*s
A ldr r10, [r3, lr, lsr #1]
T lsr r10, lr, #1
T ldr r10, [r3, r10]
vmul.f32 d6, d1, d20 @ -R*c
ldr r6, [r3, #4]!
vmul.f32 d4, d1, d21 @ -R*s
vmul.f32 d5, d0, d20 @ I*c
vmul.f32 d24, d16, d30 @ R*c
vmul.f32 d25, d17, d31 @ -I*s
vmul.f32 d22, d16, d31 @ R*s
vmul.f32 d23, d17, d30 @ I*c
subs lr, lr, #16
vsub.f32 d6, d6, d7 @ -R*c-I*s
vadd.f32 d7, d4, d5 @ -R*s+I*c
vsub.f32 d24, d25, d24 @ I*s-R*c
vadd.f32 d25, d22, d23 @ R*s-I*c
beq 1f
mov r12, #-16
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
vneg.f32 d7, d7 @ R*s-I*c
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
vsub.f32 d0, d18, d0 @ in4d-in4u I
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
vadd.f32 d1, d1, d19 @ in3u+in3d -R
vsub.f32 d16, d16, d2 @ in0u-in2d R
vadd.f32 d17, d17, d3 @ in2u+in1d -I
uxth r12, r6, ror #16
uxth r6, r6
add r12, r1, r12, lsl #3
add r6, r1, r6, lsl #3
vst2.32 {d6[0],d7[0]}, [r6,:64]
vst2.32 {d6[1],d7[1]}, [r12,:64]
uxth r6, r10, ror #16
uxth r10, r10
add r6 , r1, r6, lsl #3
add r10, r1, r10, lsl #3
vst2.32 {d24[0],d25[0]},[r10,:64]
vst2.32 {d24[1],d25[1]},[r6,:64]
b 1b
1:
vneg.f32 d7, d7 @ R*s-I*c
uxth r12, r6, ror #16
uxth r6, r6
add r12, r1, r12, lsl #3
add r6, r1, r6, lsl #3
vst2.32 {d6[0],d7[0]}, [r6,:64]
vst2.32 {d6[1],d7[1]}, [r12,:64]
uxth r6, r10, ror #16
uxth r10, r10
add r6 , r1, r6, lsl #3
add r10, r1, r10, lsl #3
vst2.32 {d24[0],d25[0]},[r10,:64]
vst2.32 {d24[1],d25[1]},[r6,:64]
mov r4, r0
mov r6, r1
bl ff_fft_calc_neon
mov r12, #1
ldr lr, [r4, #20] @ mdct_bits
ldr r4, [r4, #24] @ tcos
lsl r12, r12, lr @ n = 1 << nbits
lsr lr, r12, #3 @ n8 = n >> 3
add r4, r4, lr, lsl #3
add r6, r6, lr, lsl #3
sub r1, r4, #16
sub r3, r6, #16
mov r7, #-16
mov r8, r6
mov r0, r3
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
1:
subs lr, lr, #2
vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
vneg.f32 q2, q2
beq 1f
vld2.32 {d0-d1}, [r3,:128], r7
vld2.32 {d20-d21},[r6,:128]!
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128], r7
vst2.32 {d5,d7}, [r8,:128]!
b 1b
1:
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128]
vst2.32 {d5,d7}, [r8,:128]
pop {r4-r10,pc}
endfunc

View File

@@ -0,0 +1,205 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
CONTEXT .req a1
ORIGOUT .req a2
IN .req a3
OUT .req v1
REVTAB .req v2
TCOS .req v3
TSIN .req v4
OLDFPSCR .req v5
J0 .req a2
J1 .req a4
J2 .req ip
J3 .req lr
.macro prerotation_innerloop
.set trig_lo, k
.set trig_hi, n4 - k - 2
.set in_lo, trig_lo * 2
.set in_hi, trig_hi * 2
vldr d8, [TCOS, #trig_lo*4] @ s16,s17
vldr d9, [TCOS, #trig_hi*4] @ s18,s19
vldr s0, [IN, #in_hi*4 + 12]
vldr s1, [IN, #in_hi*4 + 4]
vldr s2, [IN, #in_lo*4 + 12]
vldr s3, [IN, #in_lo*4 + 4]
vmul.f s8, s0, s16 @ vector operation
vldr d10, [TSIN, #trig_lo*4] @ s20,s21
vldr d11, [TSIN, #trig_hi*4] @ s22,s23
vldr s4, [IN, #in_lo*4]
vldr s5, [IN, #in_lo*4 + 8]
vldr s6, [IN, #in_hi*4]
vldr s7, [IN, #in_hi*4 + 8]
ldr J0, [REVTAB, #trig_lo*2]
vmul.f s12, s0, s20 @ vector operation
ldr J2, [REVTAB, #trig_hi*2]
mov J1, J0, lsr #16
and J0, J0, #255 @ halfword value will be < n4
vmls.f s8, s4, s20 @ vector operation
mov J3, J2, lsr #16
and J2, J2, #255 @ halfword value will be < n4
add J0, OUT, J0, lsl #3
vmla.f s12, s4, s16 @ vector operation
add J1, OUT, J1, lsl #3
add J2, OUT, J2, lsl #3
add J3, OUT, J3, lsl #3
vstr s8, [J0]
vstr s9, [J1]
vstr s10, [J2]
vstr s11, [J3]
vstr s12, [J0, #4]
vstr s13, [J1, #4]
vstr s14, [J2, #4]
vstr s15, [J3, #4]
.set k, k + 2
.endm
.macro postrotation_innerloop tail, head
.set trig_lo_head, n8 - k - 2
.set trig_hi_head, n8 + k
.set out_lo_head, trig_lo_head * 2
.set out_hi_head, trig_hi_head * 2
.set trig_lo_tail, n8 - (k - 2) - 2
.set trig_hi_tail, n8 + (k - 2)
.set out_lo_tail, trig_lo_tail * 2
.set out_hi_tail, trig_hi_tail * 2
.if (k & 2) == 0
TCOS_D0_HEAD .req d10 @ s20,s21
TCOS_D1_HEAD .req d11 @ s22,s23
TCOS_S0_TAIL .req s24
.else
TCOS_D0_HEAD .req d12 @ s24,s25
TCOS_D1_HEAD .req d13 @ s26,s27
TCOS_S0_TAIL .req s20
.endif
.ifnc "\tail",""
vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
.endif
.ifnc "\head",""
vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
.endif
.ifnc "\tail",""
vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
.endif
.ifnc "\head",""
vldr s0, [OUT, #out_lo_head*4]
vldr s1, [OUT, #out_lo_head*4 + 8]
vldr s2, [OUT, #out_hi_head*4]
vldr s3, [OUT, #out_hi_head*4 + 8]
vldr s4, [OUT, #out_lo_head*4 + 4]
vldr s5, [OUT, #out_lo_head*4 + 12]
vldr s6, [OUT, #out_hi_head*4 + 4]
vldr s7, [OUT, #out_hi_head*4 + 12]
.endif
.ifnc "\tail",""
vstr s8, [OUT, #out_lo_tail*4]
vstr s9, [OUT, #out_lo_tail*4 + 8]
vstr s10, [OUT, #out_hi_tail*4]
vstr s11, [OUT, #out_hi_tail*4 + 8]
.endif
.ifnc "\head",""
vmul.f s8, s4, s16 @ vector operation
.endif
.ifnc "\tail",""
vstr s12, [OUT, #out_hi_tail*4 + 12]
vstr s13, [OUT, #out_hi_tail*4 + 4]
vstr s14, [OUT, #out_lo_tail*4 + 12]
vstr s15, [OUT, #out_lo_tail*4 + 4]
.endif
.ifnc "\head",""
vmul.f s12, s0, s16 @ vector operation
vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
.endif
.unreq TCOS_D0_HEAD
.unreq TCOS_D1_HEAD
.unreq TCOS_S0_TAIL
.ifnc "\head",""
.set k, k + 2
.endif
.endm
/* void ff_imdct_half_vfp(FFTContext *s,
* FFTSample *output,
* const FFTSample *input)
*/
function ff_imdct_half_vfp, export=1
ldr ip, [CONTEXT, #5*4] @ mdct_bits
teq ip, #6
it ne
bne X(ff_imdct_half_c) @ only case currently accelerated is the one used by DCA
.set n, 1<<6
.set n2, n/2
.set n4, n/4
.set n8, n/8
push {v1-v5,lr}
vpush {s16-s27}
fmrx OLDFPSCR, FPSCR
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
mov OUT, ORIGOUT
ldr REVTAB, [CONTEXT, #2*4]
ldr TCOS, [CONTEXT, #6*4]
ldr TSIN, [CONTEXT, #7*4]
.set k, 0
.rept n8/2
prerotation_innerloop
.endr
fmxr FPSCR, OLDFPSCR
mov a1, OUT
bl X(ff_fft16_vfp)
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
.set k, 0
postrotation_innerloop , head
.rept n8/2 - 1
postrotation_innerloop tail, head
.endr
postrotation_innerloop tail
fmxr FPSCR, OLDFPSCR
vpop {s16-s27}
pop {v1-v5,pc}
endfunc
.unreq CONTEXT
.unreq ORIGOUT
.unreq IN
.unreq OUT
.unreq REVTAB
.unreq TCOS
.unreq TSIN
.unreq OLDFPSCR
.unreq J0
.unreq J1
.unreq J2
.unreq J3

View File

@@ -0,0 +1,143 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro skip args:vararg
.endm
.macro sum8 lo, hi, w, p, t1, t2, t3, t4, rsb=skip, offs=0
ldr \t1, [\w, #4*\offs]
ldr \t2, [\p, #4]!
\rsb \t1, \t1, #0
.irpc i, 135
ldr \t3, [\w, #4*64*\i+4*\offs]
ldr \t4, [\p, #4*64*\i]
smlal \lo, \hi, \t1, \t2
\rsb \t3, \t3, #0
ldr \t1, [\w, #4*64*(\i+1)+4*\offs]
ldr \t2, [\p, #4*64*(\i+1)]
smlal \lo, \hi, \t3, \t4
\rsb \t1, \t1, #0
.endr
ldr \t3, [\w, #4*64*7+4*\offs]
ldr \t4, [\p, #4*64*7]
smlal \lo, \hi, \t1, \t2
\rsb \t3, \t3, #0
smlal \lo, \hi, \t3, \t4
.endm
.macro round rd, lo, hi
lsr \rd, \lo, #24
bic \lo, \lo, #0xff000000
orr \rd, \rd, \hi, lsl #8
mov \hi, #0
ssat \rd, #16, \rd
.endm
function ff_mpadsp_apply_window_fixed_armv6, export=1
push {r2,r4-r11,lr}
add r4, r0, #4*512 @ synth_buf + 512
.rept 4
ldm r0!, {r5-r12}
stm r4!, {r5-r12}
.endr
ldr r4, [sp, #40] @ incr
sub r0, r0, #4*17 @ synth_buf + 16
ldr r8, [r2] @ sum:low
add r2, r0, #4*32 @ synth_buf + 48
rsb r5, r4, r4, lsl #5 @ 31 * incr
lsl r4, r4, #1
asr r9, r8, #31 @ sum:high
add r5, r3, r5, lsl #1 @ samples2
add r6, r1, #4*32 @ w2
str r4, [sp, #40]
sum8 r8, r9, r1, r0, r10, r11, r12, lr
sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
round r10, r8, r9
strh_post r10, r3, r4
mov lr, #15
1:
ldr r12, [r0, #4]!
ldr r11, [r6, #-4]!
ldr r10, [r1, #4]!
.irpc i, 0246
.if \i
ldr r11, [r6, #4*64*\i]
ldr r10, [r1, #4*64*\i]
.endif
rsb r11, r11, #0
smlal r8, r9, r10, r12
ldr r10, [r0, #4*64*(\i+1)]
.ifeq \i
smull r4, r7, r11, r12
.else
smlal r4, r7, r11, r12
.endif
ldr r11, [r6, #4*64*(\i+1)]
ldr r12, [r1, #4*64*(\i+1)]
rsb r11, r11, #0
smlal r8, r9, r12, r10
.iflt \i-6
ldr r12, [r0, #4*64*(\i+2)]
.else
ldr r12, [r2, #-4]!
.endif
smlal r4, r7, r11, r10
.endr
.irpc i, 0246
ldr r10, [r1, #4*64*\i+4*32]
rsb r12, r12, #0
ldr r11, [r6, #4*64*\i+4*32]
smlal r8, r9, r10, r12
ldr r10, [r2, #4*64*(\i+1)]
smlal r4, r7, r11, r12
ldr r12, [r1, #4*64*(\i+1)+4*32]
rsb r10, r10, #0
ldr r11, [r6, #4*64*(\i+1)+4*32]
smlal r8, r9, r12, r10
.iflt \i-6
ldr r12, [r2, #4*64*(\i+2)]
.else
ldr r12, [sp, #40]
.endif
smlal r4, r7, r11, r10
.endr
round r10, r8, r9
adds r8, r8, r4
adc r9, r9, r7
strh_post r10, r3, r12
round r11, r8, r9
subs lr, lr, #1
strh_dpost r11, r5, r12
bgt 1b
sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33
pop {r4}
round r10, r8, r9
str r8, [r4]
strh r10, [r3]
pop {r4-r11,pc}
endfunc

View File

@@ -0,0 +1,38 @@
/*
* Copyright (c) 2011 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/mpegaudiodsp.h"
#include "config.h"
void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window,
int *dither, int16_t *out, int incr);
av_cold void ff_mpadsp_init_arm(MPADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv6(cpu_flags)) {
s->apply_window_fixed = ff_mpadsp_apply_window_fixed_armv6;
}
}

View File

@@ -0,0 +1,52 @@
/*
* Copyright (c) 2002 Michael Niedermayer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
#include "mpegvideo_arm.h"
#include "asm-offsets.h"
#if HAVE_NEON
CHK_OFFS(MpegEncContext, y_dc_scale, Y_DC_SCALE);
CHK_OFFS(MpegEncContext, c_dc_scale, C_DC_SCALE);
CHK_OFFS(MpegEncContext, ac_pred, AC_PRED);
CHK_OFFS(MpegEncContext, block_last_index, BLOCK_LAST_INDEX);
CHK_OFFS(MpegEncContext, inter_scantable.raster_end, INTER_SCANTAB_RASTER_END);
CHK_OFFS(MpegEncContext, h263_aic, H263_AIC);
#endif
void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
int n, int qscale);
void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
int n, int qscale);
av_cold void ff_MPV_common_init_arm(MpegEncContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv5te(cpu_flags))
ff_MPV_common_init_armv5te(s);
if (have_neon(cpu_flags)) {
s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_neon;
s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_neon;
}
}

View File

@@ -0,0 +1,26 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_MPEGVIDEO_H
#define AVCODEC_ARM_MPEGVIDEO_H
#include "libavcodec/mpegvideo.h"
void ff_MPV_common_init_armv5te(MpegEncContext *s);
#endif /* AVCODEC_ARM_MPEGVIDEO_H */

View File

@@ -0,0 +1,102 @@
/*
* Optimization of some functions from mpegvideo.c for armv5te
* Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
#include "mpegvideo_arm.h"
void ff_dct_unquantize_h263_armv5te(int16_t *block, int qmul, int qadd, int count);
#ifdef ENABLE_ARM_TESTS
/**
* h263 dequantizer supplementary function, it is performance critical and needs to
* have optimized implementations for each architecture. Is also used as a reference
* implementation in regression tests
*/
static inline void dct_unquantize_h263_helper_c(int16_t *block, int qmul, int qadd, int count)
{
int i, level;
for (i = 0; i < count; i++) {
level = block[i];
if (level) {
if (level < 0) {
level = level * qmul - qadd;
} else {
level = level * qmul + qadd;
}
block[i] = level;
}
}
}
#endif
static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
int level, qmul, qadd;
int nCoeffs;
av_assert2(s->block_last_index[n]>=0);
qmul = qscale << 1;
if (!s->h263_aic) {
if (n < 4)
level = block[0] * s->y_dc_scale;
else
level = block[0] * s->c_dc_scale;
qadd = (qscale - 1) | 1;
}else{
qadd = 0;
level = block[0];
}
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
block[0] = level;
}
static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
int qmul, qadd;
int nCoeffs;
av_assert2(s->block_last_index[n]>=0);
qadd = (qscale - 1) | 1;
qmul = qscale << 1;
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
}
av_cold void ff_MPV_common_init_armv5te(MpegEncContext *s)
{
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
}

View File

@@ -0,0 +1,114 @@
/*
* Optimization of some functions from mpegvideo.c for armv5te
* Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/asm.S"
/*
* Special optimized version of dct_unquantize_h263_helper_c, it
* requires the block to be at least 8 bytes aligned, and may process
* more elements than requested. But it is guaranteed to never
* process more than 64 elements provided that count argument is <= 64,
* so it is safe. This function is optimized for a common distribution
* of values for nCoeffs (they are mostly multiple of 8 plus one or
* two extra elements). So this function processes data as 8 elements
* per loop iteration and contains optional 2 elements processing in
* the end.
*
* Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
*/
.macro dequant_t dst, src, mul, add, tmp
rsbs \tmp, ip, \src, asr #16
it gt
addgt \tmp, \add, #0
it lt
rsblt \tmp, \add, #0
it ne
smlatbne \dst, \src, \mul, \tmp
.endm
.macro dequant_b dst, src, mul, add, tmp
rsbs \tmp, ip, \src, lsl #16
it gt
addgt \tmp, \add, #0
it lt
rsblt \tmp, \add, #0
it ne
smlabbne \dst, \src, \mul, \tmp
.endm
function ff_dct_unquantize_h263_armv5te, export=1
push {r4-r9,lr}
mov ip, #0
subs r3, r3, #2
ble 2f
ldrd r4, r5, [r0, #0]
1:
ldrd r6, r7, [r0, #8]
dequant_t r9, r4, r1, r2, r9
dequant_t lr, r5, r1, r2, lr
dequant_b r4, r4, r1, r2, r8
dequant_b r5, r5, r1, r2, r8
strh r4, [r0], #2
strh r9, [r0], #2
strh r5, [r0], #2
strh lr, [r0], #2
dequant_t r9, r6, r1, r2, r9
dequant_t lr, r7, r1, r2, lr
dequant_b r6, r6, r1, r2, r8
dequant_b r7, r7, r1, r2, r8
strh r6, [r0], #2
strh r9, [r0], #2
strh r7, [r0], #2
strh lr, [r0], #2
subs r3, r3, #8
it gt
ldrdgt r4, r5, [r0, #0] /* load data early to avoid load/use pipeline stall */
bgt 1b
adds r3, r3, #2
it le
pople {r4-r9,pc}
2:
ldrsh r9, [r0, #0]
ldrsh lr, [r0, #2]
mov r8, r2
cmp r9, #0
it lt
rsblt r8, r2, #0
it ne
smlabbne r9, r9, r1, r8
mov r8, r2
cmp lr, #0
it lt
rsblt r8, r2, #0
it ne
smlabbne lr, lr, r1, r8
strh r9, [r0], #2
strh lr, [r0], #2
pop {r4-r9,pc}
endfunc

View File

@@ -0,0 +1,107 @@
/*
* Copyright (c) 2010 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "asm-offsets.h"
function ff_dct_unquantize_h263_inter_neon, export=1
add r12, r0, #BLOCK_LAST_INDEX
ldr r12, [r12, r2, lsl #2]
add r0, r0, #INTER_SCANTAB_RASTER_END
ldrb r12, [r0, r12]
sub r2, r3, #1
lsl r0, r3, #1
orr r2, r2, #1
add r3, r12, #1
endfunc
function ff_dct_unquantize_h263_neon, export=1
vdup.16 q15, r0 @ qmul
vdup.16 q14, r2 @ qadd
vneg.s16 q13, q14
cmp r3, #4
mov r0, r1
ble 2f
1:
vld1.16 {q0}, [r0,:128]!
vclt.s16 q3, q0, #0
vld1.16 {q8}, [r0,:128]!
vceq.s16 q1, q0, #0
vmul.s16 q2, q0, q15
vclt.s16 q11, q8, #0
vmul.s16 q10, q8, q15
vbsl q3, q13, q14
vbsl q11, q13, q14
vadd.s16 q2, q2, q3
vceq.s16 q9, q8, #0
vadd.s16 q10, q10, q11
vbif q0, q2, q1
vbif q8, q10, q9
subs r3, r3, #16
vst1.16 {q0}, [r1,:128]!
vst1.16 {q8}, [r1,:128]!
it le
bxle lr
cmp r3, #8
bgt 1b
2:
vld1.16 {d0}, [r0,:64]
vclt.s16 d3, d0, #0
vceq.s16 d1, d0, #0
vmul.s16 d2, d0, d30
vbsl d3, d26, d28
vadd.s16 d2, d2, d3
vbif d0, d2, d1
vst1.16 {d0}, [r1,:64]
bx lr
endfunc
function ff_dct_unquantize_h263_intra_neon, export=1
push {r4-r6,lr}
add r12, r0, #BLOCK_LAST_INDEX
ldr r6, [r0, #AC_PRED]
add lr, r0, #INTER_SCANTAB_RASTER_END
cmp r6, #0
it ne
movne r12, #63
bne 1f
ldr r12, [r12, r2, lsl #2]
ldrb r12, [lr, r12]
1: ldr r5, [r0, #H263_AIC]
ldrsh r4, [r1]
cmp r5, #0
mov r5, r1
it ne
movne r2, #0
bne 2f
cmp r2, #4
it ge
addge r0, r0, #4
sub r2, r3, #1
ldr r6, [r0, #Y_DC_SCALE]
orr r2, r2, #1
smulbb r4, r4, r6
2: lsl r0, r3, #1
add r3, r12, #1
bl ff_dct_unquantize_h263_neon
vmov.16 d0[0], r4
vst1.16 {d0[0]}, [r5]
pop {r4-r6,pc}
endfunc

View File

@@ -0,0 +1,59 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
vtrn.32 \r0, \r4
vtrn.32 \r1, \r5
vtrn.32 \r2, \r6
vtrn.32 \r3, \r7
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.16 \r4, \r6
vtrn.16 \r5, \r7
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
vtrn.8 \r4, \r5
vtrn.8 \r6, \r7
.endm
.macro transpose_4x4 r0, r1, r2, r3
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
.endm
.macro swap4 r0, r1, r2, r3, r4, r5, r6, r7
vswp \r0, \r4
vswp \r1, \r5
vswp \r2, \r6
vswp \r3, \r7
.endm
.macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7
vtrn.32 \r0, \r2
vtrn.32 \r1, \r3
vtrn.32 \r4, \r6
vtrn.32 \r5, \r7
vtrn.16 \r0, \r1
vtrn.16 \r2, \r3
vtrn.16 \r4, \r5
vtrn.16 \r6, \r7
.endm

View File

@@ -0,0 +1,150 @@
/*
* ARM NEON optimised RDFT
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_rdft_calc_neon, export=1
push {r4-r8,lr}
ldr r6, [r0, #4] @ inverse
mov r4, r0
mov r5, r1
lsls r6, r6, #31
bne 1f
add r0, r4, #20
bl X(ff_fft_permute_neon)
add r0, r4, #20
mov r1, r5
bl X(ff_fft_calc_neon)
1:
ldr r12, [r4, #0] @ nbits
mov r2, #1
lsl r12, r2, r12
add r0, r5, #8
add r1, r5, r12, lsl #2
lsr r12, r12, #2
ldr r2, [r4, #12] @ tcos
sub r12, r12, #2
ldr r3, [r4, #16] @ tsin
mov r7, r0
sub r1, r1, #8
mov lr, r1
mov r8, #-8
vld1.32 {d0}, [r0,:64]! @ d1[0,1]
vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
vld1.32 {d4}, [r2,:64]! @ tcos[i]
vld1.32 {d5}, [r3,:64]! @ tsin[i]
vmov.f32 d18, #0.5 @ k1
vdup.32 d19, r6
pld [r0, #32]
veor d19, d18, d19 @ k2
vmov.i32 d16, #0
vmov.i32 d17, #1<<31
pld [r1, #-32]
vtrn.32 d16, d17
pld [r2, #32]
vrev64.32 d16, d16 @ d16=1,0 d17=0,1
pld [r3, #32]
2:
veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
vld1.32 {d24}, [r0,:64]! @ d1[0,1]
vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
vld1.32 {d25}, [r1,:64], r8 @ d2[0,1]
vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1]
pld [r0, #32]
vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
pld [r1, #-32]
vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1]
vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1]
vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re
veor d7, d21, d16 @ -od.im, od.re
vrev64.32 d3, d21 @ od.re, od.im
veor d6, d20, d17 @ ev.re,-ev.im
veor d2, d3, d16 @ -od.re, od.im
vmla.f32 d20, d3, d4[1]
vmla.f32 d20, d7, d5[1]
vmla.f32 d6, d2, d4[1]
vmla.f32 d6, d21, d5[1]
vld1.32 {d4}, [r2,:64]! @ tcos[i]
veor d7, d23, d16 @ -od.im, od.re
vld1.32 {d5}, [r3,:64]! @ tsin[i]
veor d24, d22, d17 @ ev.re,-ev.im
vrev64.32 d3, d23 @ od.re, od.im
pld [r2, #32]
veor d2, d3, d16 @ -od.re, od.im
pld [r3, #32]
vmla.f32 d22, d3, d4[0]
vmla.f32 d22, d7, d5[0]
vmla.f32 d24, d2, d4[0]
vmla.f32 d24, d23, d5[0]
vld1.32 {d0}, [r0,:64]! @ d1[0,1]
vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
vst1.32 {d20}, [r7,:64]!
vst1.32 {d6}, [lr,:64], r8
vst1.32 {d22}, [r7,:64]!
vst1.32 {d24}, [lr,:64], r8
subs r12, r12, #2
bgt 2b
veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
ldr r2, [r4, #8] @ sign_convention
vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
add r0, r0, #4
bfc r2, #0, #31
vld1.32 {d0[0]}, [r0,:32]
veor d7, d21, d16 @ -od.im, od.re
vrev64.32 d3, d21 @ od.re, od.im
veor d6, d20, d17 @ ev.re,-ev.im
vld1.32 {d22}, [r5,:64]
vdup.32 d1, r2
vmov d23, d22
veor d2, d3, d16 @ -od.re, od.im
vtrn.32 d22, d23
veor d0, d0, d1
veor d23, d23, d17
vmla.f32 d20, d3, d4[1]
vmla.f32 d20, d7, d5[1]
vmla.f32 d6, d2, d4[1]
vmla.f32 d6, d21, d5[1]
vadd.f32 d22, d22, d23
vst1.32 {d20}, [r7,:64]
vst1.32 {d6}, [lr,:64]
vst1.32 {d0[0]}, [r0,:32]
vst1.32 {d22}, [r5,:64]
cmp r6, #0
it eq
popeq {r4-r8,pc}
vmul.f32 d22, d22, d18
vst1.32 {d22}, [r5,:64]
add r0, r4, #20
mov r1, r5
bl X(ff_fft_permute_neon)
add r0, r4, #20
mov r1, r5
pop {r4-r8,lr}
b X(ff_fft_calc_neon)
endfunc

View File

@@ -0,0 +1,46 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/rv34dsp.h"
#include "libavutil/arm/cpu.h"
void ff_rv34_inv_transform_noround_neon(int16_t *block);
void ff_rv34_inv_transform_noround_dc_neon(int16_t *block);
void ff_rv34_idct_add_neon(uint8_t *dst, ptrdiff_t stride, int16_t *block);
void ff_rv34_idct_dc_add_neon(uint8_t *dst, ptrdiff_t stride, int dc);
av_cold void ff_rv34dsp_init_arm(RV34DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
c->rv34_inv_transform = ff_rv34_inv_transform_noround_neon;
c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
c->rv34_idct_add = ff_rv34_idct_add_neon;
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon;
}
}

View File

@@ -0,0 +1,156 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
.macro rv34_inv_transform r0
vld1.16 {q14-q15}, [\r0,:128]
vmov.s16 d0, #13
vshll.s16 q12, d29, #3
vshll.s16 q13, d29, #4
vshll.s16 q9, d31, #3
vshll.s16 q1, d31, #4
vmull.s16 q10, d28, d0
vmlal.s16 q10, d30, d0
vmull.s16 q11, d28, d0
vmlsl.s16 q11, d30, d0
vsubw.s16 q12, q12, d29 @ z2 = block[i+4*1]*7
vaddw.s16 q13, q13, d29 @ z3 = block[i+4*1]*17
vsubw.s16 q9, q9, d31
vaddw.s16 q1, q1, d31
vadd.s32 q13, q13, q9 @ z3 = 17*block[i+4*1] + 7*block[i+4*3]
vsub.s32 q12, q12, q1 @ z2 = 7*block[i+4*1] - 17*block[i+4*3]
vadd.s32 q1, q10, q13 @ z0 + z3
vadd.s32 q2, q11, q12 @ z1 + z2
vsub.s32 q8, q10, q13 @ z0 - z3
vsub.s32 q3, q11, q12 @ z1 - z2
vtrn.32 q1, q2
vtrn.32 q3, q8
vswp d3, d6
vswp d5, d16
vmov.s32 d0, #13
vadd.s32 q10, q1, q3
vsub.s32 q11, q1, q3
vshl.s32 q12, q2, #3
vshl.s32 q9, q2, #4
vmul.s32 q13, q11, d0[0]
vshl.s32 q11, q8, #4
vadd.s32 q9, q9, q2
vshl.s32 q15, q8, #3
vsub.s32 q12, q12, q2
vadd.s32 q11, q11, q8
vmul.s32 q14, q10, d0[0]
vsub.s32 q8, q15, q8
vsub.s32 q12, q12, q11
vadd.s32 q9, q9, q8
vadd.s32 q2, q13, q12 @ z1 + z2
vadd.s32 q1, q14, q9 @ z0 + z3
vsub.s32 q3, q13, q12 @ z1 - z2
vsub.s32 q15, q14, q9 @ z0 - z3
.endm
/* void rv34_idct_add_c(uint8_t *dst, int stride, int16_t *block) */
function ff_rv34_idct_add_neon, export=1
mov r3, r0
rv34_inv_transform r2
vmov.i16 q12, #0
vrshrn.s32 d16, q1, #10 @ (z0 + z3) >> 10
vrshrn.s32 d17, q2, #10 @ (z1 + z2) >> 10
vrshrn.s32 d18, q3, #10 @ (z1 - z2) >> 10
vrshrn.s32 d19, q15, #10 @ (z0 - z3) >> 10
vld1.32 {d28[]}, [r0,:32], r1
vld1.32 {d29[]}, [r0,:32], r1
vtrn.32 q8, q9
vld1.32 {d28[1]}, [r0,:32], r1
vld1.32 {d29[1]}, [r0,:32], r1
vst1.16 {q12}, [r2,:128]! @ memset(block, 0, 16)
vst1.16 {q12}, [r2,:128] @ memset(block+16, 0, 16)
vtrn.16 d16, d17
vtrn.32 d28, d29
vtrn.16 d18, d19
vaddw.u8 q0, q8, d28
vaddw.u8 q1, q9, d29
vqmovun.s16 d28, q0
vqmovun.s16 d29, q1
vst1.32 {d28[0]}, [r3,:32], r1
vst1.32 {d28[1]}, [r3,:32], r1
vst1.32 {d29[0]}, [r3,:32], r1
vst1.32 {d29[1]}, [r3,:32], r1
bx lr
endfunc
/* void rv34_inv_transform_noround_neon(int16_t *block); */
function ff_rv34_inv_transform_noround_neon, export=1
rv34_inv_transform r0
vshl.s32 q11, q2, #1
vshl.s32 q10, q1, #1
vshl.s32 q12, q3, #1
vshl.s32 q13, q15, #1
vadd.s32 q11, q11, q2
vadd.s32 q10, q10, q1
vadd.s32 q12, q12, q3
vadd.s32 q13, q13, q15
vshrn.s32 d0, q10, #11 @ (z0 + z3)*3 >> 11
vshrn.s32 d1, q11, #11 @ (z1 + z2)*3 >> 11
vshrn.s32 d2, q12, #11 @ (z1 - z2)*3 >> 11
vshrn.s32 d3, q13, #11 @ (z0 - z3)*3 >> 11
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0,:64]!
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0,:64]!
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0,:64]!
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0,:64]!
bx lr
endfunc
/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */
function ff_rv34_idct_dc_add_neon, export=1
mov r3, r0
vld1.32 {d28[]}, [r0,:32], r1
vld1.32 {d29[]}, [r0,:32], r1
vdup.16 d0, r2
vmov.s16 d1, #169
vld1.32 {d28[1]}, [r0,:32], r1
vmull.s16 q1, d0, d1 @ dc * 13 * 13
vld1.32 {d29[1]}, [r0,:32], r1
vrshrn.s32 d0, q1, #10 @ (dc * 13 * 13 + 0x200) >> 10
vmov d1, d0
vaddw.u8 q2, q0, d28
vaddw.u8 q3, q0, d29
vqmovun.s16 d28, q2
vqmovun.s16 d29, q3
vst1.32 {d28[0]}, [r3,:32], r1
vst1.32 {d29[0]}, [r3,:32], r1
vst1.32 {d28[1]}, [r3,:32], r1
vst1.32 {d29[1]}, [r3,:32], r1
bx lr
endfunc
/* void rv34_inv_transform_dc_noround_c(int16_t *block) */
function ff_rv34_inv_transform_noround_dc_neon, export=1
vld1.16 {d28[]}, [r0,:16] @ block[0]
vmov.i16 d4, #251
vorr.s16 d4, #256 @ 13^2 * 3
vmull.s16 q3, d28, d4
vshrn.s32 d0, q3, #11
vmov.i16 d1, d0
vst1.64 {q0}, [r0,:128]!
vst1.64 {q0}, [r0,:128]!
bx lr
endfunc

View File

@@ -0,0 +1,148 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/rv34dsp.h"
#include "libavutil/arm/cpu.h"
#define DECL_QPEL3(type, w, pos) \
void ff_##type##_rv40_qpel##w##_mc##pos##_neon(uint8_t *dst, uint8_t *src,\
ptrdiff_t stride)
#define DECL_QPEL2(w, pos) \
DECL_QPEL3(put, w, pos); \
DECL_QPEL3(avg, w, pos)
#define DECL_QPEL_XY(x, y) \
DECL_QPEL2(16, x ## y); \
DECL_QPEL2(8, x ## y)
#define DECL_QPEL_Y(y) \
DECL_QPEL_XY(0, y); \
DECL_QPEL_XY(1, y); \
DECL_QPEL_XY(2, y); \
DECL_QPEL_XY(3, y); \
DECL_QPEL_Y(0);
DECL_QPEL_Y(1);
DECL_QPEL_Y(2);
DECL_QPEL_Y(3);
void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t);
void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t);
int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride,
int beta, int beta2, int edge,
int *p1, int *q1);
int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride,
int beta, int beta2, int edge,
int *p1, int *q1);
void ff_rv40_h_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1,
int filter_q1, int alpha, int beta,
int lim_p0q0, int lim_q1, int lim_p1);
void ff_rv40_v_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1,
int filter_q1, int alpha, int beta,
int lim_p0q0, int lim_q1, int lim_p1);
static av_cold void rv40dsp_init_neon(RV34DSPContext *c)
{
c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon;
c->put_pixels_tab[0][ 3] = ff_put_rv40_qpel16_mc30_neon;
c->put_pixels_tab[0][ 4] = ff_put_rv40_qpel16_mc01_neon;
c->put_pixels_tab[0][ 5] = ff_put_rv40_qpel16_mc11_neon;
c->put_pixels_tab[0][ 6] = ff_put_rv40_qpel16_mc21_neon;
c->put_pixels_tab[0][ 7] = ff_put_rv40_qpel16_mc31_neon;
c->put_pixels_tab[0][ 9] = ff_put_rv40_qpel16_mc12_neon;
c->put_pixels_tab[0][10] = ff_put_rv40_qpel16_mc22_neon;
c->put_pixels_tab[0][11] = ff_put_rv40_qpel16_mc32_neon;
c->put_pixels_tab[0][12] = ff_put_rv40_qpel16_mc03_neon;
c->put_pixels_tab[0][13] = ff_put_rv40_qpel16_mc13_neon;
c->put_pixels_tab[0][14] = ff_put_rv40_qpel16_mc23_neon;
c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_neon;
c->avg_pixels_tab[0][ 1] = ff_avg_rv40_qpel16_mc10_neon;
c->avg_pixels_tab[0][ 3] = ff_avg_rv40_qpel16_mc30_neon;
c->avg_pixels_tab[0][ 4] = ff_avg_rv40_qpel16_mc01_neon;
c->avg_pixels_tab[0][ 5] = ff_avg_rv40_qpel16_mc11_neon;
c->avg_pixels_tab[0][ 6] = ff_avg_rv40_qpel16_mc21_neon;
c->avg_pixels_tab[0][ 7] = ff_avg_rv40_qpel16_mc31_neon;
c->avg_pixels_tab[0][ 9] = ff_avg_rv40_qpel16_mc12_neon;
c->avg_pixels_tab[0][10] = ff_avg_rv40_qpel16_mc22_neon;
c->avg_pixels_tab[0][11] = ff_avg_rv40_qpel16_mc32_neon;
c->avg_pixels_tab[0][12] = ff_avg_rv40_qpel16_mc03_neon;
c->avg_pixels_tab[0][13] = ff_avg_rv40_qpel16_mc13_neon;
c->avg_pixels_tab[0][14] = ff_avg_rv40_qpel16_mc23_neon;
c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_neon;
c->put_pixels_tab[1][ 1] = ff_put_rv40_qpel8_mc10_neon;
c->put_pixels_tab[1][ 3] = ff_put_rv40_qpel8_mc30_neon;
c->put_pixels_tab[1][ 4] = ff_put_rv40_qpel8_mc01_neon;
c->put_pixels_tab[1][ 5] = ff_put_rv40_qpel8_mc11_neon;
c->put_pixels_tab[1][ 6] = ff_put_rv40_qpel8_mc21_neon;
c->put_pixels_tab[1][ 7] = ff_put_rv40_qpel8_mc31_neon;
c->put_pixels_tab[1][ 9] = ff_put_rv40_qpel8_mc12_neon;
c->put_pixels_tab[1][10] = ff_put_rv40_qpel8_mc22_neon;
c->put_pixels_tab[1][11] = ff_put_rv40_qpel8_mc32_neon;
c->put_pixels_tab[1][12] = ff_put_rv40_qpel8_mc03_neon;
c->put_pixels_tab[1][13] = ff_put_rv40_qpel8_mc13_neon;
c->put_pixels_tab[1][14] = ff_put_rv40_qpel8_mc23_neon;
c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_neon;
c->avg_pixels_tab[1][ 1] = ff_avg_rv40_qpel8_mc10_neon;
c->avg_pixels_tab[1][ 3] = ff_avg_rv40_qpel8_mc30_neon;
c->avg_pixels_tab[1][ 4] = ff_avg_rv40_qpel8_mc01_neon;
c->avg_pixels_tab[1][ 5] = ff_avg_rv40_qpel8_mc11_neon;
c->avg_pixels_tab[1][ 6] = ff_avg_rv40_qpel8_mc21_neon;
c->avg_pixels_tab[1][ 7] = ff_avg_rv40_qpel8_mc31_neon;
c->avg_pixels_tab[1][ 9] = ff_avg_rv40_qpel8_mc12_neon;
c->avg_pixels_tab[1][10] = ff_avg_rv40_qpel8_mc22_neon;
c->avg_pixels_tab[1][11] = ff_avg_rv40_qpel8_mc32_neon;
c->avg_pixels_tab[1][12] = ff_avg_rv40_qpel8_mc03_neon;
c->avg_pixels_tab[1][13] = ff_avg_rv40_qpel8_mc13_neon;
c->avg_pixels_tab[1][14] = ff_avg_rv40_qpel8_mc23_neon;
c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_neon;
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon;
c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
c->rv40_weak_loop_filter[0] = ff_rv40_h_weak_loop_filter_neon;
c->rv40_weak_loop_filter[1] = ff_rv40_v_weak_loop_filter_neon;
}
av_cold void ff_rv40dsp_init_arm(RV34DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
rv40dsp_init_neon(c);
}

View File

@@ -0,0 +1,920 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
.macro qpel_lowpass r0, r1, rc1, rc2, shift
vext.8 d25, \r0, \r1, #1 @ src[-1]
vext.8 d26, \r0, \r1, #4 @ src[ 2]
vext.8 d24, \r0, \r1, #5 @ src[ 3]
vaddl.u8 q9, d25, d26
vaddl.u8 q8, \r0, d24
vext.8 d27, \r0, \r1, #2 @ src[ 0]
vshl.s16 q12, q9, #2
vsub.s16 q8, q8, q9
vext.8 d28, \r0, \r1, #3 @ src[ 1]
vsub.s16 q8, q8, q12
vmlal.u8 q8, d27, \rc1
vmlal.u8 q8, d28, \rc2
vqrshrun.s16 \r0, q8, #\shift
.endm
.macro qpel_lowpass_x2 r0, r1, r2, r3, rc1, rc2, shift
vext.8 d25, \r0, \r1, #1 @ src[-1]
vext.8 d26, \r0, \r1, #4 @ src[ 2]
vext.8 d24, \r0, \r1, #5 @ src[ 3]
vaddl.u8 q9, d25, d26
vaddl.u8 q8, \r0, d24
vext.8 d29, \r0, \r1, #2 @ src[ 0]
vext.8 d28, \r0, \r1, #3 @ src[ 1]
vshl.s16 q10, q9, #2
vext.8 \r1, \r2, \r3, #1 @ src[-1]
vsub.s16 q8, q8, q9
vext.8 d22, \r2, \r3, #4 @ src[ 2]
vext.8 \r0, \r2, \r3, #5 @ src[ 3]
vaddl.u8 q13, \r1, d22
vaddl.u8 q12, \r2, \r0
vsub.s16 q8, q8, q10
vshl.s16 q9, q13, #2
vsub.s16 q12, q12, q13
vmlal.u8 q8, d29, \rc1
vmlal.u8 q8, d28, \rc2
vsub.s16 q12, q12, q9
vext.8 d26, \r2, \r3, #2 @ src[ 0]
vext.8 d27, \r2, \r3, #3 @ src[ 1]
vmlal.u8 q12, d26, \rc1
vmlal.u8 q12, d27, \rc2
vqrshrun.s16 \r0, q8, #\shift
vqrshrun.s16 \r2, q12, #\shift
.endm
.macro rv40_qpel8_h shift
function put_rv40_qpel8_h_lp_packed_s\shift\()_neon
1:
vld1.8 {q2}, [r1], r2
vld1.8 {q3}, [r1], r2
qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, \shift
vst1.8 {d4}, [r12,:64]!
vst1.8 {d6}, [r12,:64]!
subs r3, r3, #2
bgt 1b
vld1.8 {q2}, [r1]
qpel_lowpass d4, d5, d0, d1, \shift
vst1.8 {d4}, [r12,:64]!
bx lr
endfunc
.endm
.macro rv40_qpel8_v shift, type
function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon
vld1.64 {d2}, [r1,:64]!
vld1.64 {d3}, [r1,:64]!
vld1.64 {d4}, [r1,:64]!
vld1.64 {d5}, [r1,:64]!
vld1.64 {d6}, [r1,:64]!
vld1.64 {d7}, [r1,:64]!
vld1.64 {d8}, [r1,:64]!
vld1.64 {d9}, [r1,:64]!
vld1.64 {d10}, [r1,:64]!
vld1.64 {d11}, [r1,:64]!
vld1.64 {d12}, [r1,:64]!
vld1.64 {d13}, [r1,:64]!
vld1.64 {d14}, [r1,:64]!
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, \shift
qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, \shift
qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, \shift
qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, \shift
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
.ifc \type,avg
vld1.64 d12, [r0,:64], r2
vld1.64 d13, [r0,:64], r2
vld1.64 d14, [r0,:64], r2
vld1.64 d15, [r0,:64], r2
vld1.64 d16, [r0,:64], r2
vld1.64 d17, [r0,:64], r2
vld1.64 d18, [r0,:64], r2
vld1.64 d19, [r0,:64], r2
sub r0, r0, r2, lsl #3
vrhadd.u8 q1, q1, q6
vrhadd.u8 q2, q2, q7
vrhadd.u8 q3, q3, q8
vrhadd.u8 q4, q4, q9
.endif
vst1.64 d2, [r0,:64], r2
vst1.64 d3, [r0,:64], r2
vst1.64 d4, [r0,:64], r2
vst1.64 d5, [r0,:64], r2
vst1.64 d6, [r0,:64], r2
vst1.64 d7, [r0,:64], r2
vst1.64 d8, [r0,:64], r2
vst1.64 d9, [r0,:64], r2
bx lr
endfunc
.endm
rv40_qpel8_h 5
rv40_qpel8_h 6
.macro rv40_qpel type
function \type\()_rv40_qpel8_h_lowpass_neon
.ifc \type,avg
mov r12, r0
.endif
1:
vld1.8 {q2}, [r1], r2
vld1.8 {q3}, [r1], r2
qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, 6
.ifc \type,avg
vld1.8 {d3}, [r12,:64], r2
vld1.8 {d16}, [r12,:64], r2
vrhadd.u8 d4, d4, d3
vrhadd.u8 d6, d6, d16
.endif
vst1.8 {d4}, [r0,:64], r2
vst1.8 {d6}, [r0,:64], r2
subs r3, r3, #2
bgt 1b
bx lr
endfunc
function \type\()_rv40_qpel8_v_lowpass_neon
vld1.64 {d2}, [r1], r2
vld1.64 {d3}, [r1], r2
vld1.64 {d4}, [r1], r2
vld1.64 {d5}, [r1], r2
vld1.64 {d6}, [r1], r2
vld1.64 {d7}, [r1], r2
vld1.64 {d8}, [r1], r2
vld1.64 {d9}, [r1], r2
vld1.64 {d10}, [r1], r2
vld1.64 {d11}, [r1], r2
vld1.64 {d12}, [r1], r2
vld1.64 {d13}, [r1], r2
vld1.64 {d14}, [r1]
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, 6
qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, 6
qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, 6
qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, 6
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
.ifc \type,avg
vld1.64 d12, [r0,:64], r2
vld1.64 d13, [r0,:64], r2
vld1.64 d14, [r0,:64], r2
vld1.64 d15, [r0,:64], r2
vld1.64 d16, [r0,:64], r2
vld1.64 d17, [r0,:64], r2
vld1.64 d18, [r0,:64], r2
vld1.64 d19, [r0,:64], r2
sub r0, r0, r2, lsl #3
vrhadd.u8 q1, q1, q6
vrhadd.u8 q2, q2, q7
vrhadd.u8 q3, q3, q8
vrhadd.u8 q4, q4, q9
.endif
vst1.64 d2, [r0,:64], r2
vst1.64 d3, [r0,:64], r2
vst1.64 d4, [r0,:64], r2
vst1.64 d5, [r0,:64], r2
vst1.64 d6, [r0,:64], r2
vst1.64 d7, [r0,:64], r2
vst1.64 d8, [r0,:64], r2
vst1.64 d9, [r0,:64], r2
bx lr
endfunc
rv40_qpel8_v 5, \type
rv40_qpel8_v 6, \type
function ff_\type\()_rv40_qpel8_mc10_neon, export=1
sub r1, r1, #2
mov r3, #8
vmov.i8 d0, #52
vmov.i8 d1, #20
b \type\()_rv40_qpel8_h_lowpass_neon
endfunc
function ff_\type\()_rv40_qpel8_mc30_neon, export=1
sub r1, r1, #2
mov r3, #8
vmov.i8 d0, #20
vmov.i8 d1, #52
b \type\()_rv40_qpel8_h_lowpass_neon
endfunc
function ff_\type\()_rv40_qpel8_mc01_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub r1, r1, r2, lsl #1
vmov.i8 d0, #52
vmov.i8 d1, #20
bl \type\()_rv40_qpel8_v_lowpass_neon
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc11_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
add r1, sp, #7
bic r1, r1, #7
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc21_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
add r1, sp, #7
bic r1, r1, #7
vmov.i8 d0, #52
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc31_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #20
vmov.i8 d1, #52
bl put_rv40_qpel8_h_lp_packed_s6_neon
add r1, sp, #7
bic r1, r1, #7
vswp d0, d1
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc12_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
add r1, sp, #7
bic r1, r1, #7
vmov.i8 d0, #20
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc22_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
add r1, sp, #7
bic r1, r1, #7
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc32_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #20
vmov.i8 d1, #52
bl put_rv40_qpel8_h_lp_packed_s6_neon
add r1, sp, #7
bic r1, r1, #7
vmov.i8 d1, #20
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc03_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub r1, r1, r2, lsl #1
vmov.i8 d0, #20
vmov.i8 d1, #52
bl \type\()_rv40_qpel8_v_lowpass_neon
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc33_neon, export=1
mov r3, #8
b X(ff_\type\()_pixels8_xy2_neon)
endfunc
function ff_\type\()_rv40_qpel8_mc13_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
add r1, sp, #7
bic r1, r1, #7
vswp d0, d1
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc23_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
add r1, sp, #7
bic r1, r1, #7
vmov.i8 d1, #52
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel16_mc10_neon, export=1
vmov.i8 d0, #52
vmov.i8 d1, #20
.L\type\()_rv40_qpel16_h:
push {r1, lr}
sub r1, r1, #2
mov r3, #16
bl \type\()_rv40_qpel8_h_lowpass_neon
pop {r1, lr}
sub r0, r0, r2, lsl #4
add r0, r0, #8
add r1, r1, #6
mov r3, #16
b \type\()_rv40_qpel8_h_lowpass_neon
endfunc
function ff_\type\()_rv40_qpel16_mc30_neon, export=1
vmov.i8 d0, #20
vmov.i8 d1, #52
b .L\type\()_rv40_qpel16_h
endfunc
function ff_\type\()_rv40_qpel16_mc01_neon, export=1
vmov.i8 d0, #52
vmov.i8 d1, #20
.L\type\()_rv40_qpel16_v:
sub r1, r1, r2, lsl #1
push {r1, lr}
vpush {d8-d15}
bl \type\()_rv40_qpel8_v_lowpass_neon
sub r1, r1, r2, lsl #2
bl \type\()_rv40_qpel8_v_lowpass_neon
ldr r1, [sp, #64]
sub r0, r0, r2, lsl #4
add r0, r0, #8
add r1, r1, #8
bl \type\()_rv40_qpel8_v_lowpass_neon
sub r1, r1, r2, lsl #2
bl \type\()_rv40_qpel8_v_lowpass_neon
vpop {d8-d15}
pop {r1, pc}
endfunc
function ff_\type\()_rv40_qpel16_mc11_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
.L\type\()_rv40_qpel16_v_s6:
add r1, sp, #7
bic r1, r1, #7
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
sub r1, r1, #40
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
sub r0, r0, r2, lsl #4
add r0, r0, #8
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
sub r1, r1, #40
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #44*8
vpop {d8-d15}
pop {r1, pc}
endfunc
function ff_\type\()_rv40_qpel16_mc21_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
vmov.i8 d0, #52
b .L\type\()_rv40_qpel16_v_s6
endfunc
function ff_\type\()_rv40_qpel16_mc31_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #20
vmov.i8 d1, #52
bl put_rv40_qpel8_h_lp_packed_s6_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
vswp d0, d1
b .L\type\()_rv40_qpel16_v_s6
endfunc
function ff_\type\()_rv40_qpel16_mc12_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
vmov.i8 d0, #20
.L\type\()_rv40_qpel16_v_s5:
add r1, sp, #7
bic r1, r1, #7
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
sub r1, r1, #40
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
sub r0, r0, r2, lsl #4
add r0, r0, #8
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
sub r1, r1, #40
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
add sp, sp, #44*8
vpop {d8-d15}
pop {r1, pc}
endfunc
function ff_\type\()_rv40_qpel16_mc22_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
b .L\type\()_rv40_qpel16_v_s5
endfunc
function ff_\type\()_rv40_qpel16_mc32_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #20
vmov.i8 d1, #52
bl put_rv40_qpel8_h_lp_packed_s6_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
vmov.i8 d1, #20
b .L\type\()_rv40_qpel16_v_s5
endfunc
function ff_\type\()_rv40_qpel16_mc03_neon, export=1
vmov.i8 d0, #20
vmov.i8 d1, #52
b .L\type\()_rv40_qpel16_v
endfunc
function ff_\type\()_rv40_qpel16_mc13_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
vswp d0, d1
b .L\type\()_rv40_qpel16_v_s6
endfunc
function ff_\type\()_rv40_qpel16_mc23_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
vmov.i8 d1, #52
b .L\type\()_rv40_qpel16_v_s6
endfunc
function ff_\type\()_rv40_qpel16_mc33_neon, export=1
mov r3, #16
b X(ff_\type\()_pixels16_xy2_neon)
endfunc
.endm
rv40_qpel put
rv40_qpel avg
.macro rv40_weight
vmovl.u8 q8, d2
vmovl.u8 q9, d3
vmovl.u8 q10, d4
vmovl.u8 q11, d5
vmull.u16 q2, d16, d0[2]
vmull.u16 q3, d17, d0[2]
vmull.u16 q8, d18, d0[2]
vmull.u16 q9, d19, d0[2]
vmull.u16 q12, d20, d0[0]
vmull.u16 q13, d21, d0[0]
vmull.u16 q14, d22, d0[0]
vmull.u16 q15, d23, d0[0]
vshrn.i32 d4, q2, #9
vshrn.i32 d5, q3, #9
vshrn.i32 d6, q8, #9
vshrn.i32 d7, q9, #9
vshrn.i32 d16, q12, #9
vshrn.i32 d17, q13, #9
vshrn.i32 d18, q14, #9
vshrn.i32 d19, q15, #9
vadd.u16 q2, q2, q8
vadd.u16 q3, q3, q9
vrshrn.i16 d2, q2, #5
vrshrn.i16 d3, q3, #5
.endm
/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int w1, int w2, int stride) */
function ff_rv40_weight_func_16_neon, export=1
ldr r12, [sp]
vmov d0, r3, r12
ldr r12, [sp, #4]
mov r3, #16
1:
vld1.8 {q1}, [r1,:128], r12
vld1.8 {q2}, [r2,:128], r12
rv40_weight
vst1.8 {q1}, [r0,:128], r12
subs r3, r3, #1
bne 1b
bx lr
endfunc
/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int w1, int w2, int stride) */
function ff_rv40_weight_func_8_neon, export=1
ldr r12, [sp]
vmov d0, r3, r12
ldr r12, [sp, #4]
mov r3, #8
1:
vld1.8 {d2}, [r1,:64], r12
vld1.8 {d3}, [r1,:64], r12
vld1.8 {d4}, [r2,:64], r12
vld1.8 {d5}, [r2,:64], r12
rv40_weight
vst1.8 {d2}, [r0,:64], r12
vst1.8 {d3}, [r0,:64], r12
subs r3, r3, #2
bne 1b
bx lr
endfunc
function ff_rv40_h_loop_filter_strength_neon, export=1
pkhbt r2, r3, r2, lsl #18
ldr r3, [r0]
ldr_dpre r12, r0, r1
teq r3, r12
beq 1f
sub r0, r0, r1, lsl #1
vld1.32 {d4[]}, [r0,:32], r1 @ -3
vld1.32 {d0[]}, [r0,:32], r1 @ -2
vld1.32 {d4[1]}, [r0,:32], r1 @ -1
vld1.32 {d5[]}, [r0,:32], r1 @ 0
vld1.32 {d1[]}, [r0,:32], r1 @ 1
vld1.32 {d5[0]}, [r0,:32], r1 @ 2
vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1
vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0
vdup.32 d30, r2 @ beta2, beta << 2
vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1
vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0
vabd.u16 d16, d18, d16
vclt.u16 d16, d16, d30
ldrd r2, r3, [sp, #4]
vmovl.u16 q12, d16
vtrn.16 d16, d17
vshr.u32 q12, q12, #15
ldr r0, [sp]
vst1.32 {d24[1]}, [r2,:32]
vst1.32 {d25[1]}, [r3,:32]
cmp r0, #0
it eq
bxeq lr
vand d18, d16, d17
vtrn.32 d18, d19
vand d18, d18, d19
vmov.u16 r0, d18[0]
bx lr
1:
ldrd r2, r3, [sp, #4]
mov r0, #0
str r0, [r2]
str r0, [r3]
bx lr
endfunc
function ff_rv40_v_loop_filter_strength_neon, export=1
sub r0, r0, #3
pkhbt r2, r3, r2, lsl #18
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d3}, [r0], r1
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vdup.32 q15, r2
vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2
vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2
vabd.u16 q0, q1, q0
vclt.u16 q0, q0, q15
ldrd r2, r3, [sp, #4]
vmovl.u16 q1, d0
vext.16 d1, d0, d1, #3
vshr.u32 q1, q1, #15
ldr r0, [sp]
vst1.32 {d2[1]}, [r2,:32]
vst1.32 {d3[1]}, [r3,:32]
cmp r0, #0
it eq
bxeq lr
vand d0, d0, d1
vtrn.16 d0, d1
vand d0, d0, d1
vmov.u16 r0, d0[0]
bx lr
endfunc
.macro rv40_weak_loop_filter
vdup.16 d30, r2 @ filter_p1
vdup.16 d31, r3 @ filter_q1
ldrd r2, r3, [sp]
vdup.16 d28, r2 @ alpha
vdup.16 d29, r3 @ beta
ldr r12, [sp, #8]
vdup.16 d25, r12 @ lim_p0q0
ldrd r2, r3, [sp, #12]
vsubl.u8 q9, d5, d4 @ x, t
vabdl.u8 q8, d5, d4 @ x, abs(t)
vneg.s16 q15, q15
vceq.i16 d16, d19, #0 @ !t
vshl.s16 d19, d19, #2 @ t << 2
vmul.u16 d18, d17, d28 @ alpha * abs(t)
vand d24, d30, d31 @ filter_p1 & filter_q1
vsubl.u8 q1, d0, d4 @ p1p2, p1p0
vsubl.u8 q3, d1, d5 @ q1q2, q1q0
vmov.i16 d22, #3
vshr.u16 d18, d18, #7
vadd.i16 d22, d22, d24 @ 3 - (filter_p1 & filter_q1)
vsubl.u8 q10, d0, d1 @ src[-2] - src[1]
vcle.u16 d18, d18, d22
vand d20, d20, d24
vneg.s16 d23, d25 @ -lim_p0q0
vadd.s16 d19, d19, d20
vbic d16, d18, d16 @ t && u <= 3 - (fp1 & fq1)
vtrn.32 d4, d5 @ -3, 2, -1, 0
vrshr.s16 d19, d19, #3
vmov d28, d29 @ beta
vswp d3, d6 @ q1q2, p1p0
vmin.s16 d19, d19, d25
vand d30, d30, d16
vand d31, d31, d16
vadd.s16 q10, q1, q3 @ p1p2 + p1p0, q1q2 + q1q0
vmax.s16 d19, d19, d23 @ diff
vabs.s16 q1, q1 @ abs(p1p2), abs(q1q2)
vand d18, d19, d16 @ diff
vcle.u16 q1, q1, q14
vneg.s16 d19, d18 @ -diff
vdup.16 d26, r3 @ lim_p1
vaddw.u8 q2, q9, d5 @ src[-1]+diff, src[0]-diff
vhsub.s16 q11, q10, q9
vand q1, q1, q15
vqmovun.s16 d4, q2 @ -1, 0
vand q9, q11, q1
vdup.16 d27, r2 @ lim_q1
vneg.s16 q9, q9
vneg.s16 q14, q13
vmin.s16 q9, q9, q13
vtrn.32 d0, d1 @ -2, 1, -2, 1
vmax.s16 q9, q9, q14
vaddw.u8 q3, q9, d0
vqmovun.s16 d5, q3 @ -2, 1
.endm
function ff_rv40_h_weak_loop_filter_neon, export=1
sub r0, r0, r1, lsl #1
sub r0, r0, r1
vld1.32 {d4[]}, [r0,:32], r1
vld1.32 {d0[]}, [r0,:32], r1
vld1.32 {d4[1]}, [r0,:32], r1
vld1.32 {d5[]}, [r0,:32], r1
vld1.32 {d1[]}, [r0,:32], r1
vld1.32 {d5[0]}, [r0,:32]
sub r0, r0, r1, lsl #2
rv40_weak_loop_filter
vst1.32 {d5[0]}, [r0,:32], r1
vst1.32 {d4[0]}, [r0,:32], r1
vst1.32 {d4[1]}, [r0,:32], r1
vst1.32 {d5[1]}, [r0,:32], r1
bx lr
endfunc
function ff_rv40_v_weak_loop_filter_neon, export=1
sub r12, r0, #3
sub r0, r0, #2
vld1.8 {d4}, [r12], r1
vld1.8 {d5}, [r12], r1
vld1.8 {d2}, [r12], r1
vld1.8 {d3}, [r12], r1
vtrn.16 q2, q1
vtrn.8 d4, d5
vtrn.8 d2, d3
vrev64.32 d5, d5
vtrn.32 q2, q1
vdup.32 d0, d3[0]
vdup.32 d1, d2[0]
rv40_weak_loop_filter
vtrn.32 q2, q3
vswp d4, d5
vst4.8 {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
vst4.8 {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
vst4.8 {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
vst4.8 {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
bx lr
endfunc

View File

@@ -0,0 +1,73 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/sbrdsp.h"
void ff_sbr_sum64x5_neon(float *z);
float ff_sbr_sum_square_neon(float (*x)[2], int n);
void ff_sbr_neg_odd_64_neon(float *x);
void ff_sbr_qmf_pre_shuffle_neon(float *z);
void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
const float *g_filt, int m_max, intptr_t ixh);
void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
const float alpha0[2], const float alpha1[2],
float bw, int start, int end);
void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
av_cold void ff_sbrdsp_init_arm(SBRDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
s->sum64x5 = ff_sbr_sum64x5_neon;
s->sum_square = ff_sbr_sum_square_neon;
s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
s->hf_g_filt = ff_sbr_hf_g_filt_neon;
s->hf_gen = ff_sbr_hf_gen_neon;
s->autocorrelate = ff_sbr_autocorrelate_neon;
s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
}
}

View File

@@ -0,0 +1,411 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_sbr_sum64x5_neon, export=1
push {lr}
add r1, r0, # 64*4
add r2, r0, #128*4
add r3, r0, #192*4
add lr, r0, #256*4
mov r12, #64
1:
vld1.32 {q0}, [r0,:128]
vld1.32 {q1}, [r1,:128]!
vadd.f32 q0, q0, q1
vld1.32 {q2}, [r2,:128]!
vadd.f32 q0, q0, q2
vld1.32 {q3}, [r3,:128]!
vadd.f32 q0, q0, q3
vld1.32 {q8}, [lr,:128]!
vadd.f32 q0, q0, q8
vst1.32 {q0}, [r0,:128]!
subs r12, #4
bgt 1b
pop {pc}
endfunc
function ff_sbr_sum_square_neon, export=1
vmov.f32 q0, #0.0
1:
vld1.32 {q1}, [r0,:128]!
vmla.f32 q0, q1, q1
subs r1, r1, #2
bgt 1b
vadd.f32 d0, d0, d1
vpadd.f32 d0, d0, d0
NOVFP vmov.32 r0, d0[0]
bx lr
endfunc
function ff_sbr_neg_odd_64_neon, export=1
mov r1, r0
vmov.i32 q8, #1<<31
vld2.32 {q0,q1}, [r0,:128]!
veor q1, q1, q8
vld2.32 {q2,q3}, [r0,:128]!
.rept 3
vst2.32 {q0,q1}, [r1,:128]!
veor q3, q3, q8
vld2.32 {q0,q1}, [r0,:128]!
vst2.32 {q2,q3}, [r1,:128]!
veor q1, q1, q8
vld2.32 {q2,q3}, [r0,:128]!
.endr
veor q3, q3, q8
vst2.32 {q0,q1}, [r1,:128]!
vst2.32 {q2,q3}, [r1,:128]!
bx lr
endfunc
function ff_sbr_qmf_pre_shuffle_neon, export=1
add r1, r0, #60*4
add r2, r0, #64*4
vld1.32 {d0}, [r0,:64]!
vst1.32 {d0}, [r2,:64]!
mov r3, #-16
mov r12, #24
vmov.i32 q8, #1<<31
vld1.32 {q0}, [r1,:128], r3
vld1.32 {d2}, [r0,:64]!
1:
vld1.32 {d3,d4}, [r0,:128]!
vrev64.32 q0, q0
vld1.32 {q9}, [r1,:128], r3
veor q0, q0, q8
vld1.32 {d5,d6}, [r0,:128]!
vswp d0, d1
vrev64.32 q9, q9
vst2.32 {q0,q1}, [r2,:64]!
vmov q10, q2
veor q9, q9, q8
vmov d2, d6
vswp d18, d19
vld1.32 {q0}, [r1,:128], r3
vst2.32 {q9,q10}, [r2,:64]!
subs r12, r12, #8
bgt 1b
vld1.32 {d3,d4}, [r0,:128]!
vrev64.32 q0, q0
vld1.32 {q9}, [r1,:128], r3
veor q0, q0, q8
vld1.32 {d5}, [r0,:64]!
vswp d0, d1
vrev64.32 q9, q9
vst2.32 {q0,q1}, [r2,:64]!
vswp d4, d5
veor q1, q9, q8
vst2.32 {d3,d5}, [r2,:64]!
vst2.32 {d2[0],d4[0]}, [r2,:64]!
bx lr
endfunc
function ff_sbr_qmf_post_shuffle_neon, export=1
add r2, r1, #60*4
mov r3, #-16
mov r12, #32
vmov.i32 q8, #1<<31
vld1.32 {q0}, [r2,:128], r3
vld1.32 {q1}, [r1,:128]!
1:
pld [r2, #-32]
vrev64.32 q0, q0
vswp d2, d3
veor q0, q0, q8
vld1.32 {q2}, [r2,:128], r3
vld1.32 {q3}, [r1,:128]!
vst2.32 {d1,d3}, [r0,:128]!
vst2.32 {d0,d2}, [r0,:128]!
pld [r2, #-32]
vrev64.32 q2, q2
vswp d6, d7
veor q2, q2, q8
vld1.32 {q0}, [r2,:128], r3
vld1.32 {q1}, [r1,:128]!
vst2.32 {d5,d7}, [r0,:128]!
vst2.32 {d4,d6}, [r0,:128]!
subs r12, r12, #8
bgt 1b
bx lr
endfunc
function ff_sbr_qmf_deint_neg_neon, export=1
add r1, r1, #60*4
add r2, r0, #62*4
mov r3, #-16
mov r12, #32
vmov.i32 d2, #1<<31
1:
vld2.32 {d0,d1}, [r1,:128], r3
veor d0, d0, d2
vrev64.32 d1, d1
vst1.32 {d0}, [r2,:64]
vst1.32 {d1}, [r0,:64]!
sub r2, r2, #8
subs r12, r12, #2
bgt 1b
bx lr
endfunc
function ff_sbr_qmf_deint_bfly_neon, export=1
push {lr}
add r2, r2, #60*4
add r3, r0, #124*4
mov r12, #64
mov lr, #-16
1:
vld1.32 {q0}, [r1,:128]!
vld1.32 {q1}, [r2,:128], lr
vrev64.32 q2, q0
vrev64.32 q3, q1
vadd.f32 d3, d4, d3
vadd.f32 d2, d5, d2
vsub.f32 d0, d0, d7
vsub.f32 d1, d1, d6
vst1.32 {q1}, [r3,:128], lr
vst1.32 {q0}, [r0,:128]!
subs r12, r12, #4
bgt 1b
pop {pc}
endfunc
function ff_sbr_hf_g_filt_neon, export=1
ldr r12, [sp]
add r1, r1, r12, lsl #3
mov r12, #40*2*4
sub r3, r3, #1
vld2.32 {d2[],d3[]},[r2,:64]!
vld1.32 {d0}, [r1,:64], r12
1:
vld1.32 {d1}, [r1,:64], r12
vmul.f32 q3, q0, q1
vld2.32 {d2[],d3[]},[r2,:64]!
vld1.32 {d0}, [r1,:64], r12
vst1.32 {q3}, [r0,:64]!
subs r3, r3, #2
bgt 1b
it lt
bxlt lr
vmul.f32 d0, d0, d2
vst1.32 {d0}, [r0,:64]!
bx lr
endfunc
function ff_sbr_hf_gen_neon, export=1
NOVFP vld1.32 {d1[]}, [sp,:32]
VFP vdup.32 d1, d0[0]
vmul.f32 d0, d1, d1
vld1.32 {d3}, [r2,:64]
vld1.32 {d2}, [r3,:64]
vmul.f32 q0, q0, q1
ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS]
vtrn.32 d0, d1
vneg.f32 d18, d1
vtrn.32 d18, d1
add r0, r0, r2, lsl #3
add r1, r1, r2, lsl #3
sub r1, r1, #2*8
sub r3, r3, r2
vld1.32 {q1}, [r1,:128]!
1:
vld1.32 {q3}, [r1,:128]!
vrev64.32 q2, q1
vmov q8, q3
vrev64.32 d20, d3
vrev64.32 d21, d6
vmla.f32 q3, q1, d0[0]
vmla.f32 d6, d4, d18
vmla.f32 d7, d20, d18
vmla.f32 d6, d3, d0[1]
vmla.f32 d7, d16, d0[1]
vmla.f32 d6, d5, d1
vmla.f32 d7, d21, d1
vmov q1, q8
vst1.32 {q3}, [r0,:128]!
subs r3, r3, #2
bgt 1b
bx lr
endfunc
function ff_sbr_autocorrelate_neon, export=1
vld1.32 {q0}, [r0,:128]!
vmov.f32 q1, #0.0
vmov.f32 q3, #0.0
vmov.f32 d20, #0.0
vmul.f32 d21, d1, d1
vmov q8, q0
vmov q11, q0
mov r12, #36
1:
vld1.32 {q2}, [r0,:128]!
vrev64.32 q12, q2
vmla.f32 q10, q2, q2
vmla.f32 d2, d1, d4
vmla.f32 d3, d1, d24
vmla.f32 d6, d0, d4
vmla.f32 d7, d0, d24
vmla.f32 d2, d4, d5
vmla.f32 d3, d4, d25
vmla.f32 d6, d1, d5
vmla.f32 d7, d1, d25
vmov q0, q2
subs r12, r12, #2
bgt 1b
vld1.32 {q2}, [r0,:128]!
vrev64.32 q12, q2
vmla.f32 d2, d1, d4
vmla.f32 d3, d1, d24
vmla.f32 d6, d0, d4
vmla.f32 d7, d0, d24
vadd.f32 d20, d20, d21
vrev64.32 d18, d17
vmla.f32 d6, d1, d5
vmla.f32 d7, d1, d25
vmov q0, q1
vmla.f32 d0, d16, d17
vmla.f32 d1, d16, d18
vmla.f32 d2, d4, d5
vmla.f32 d3, d4, d25
vneg.f32 s15, s15
vmov d21, d20
vpadd.f32 d0, d0, d2
vpadd.f32 d7, d6, d7
vtrn.32 d1, d3
vsub.f32 d6, d1, d3
vmla.f32 d20, d22, d22
vmla.f32 d21, d4, d4
vtrn.32 d0, d6
vpadd.f32 d20, d20, d21
vst1.32 {q3}, [r1,:128]!
vst1.32 {d20[1]}, [r1,:32]
add r1, r1, #2*4
vst1.32 {d0}, [r1,:64]
add r1, r1, #4*4
vst1.32 {d20[0]}, [r1,:32]
bx lr
endfunc
function ff_sbr_hf_apply_noise_0_neon, export=1
vmov.i32 d3, #0
.Lhf_apply_noise_0:
push {r4,lr}
movrelx r4, X(ff_sbr_noise_table)
ldr r12, [sp, #12]
add r3, r3, #1
bfc r3, #9, #23
sub r12, r12, #1
1:
add lr, r4, r3, lsl #3
vld2.32 {q0}, [r0,:64]
vld2.32 {q3}, [lr,:64]
vld1.32 {d2}, [r1,:64]!
vld1.32 {d18}, [r2,:64]!
vceq.f32 d16, d2, #0
veor d2, d2, d3
vmov q2, q0
vmla.f32 d0, d6, d18
vmla.f32 d1, d7, d18
vadd.f32 d4, d4, d2
add r3, r3, #2
bfc r3, #9, #23
vbif d0, d4, d16
vbif d1, d5, d16
vst2.32 {q0}, [r0,:64]!
subs r12, r12, #2
bgt 1b
blt 2f
add lr, r4, r3, lsl #3
vld1.32 {d0}, [r0,:64]
vld1.32 {d6}, [lr,:64]
vld1.32 {d2[]}, [r1,:32]!
vld1.32 {d3[]}, [r2,:32]!
vceq.f32 d4, d2, #0
veor d2, d2, d3
vmov d1, d0
vmla.f32 d0, d6, d3
vadd.f32 s2, s2, s4
vbif d0, d1, d4
vst1.32 {d0}, [r0,:64]!
2:
pop {r4,pc}
endfunc
function ff_sbr_hf_apply_noise_1_neon, export=1
ldr r12, [sp]
push {r4,lr}
lsl r12, r12, #31
eor lr, r12, #1<<31
vmov d3, r12, lr
.Lhf_apply_noise_1:
movrelx r4, X(ff_sbr_noise_table)
ldr r12, [sp, #12]
add r3, r3, #1
bfc r3, #9, #23
sub r12, r12, #1
1:
add lr, r4, r3, lsl #3
vld2.32 {q0}, [r0,:64]
vld2.32 {q3}, [lr,:64]
vld1.32 {d2}, [r1,:64]!
vld1.32 {d18}, [r2,:64]!
vceq.f32 d16, d2, #0
veor d2, d2, d3
vmov q2, q0
vmla.f32 d0, d6, d18
vmla.f32 d1, d7, d18
vadd.f32 d5, d5, d2
add r3, r3, #2
bfc r3, #9, #23
vbif d0, d4, d16
vbif d1, d5, d16
vst2.32 {q0}, [r0,:64]!
subs r12, r12, #2
bgt 1b
blt 2f
add lr, r4, r3, lsl #3
vld1.32 {d0}, [r0,:64]
vld1.32 {d6}, [lr,:64]
vld1.32 {d2[]}, [r1,:32]!
vld1.32 {d18[]}, [r2,:32]!
vceq.f32 d4, d2, #0
veor d2, d2, d3
vmov d1, d0
vmla.f32 d0, d6, d18
vadd.f32 s3, s3, s5
vbif d0, d1, d4
vst1.32 {d0}, [r0,:64]!
2:
pop {r4,pc}
endfunc
function ff_sbr_hf_apply_noise_2_neon, export=1
vmov.i32 d3, #1<<31
b .Lhf_apply_noise_0
endfunc
function ff_sbr_hf_apply_noise_3_neon, export=1
ldr r12, [sp]
push {r4,lr}
lsl r12, r12, #31
eor lr, r12, #1<<31
vmov d3, lr, r12
b .Lhf_apply_noise_1
endfunc

View File

@@ -0,0 +1,479 @@
/*
* Copyright (C) 2002 Frederic 'dilb' Boulay
*
* Author: Frederic Boulay <dilb@handhelds.org>
*
* The function defined in this file is derived from the simple_idct function
* from the libavcodec library part of the FFmpeg project.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
/* useful constants for the algorithm */
#define W1 22725
#define W2 21407
#define W3 19266
#define W4 16383
#define W5 12873
#define W6 8867
#define W7 4520
#define MASK_MSHW 0xFFFF0000
#define ROW_SHIFT 11
#define ROW_SHIFT2MSHW (16-11)
#define COL_SHIFT 20
#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
function ff_simple_idct_arm, export=1
@@ void simple_idct_arm(int16_t *block)
@@ save stack for reg needed (take all of them),
@@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
@@ so it must not be overwritten, if it is not saved!!
@@ R12 is another scratch register, so it should not be saved too
@@ save all registers
stmfd sp!, {r4-r11, r14} @ R14 is also called LR
@@ at this point, R0=block, other registers are free.
add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
@@ add 2 temporary variables in the stack: R0 and R14
sub sp, sp, #8 @ allow 2 local variables
str r0, [sp, #0] @ save block in sp[0]
@@ stack status
@@ sp+4 free
@@ sp+0 R0 (block)
@@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
__row_loop:
@@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
ldr r3, [r14, #8] @ R3=ROWr32[2]
ldr r4, [r14, #12] @ R4=ROWr32[3]
@@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
@@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
@@ else follow the complete algorithm.
@@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
@@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
orr r5, r4, r3 @ R5=R4 | R3
orr r5, r5, r2 @ R5=R4 | R3 | R2
orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null)
beq __end_row_loop
mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
ldrsh r6, [r14, #0] @ R6=ROWr16[0]
orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7
beq __almost_empty_row
@@ __b_evaluation:
@@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
@@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
@@ R12=__const_ptr_, R14=&block[n]
@@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
@@ MUL16(b0, W1, row[1]);
@@ MUL16(b1, W3, row[1]);
@@ MUL16(b2, W5, row[1]);
@@ MUL16(b3, W7, row[1]);
@@ MAC16(b0, W3, row[3]);
@@ MAC16(b1, -W7, row[3]);
@@ MAC16(b2, -W1, row[3]);
@@ MAC16(b3, -W5, row[3]);
ldr r8, =W1 @ R8=W1
mov r2, r2, asr #16 @ R2=ROWr16[3]
mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r9, =W3 @ R9=W3
ldr r10, =W5 @ R10=W5
mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r11, =W7 @ R11=W7
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if null avoid muls
itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
@@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
@@ R12=__const_ptr_, R14=&block[n]
@@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
@@ if (temp != 0) {}
orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3]
beq __end_b_evaluation
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
@@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
@@ R12=__const_ptr_, R14=&block[n]
@@ MAC16(b0, W5, row[5]);
@@ MAC16(b2, W7, row[5]);
@@ MAC16(b3, W3, row[5]);
@@ MAC16(b1, -W1, row[5]);
@@ MAC16(b0, W7, row[7]);
@@ MAC16(b2, W3, row[7]);
@@ MAC16(b3, -W1, row[7]);
@@ MAC16(b1, -W5, row[7]);
mov r3, r3, asr #16 @ R3=ROWr16[5]
teq r3, #0 @ if null avoid muls
it ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
mov r4, r4, asr #16 @ R4=ROWr16[7]
itttt ne
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5]
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
@@ R3 is free now
teq r4, #0 @ if null avoid muls
itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
@@ R4 is free now
__end_b_evaluation:
@@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
@@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ __a_evaluation:
@@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
@@ a1 = a0 + W6 * row[2];
@@ a2 = a0 - W6 * row[2];
@@ a3 = a0 - W2 * row[2];
@@ a0 = a0 + W2 * row[2];
ldr r9, =W4 @ R9=W4
mul r6, r9, r6 @ R6=W4*ROWr16[0]
ldr r10, =W6 @ R10=W6
ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
mul r11, r10, r4 @ R11=W6*ROWr16[2]
ldr r8, =W2 @ R8=W2
sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
@@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
@@ if (temp != 0) {}
teq r2, #0
beq __end_bef_a_evaluation
add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
mul r11, r8, r4 @ R11=W2*ROWr16[2]
sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ a0 += W4*row[4]
@@ a1 -= W4*row[4]
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11, [r14, #8] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls
it ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
ldrsh r9, [r14, #12] @ R9=ROWr16[6]
itttt ne
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls
itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@@ a0 += W6*row[6];
@@ a3 -= W6*row[6];
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
__end_a_evaluation:
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ row[0] = (a0 + b0) >> ROW_SHIFT;
@@ row[1] = (a1 + b1) >> ROW_SHIFT;
@@ row[2] = (a2 + b2) >> ROW_SHIFT;
@@ row[3] = (a3 + b3) >> ROW_SHIFT;
@@ row[4] = (a3 - b3) >> ROW_SHIFT;
@@ row[5] = (a2 - b2) >> ROW_SHIFT;
@@ row[6] = (a1 - b1) >> ROW_SHIFT;
@@ row[7] = (a0 - b0) >> ROW_SHIFT;
add r8, r6, r0 @ R8=a0+b0
add r9, r2, r1 @ R9=a1+b1
@@ put 2 16 bits half-words in a 32bits word
@@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
ldr r10, =MASK_MSHW @ R10=0xFFFF0000
and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
orr r8, r8, r9
str r8, [r14, #0]
add r8, r3, r5 @ R8=a2+b2
add r9, r4, r7 @ R9=a3+b3
and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
orr r8, r8, r9
str r8, [r14, #4]
sub r8, r4, r7 @ R8=a3-b3
sub r9, r3, r5 @ R9=a2-b2
and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
orr r8, r8, r9
str r8, [r14, #8]
sub r8, r2, r1 @ R8=a1-b1
sub r9, r6, r0 @ R9=a0-b0
and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
orr r8, r8, r9
str r8, [r14, #12]
bal __end_row_loop
__almost_empty_row:
@@ the row was empty, except ROWr16[0], now, management of this special case
@@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
@@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
@@ R8=0xFFFF (temp), R9-R11 free
mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
sub r8, r8, #1 @ R8 is now ready.
and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16)
str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
__end_row_loop:
@@ at this point, R0-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
ldr r0, [sp, #0] @ R0=block
teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
sub r14, r14, #16
bne __row_loop
@@ at this point, R0=block, R1-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
__col_loop:
@@ __b_evaluation2:
@@ at this point, R0=block (temp), R1-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
@@ proceed with b0-b3 first, followed by a0-a3
@@ MUL16(b0, W1, col[8x1]);
@@ MUL16(b1, W3, col[8x1]);
@@ MUL16(b2, W5, col[8x1]);
@@ MUL16(b3, W7, col[8x1]);
@@ MAC16(b0, W3, col[8x3]);
@@ MAC16(b1, -W7, col[8x3]);
@@ MAC16(b2, -W1, col[8x3]);
@@ MAC16(b3, -W5, col[8x3]);
ldr r8, =W1 @ R8=W1
ldrsh r7, [r14, #16]
mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r9, =W3 @ R9=W3
ldr r10, =W5 @ R10=W5
mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r11, =W7 @ R11=W7
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldrsh r2, [r14, #48]
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if 0, then avoid muls
itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
@@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
@@ R12=__const_ptr_, R14=&block[n]
@@ MAC16(b0, W5, col[5x8]);
@@ MAC16(b2, W7, col[5x8]);
@@ MAC16(b3, W3, col[5x8]);
@@ MAC16(b1, -W1, col[5x8]);
@@ MAC16(b0, W7, col[7x8]);
@@ MAC16(b2, W3, col[7x8]);
@@ MAC16(b3, -W1, col[7x8]);
@@ MAC16(b1, -W5, col[7x8]);
ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
teq r3, #0 @ if 0 then avoid muls
itttt ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
it ne
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
@@ R3 is free now
teq r4, #0 @ if 0 then avoid muls
itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
@@ R4 is free now
@@ __end_b_evaluation2:
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
@@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ __a_evaluation2:
@@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
@@ a1 = a0 + W6 * row[2];
@@ a2 = a0 - W6 * row[2];
@@ a3 = a0 - W2 * row[2];
@@ a0 = a0 + W2 * row[2];
ldrsh r6, [r14, #0]
ldr r9, =W4 @ R9=W4
mul r6, r9, r6 @ R6=W4*ROWr16[0]
ldr r10, =W6 @ R10=W6
ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
mul r11, r10, r4 @ R11=W6*ROWr16[2]
ldr r8, =W2 @ R8=W2
add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
mul r11, r8, r4 @ R11=W2*ROWr16[2]
sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ a0 += W4*row[4]
@@ a1 -= W4*row[4]
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11, [r14, #64] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls
itttt ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
ldrsh r9, [r14, #96] @ R9=ROWr16[6]
it ne
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls
itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@@ a0 += W6*row[6];
@@ a3 -= W6*row[6];
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
@@ __end_a_evaluation2:
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
@@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
@@ col[16] = ((a2 + b2) >> COL_SHIFT);
@@ col[24] = ((a3 + b3) >> COL_SHIFT);
@@ col[32] = ((a3 - b3) >> COL_SHIFT);
@@ col[40] = ((a2 - b2) >> COL_SHIFT);
@@ col[48] = ((a1 - b1) >> COL_SHIFT);
@@ col[56] = ((a0 - b0) >> COL_SHIFT);
@@@@@ no optimization here @@@@@
add r8, r6, r0 @ R8=a0+b0
add r9, r2, r1 @ R9=a1+b1
mov r8, r8, asr #COL_SHIFT
mov r9, r9, asr #COL_SHIFT
strh r8, [r14, #0]
strh r9, [r14, #16]
add r8, r3, r5 @ R8=a2+b2
add r9, r4, r7 @ R9=a3+b3
mov r8, r8, asr #COL_SHIFT
mov r9, r9, asr #COL_SHIFT
strh r8, [r14, #32]
strh r9, [r14, #48]
sub r8, r4, r7 @ R8=a3-b3
sub r9, r3, r5 @ R9=a2-b2
mov r8, r8, asr #COL_SHIFT
mov r9, r9, asr #COL_SHIFT
strh r8, [r14, #64]
strh r9, [r14, #80]
sub r8, r2, r1 @ R8=a1-b1
sub r9, r6, r0 @ R9=a0-b0
mov r8, r8, asr #COL_SHIFT
mov r9, r9, asr #COL_SHIFT
strh r8, [r14, #96]
strh r9, [r14, #112]
@@ __end_col_loop:
@@ at this point, R0-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
ldr r0, [sp, #0] @ R0=block
teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
sub r14, r14, #2
bne __col_loop
@@ __end_simple_idct_arm:
@@ restore registers to previous status!
add sp, sp, #8 @@ the local variables!
ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
@@ kind of sub-function, here not to overload the common case.
__end_bef_a_evaluation:
add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
mul r11, r8, r4 @ R11=W2*ROWr16[2]
sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
bal __end_a_evaluation

View File

@@ -0,0 +1,620 @@
/*
* Simple IDCT
*
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
* Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define ROW_SHIFT 11
#define COL_SHIFT 20
#define W13 (W1 | (W3 << 16))
#define W26 (W2 | (W6 << 16))
#define W57 (W5 | (W7 << 16))
function idct_row_armv5te
str lr, [sp, #-4]!
ldrd v1, v2, [a1, #8]
ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */
orrs v1, v1, v2
itt eq
cmpeq v1, a4
cmpeq v1, a3, lsr #16
beq row_dc_only
mov v1, #(1<<(ROW_SHIFT-1))
mov ip, #16384
sub ip, ip, #1 /* ip = W4 */
smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */
ldr ip, =W26 /* ip = W2 | (W6 << 16) */
smultb a2, ip, a4
smulbb lr, ip, a4
add v2, v1, a2
sub v3, v1, a2
sub v4, v1, lr
add v1, v1, lr
ldr ip, =W13 /* ip = W1 | (W3 << 16) */
ldr lr, =W57 /* lr = W5 | (W7 << 16) */
smulbt v5, ip, a3
smultt v6, lr, a4
smlatt v5, ip, a4, v5
smultt a2, ip, a3
smulbt v7, lr, a3
sub v6, v6, a2
smulbt a2, ip, a4
smultt fp, lr, a3
sub v7, v7, a2
smulbt a2, lr, a4
ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */
sub fp, fp, a2
orrs a2, a3, a4
beq 1f
smlabt v5, lr, a3, v5
smlabt v6, ip, a3, v6
smlatt v5, lr, a4, v5
smlabt v6, lr, a4, v6
smlatt v7, lr, a3, v7
smlatt fp, ip, a3, fp
smulbt a2, ip, a4
smlatt v7, ip, a4, v7
sub fp, fp, a2
ldr ip, =W26 /* ip = W2 | (W6 << 16) */
mov a2, #16384
sub a2, a2, #1 /* a2 = W4 */
smulbb a2, a2, a3 /* a2 = W4*row[4] */
smultb lr, ip, a4 /* lr = W6*row[6] */
add v1, v1, a2 /* v1 += W4*row[4] */
add v1, v1, lr /* v1 += W6*row[6] */
add v4, v4, a2 /* v4 += W4*row[4] */
sub v4, v4, lr /* v4 -= W6*row[6] */
smulbb lr, ip, a4 /* lr = W2*row[6] */
sub v2, v2, a2 /* v2 -= W4*row[4] */
sub v2, v2, lr /* v2 -= W2*row[6] */
sub v3, v3, a2 /* v3 -= W4*row[4] */
add v3, v3, lr /* v3 += W2*row[6] */
1: add a2, v1, v5
mov a3, a2, lsr #11
bic a3, a3, #0x1f0000
sub a2, v2, v6
mov a2, a2, lsr #11
add a3, a3, a2, lsl #16
add a2, v3, v7
mov a4, a2, lsr #11
bic a4, a4, #0x1f0000
add a2, v4, fp
mov a2, a2, lsr #11
add a4, a4, a2, lsl #16
strd a3, a4, [a1]
sub a2, v4, fp
mov a3, a2, lsr #11
bic a3, a3, #0x1f0000
sub a2, v3, v7
mov a2, a2, lsr #11
add a3, a3, a2, lsl #16
add a2, v2, v6
mov a4, a2, lsr #11
bic a4, a4, #0x1f0000
sub a2, v1, v5
mov a2, a2, lsr #11
add a4, a4, a2, lsl #16
strd a3, a4, [a1, #8]
ldr pc, [sp], #4
row_dc_only:
orr a3, a3, a3, lsl #16
bic a3, a3, #0xe000
mov a3, a3, lsl #3
mov a4, a3
strd a3, a4, [a1]
strd a3, a4, [a1, #8]
ldr pc, [sp], #4
endfunc
.macro idct_col
ldr a4, [a1] /* a4 = col[1:0] */
mov ip, #16384
sub ip, ip, #1 /* ip = W4 */
#if 0
mov v1, #(1<<(COL_SHIFT-1))
smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
ldr a4, [a1, #(16*4)]
#else
mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
add v2, v1, a4, asr #16
rsb v2, v2, v2, lsl #14
mov a4, a4, lsl #16
add v1, v1, a4, asr #16
ldr a4, [a1, #(16*4)]
rsb v1, v1, v1, lsl #14
#endif
smulbb lr, ip, a4
smulbt a3, ip, a4
sub v3, v1, lr
sub v5, v1, lr
add v7, v1, lr
add v1, v1, lr
sub v4, v2, a3
sub v6, v2, a3
add fp, v2, a3
ldr ip, =W26
ldr a4, [a1, #(16*2)]
add v2, v2, a3
smulbb lr, ip, a4
smultb a3, ip, a4
add v1, v1, lr
sub v7, v7, lr
add v3, v3, a3
sub v5, v5, a3
smulbt lr, ip, a4
smultt a3, ip, a4
add v2, v2, lr
sub fp, fp, lr
add v4, v4, a3
ldr a4, [a1, #(16*6)]
sub v6, v6, a3
smultb lr, ip, a4
smulbb a3, ip, a4
add v1, v1, lr
sub v7, v7, lr
sub v3, v3, a3
add v5, v5, a3
smultt lr, ip, a4
smulbt a3, ip, a4
add v2, v2, lr
sub fp, fp, lr
sub v4, v4, a3
add v6, v6, a3
stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
ldr ip, =W13
ldr a4, [a1, #(16*1)]
ldr lr, =W57
smulbb v1, ip, a4
smultb v3, ip, a4
smulbb v5, lr, a4
smultb v7, lr, a4
smulbt v2, ip, a4
smultt v4, ip, a4
smulbt v6, lr, a4
smultt fp, lr, a4
rsb v4, v4, #0
ldr a4, [a1, #(16*3)]
rsb v3, v3, #0
smlatb v1, ip, a4, v1
smlatb v3, lr, a4, v3
smulbb a3, ip, a4
smulbb a2, lr, a4
sub v5, v5, a3
sub v7, v7, a2
smlatt v2, ip, a4, v2
smlatt v4, lr, a4, v4
smulbt a3, ip, a4
smulbt a2, lr, a4
sub v6, v6, a3
ldr a4, [a1, #(16*5)]
sub fp, fp, a2
smlabb v1, lr, a4, v1
smlabb v3, ip, a4, v3
smlatb v5, lr, a4, v5
smlatb v7, ip, a4, v7
smlabt v2, lr, a4, v2
smlabt v4, ip, a4, v4
smlatt v6, lr, a4, v6
ldr a3, [a1, #(16*7)]
smlatt fp, ip, a4, fp
smlatb v1, lr, a3, v1
smlabb v3, lr, a3, v3
smlatb v5, ip, a3, v5
smulbb a4, ip, a3
smlatt v2, lr, a3, v2
sub v7, v7, a4
smlabt v4, lr, a3, v4
smulbt a4, ip, a3
smlatt v6, ip, a3, v6
sub fp, fp, a4
.endm
function idct_col_armv5te
str lr, [sp, #-4]!
idct_col
ldmfd sp!, {a3, a4}
adds a2, a3, v1
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
add ip, a4, v2
mov ip, ip, asr #20
orr a2, a2, ip, lsl #16
str a2, [a1]
subs a3, a3, v1
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
sub a4, a4, v2
mov a4, a4, asr #20
orr a2, a2, a4, lsl #16
ldmfd sp!, {a3, a4}
str a2, [a1, #(16*7)]
subs a2, a3, v3
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
sub ip, a4, v4
mov ip, ip, asr #20
orr a2, a2, ip, lsl #16
str a2, [a1, #(16*1)]
adds a3, a3, v3
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
add a4, a4, v4
mov a4, a4, asr #20
orr a2, a2, a4, lsl #16
ldmfd sp!, {a3, a4}
str a2, [a1, #(16*6)]
adds a2, a3, v5
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
add ip, a4, v6
mov ip, ip, asr #20
orr a2, a2, ip, lsl #16
str a2, [a1, #(16*2)]
subs a3, a3, v5
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
sub a4, a4, v6
mov a4, a4, asr #20
orr a2, a2, a4, lsl #16
ldmfd sp!, {a3, a4}
str a2, [a1, #(16*5)]
adds a2, a3, v7
mov a2, a2, lsr #20
it mi
orrmi a2, a2, #0xf000
add ip, a4, fp
mov ip, ip, asr #20
orr a2, a2, ip, lsl #16
str a2, [a1, #(16*3)]
subs a3, a3, v7
mov a2, a3, lsr #20
it mi
orrmi a2, a2, #0xf000
sub a4, a4, fp
mov a4, a4, asr #20
orr a2, a2, a4, lsl #16
str a2, [a1, #(16*4)]
ldr pc, [sp], #4
endfunc
.macro clip dst, src:vararg
movs \dst, \src
it mi
movmi \dst, #0
cmp \dst, #255
it gt
movgt \dst, #255
.endm
.macro aclip dst, src:vararg
adds \dst, \src
it mi
movmi \dst, #0
cmp \dst, #255
it gt
movgt \dst, #255
.endm
function idct_col_put_armv5te
str lr, [sp, #-4]!
idct_col
ldmfd sp!, {a3, a4}
ldr lr, [sp, #32]
add a2, a3, v1
clip a2, a2, asr #20
add ip, a4, v2
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
sub a3, a3, v1
clip a3, a3, asr #20
sub a4, a4, v2
clip a4, a4, asr #20
ldr v1, [sp, #28]
strh a2, [v1]
add a2, v1, #2
str a2, [sp, #28]
orr a2, a3, a4, lsl #8
rsb v2, lr, lr, lsl #3
ldmfd sp!, {a3, a4}
strh_pre a2, v2, v1
sub a2, a3, v3
clip a2, a2, asr #20
sub ip, a4, v4
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
strh_pre a2, v1, lr
add a3, a3, v3
clip a2, a3, asr #20
add a4, a4, v4
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4}
strh_dpre a2, v2, lr
add a2, a3, v5
clip a2, a2, asr #20
add ip, a4, v6
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
strh_pre a2, v1, lr
sub a3, a3, v5
clip a2, a3, asr #20
sub a4, a4, v6
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
ldmfd sp!, {a3, a4}
strh_dpre a2, v2, lr
add a2, a3, v7
clip a2, a2, asr #20
add ip, a4, fp
clip ip, ip, asr #20
orr a2, a2, ip, lsl #8
strh a2, [v1, lr]
sub a3, a3, v7
clip a2, a3, asr #20
sub a4, a4, fp
clip a4, a4, asr #20
orr a2, a2, a4, lsl #8
strh_dpre a2, v2, lr
ldr pc, [sp], #4
endfunc
function idct_col_add_armv5te
str lr, [sp, #-4]!
idct_col
ldr lr, [sp, #36]
ldmfd sp!, {a3, a4}
ldrh ip, [lr]
add a2, a3, v1
sub a3, a3, v1
and v1, ip, #255
aclip a2, v1, a2, asr #20
add v1, a4, v2
mov v1, v1, asr #20
aclip v1, v1, ip, lsr #8
orr a2, a2, v1, lsl #8
ldr v1, [sp, #32]
sub a4, a4, v2
rsb v2, v1, v1, lsl #3
ldrh_pre ip, v2, lr
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
mov a4, a4, asr #20
aclip a4, a4, ip, lsr #8
add a2, lr, #2
str a2, [sp, #28]
orr a2, a3, a4, lsl #8
strh a2, [v2]
ldmfd sp!, {a3, a4}
ldrh_pre ip, lr, v1
sub a2, a3, v3
add a3, a3, v3
and v3, ip, #255
aclip a2, v3, a2, asr #20
sub v3, a4, v4
mov v3, v3, asr #20
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
add a4, a4, v4
ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
mov a4, a4, asr #20
aclip a4, a4, ip, lsr #8
orr a2, a3, a4, lsl #8
strh a2, [v2]
ldmfd sp!, {a3, a4}
ldrh_pre ip, lr, v1
add a2, a3, v5
sub a3, a3, v5
and v3, ip, #255
aclip a2, v3, a2, asr #20
add v3, a4, v6
mov v3, v3, asr #20
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
sub a4, a4, v6
ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
mov a4, a4, asr #20
aclip a4, a4, ip, lsr #8
orr a2, a3, a4, lsl #8
strh a2, [v2]
ldmfd sp!, {a3, a4}
ldrh_pre ip, lr, v1
add a2, a3, v7
sub a3, a3, v7
and v3, ip, #255
aclip a2, v3, a2, asr #20
add v3, a4, fp
mov v3, v3, asr #20
aclip v3, v3, ip, lsr #8
orr a2, a2, v3, lsl #8
sub a4, a4, fp
ldrh_dpre ip, v2, v1
strh a2, [lr]
and a2, ip, #255
aclip a3, a2, a3, asr #20
mov a4, a4, asr #20
aclip a4, a4, ip, lsr #8
orr a2, a3, a4, lsl #8
strh a2, [v2]
ldr pc, [sp], #4
endfunc
function ff_simple_idct_armv5te, export=1
stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
sub a1, a1, #(16*7)
bl idct_col_armv5te
add a1, a1, #4
bl idct_col_armv5te
add a1, a1, #4
bl idct_col_armv5te
add a1, a1, #4
bl idct_col_armv5te
ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
endfunc
function ff_simple_idct_add_armv5te, export=1
stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
mov a1, a3
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
sub a1, a1, #(16*7)
bl idct_col_add_armv5te
add a1, a1, #4
bl idct_col_add_armv5te
add a1, a1, #4
bl idct_col_add_armv5te
add a1, a1, #4
bl idct_col_add_armv5te
add sp, sp, #8
ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
endfunc
function ff_simple_idct_put_armv5te, export=1
stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
mov a1, a3
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
add a1, a1, #16
bl idct_row_armv5te
sub a1, a1, #(16*7)
bl idct_col_put_armv5te
add a1, a1, #4
bl idct_col_put_armv5te
add a1, a1, #4
bl idct_col_put_armv5te
add a1, a1, #4
bl idct_col_put_armv5te
add sp, sp, #8
ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
endfunc

View File

@@ -0,0 +1,425 @@
/*
* Simple IDCT
*
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
* Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
#define ROW_SHIFT 11
#define COL_SHIFT 20
#define W13 (W1 | (W3 << 16))
#define W26 (W2 | (W6 << 16))
#define W42 (W4 | (W2 << 16))
#define W42n (-W4&0xffff | (-W2 << 16))
#define W46 (W4 | (W6 << 16))
#define W57 (W5 | (W7 << 16))
/*
Compute partial IDCT of single row.
shift = left-shift amount
r0 = source address
r2 = row[2,0] <= 2 cycles
r3 = row[3,1]
ip = w42 <= 2 cycles
Output in registers r4--r11
*/
.macro idct_row shift
ldr lr, =W46 /* lr = W4 | (W6 << 16) */
mov r1, #(1<<(\shift-1))
smlad r4, r2, ip, r1
smlsd r7, r2, ip, r1
ldr ip, =W13 /* ip = W1 | (W3 << 16) */
ldr r10,=W57 /* r10 = W5 | (W7 << 16) */
smlad r5, r2, lr, r1
smlsd r6, r2, lr, r1
smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
ldr lr, [r0, #12] /* lr = row[7,5] */
pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */
smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
ldr r3, =W42n /* r3 = -W4 | (-W2 << 16) */
smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */
ldr r2, [r0, #4] /* r2 = row[6,4] */
smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */
ldr ip, =W46 /* ip = W4 | (W6 << 16) */
smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */
smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */
smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */
smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */
smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */
.endm
/*
Compute partial IDCT of half row.
shift = left-shift amount
r2 = row[2,0]
r3 = row[3,1]
ip = w42
Output in registers r4--r11
*/
.macro idct_row4 shift
ldr lr, =W46 /* lr = W4 | (W6 << 16) */
ldr r10,=W57 /* r10 = W5 | (W7 << 16) */
mov r1, #(1<<(\shift-1))
smlad r4, r2, ip, r1
smlsd r7, r2, ip, r1
ldr ip, =W13 /* ip = W1 | (W3 << 16) */
smlad r5, r2, lr, r1
smlsd r6, r2, lr, r1
smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
.endm
/*
Compute final part of IDCT single row without shift.
Input in registers r4--r11
Output in registers ip, r4--r6, lr, r8--r10
*/
.macro idct_finish
add ip, r4, r8 /* r1 = A0 + B0 */
sub lr, r4, r8 /* r2 = A0 - B0 */
sub r4, r5, r9 /* r2 = A1 + B1 */
add r8, r5, r9 /* r2 = A1 - B1 */
add r5, r6, r10 /* r1 = A2 + B2 */
sub r9, r6, r10 /* r1 = A2 - B2 */
add r6, r7, r11 /* r2 = A3 + B3 */
sub r10,r7, r11 /* r2 = A3 - B3 */
.endm
/*
Compute final part of IDCT single row.
shift = right-shift amount
Input/output in registers r4--r11
*/
.macro idct_finish_shift shift
add r3, r4, r8 /* r3 = A0 + B0 */
sub r2, r4, r8 /* r2 = A0 - B0 */
mov r4, r3, asr #\shift
mov r8, r2, asr #\shift
sub r3, r5, r9 /* r3 = A1 + B1 */
add r2, r5, r9 /* r2 = A1 - B1 */
mov r5, r3, asr #\shift
mov r9, r2, asr #\shift
add r3, r6, r10 /* r3 = A2 + B2 */
sub r2, r6, r10 /* r2 = A2 - B2 */
mov r6, r3, asr #\shift
mov r10,r2, asr #\shift
add r3, r7, r11 /* r3 = A3 + B3 */
sub r2, r7, r11 /* r2 = A3 - B3 */
mov r7, r3, asr #\shift
mov r11,r2, asr #\shift
.endm
/*
Compute final part of IDCT single row, saturating results at 8 bits.
shift = right-shift amount
Input/output in registers r4--r11
*/
.macro idct_finish_shift_sat shift
add r3, r4, r8 /* r3 = A0 + B0 */
sub ip, r4, r8 /* ip = A0 - B0 */
usat r4, #8, r3, asr #\shift
usat r8, #8, ip, asr #\shift
sub r3, r5, r9 /* r3 = A1 + B1 */
add ip, r5, r9 /* ip = A1 - B1 */
usat r5, #8, r3, asr #\shift
usat r9, #8, ip, asr #\shift
add r3, r6, r10 /* r3 = A2 + B2 */
sub ip, r6, r10 /* ip = A2 - B2 */
usat r6, #8, r3, asr #\shift
usat r10,#8, ip, asr #\shift
add r3, r7, r11 /* r3 = A3 + B3 */
sub ip, r7, r11 /* ip = A3 - B3 */
usat r7, #8, r3, asr #\shift
usat r11,#8, ip, asr #\shift
.endm
/*
Compute IDCT of single row, storing as column.
r0 = source
r1 = dest
*/
function idct_row_armv6
push {lr}
ldr lr, [r0, #12] /* lr = row[7,5] */
ldr ip, [r0, #4] /* ip = row[6,4] */
ldr r3, [r0, #8] /* r3 = row[3,1] */
ldr r2, [r0] /* r2 = row[2,0] */
orrs lr, lr, ip
itt eq
cmpeq lr, r3
cmpeq lr, r2, lsr #16
beq 1f
push {r1}
ldr ip, =W42 /* ip = W4 | (W2 << 16) */
cmp lr, #0
beq 2f
idct_row ROW_SHIFT
b 3f
2: idct_row4 ROW_SHIFT
3: pop {r1}
idct_finish_shift ROW_SHIFT
strh r4, [r1]
strh r5, [r1, #(16*2)]
strh r6, [r1, #(16*4)]
strh r7, [r1, #(16*6)]
strh r11,[r1, #(16*1)]
strh r10,[r1, #(16*3)]
strh r9, [r1, #(16*5)]
strh r8, [r1, #(16*7)]
pop {pc}
1: mov r2, r2, lsl #3
strh r2, [r1]
strh r2, [r1, #(16*2)]
strh r2, [r1, #(16*4)]
strh r2, [r1, #(16*6)]
strh r2, [r1, #(16*1)]
strh r2, [r1, #(16*3)]
strh r2, [r1, #(16*5)]
strh r2, [r1, #(16*7)]
pop {pc}
endfunc
/*
Compute IDCT of single column, read as row.
r0 = source
r1 = dest
*/
function idct_col_armv6
push {r1, lr}
ldr r2, [r0] /* r2 = row[2,0] */
ldr ip, =W42 /* ip = W4 | (W2 << 16) */
ldr r3, [r0, #8] /* r3 = row[3,1] */
idct_row COL_SHIFT
pop {r1}
idct_finish_shift COL_SHIFT
strh r4, [r1]
strh r5, [r1, #(16*1)]
strh r6, [r1, #(16*2)]
strh r7, [r1, #(16*3)]
strh r11,[r1, #(16*4)]
strh r10,[r1, #(16*5)]
strh r9, [r1, #(16*6)]
strh r8, [r1, #(16*7)]
pop {pc}
endfunc
/*
Compute IDCT of single column, read as row, store saturated 8-bit.
r0 = source
r1 = dest
r2 = line size
*/
function idct_col_put_armv6
push {r1, r2, lr}
ldr r2, [r0] /* r2 = row[2,0] */
ldr ip, =W42 /* ip = W4 | (W2 << 16) */
ldr r3, [r0, #8] /* r3 = row[3,1] */
idct_row COL_SHIFT
pop {r1, r2}
idct_finish_shift_sat COL_SHIFT
strb_post r4, r1, r2
strb_post r5, r1, r2
strb_post r6, r1, r2
strb_post r7, r1, r2
strb_post r11,r1, r2
strb_post r10,r1, r2
strb_post r9, r1, r2
strb_post r8, r1, r2
sub r1, r1, r2, lsl #3
pop {pc}
endfunc
/*
Compute IDCT of single column, read as row, add/store saturated 8-bit.
r0 = source
r1 = dest
r2 = line size
*/
function idct_col_add_armv6
push {r1, r2, lr}
ldr r2, [r0] /* r2 = row[2,0] */
ldr ip, =W42 /* ip = W4 | (W2 << 16) */
ldr r3, [r0, #8] /* r3 = row[3,1] */
idct_row COL_SHIFT
pop {r1, r2}
idct_finish
ldrb r3, [r1]
ldrb r7, [r1, r2]
ldrb r11,[r1, r2, lsl #2]
add ip, r3, ip, asr #COL_SHIFT
usat ip, #8, ip
add r4, r7, r4, asr #COL_SHIFT
strb_post ip, r1, r2
ldrb ip, [r1, r2]
usat r4, #8, r4
ldrb r11,[r1, r2, lsl #2]
add r5, ip, r5, asr #COL_SHIFT
usat r5, #8, r5
strb_post r4, r1, r2
ldrb r3, [r1, r2]
ldrb ip, [r1, r2, lsl #2]
strb_post r5, r1, r2
ldrb r7, [r1, r2]
ldrb r4, [r1, r2, lsl #2]
add r6, r3, r6, asr #COL_SHIFT
usat r6, #8, r6
add r10,r7, r10,asr #COL_SHIFT
usat r10,#8, r10
add r9, r11,r9, asr #COL_SHIFT
usat r9, #8, r9
add r8, ip, r8, asr #COL_SHIFT
usat r8, #8, r8
add lr, r4, lr, asr #COL_SHIFT
usat lr, #8, lr
strb_post r6, r1, r2
strb_post r10,r1, r2
strb_post r9, r1, r2
strb_post r8, r1, r2
strb_post lr, r1, r2
sub r1, r1, r2, lsl #3
pop {pc}
endfunc
/*
Compute 8 IDCT row transforms.
func = IDCT row->col function
width = width of columns in bytes
*/
.macro idct_rows func width
bl \func
add r0, r0, #(16*2)
add r1, r1, #\width
bl \func
add r0, r0, #(16*2)
add r1, r1, #\width
bl \func
add r0, r0, #(16*2)
add r1, r1, #\width
bl \func
sub r0, r0, #(16*5)
add r1, r1, #\width
bl \func
add r0, r0, #(16*2)
add r1, r1, #\width
bl \func
add r0, r0, #(16*2)
add r1, r1, #\width
bl \func
add r0, r0, #(16*2)
add r1, r1, #\width
bl \func
sub r0, r0, #(16*7)
.endm
/* void ff_simple_idct_armv6(int16_t *data); */
function ff_simple_idct_armv6, export=1
push {r4-r11, lr}
sub sp, sp, #128
mov r1, sp
idct_rows idct_row_armv6, 2
mov r1, r0
mov r0, sp
idct_rows idct_col_armv6, 2
add sp, sp, #128
pop {r4-r11, pc}
endfunc
/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); */
function ff_simple_idct_add_armv6, export=1
push {r0, r1, r4-r11, lr}
sub sp, sp, #128
mov r0, r2
mov r1, sp
idct_rows idct_row_armv6, 2
mov r0, sp
ldr r1, [sp, #128]
ldr r2, [sp, #(128+4)]
idct_rows idct_col_add_armv6, 1
add sp, sp, #(128+8)
pop {r4-r11, pc}
endfunc
/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); */
function ff_simple_idct_put_armv6, export=1
push {r0, r1, r4-r11, lr}
sub sp, sp, #128
mov r0, r2
mov r1, sp
idct_rows idct_row_armv6, 2
mov r0, sp
ldr r1, [sp, #128]
ldr r2, [sp, #(128+4)]
idct_rows idct_col_put_armv6, 1
add sp, sp, #(128+8)
pop {r4-r11, pc}
endfunc

View File

@@ -0,0 +1,375 @@
/*
* ARM NEON IDCT
*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* Based on Simple IDCT
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define W4c ((1<<(COL_SHIFT-1))/W4)
#define ROW_SHIFT 11
#define COL_SHIFT 20
#define w1 d0[0]
#define w2 d0[1]
#define w3 d0[2]
#define w4 d0[3]
#define w5 d1[0]
#define w6 d1[1]
#define w7 d1[2]
#define w4c d1[3]
.macro idct_col4_top
vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */
vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */
vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */
vadd.i32 q11, q15, q7
vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */
vadd.i32 q12, q15, q8
vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */
vsub.i32 q13, q15, q8
vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */
vsub.i32 q14, q15, q7
vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */
vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */
vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */
vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */
.endm
.text
.align 6
function idct_row4_pld_neon
pld [r0]
add r3, r0, r1, lsl #2
pld [r0, r1]
pld [r0, r1, lsl #1]
A pld [r3, -r1]
pld [r3]
pld [r3, r1]
add r3, r3, r1, lsl #1
pld [r3]
pld [r3, r1]
endfunc
function idct_row4_neon
vmov.i32 q15, #(1<<(ROW_SHIFT-1))
vld1.64 {d2-d5}, [r2,:128]!
vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */
vld1.64 {d6,d7}, [r2,:128]!
vorr d10, d3, d5
vld1.64 {d8,d9}, [r2,:128]!
add r2, r2, #-64
vorr d11, d7, d9
vorr d10, d10, d11
vmov r3, r4, d10
idct_col4_top
orrs r3, r3, r4
beq 1f
vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
vadd.i32 q11, q11, q7
vsub.i32 q12, q12, q7
vsub.i32 q13, q13, q7
vadd.i32 q14, q14, q7
vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
vmlal.s16 q9, d9, w7
vmlsl.s16 q10, d9, w5
vmlal.s16 q5, d9, w3
vmlsl.s16 q6, d9, w1
vadd.i32 q11, q11, q7
vsub.i32 q12, q12, q8
vadd.i32 q13, q13, q8
vsub.i32 q14, q14, q7
1: vadd.i32 q3, q11, q9
vadd.i32 q4, q12, q10
vshrn.i32 d2, q3, #ROW_SHIFT
vshrn.i32 d4, q4, #ROW_SHIFT
vadd.i32 q7, q13, q5
vadd.i32 q8, q14, q6
vtrn.16 d2, d4
vshrn.i32 d6, q7, #ROW_SHIFT
vshrn.i32 d8, q8, #ROW_SHIFT
vsub.i32 q14, q14, q6
vsub.i32 q11, q11, q9
vtrn.16 d6, d8
vsub.i32 q13, q13, q5
vshrn.i32 d3, q14, #ROW_SHIFT
vtrn.32 d2, d6
vsub.i32 q12, q12, q10
vtrn.32 d4, d8
vshrn.i32 d5, q13, #ROW_SHIFT
vshrn.i32 d7, q12, #ROW_SHIFT
vshrn.i32 d9, q11, #ROW_SHIFT
vtrn.16 d3, d5
vtrn.16 d7, d9
vtrn.32 d3, d7
vtrn.32 d5, d9
vst1.64 {d2-d5}, [r2,:128]!
vst1.64 {d6-d9}, [r2,:128]!
bx lr
endfunc
function idct_col4_neon
mov ip, #16
vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */
vdup.16 d30, w4c
vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */
vadd.i16 d30, d30, d2
vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */
vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */
ldrd r4, r5, [r2]
ldrd r6, r7, [r2, #16]
orrs r4, r4, r5
idct_col4_top
it eq
addeq r2, r2, #16
beq 1f
vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */
vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
vadd.i32 q11, q11, q7
vsub.i32 q12, q12, q7
vsub.i32 q13, q13, q7
vadd.i32 q14, q14, q7
1: orrs r6, r6, r7
ldrd r4, r5, [r2, #16]
it eq
addeq r2, r2, #16
beq 2f
vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */
vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
2: orrs r4, r4, r5
ldrd r4, r5, [r2, #16]
it eq
addeq r2, r2, #16
beq 3f
vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */
vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
vadd.i32 q11, q11, q7
vsub.i32 q14, q14, q7
vsub.i32 q12, q12, q8
vadd.i32 q13, q13, q8
3: orrs r4, r4, r5
it eq
addeq r2, r2, #16
beq 4f
vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */
vmlal.s16 q9, d9, w7
vmlsl.s16 q10, d9, w5
vmlal.s16 q5, d9, w3
vmlsl.s16 q6, d9, w1
4: vaddhn.i32 d2, q11, q9
vaddhn.i32 d3, q12, q10
vaddhn.i32 d4, q13, q5
vaddhn.i32 d5, q14, q6
vsubhn.i32 d9, q11, q9
vsubhn.i32 d8, q12, q10
vsubhn.i32 d7, q13, q5
vsubhn.i32 d6, q14, q6
bx lr
endfunc
.align 6
function idct_col4_st8_neon
vqshrun.s16 d2, q1, #COL_SHIFT-16
vqshrun.s16 d3, q2, #COL_SHIFT-16
vqshrun.s16 d4, q3, #COL_SHIFT-16
vqshrun.s16 d5, q4, #COL_SHIFT-16
vst1.32 {d2[0]}, [r0,:32], r1
vst1.32 {d2[1]}, [r0,:32], r1
vst1.32 {d3[0]}, [r0,:32], r1
vst1.32 {d3[1]}, [r0,:32], r1
vst1.32 {d4[0]}, [r0,:32], r1
vst1.32 {d4[1]}, [r0,:32], r1
vst1.32 {d5[0]}, [r0,:32], r1
vst1.32 {d5[1]}, [r0,:32], r1
bx lr
endfunc
const idct_coeff_neon, align=4
.short W1, W2, W3, W4, W5, W6, W7, W4c
endconst
.macro idct_start data
push {r4-r7, lr}
pld [\data]
pld [\data, #64]
vpush {d8-d15}
movrel r3, idct_coeff_neon
vld1.64 {d0,d1}, [r3,:128]
.endm
.macro idct_end
vpop {d8-d15}
pop {r4-r7, pc}
.endm
/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, int16_t *data); */
function ff_simple_idct_put_neon, export=1
idct_start r2
bl idct_row4_pld_neon
bl idct_row4_neon
add r2, r2, #-128
bl idct_col4_neon
bl idct_col4_st8_neon
sub r0, r0, r1, lsl #3
add r0, r0, #4
add r2, r2, #-120
bl idct_col4_neon
bl idct_col4_st8_neon
idct_end
endfunc
.align 6
function idct_col4_add8_neon
mov ip, r0
vld1.32 {d10[0]}, [r0,:32], r1
vshr.s16 q1, q1, #COL_SHIFT-16
vld1.32 {d10[1]}, [r0,:32], r1
vshr.s16 q2, q2, #COL_SHIFT-16
vld1.32 {d11[0]}, [r0,:32], r1
vshr.s16 q3, q3, #COL_SHIFT-16
vld1.32 {d11[1]}, [r0,:32], r1
vshr.s16 q4, q4, #COL_SHIFT-16
vld1.32 {d12[0]}, [r0,:32], r1
vaddw.u8 q1, q1, d10
vld1.32 {d12[1]}, [r0,:32], r1
vaddw.u8 q2, q2, d11
vld1.32 {d13[0]}, [r0,:32], r1
vqmovun.s16 d2, q1
vld1.32 {d13[1]}, [r0,:32], r1
vaddw.u8 q3, q3, d12
vst1.32 {d2[0]}, [ip,:32], r1
vqmovun.s16 d3, q2
vst1.32 {d2[1]}, [ip,:32], r1
vaddw.u8 q4, q4, d13
vst1.32 {d3[0]}, [ip,:32], r1
vqmovun.s16 d4, q3
vst1.32 {d3[1]}, [ip,:32], r1
vqmovun.s16 d5, q4
vst1.32 {d4[0]}, [ip,:32], r1
vst1.32 {d4[1]}, [ip,:32], r1
vst1.32 {d5[0]}, [ip,:32], r1
vst1.32 {d5[1]}, [ip,:32], r1
bx lr
endfunc
/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, int16_t *data); */
function ff_simple_idct_add_neon, export=1
idct_start r2
bl idct_row4_pld_neon
bl idct_row4_neon
add r2, r2, #-128
bl idct_col4_neon
bl idct_col4_add8_neon
sub r0, r0, r1, lsl #3
add r0, r0, #4
add r2, r2, #-120
bl idct_col4_neon
bl idct_col4_add8_neon
idct_end
endfunc
.align 6
function idct_col4_st16_neon
mov ip, #16
vshr.s16 q1, q1, #COL_SHIFT-16
vshr.s16 q2, q2, #COL_SHIFT-16
vst1.64 {d2}, [r2,:64], ip
vshr.s16 q3, q3, #COL_SHIFT-16
vst1.64 {d3}, [r2,:64], ip
vshr.s16 q4, q4, #COL_SHIFT-16
vst1.64 {d4}, [r2,:64], ip
vst1.64 {d5}, [r2,:64], ip
vst1.64 {d6}, [r2,:64], ip
vst1.64 {d7}, [r2,:64], ip
vst1.64 {d8}, [r2,:64], ip
vst1.64 {d9}, [r2,:64], ip
bx lr
endfunc
/* void ff_simple_idct_neon(int16_t *data); */
function ff_simple_idct_neon, export=1
idct_start r0
mov r2, r0
bl idct_row4_neon
bl idct_row4_neon
add r2, r2, #-128
bl idct_col4_neon
add r2, r2, #-128
bl idct_col4_st16_neon
add r2, r2, #-120
bl idct_col4_neon
add r2, r2, #-128
bl idct_col4_st16_neon
idct_end
endfunc

View File

@@ -0,0 +1,115 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_synth_filter_float_neon, export=1
push {r3-r11,lr}
ldr r4, [r2] @ synth_buf_offset
add r1, r1, r4, lsl #2 @ synth_buf
sub r12, r4, #32
bfc r12, #9, #23
bic r4, r4, #63
str r12, [r2]
ldr r2, [sp, #12*4] @ in
mov r9, r1 @ synth_buf
VFP vpush {d0}
bl X(ff_imdct_half_neon)
VFP vpop {d0}
pop {r3}
ldr r5, [sp, #9*4] @ window
ldr r2, [sp, #10*4] @ out
NOVFP vldr s0, [sp, #12*4] @ scale
add r8, r9, #12*4
mov lr, #64*4
mov r1, #4
1:
add r10, r9, #16*4 @ synth_buf
add r11, r8, #16*4
add r0, r5, #16*4 @ window
add r6, r5, #32*4
add r7, r5, #48*4
vld1.32 {q10}, [r3,:128] @ a
add r3, r3, #16*4
vld1.32 {q1}, [r3,:128] @ b
vmov.f32 q2, #0.0 @ c
vmov.f32 q3, #0.0 @ d
mov r12, #512
2:
vld1.32 {q9}, [r8, :128], lr
vrev64.32 q9, q9
vld1.32 {q8}, [r5, :128], lr
vmls.f32 d20, d16, d19
vld1.32 {q11}, [r0, :128], lr
vmls.f32 d21, d17, d18
vld1.32 {q12}, [r9, :128], lr
vmla.f32 d2, d22, d24
vld1.32 {q8}, [r6, :128], lr
vmla.f32 d3, d23, d25
vld1.32 {q9}, [r10,:128], lr
vmla.f32 d4, d16, d18
vld1.32 {q12}, [r11,:128], lr
vmla.f32 d5, d17, d19
vrev64.32 q12, q12
vld1.32 {q11}, [r7, :128], lr
vmla.f32 d6, d22, d25
vmla.f32 d7, d23, d24
subs r12, r12, #64
beq 3f
cmp r12, r4
bne 2b
sub r8, r8, #512*4
sub r9, r9, #512*4
sub r10, r10, #512*4
sub r11, r11, #512*4
b 2b
3:
vmul.f32 q8, q10, d0[0]
vmul.f32 q9, q1, d0[0]
vst1.32 {q3}, [r3,:128]
sub r3, r3, #16*4
vst1.32 {q2}, [r3,:128]
vst1.32 {q8}, [r2,:128]
add r2, r2, #16*4
vst1.32 {q9}, [r2,:128]
subs r1, r1, #1
it eq
popeq {r4-r11,pc}
cmp r4, #0
itt eq
subeq r8, r8, #512*4
subeq r9, r9, #512*4
sub r5, r5, #512*4
sub r2, r2, #12*4 @ out
add r3, r3, #4*4 @ synth_buf2
add r5, r5, #4*4 @ window
add r9, r9, #4*4 @ synth_buf
sub r8, r8, #4*4 @ synth_buf
b 1b
endfunc

View File

@@ -0,0 +1,243 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
IMDCT .req r0
ORIG_P_SB .req r1
P_SB_OFF .req r2
I .req r0
P_SB2_UP .req r1
OLDFPSCR .req r2
P_SB2_DN .req r3
P_WIN_DN .req r4
P_OUT_DN .req r5
P_SB .req r6
J_WRAP .req r7
P_WIN_UP .req r12
P_OUT_UP .req r14
SCALE .req s0
SBUF_DAT_REV0 .req s4
SBUF_DAT_REV1 .req s5
SBUF_DAT_REV2 .req s6
SBUF_DAT_REV3 .req s7
VA0 .req s8
VA3 .req s11
VB0 .req s12
VB3 .req s15
VC0 .req s8
VC3 .req s11
VD0 .req s12
VD3 .req s15
SBUF_DAT0 .req s16
SBUF_DAT1 .req s17
SBUF_DAT2 .req s18
SBUF_DAT3 .req s19
SBUF_DAT_ALT0 .req s20
SBUF_DAT_ALT1 .req s21
SBUF_DAT_ALT2 .req s22
SBUF_DAT_ALT3 .req s23
WIN_DN_DAT0 .req s24
WIN_UP_DAT0 .req s28
.macro inner_loop half, tail, head
.if (OFFSET & (64*4)) == 0 @ even numbered call
SBUF_DAT_THIS0 .req SBUF_DAT0
SBUF_DAT_THIS1 .req SBUF_DAT1
SBUF_DAT_THIS2 .req SBUF_DAT2
SBUF_DAT_THIS3 .req SBUF_DAT3
.ifnc "\head",""
vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT
vldr d9, [P_SB, #OFFSET+8]
.endif
.else
SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
.ifnc "\head",""
vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT
vldr d11, [P_SB, #OFFSET+8]
.endif
.endif
.ifnc "\tail",""
.ifc "\half","ab"
vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
.else
vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
.endif
.endif
.ifnc "\head",""
vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT
vldr d15, [P_WIN_UP, #OFFSET+8]
vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT
vldr d13, [P_WIN_DN, #OFFSET+8]
vmov SBUF_DAT_REV3, SBUF_DAT_THIS0
vmov SBUF_DAT_REV2, SBUF_DAT_THIS1
vmov SBUF_DAT_REV1, SBUF_DAT_THIS2
vmov SBUF_DAT_REV0, SBUF_DAT_THIS3
.ifc "\half","ab"
vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
.else
vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
.endif
teq J_WRAP, #J
bne 2f @ strongly predictable, so better than cond exec in this case
sub P_SB, P_SB, #512*4
2:
.set J, J - 64
.set OFFSET, OFFSET + 64*4
.endif
.unreq SBUF_DAT_THIS0
.unreq SBUF_DAT_THIS1
.unreq SBUF_DAT_THIS2
.unreq SBUF_DAT_THIS3
.endm
/* void ff_synth_filter_float_vfp(FFTContext *imdct,
* float *synth_buf_ptr, int *synth_buf_offset,
* float synth_buf2[32], const float window[512],
* float out[32], const float in[32], float scale)
*/
function ff_synth_filter_float_vfp, export=1
push {r3-r7,lr}
vpush {s16-s31}
ldr lr, [P_SB_OFF]
add a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half
mov P_SB, a2 @ and keep a copy for ourselves
bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop
sub lr, lr, #32
and lr, lr, #512-32
str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
bl X(ff_imdct_half_vfp)
VFP vmov SCALE, s16
fmrx OLDFPSCR, FPSCR
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
ldr P_SB2_DN, [sp, #16*4]
ldr P_WIN_DN, [sp, #(16+6+0)*4]
ldr P_OUT_DN, [sp, #(16+6+1)*4]
NOVFP vldr SCALE, [sp, #(16+6+3)*4]
#define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 16*4 */
add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range
add P_SB2_UP, P_SB2_DN, #16*4
add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
add P_OUT_UP, P_OUT_DN, #16*4
add P_SB2_DN, P_SB2_DN, #16*4
add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
add P_OUT_DN, P_OUT_DN, #16*4
mov I, #4
1:
vldmia P_SB2_UP!, {VB0-VB3}
vldmdb P_SB2_DN!, {VA0-VA3}
.set J, 512 - 64
.set OFFSET, -IMM_OFF_SKEW
inner_loop ab,, head
.rept 7
inner_loop ab, tail, head
.endr
inner_loop ab, tail
add P_WIN_UP, P_WIN_UP, #4*4
sub P_WIN_DN, P_WIN_DN, #4*4
vmul.f VB0, VB0, SCALE @ SCALE treated as scalar
add P_SB, P_SB, #(512+4)*4
subs I, I, #1
vmul.f VA0, VA0, SCALE
vstmia P_OUT_UP!, {VB0-VB3}
vstmdb P_OUT_DN!, {VA0-VA3}
bne 1b
add P_SB2_DN, P_SB2_DN, #(16+28-12)*4
sub P_SB2_UP, P_SB2_UP, #(16+16)*4
add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
mov I, #4
1:
vldr.d d4, zero @ d4 = VC0
vldr.d d5, zero
vldr.d d6, zero @ d6 = VD0
vldr.d d7, zero
.set J, 512 - 64
.set OFFSET, -IMM_OFF_SKEW
inner_loop cd,, head
.rept 7
inner_loop cd, tail, head
.endr
inner_loop cd, tail
add P_WIN_UP, P_WIN_UP, #4*4
sub P_WIN_DN, P_WIN_DN, #4*4
add P_SB, P_SB, #(512+4)*4
subs I, I, #1
vstmia P_SB2_UP!, {VC0-VC3}
vstmdb P_SB2_DN!, {VD0-VD3}
bne 1b
fmxr FPSCR, OLDFPSCR
vpop {s16-s31}
pop {r3-r7,pc}
endfunc
.unreq IMDCT
.unreq ORIG_P_SB
.unreq P_SB_OFF
.unreq I
.unreq P_SB2_UP
.unreq OLDFPSCR
.unreq P_SB2_DN
.unreq P_WIN_DN
.unreq P_OUT_DN
.unreq P_SB
.unreq J_WRAP
.unreq P_WIN_UP
.unreq P_OUT_UP
.unreq SCALE
.unreq SBUF_DAT_REV0
.unreq SBUF_DAT_REV1
.unreq SBUF_DAT_REV2
.unreq SBUF_DAT_REV3
.unreq VA0
.unreq VA3
.unreq VB0
.unreq VB3
.unreq VC0
.unreq VC3
.unreq VD0
.unreq VD3
.unreq SBUF_DAT0
.unreq SBUF_DAT1
.unreq SBUF_DAT2
.unreq SBUF_DAT3
.unreq SBUF_DAT_ALT0
.unreq SBUF_DAT_ALT1
.unreq SBUF_DAT_ALT2
.unreq SBUF_DAT_ALT3
.unreq WIN_DN_DAT0
.unreq WIN_UP_DAT0
.align 3
zero: .word 0, 0

View File

@@ -0,0 +1,29 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_VIDEODSP_ARM_H
#define AVCODEC_ARM_VIDEODSP_ARM_H
#include "libavcodec/avcodec.h"
#include "libavcodec/videodsp.h"
void ff_videodsp_init_armv5te(VideoDSPContext* ctx, int bpc);
#endif /* AVCODEC_ARM_VIDEODSP_ARM_H */

View File

@@ -0,0 +1,31 @@
@
@ ARMv5te optimized DSP utils
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
@ This file is part of FFmpeg
@
@ FFmpeg is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2.1 of the License, or (at your option) any later version.
@
@ FFmpeg is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
@ License along with FFmpeg; if not, write to the Free Software
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@
#include "config.h"
#include "libavutil/arm/asm.S"
function ff_prefetch_arm, export=1
subs r2, r2, #1
pld [r0]
add r0, r0, r1
bne ff_prefetch_arm
bx lr
endfunc

View File

@@ -0,0 +1,30 @@
/*
* Copyright (C) 2012 Ronald S. Bultje
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/videodsp.h"
#include "videodsp_arm.h"
av_cold void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv5te(cpu_flags)) ff_videodsp_init_armv5te(ctx, bpc);
}

View File

@@ -0,0 +1,33 @@
/*
* Copyright (C) 2012 Ronald S. Bultje
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/videodsp.h"
#include "videodsp_arm.h"
void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h);
av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc)
{
#if HAVE_ARMV5TE_EXTERNAL
ctx->prefetch = ff_prefetch_arm;
#endif
}

View File

@@ -0,0 +1,37 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/vorbisdsp.h"
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang,
intptr_t blocksize);
av_cold void ff_vorbisdsp_init_arm(VorbisDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
}
}

View File

@@ -0,0 +1,83 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_vorbis_inverse_coupling_neon, export=1
vmov.i32 q10, #1<<31
subs r2, r2, #4
mov r3, r0
mov r12, r1
beq 3f
vld1.32 {d24-d25},[r1,:128]!
vld1.32 {d22-d23},[r0,:128]!
vcle.s32 q8, q12, #0
vand q9, q11, q10
veor q12, q12, q9
vand q2, q12, q8
vbic q3, q12, q8
vadd.f32 q12, q11, q2
vsub.f32 q11, q11, q3
1: vld1.32 {d2-d3}, [r1,:128]!
vld1.32 {d0-d1}, [r0,:128]!
vcle.s32 q8, q1, #0
vand q9, q0, q10
veor q1, q1, q9
vst1.32 {d24-d25},[r3, :128]!
vst1.32 {d22-d23},[r12,:128]!
vand q2, q1, q8
vbic q3, q1, q8
vadd.f32 q1, q0, q2
vsub.f32 q0, q0, q3
subs r2, r2, #8
ble 2f
vld1.32 {d24-d25},[r1,:128]!
vld1.32 {d22-d23},[r0,:128]!
vcle.s32 q8, q12, #0
vand q9, q11, q10
veor q12, q12, q9
vst1.32 {d2-d3}, [r3, :128]!
vst1.32 {d0-d1}, [r12,:128]!
vand q2, q12, q8
vbic q3, q12, q8
vadd.f32 q12, q11, q2
vsub.f32 q11, q11, q3
b 1b
2: vst1.32 {d2-d3}, [r3, :128]!
vst1.32 {d0-d1}, [r12,:128]!
it lt
bxlt lr
3: vld1.32 {d2-d3}, [r1,:128]
vld1.32 {d0-d1}, [r0,:128]
vcle.s32 q8, q1, #0
vand q9, q0, q10
veor q1, q1, q9
vand q2, q1, q8
vbic q3, q1, q8
vadd.f32 q1, q0, q2
vsub.f32 q0, q0, q3
vst1.32 {d2-d3}, [r0,:128]!
vst1.32 {d0-d1}, [r1,:128]!
bx lr
endfunc

View File

@@ -0,0 +1,45 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/vp3dsp.h"
void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const int16_t *data);
void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
c->idct_put = ff_vp3_idct_put_neon;
c->idct_add = ff_vp3_idct_add_neon;
c->idct_dc_add = ff_vp3_idct_dc_add_neon;
c->v_loop_filter = ff_vp3_v_loop_filter_neon;
c->h_loop_filter = ff_vp3_h_loop_filter_neon;
}
}

View File

@@ -0,0 +1,395 @@
/*
* Copyright (c) 2009 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
const vp3_idct_constants, align=4
.short 64277, 60547, 54491, 46341, 36410, 25080, 12785
endconst
#define xC1S7 d0[0]
#define xC2S6 d0[1]
#define xC3S5 d0[2]
#define xC4S4 d0[3]
#define xC5S3 d1[0]
#define xC6S2 d1[1]
#define xC7S1 d1[2]
.macro vp3_loop_filter
vsubl.u8 q3, d18, d17
vsubl.u8 q2, d16, d19
vadd.i16 q1, q3, q3
vadd.i16 q2, q2, q3
vadd.i16 q0, q1, q2
vrshr.s16 q0, q0, #3
vmovl.u8 q9, d18
vdup.u16 q15, r2
vabs.s16 q1, q0
vshr.s16 q0, q0, #15
vqsub.u16 q2, q15, q1
vqsub.u16 q3, q2, q1
vsub.i16 q1, q2, q3
veor q1, q1, q0
vsub.i16 q0, q1, q0
vaddw.u8 q2, q0, d17
vsub.i16 q3, q9, q0
vqmovun.s16 d0, q2
vqmovun.s16 d1, q3
.endm
function ff_vp3_v_loop_filter_neon, export=1
sub ip, r0, r1
sub r0, r0, r1, lsl #1
vld1.64 {d16}, [r0,:64], r1
vld1.64 {d17}, [r0,:64], r1
vld1.64 {d18}, [r0,:64], r1
vld1.64 {d19}, [r0,:64], r1
ldrb r2, [r2, #129*4]
vp3_loop_filter
vst1.64 {d0}, [ip,:64], r1
vst1.64 {d1}, [ip,:64], r1
bx lr
endfunc
function ff_vp3_h_loop_filter_neon, export=1
sub ip, r0, #1
sub r0, r0, #2
vld1.32 {d16[]}, [r0], r1
vld1.32 {d17[]}, [r0], r1
vld1.32 {d18[]}, [r0], r1
vld1.32 {d19[]}, [r0], r1
vld1.32 {d16[1]}, [r0], r1
vld1.32 {d17[1]}, [r0], r1
vld1.32 {d18[1]}, [r0], r1
vld1.32 {d19[1]}, [r0], r1
ldrb r2, [r2, #129*4]
vtrn.8 d16, d17
vtrn.8 d18, d19
vtrn.16 d16, d18
vtrn.16 d17, d19
vp3_loop_filter
vtrn.8 d0, d1
vst1.16 {d0[0]}, [ip], r1
vst1.16 {d1[0]}, [ip], r1
vst1.16 {d0[1]}, [ip], r1
vst1.16 {d1[1]}, [ip], r1
vst1.16 {d0[2]}, [ip], r1
vst1.16 {d1[2]}, [ip], r1
vst1.16 {d0[3]}, [ip], r1
vst1.16 {d1[3]}, [ip], r1
bx lr
endfunc
function vp3_idct_start_neon
vpush {d8-d15}
vmov.i16 q4, #0
vmov.i16 q5, #0
movrel r3, vp3_idct_constants
vld1.64 {d0-d1}, [r3,:128]
vld1.64 {d16-d19}, [r2,:128]
vst1.64 {q4-q5}, [r2,:128]!
vld1.64 {d20-d23}, [r2,:128]
vst1.64 {q4-q5}, [r2,:128]!
vld1.64 {d24-d27}, [r2,:128]
vst1.64 {q4-q5}, [r2,:128]!
vadd.s16 q1, q8, q12
vsub.s16 q8, q8, q12
vld1.64 {d28-d31}, [r2,:128]
vst1.64 {q4-q5}, [r2,:128]!
vp3_idct_core_neon:
vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16
vmull.s16 q3, d19, xC1S7
vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16
vmull.s16 q5, d3, xC4S4
vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16
vmull.s16 q7, d17, xC4S4
vshrn.s32 d4, q2, #16
vshrn.s32 d5, q3, #16
vshrn.s32 d6, q4, #16
vshrn.s32 d7, q5, #16
vshrn.s32 d8, q6, #16
vshrn.s32 d9, q7, #16
vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4
vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4
vadd.s16 q1, q2, q9 // ip[1] * C1
vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16
vmull.s16 q3, d31, xC1S7
vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16
vmull.s16 q5, d31, xC7S1
vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16
vmull.s16 q7, d19, xC7S1
vshrn.s32 d4, q2, #16
vshrn.s32 d5, q3, #16
vshrn.s32 d6, q4, #16 // ip[7] * C7
vshrn.s32 d7, q5, #16
vshrn.s32 d8, q6, #16 // ip[1] * C7
vshrn.s32 d9, q7, #16
vadd.s16 q2, q2, q15 // ip[7] * C1
vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7
vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1
vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16
vmull.s16 q3, d23, xC5S3
vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16
vmull.s16 q5, d23, xC3S5
vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16
vmull.s16 q7, d27, xC5S3
vshrn.s32 d4, q2, #16
vshrn.s32 d5, q3, #16
vshrn.s32 d6, q4, #16
vshrn.s32 d7, q5, #16
vshrn.s32 d8, q6, #16
vshrn.s32 d9, q7, #16
vadd.s16 q3, q3, q11 // ip[3] * C3
vadd.s16 q4, q4, q13 // ip[5] * C5
vadd.s16 q1, q2, q11 // ip[3] * C5
vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5
vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16
vmull.s16 q3, d27, xC3S5
vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16
vmull.s16 q5, d21, xC2S6
vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16
vmull.s16 q7, d29, xC6S2
vshrn.s32 d4, q2, #16
vshrn.s32 d5, q3, #16
vshrn.s32 d6, q4, #16
vshrn.s32 d7, q5, #16
vshrn.s32 d8, q6, #16 // ip[6] * C6
vshrn.s32 d9, q7, #16
vadd.s16 q2, q2, q13 // ip[5] * C3
vadd.s16 q3, q3, q10 // ip[2] * C2
vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5
vsub.s16 q1, q9, q11 // (A - C)
vadd.s16 q11, q9, q11 // Cd = A + C
vsub.s16 q9, q15, q13 // (B - D)
vadd.s16 q13, q15, q13 // Dd = B + D
vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6
vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16
vmull.s16 q3, d3, xC4S4
vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16
vmull.s16 q5, d29, xC2S6
vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16
vmull.s16 q7, d21, xC6S2
vshrn.s32 d4, q2, #16
vshrn.s32 d5, q3, #16
vshrn.s32 d6, q4, #16
vshrn.s32 d7, q5, #16
vshrn.s32 d8, q6, #16 // ip[2] * C6
vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16
vmull.s16 q6, d19, xC4S4
vshrn.s32 d9, q7, #16
vadd.s16 q3, q3, q14 // ip[6] * C2
vadd.s16 q10, q1, q2 // Ad = (A - C) * C4
vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2
bx lr
endfunc
.macro VP3_IDCT_END type
function vp3_idct_end_\type\()_neon
.ifc \type, col
vdup.16 q0, r3
vadd.s16 q12, q12, q0
vadd.s16 q8, q8, q0
.endif
vshrn.s32 d2, q5, #16
vshrn.s32 d3, q6, #16
vadd.s16 q2, q12, q15 // Gd = E + G
vadd.s16 q9, q1, q9 // (B - D) * C4
vsub.s16 q12, q12, q15 // Ed = E - G
vsub.s16 q3, q8, q10 // Fd = F - Ad
vadd.s16 q10, q8, q10 // Add = F + Ad
vadd.s16 q4, q9, q14 // Hd = Bd + H
vsub.s16 q14, q9, q14 // Bdd = Bd - H
vadd.s16 q8, q2, q11 // [0] = Gd + Cd
vsub.s16 q15, q2, q11 // [7] = Gd - Cd
vadd.s16 q9, q10, q4 // [1] = Add + Hd
vsub.s16 q10, q10, q4 // [2] = Add - Hd
vadd.s16 q11, q12, q13 // [3] = Ed + Dd
vsub.s16 q12, q12, q13 // [4] = Ed - Dd
.ifc \type, row
vtrn.16 q8, q9
.endif
vadd.s16 q13, q3, q14 // [5] = Fd + Bdd
vsub.s16 q14, q3, q14 // [6] = Fd - Bdd
.ifc \type, row
// 8x8 transpose
vtrn.16 q10, q11
vtrn.16 q12, q13
vtrn.16 q14, q15
vtrn.32 q8, q10
vtrn.32 q9, q11
vtrn.32 q12, q14
vtrn.32 q13, q15
vswp d17, d24
vswp d19, d26
vadd.s16 q1, q8, q12
vswp d21, d28
vsub.s16 q8, q8, q12
vswp d23, d30
.endif
bx lr
endfunc
.endm
VP3_IDCT_END row
VP3_IDCT_END col
function ff_vp3_idct_put_neon, export=1
mov ip, lr
bl vp3_idct_start_neon
bl vp3_idct_end_row_neon
mov r3, #8
add r3, r3, #2048 // convert signed pixel to unsigned
bl vp3_idct_core_neon
bl vp3_idct_end_col_neon
mov lr, ip
vpop {d8-d15}
vqshrun.s16 d0, q8, #4
vqshrun.s16 d1, q9, #4
vqshrun.s16 d2, q10, #4
vqshrun.s16 d3, q11, #4
vst1.64 {d0}, [r0,:64], r1
vqshrun.s16 d4, q12, #4
vst1.64 {d1}, [r0,:64], r1
vqshrun.s16 d5, q13, #4
vst1.64 {d2}, [r0,:64], r1
vqshrun.s16 d6, q14, #4
vst1.64 {d3}, [r0,:64], r1
vqshrun.s16 d7, q15, #4
vst1.64 {d4}, [r0,:64], r1
vst1.64 {d5}, [r0,:64], r1
vst1.64 {d6}, [r0,:64], r1
vst1.64 {d7}, [r0,:64], r1
bx lr
endfunc
function ff_vp3_idct_add_neon, export=1
mov ip, lr
bl vp3_idct_start_neon
bl vp3_idct_end_row_neon
mov r3, #8
bl vp3_idct_core_neon
bl vp3_idct_end_col_neon
mov lr, ip
vpop {d8-d15}
mov r2, r0
vld1.64 {d0}, [r0,:64], r1
vshr.s16 q8, q8, #4
vld1.64 {d1}, [r0,:64], r1
vshr.s16 q9, q9, #4
vld1.64 {d2}, [r0,:64], r1
vaddw.u8 q8, q8, d0
vld1.64 {d3}, [r0,:64], r1
vaddw.u8 q9, q9, d1
vld1.64 {d4}, [r0,:64], r1
vshr.s16 q10, q10, #4
vld1.64 {d5}, [r0,:64], r1
vshr.s16 q11, q11, #4
vld1.64 {d6}, [r0,:64], r1
vqmovun.s16 d0, q8
vld1.64 {d7}, [r0,:64], r1
vqmovun.s16 d1, q9
vaddw.u8 q10, q10, d2
vaddw.u8 q11, q11, d3
vshr.s16 q12, q12, #4
vshr.s16 q13, q13, #4
vqmovun.s16 d2, q10
vqmovun.s16 d3, q11
vaddw.u8 q12, q12, d4
vaddw.u8 q13, q13, d5
vshr.s16 q14, q14, #4
vshr.s16 q15, q15, #4
vst1.64 {d0}, [r2,:64], r1
vqmovun.s16 d4, q12
vst1.64 {d1}, [r2,:64], r1
vqmovun.s16 d5, q13
vst1.64 {d2}, [r2,:64], r1
vaddw.u8 q14, q14, d6
vst1.64 {d3}, [r2,:64], r1
vaddw.u8 q15, q15, d7
vst1.64 {d4}, [r2,:64], r1
vqmovun.s16 d6, q14
vst1.64 {d5}, [r2,:64], r1
vqmovun.s16 d7, q15
vst1.64 {d6}, [r2,:64], r1
vst1.64 {d7}, [r2,:64], r1
bx lr
endfunc
function ff_vp3_idct_dc_add_neon, export=1
ldrsh r12, [r2]
mov r3, r0
add r12, r12, #15
vdup.16 q15, r12
mov r12, 0
strh r12, [r2]
vshr.s16 q15, q15, #5
vld1.8 {d0}, [r0,:64], r1
vld1.8 {d1}, [r0,:64], r1
vld1.8 {d2}, [r0,:64], r1
vaddw.u8 q8, q15, d0
vld1.8 {d3}, [r0,:64], r1
vaddw.u8 q9, q15, d1
vld1.8 {d4}, [r0,:64], r1
vaddw.u8 q10, q15, d2
vld1.8 {d5}, [r0,:64], r1
vaddw.u8 q11, q15, d3
vld1.8 {d6}, [r0,:64], r1
vaddw.u8 q12, q15, d4
vld1.8 {d7}, [r0,:64], r1
vaddw.u8 q13, q15, d5
vqmovun.s16 d0, q8
vaddw.u8 q14, q15, d6
vqmovun.s16 d1, q9
vaddw.u8 q15, q15, d7
vqmovun.s16 d2, q10
vst1.8 {d0}, [r3,:64], r1
vqmovun.s16 d3, q11
vst1.8 {d1}, [r3,:64], r1
vqmovun.s16 d4, q12
vst1.8 {d2}, [r3,:64], r1
vqmovun.s16 d5, q13
vst1.8 {d3}, [r3,:64], r1
vqmovun.s16 d6, q14
vst1.8 {d4}, [r3,:64], r1
vqmovun.s16 d7, q15
vst1.8 {d5}, [r3,:64], r1
vst1.8 {d6}, [r3,:64], r1
vst1.8 {d7}, [r3,:64], r1
bx lr
endfunc

View File

@@ -0,0 +1,121 @@
/*
* Copyright (C) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_VP56_ARITH_H
#define AVCODEC_ARM_VP56_ARITH_H
#if CONFIG_THUMB
# define A(x)
# define T(x) x
#else
# define A(x) x
# define T(x)
#endif
#if CONFIG_THUMB || defined __clang__
# define L(x)
# define U(x) x
#else
# define L(x) x
# define U(x)
#endif
#if HAVE_ARMV6_INLINE
#define vp56_rac_get_prob vp56_rac_get_prob_armv6
static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
{
unsigned shift = ff_vp56_norm_shift[c->high];
unsigned code_word = c->code_word << shift;
unsigned high = c->high << shift;
unsigned bit;
__asm__ ("adds %3, %3, %0 \n"
"itt cs \n"
"cmpcs %7, %4 \n"
L("ldrcsh %2, [%4], #2 \n")
U("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n"
T("itttt cs \n")
"rev16cs %2, %2 \n"
T("lslcs %2, %2, %3 \n")
T("orrcs %1, %1, %2 \n")
A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n"
"cmp %1, %0, lsl #16 \n"
"ittte ge \n"
"subge %1, %1, %0, lsl #16 \n"
"subge %0, %5, %0 \n"
"movge %2, #1 \n"
"movlt %2, #0 \n"
: "=&r"(c->high), "=&r"(c->code_word), "=&r"(bit),
"+&r"(c->bits), "+&r"(c->buffer)
: "r"(high), "r"(pr), "r"(c->end - 1),
"0"(shift), "1"(code_word)
: "cc");
return bit;
}
#define vp56_rac_get_prob_branchy vp56_rac_get_prob_branchy_armv6
static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
{
unsigned shift = ff_vp56_norm_shift[c->high];
unsigned code_word = c->code_word << shift;
unsigned high = c->high << shift;
unsigned low;
unsigned tmp;
__asm__ ("adds %3, %3, %0 \n"
"itt cs \n"
"cmpcs %7, %4 \n"
L("ldrcsh %2, [%4], #2 \n")
U("ldrhcs %2, [%4], #2 \n")
"rsb %0, %6, #256 \n"
"smlabb %0, %5, %6, %0 \n"
T("itttt cs \n")
"rev16cs %2, %2 \n"
T("lslcs %2, %2, %3 \n")
T("orrcs %1, %1, %2 \n")
A("orrcs %1, %1, %2, lsl %3 \n")
"subcs %3, %3, #16 \n"
"lsr %0, %0, #8 \n"
"lsl %2, %0, #16 \n"
: "=&r"(low), "+&r"(code_word), "=&r"(tmp),
"+&r"(c->bits), "+&r"(c->buffer)
: "r"(high), "r"(pr), "r"(c->end - 1), "0"(shift)
: "cc");
if (code_word >= tmp) {
c->high = high - low;
c->code_word = code_word - tmp;
return 1;
}
c->high = low;
c->code_word = code_word;
return 0;
}
#endif
#endif /* AVCODEC_ARM_VP56_ARITH_H */

View File

@@ -0,0 +1,39 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/vp56dsp.h"
void ff_vp6_edge_filter_hor_neon(uint8_t *yuv, int stride, int t);
void ff_vp6_edge_filter_ver_neon(uint8_t *yuv, int stride, int t);
av_cold void ff_vp6dsp_init_arm(VP56DSPContext *s, enum AVCodecID codec)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
s->edge_filter_hor = ff_vp6_edge_filter_hor_neon;
s->edge_filter_ver = ff_vp6_edge_filter_ver_neon;
}
}

View File

@@ -0,0 +1,121 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro vp6_edge_filter
vdup.16 q3, r2 @ t
vmov.i16 q13, #1
vsubl.u8 q0, d20, d18 @ p[ 0] - p[-s]
vsubl.u8 q1, d16, d22 @ p[-2*s] - p[ s]
vsubl.u8 q14, d21, d19
vsubl.u8 q15, d17, d23
vadd.i16 q2, q0, q0 @ 2*(p[0]-p[-s])
vadd.i16 d29, d28, d28
vadd.i16 q0, q0, q1 @ p[0]-p[-s] + p[-2*s]-p[s]
vadd.i16 d28, d28, d30
vadd.i16 q0, q0, q2 @ 3*(p[0]-p[-s]) + p[-2*s]-p[s]
vadd.i16 d28, d28, d29
vrshr.s16 q0, q0, #3 @ v
vrshr.s16 d28, d28, #3
vsub.i16 q8, q3, q13 @ t-1
vabs.s16 q1, q0 @ V
vshr.s16 q2, q0, #15 @ s
vabs.s16 d30, d28
vshr.s16 d29, d28, #15
vsub.i16 q12, q1, q3 @ V-t
vsub.i16 d31, d30, d6
vsub.i16 q12, q12, q13 @ V-t-1
vsub.i16 d31, d31, d26
vcge.u16 q12, q12, q8 @ V-t-1 >= t-1
vcge.u16 d31, d31, d16
vadd.i16 q13, q3, q3 @ 2*t
vadd.i16 d16, d6, d6
vsub.i16 q13, q13, q1 @ 2*t - V
vsub.i16 d16, d16, d30
vadd.i16 q13, q13, q2 @ += s
vadd.i16 d16, d16, d29
veor q13, q13, q2 @ ^= s
veor d16, d16, d29
vbif q0, q13, q12
vbif d28, d16, d31
vmovl.u8 q1, d20
vmovl.u8 q15, d21
vaddw.u8 q2, q0, d18
vaddw.u8 q3, q14, d19
vsub.i16 q1, q1, q0
vsub.i16 d30, d30, d28
vqmovun.s16 d18, q2
vqmovun.s16 d19, q3
vqmovun.s16 d20, q1
vqmovun.s16 d21, q15
.endm
function ff_vp6_edge_filter_ver_neon, export=1
sub r0, r0, r1, lsl #1
vld1.8 {q8}, [r0], r1 @ p[-2*s]
vld1.8 {q9}, [r0], r1 @ p[-s]
vld1.8 {q10}, [r0], r1 @ p[0]
vld1.8 {q11}, [r0] @ p[s]
vp6_edge_filter
sub r0, r0, r1, lsl #1
sub r1, r1, #8
vst1.8 {d18}, [r0]!
vst1.32 {d19[0]}, [r0], r1
vst1.8 {d20}, [r0]!
vst1.32 {d21[0]}, [r0]
bx lr
endfunc
function ff_vp6_edge_filter_hor_neon, export=1
sub r3, r0, #1
sub r0, r0, #2
vld1.32 {d16[0]}, [r0], r1
vld1.32 {d18[0]}, [r0], r1
vld1.32 {d20[0]}, [r0], r1
vld1.32 {d22[0]}, [r0], r1
vld1.32 {d16[1]}, [r0], r1
vld1.32 {d18[1]}, [r0], r1
vld1.32 {d20[1]}, [r0], r1
vld1.32 {d22[1]}, [r0], r1
vld1.32 {d17[0]}, [r0], r1
vld1.32 {d19[0]}, [r0], r1
vld1.32 {d21[0]}, [r0], r1
vld1.32 {d23[0]}, [r0], r1
vtrn.8 q8, q9
vtrn.8 q10, q11
vtrn.16 q8, q10
vtrn.16 q9, q11
vp6_edge_filter
vtrn.8 q9, q10
vst1.16 {d18[0]}, [r3], r1
vst1.16 {d20[0]}, [r3], r1
vst1.16 {d18[1]}, [r3], r1
vst1.16 {d20[1]}, [r3], r1
vst1.16 {d18[2]}, [r3], r1
vst1.16 {d20[2]}, [r3], r1
vst1.16 {d18[3]}, [r3], r1
vst1.16 {d20[3]}, [r3], r1
vst1.16 {d19[0]}, [r3], r1
vst1.16 {d21[0]}, [r3], r1
vst1.16 {d19[1]}, [r3], r1
vst1.16 {d21[1]}, [r3], r1
bx lr
endfunc

View File

@@ -0,0 +1,35 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_VP8_H
#define AVCODEC_ARM_VP8_H
#include <stdint.h>
#include "config.h"
#include "libavcodec/vp56.h"
#include "libavcodec/vp8.h"
#if HAVE_ARMV6_EXTERNAL
#define decode_block_coeffs_internal ff_decode_block_coeffs_armv6
int ff_decode_block_coeffs_armv6(VP56RangeCoder *rc, int16_t block[16],
uint8_t probs[8][3][NUM_DCT_TOKENS-1],
int i, uint8_t *token_prob, int16_t qmul[2]);
#endif
#endif /* AVCODEC_ARM_VP8_H */

View File

@@ -0,0 +1,248 @@
/*
* Copyright (C) 2010 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro rac_get_prob h, bs, buf, cw, pr, t0, t1
adds \bs, \bs, \t0
lsl \cw, \cw, \t0
lsl \t0, \h, \t0
rsb \h, \pr, #256
it cs
ldrhcs \t1, [\buf], #2
smlabb \h, \t0, \pr, \h
T itttt cs
rev16cs \t1, \t1
A orrcs \cw, \cw, \t1, lsl \bs
T lslcs \t1, \t1, \bs
T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16
lsr \h, \h, #8
cmp \cw, \h, lsl #16
itt ge
subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h
.endm
.macro rac_get_128 h, bs, buf, cw, t0, t1
adds \bs, \bs, \t0
lsl \cw, \cw, \t0
lsl \t0, \h, \t0
it cs
ldrhcs \t1, [\buf], #2
mov \h, #128
it cs
rev16cs \t1, \t1
add \h, \h, \t0, lsl #7
A orrcs \cw, \cw, \t1, lsl \bs
T ittt cs
T lslcs \t1, \t1, \bs
T orrcs \cw, \cw, \t1
subcs \bs, \bs, #16
lsr \h, \h, #8
cmp \cw, \h, lsl #16
itt ge
subge \cw, \cw, \h, lsl #16
subge \h, \t0, \h
.endm
function ff_decode_block_coeffs_armv6, export=1
push {r0,r1,r4-r11,lr}
movrelx lr, X(ff_vp56_norm_shift)
ldrd r4, r5, [sp, #44] @ token_prob, qmul
cmp r3, #0
ldr r11, [r5]
ldm r0, {r5-r7} @ high, bits, buf
it ne
pkhtbne r11, r11, r11, asr #16
ldr r8, [r0, #16] @ code_word
0:
ldrb r9, [lr, r5]
add r3, r3, #1
ldrb r0, [r4, #1]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
blt 2f
ldrb r9, [lr, r5]
ldrb r0, [r4, #2]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
ldrb r9, [lr, r5]
bge 3f
add r4, r3, r3, lsl #5
sxth r12, r11
add r4, r4, r2
adds r6, r6, r9
add r4, r4, #11
lsl r8, r8, r9
it cs
ldrhcs r10, [r7], #2
lsl r9, r5, r9
mov r5, #128
it cs
rev16cs r10, r10
add r5, r5, r9, lsl #7
T ittt cs
T lslcs r10, r10, r6
T orrcs r8, r8, r10
A orrcs r8, r8, r10, lsl r6
subcs r6, r6, #16
lsr r5, r5, #8
cmp r8, r5, lsl #16
movrel r10, zigzag_scan-1
itt ge
subge r8, r8, r5, lsl #16
subge r5, r9, r5
ldrb r10, [r10, r3]
it ge
rsbge r12, r12, #0
cmp r3, #16
strh r12, [r1, r10]
bge 6f
5:
ldrb r9, [lr, r5]
ldrb r0, [r4]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
pkhtb r11, r11, r11, asr #16
bge 0b
6:
ldr r0, [sp]
ldr r9, [r0, #12]
cmp r7, r9
it hi
movhi r7, r9
stm r0, {r5-r7} @ high, bits, buf
str r8, [r0, #16] @ code_word
add sp, sp, #8
mov r0, r3
pop {r4-r11,pc}
2:
add r4, r3, r3, lsl #5
cmp r3, #16
add r4, r4, r2
pkhtb r11, r11, r11, asr #16
bne 0b
b 6b
3:
ldrb r0, [r4, #3]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
ldrb r9, [lr, r5]
bge 1f
mov r12, #2
ldrb r0, [r4, #4]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, #1
ldrb r9, [lr, r5]
blt 4f
ldrb r0, [r4, #5]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, #1
ldrb r9, [lr, r5]
b 4f
1:
ldrb r0, [r4, #6]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
ldrb r9, [lr, r5]
bge 3f
ldrb r0, [r4, #7]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
ldrb r9, [lr, r5]
bge 2f
mov r12, #5
mov r0, #159
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #1
ldrb r9, [lr, r5]
b 4f
2:
mov r12, #7
mov r0, #165
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #2
ldrb r9, [lr, r5]
mov r0, #145
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r12, r12, #1
ldrb r9, [lr, r5]
b 4f
3:
ldrb r0, [r4, #8]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
it ge
addge r4, r4, #1
ldrb r9, [lr, r5]
ite ge
movge r12, #2
movlt r12, #0
ldrb r0, [r4, #9]
rac_get_prob r5, r6, r7, r8, r0, r9, r10
mov r9, #8
it ge
addge r12, r12, #1
movrelx r4, X(ff_vp8_dct_cat_prob), r1
lsl r9, r9, r12
ldr r4, [r4, r12, lsl #2]
add r12, r9, #3
mov r1, #0
ldrb r0, [r4], #1
1:
ldrb r9, [lr, r5]
lsl r1, r1, #1
rac_get_prob r5, r6, r7, r8, r0, r9, r10
ldrb r0, [r4], #1
it ge
addge r1, r1, #1
cmp r0, #0
bne 1b
ldrb r9, [lr, r5]
add r12, r12, r1
ldr r1, [sp, #4]
4:
add r4, r3, r3, lsl #5
add r4, r4, r2
add r4, r4, #22
rac_get_128 r5, r6, r7, r8, r9, r10
it ge
rsbge r12, r12, #0
smulbb r12, r12, r11
movrel r9, zigzag_scan-1
ldrb r9, [r9, r3]
cmp r3, #16
strh r12, [r1, r9]
bge 6b
b 5b
endfunc
const zigzag_scan
.byte 0, 2, 8, 16
.byte 10, 4, 6, 12
.byte 18, 24, 26, 20
.byte 14, 22, 28, 30
endconst

View File

@@ -0,0 +1,78 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_VP8DSP_H
#define AVCODEC_ARM_VP8DSP_H
#include "libavcodec/vp8dsp.h"
void ff_vp8dsp_init_armv6(VP8DSPContext *dsp);
void ff_vp8dsp_init_neon(VP8DSPContext *dsp);
#define VP8_LF_Y(hv, inner, opt) \
void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst, \
ptrdiff_t stride, \
int flim_E, int flim_I, \
int hev_thresh)
#define VP8_LF_UV(hv, inner, opt) \
void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t stride, \
int flim_E, int flim_I, \
int hev_thresh)
#define VP8_LF_SIMPLE(hv, opt) \
void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \
ptrdiff_t stride, \
int flim)
#define VP8_LF_HV(inner, opt) \
VP8_LF_Y(h, inner, opt); \
VP8_LF_Y(v, inner, opt); \
VP8_LF_UV(h, inner, opt); \
VP8_LF_UV(v, inner, opt)
#define VP8_LF(opt) \
VP8_LF_HV(, opt); \
VP8_LF_HV(_inner, opt); \
VP8_LF_SIMPLE(h, opt); \
VP8_LF_SIMPLE(v, opt)
#define VP8_MC(n, opt) \
void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride, \
uint8_t *src, ptrdiff_t srcstride, \
int h, int x, int y)
#define VP8_EPEL(w, opt) \
VP8_MC(pixels ## w, opt); \
VP8_MC(epel ## w ## _h4, opt); \
VP8_MC(epel ## w ## _h6, opt); \
VP8_MC(epel ## w ## _v4, opt); \
VP8_MC(epel ## w ## _h4v4, opt); \
VP8_MC(epel ## w ## _h6v4, opt); \
VP8_MC(epel ## w ## _v6, opt); \
VP8_MC(epel ## w ## _h4v6, opt); \
VP8_MC(epel ## w ## _h6v6, opt)
#define VP8_BILIN(w, opt) \
VP8_MC(bilin ## w ## _h, opt); \
VP8_MC(bilin ## w ## _v, opt); \
VP8_MC(bilin ## w ## _hv, opt)
#endif /* AVCODEC_ARM_VP8DSP_H */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,34 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/vp8dsp.h"
#include "vp8dsp.h"
av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv6(cpu_flags))
ff_vp8dsp_init_armv6(dsp);
if (have_neon(cpu_flags))
ff_vp8dsp_init_neon(dsp);
}

View File

@@ -0,0 +1,120 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/vp8dsp.h"
#include "vp8dsp.h"
void ff_vp8_luma_dc_wht_armv6(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_luma_dc_wht_dc_armv6(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_armv6(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add_armv6(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_armv6(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4uv_armv6(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
VP8_LF(armv6);
VP8_EPEL(16, armv6);
VP8_EPEL(8, armv6);
VP8_EPEL(4, armv6);
VP8_BILIN(16, armv6);
VP8_BILIN(8, armv6);
VP8_BILIN(4, armv6);
av_cold void ff_vp8dsp_init_armv6(VP8DSPContext *dsp)
{
dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_armv6;
dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_armv6;
dsp->vp8_idct_add = ff_vp8_idct_add_armv6;
dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_armv6;
dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_armv6;
dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_armv6;
dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_armv6;
dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_armv6;
dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_armv6;
dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_armv6;
dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_armv6;
dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_armv6;
dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_armv6;
dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_armv6;
dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_armv6;
dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_armv6;
dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_armv6;
dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_armv6;
dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_armv6;
dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_armv6;
dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_armv6;
dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_armv6;
dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_armv6;
dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_armv6;
dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_armv6;
dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_armv6;
dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_armv6;
dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_armv6;
dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_armv6;
dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6;
dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_armv6;
dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_armv6;
dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_armv6;
dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_armv6;
dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_armv6;
dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_armv6;
dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_armv6;
dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_armv6;
dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_armv6;
dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_armv6;
dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_armv6;
dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_armv6;
dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_armv6;
dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_armv6;
dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_armv6;
dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_armv6;
dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_armv6;
dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_armv6;
dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6;
dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_armv6;
dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_armv6;
dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_armv6;
dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_armv6;
dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_armv6;
dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_armv6;
}

View File

@@ -0,0 +1,116 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/vp8dsp.h"
#include "vp8dsp.h"
void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
VP8_LF(neon);
VP8_EPEL(16, neon);
VP8_EPEL(8, neon);
VP8_EPEL(4, neon);
VP8_BILIN(16, neon);
VP8_BILIN(8, neon);
VP8_BILIN(4, neon);
av_cold void ff_vp8dsp_init_neon(VP8DSPContext *dsp)
{
dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon;
dsp->vp8_idct_add = ff_vp8_idct_add_neon;
dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon;
dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon;
dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon;
dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon;
dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon;
dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon;
dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon;
dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon;
dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon;
dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon;
dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon;
dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon;
dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon;
dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon;
dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon;
dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon;
dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon;
dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon;
dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon;
dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon;
dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon;
dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon;
dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon;
dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon;
dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon;
dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon;
dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon;
dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon;
dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon;
dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon;
dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon;
dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon;
dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon;
dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon;
dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon;
dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon;
dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon;
dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon;
dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon;
dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon;
dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon;
dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon;
dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon;
}

File diff suppressed because it is too large Load Diff