ffmpeg-2.8.5

git-svn-id: svn://kolibrios.org@6147 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Sergey Semyonov (Serge)
2016-02-05 22:08:02 +00:00
parent a08f61ddb9
commit a4b787f4b8
5429 changed files with 1356786 additions and 0 deletions

View File

@@ -0,0 +1,142 @@
ARCH_HEADERS = mathops.h
# subsystems
OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \
arm/ac3dsp_arm.o
OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_arm.o
OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o
OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o \
arm/fft_fixed_init_arm.o
OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_init_arm.o
OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o
OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o
OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
arm/hpeldsp_arm.o
OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \
arm/idctdsp_arm.o \
arm/jrevdct_arm.o \
arm/simple_idct_arm.o
OBJS-$(CONFIG_FLACDSP) += arm/flacdsp_init_arm.o \
arm/flacdsp_arm.o
OBJS-$(CONFIG_G722DSP) += arm/g722dsp_init_arm.o
OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_init_arm.o
OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_init_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_init_arm.o
OBJS-$(CONFIG_RV34DSP) += arm/rv34dsp_init_arm.o
OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o
OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_arm.o
# decoders/encoders
OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
arm/sbrdsp_init_arm.o
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
OBJS-$(CONFIG_VC1_DECODER) += arm/vc1dsp_init_arm.o
OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
# ARMv5 optimizations
# subsystems
ARMV5TE-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv5te.o \
arm/simple_idct_armv5te.o
ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \
arm/mpegvideo_armv5te_s.o
ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \
arm/videodsp_armv5te.o
# decoders/encoders
ARMV5TE-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv5te.o
# ARMv6 optimizations
# subsystems
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
arm/hpeldsp_armv6.o
ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \
arm/idctdsp_armv6.o \
arm/simple_idct_armv6.o
ARMV6-OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o
ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_armv6.o
ARMV6-OBJS-$(CONFIG_VP8DSP) += arm/vp8_armv6.o \
arm/vp8dsp_init_armv6.o \
arm/vp8dsp_armv6.o
# decoders/encoders
ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o
ARMV6-OBJS-$(CONFIG_STARTCODE) += arm/startcode_armv6.o
# VFP optimizations
# subsystems
VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
# decoders/encoders
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
arm/synth_filter_vfp.o
# NEON optimizations
# subsystems
NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
NEON-OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_neon.o \
arm/audiodsp_neon.o \
arm/int_neon.o
NEON-OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_neon.o \
arm/blockdsp_neon.o
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
arm/fft_fixed_neon.o
NEON-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_neon.o
NEON-OBJS-$(CONFIG_G722DSP) += arm/g722dsp_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_neon.o \
arm/h264idct_neon.o
NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o
NEON-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_neon.o \
arm/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \
arm/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_neon.o \
arm/idctdsp_neon.o \
arm/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \
arm/mdct_fixed_neon.o
NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o
NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o
NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += arm/vp8dsp_init_neon.o \
arm/vp8dsp_neon.o
# decoders/encoders
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
arm/sbrdsp_neon.o
NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
arm/synth_filter_neon.o
NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
arm/hevcdsp_deblock_neon.o \
arm/hevcdsp_idct_neon.o \
arm/hevcdsp_qpel_neon.o
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
arm/rv40dsp_neon.o
NEON-OBJS-$(CONFIG_VC1_DECODER) += arm/vc1dsp_init_neon.o \
arm/vc1dsp_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o

View File

@@ -0,0 +1,143 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_AAC_H
#define AVCODEC_ARM_AAC_H
#include "config.h"
#if HAVE_NEON_INLINE
#define VMUL2 VMUL2
static inline float *VMUL2(float *dst, const float *v, unsigned idx,
const float *scale)
{
unsigned v0, v1;
__asm__ ("ubfx %0, %6, #0, #4 \n\t"
"ubfx %1, %6, #4, #4 \n\t"
"ldr %0, [%5, %0, lsl #2] \n\t"
"ldr %1, [%5, %1, lsl #2] \n\t"
"vld1.32 {d1[]}, [%7,:32] \n\t"
"vmov d0, %0, %1 \n\t"
"vmul.f32 d0, d0, d1 \n\t"
"vst1.32 {d0}, [%2,:64]! \n\t"
: "=&r"(v0), "=&r"(v1), "+r"(dst), "=m"(dst[0]), "=m"(dst[1])
: "r"(v), "r"(idx), "r"(scale)
: "d0", "d1");
return dst;
}
#define VMUL4 VMUL4
static inline float *VMUL4(float *dst, const float *v, unsigned idx,
const float *scale)
{
unsigned v0, v1, v2, v3;
__asm__ ("ubfx %0, %10, #0, #2 \n\t"
"ubfx %1, %10, #2, #2 \n\t"
"ldr %0, [%9, %0, lsl #2] \n\t"
"ubfx %2, %10, #4, #2 \n\t"
"ldr %1, [%9, %1, lsl #2] \n\t"
"ubfx %3, %10, #6, #2 \n\t"
"ldr %2, [%9, %2, lsl #2] \n\t"
"vmov d0, %0, %1 \n\t"
"ldr %3, [%9, %3, lsl #2] \n\t"
"vld1.32 {d2[],d3[]},[%11,:32] \n\t"
"vmov d1, %2, %3 \n\t"
"vmul.f32 q0, q0, q1 \n\t"
"vst1.32 {q0}, [%4,:128]! \n\t"
: "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
"=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3])
: "r"(v), "r"(idx), "r"(scale)
: "d0", "d1", "d2", "d3");
return dst;
}
#define VMUL2S VMUL2S
static inline float *VMUL2S(float *dst, const float *v, unsigned idx,
unsigned sign, const float *scale)
{
unsigned v0, v1, v2, v3;
__asm__ ("ubfx %0, %8, #0, #4 \n\t"
"ubfx %1, %8, #4, #4 \n\t"
"ldr %0, [%7, %0, lsl #2] \n\t"
"lsl %2, %10, #30 \n\t"
"ldr %1, [%7, %1, lsl #2] \n\t"
"lsl %3, %10, #31 \n\t"
"vmov d0, %0, %1 \n\t"
"bic %2, %2, #1<<30 \n\t"
"vld1.32 {d1[]}, [%9,:32] \n\t"
"vmov d2, %2, %3 \n\t"
"veor d0, d0, d2 \n\t"
"vmul.f32 d0, d0, d1 \n\t"
"vst1.32 {d0}, [%4,:64]! \n\t"
: "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
"=m"(dst[0]), "=m"(dst[1])
: "r"(v), "r"(idx), "r"(scale), "r"(sign)
: "d0", "d1", "d2");
return dst;
}
#define VMUL4S VMUL4S
static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
unsigned sign, const float *scale)
{
unsigned v0, v1, v2, v3, nz;
__asm__ ("vld1.32 {d2[],d3[]},[%13,:32] \n\t"
"ubfx %0, %12, #0, #2 \n\t"
"ubfx %1, %12, #2, #2 \n\t"
"ldr %0, [%11,%0, lsl #2] \n\t"
"ubfx %2, %12, #4, #2 \n\t"
"ldr %1, [%11,%1, lsl #2] \n\t"
"ubfx %3, %12, #6, #2 \n\t"
"ldr %2, [%11,%2, lsl #2] \n\t"
"vmov d0, %0, %1 \n\t"
"ldr %3, [%11,%3, lsl #2] \n\t"
"lsr %6, %12, #12 \n\t"
"rbit %6, %6 \n\t"
"vmov d1, %2, %3 \n\t"
"lsls %6, %6, #1 \n\t"
"and %0, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %1, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"lsls %6, %6, #1 \n\t"
"and %2, %5, #1<<31 \n\t"
"it cs \n\t"
"lslcs %5, %5, #1 \n\t"
"vmov d4, %0, %1 \n\t"
"and %3, %5, #1<<31 \n\t"
"vmov d5, %2, %3 \n\t"
"veor q0, q0, q2 \n\t"
"vmul.f32 q0, q0, q1 \n\t"
"vst1.32 {q0}, [%4,:128]! \n\t"
: "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
"+r"(sign), "=r"(nz),
"=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3])
: "r"(v), "r"(idx), "r"(scale)
: "cc", "d0", "d1", "d2", "d3", "d4", "d5");
return dst;
}
#endif /* HAVE_NEON_INLINE */
#endif /* AVCODEC_ARM_AAC_H */

View File

@@ -0,0 +1,57 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/aacpsdsp.h"
void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
float *src1, int n);
void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
const float (*filter)[8][2],
int stride, int n);
void ff_ps_hybrid_analysis_ileave_neon(float (*out)[32][2], float L[2][38][64],
int i, int len);
void ff_ps_hybrid_synthesis_deint_neon(float out[2][38][64], float (*in)[32][2],
int i, int len);
void ff_ps_decorrelate_neon(float (*out)[2], float (*delay)[2],
float (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
const float phi_fract[2], float (*Q_fract)[2],
const float *transient_gain, float g_decay_slope,
int len);
void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
float h[2][4], float h_step[2][4],
int len);
av_cold void ff_psdsp_init_arm(PSDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
s->add_squares = ff_ps_add_squares_neon;
s->mul_pair_single = ff_ps_mul_pair_single_neon;
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_neon;
s->hybrid_analysis = ff_ps_hybrid_analysis_neon;
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
}
}

View File

@@ -0,0 +1,272 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_ps_add_squares_neon, export=1
mov r3, r0
sub r2, r2, #4
vld1.32 {q0}, [r1,:128]!
vmul.f32 q0, q0, q0
vld1.32 {q2}, [r1,:128]!
vmul.f32 q2, q2, q2
vld1.32 {q1}, [r0,:128]!
1:
vpadd.f32 d6, d0, d1
vld1.32 {q0}, [r1,:128]!
vpadd.f32 d7, d4, d5
vmul.f32 q0, q0, q0
vld1.32 {q2}, [r1,:128]!
vadd.f32 q3, q1, q3
vld1.32 {q1}, [r0,:128]!
vmul.f32 q2, q2, q2
vst1.32 {q3}, [r3,:128]!
subs r2, r2, #4
bgt 1b
vpadd.f32 d6, d0, d1
vpadd.f32 d7, d4, d5
vadd.f32 q1, q1, q3
vst1.32 {q1}, [r3,:128]!
bx lr
endfunc
function ff_ps_mul_pair_single_neon, export=1
sub r3, r3, #4
tst r1, #8
bne 2f
vld1.32 {q0}, [r1,:128]!
1:
vld1.32 {q3}, [r2,:128]!
vmul.f32 d4, d0, d6[0]
vmul.f32 d5, d1, d6[1]
vld1.32 {q1}, [r1,:128]!
vmul.f32 d6, d2, d7[0]
vmul.f32 d7, d3, d7[1]
vld1.32 {q0}, [r1,:128]!
vst1.32 {q2,q3}, [r0,:128]!
subs r3, r3, #4
bgt 1b
vld1.32 {q3}, [r2,:128]!
vmul.f32 d4, d0, d6[0]
vmul.f32 d5, d1, d6[1]
vld1.32 {q1}, [r1,:128]!
vmul.f32 d6, d2, d7[0]
vmul.f32 d7, d3, d7[1]
vst1.32 {q2,q3}, [r0,:128]!
bx lr
2:
vld1.32 {d0}, [r1,:64]!
vld1.32 {d1,d2}, [r1,:128]!
1:
vld1.32 {q3}, [r2,:128]!
vmul.f32 d4, d0, d6[0]
vmul.f32 d5, d1, d6[1]
vld1.32 {d0,d1}, [r1,:128]!
vmul.f32 d6, d2, d7[0]
vmul.f32 d7, d0, d7[1]
vmov d0, d1
vld1.32 {d1,d2}, [r1,:128]!
vst1.32 {q2,q3}, [r0,:128]!
subs r3, r3, #4
bgt 1b
vld1.32 {q3}, [r2,:128]!
vmul.f32 d4, d0, d6[0]
vmul.f32 d5, d1, d6[1]
vld1.32 {d0}, [r1,:64]!
vmul.f32 d6, d2, d7[0]
vmul.f32 d7, d0, d7[1]
vst1.32 {q2,q3}, [r0,:128]!
bx lr
endfunc
function ff_ps_hybrid_synthesis_deint_neon, export=1
push {r4-r8,lr}
add r0, r0, r2, lsl #2
add r1, r1, r2, lsl #5+1+2
rsb r2, r2, #64
mov r5, #64*4
mov lr, r0
add r4, r0, #38*64*4
mov r12, r3
2:
vld1.32 {d0,d1}, [r1,:128]!
vst1.32 {d0[0]}, [lr,:32], r5
vst1.32 {d0[1]}, [r4,:32], r5
vst1.32 {d1[0]}, [lr,:32], r5
vst1.32 {d1[1]}, [r4,:32], r5
subs r12, r12, #2
bgt 2b
add r0, r0, #4
sub r2, r2, #1
tst r2, #2
bne 6f
1:
mov lr, r0
add r4, r0, #38*64*4
add r6, r1, # 32*2*4
add r7, r1, #2*32*2*4
add r8, r1, #3*32*2*4
mov r12, r3
2:
vld1.32 {d0,d1}, [r1,:128]!
vld1.32 {d2,d3}, [r6,:128]!
vld1.32 {d4,d5}, [r7,:128]!
vld1.32 {d6,d7}, [r8,:128]!
vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5
vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5
vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5
vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5
subs r12, r12, #2
bgt 2b
add r0, r0, #16
add r1, r1, #3*32*2*4
subs r2, r2, #4
bgt 1b
pop {r4-r8,pc}
6:
mov lr, r0
add r4, r0, #38*64*4
add r6, r1, #32*2*4
mov r12, r3
2:
vld1.32 {d0,d1}, [r1,:128]!
vld1.32 {d2,d3}, [r6,:128]!
vst2.32 {d0[0],d2[0]}, [lr,:64], r5
vst2.32 {d0[1],d2[1]}, [r4,:64], r5
vst2.32 {d1[0],d3[0]}, [lr,:64], r5
vst2.32 {d1[1],d3[1]}, [r4,:64], r5
subs r12, r12, #2
bgt 2b
add r0, r0, #8
add r1, r1, #32*2*4
sub r2, r2, #2
b 1b
endfunc
function ff_ps_hybrid_analysis_neon, export=1
vldm r1, {d19-d31}
ldr r12, [sp]
lsl r3, r3, #3
vadd.f32 d16, d19, d31
vadd.f32 d17, d20, d30
vsub.f32 d18, d19, d31
vsub.f32 d19, d20, d30
vsub.f32 d0, d21, d29
vsub.f32 d1, d22, d28
vadd.f32 d2, d21, d29
vadd.f32 d3, d22, d28
vadd.f32 d20, d23, d27
vadd.f32 d21, d24, d26
vsub.f32 d22, d23, d27
vsub.f32 d23, d24, d26
vmov.i32 d6, #1<<31
vmov.i32 d7, #0
vmov.f32 q14, #0.0
vmov.f32 q15, #0.0
vtrn.32 d6, d7
vrev64.32 q9, q9
vrev64.32 q0, q0
vrev64.32 q11, q11
veor q9, q9, q3
veor q0, q0, q3
veor q11, q11, q3
vld1.32 {q13}, [r2,:128]!
vtrn.32 q8, q9
vtrn.32 q1, q0
vtrn.32 q10, q11
sub r12, r12, #1
vmla.f32 q14, q8, q13
vld1.32 {q2}, [r2,:128]!
vmla.f32 q15, q9, q13
1:
vmla.f32 q14, q1, q2
vld1.32 {q13}, [r2,:128]!
vmla.f32 q15, q0, q2
vmla.f32 q14, q10, q13
vld1.32 {q2}, [r2,:128]!
vmla.f32 q15, q11, q13
vld1.32 {q13}, [r2,:128]!
vadd.f32 d6, d28, d29
vadd.f32 d7, d30, d31
vmov.f32 q14, #0.0
vmov.f32 q15, #0.0
vmla.f32 q14, q8, q13
vpadd.f32 d6, d6, d7
vmla.f32 q15, q9, q13
vmla.f32 d6, d25, d4[0]
vld1.32 {q2}, [r2,:128]!
vst1.32 {d6}, [r0,:64], r3
subs r12, r12, #1
bgt 1b
vmla.f32 q14, q1, q2
vld1.32 {q13}, [r2,:128]!
vmla.f32 q15, q0, q2
vmla.f32 q14, q10, q13
vld1.32 {q2}, [r2,:128]!
vmla.f32 q15, q11, q13
vadd.f32 d6, d28, d29
vadd.f32 d7, d30, d31
vpadd.f32 d6, d6, d7
vmla.f32 d6, d25, d4[0]
vst1.32 {d6}, [r0,:64], r3
bx lr
endfunc
function ff_ps_stereo_interpolate_neon, export=1
vld1.32 {q0}, [r2]
vld1.32 {q14}, [r3]
vadd.f32 q15, q14, q14
mov r2, r0
mov r3, r1
ldr r12, [sp]
vadd.f32 q1, q0, q14
vadd.f32 q0, q0, q15
vld1.32 {q2}, [r0,:64]!
vld1.32 {q3}, [r1,:64]!
subs r12, r12, #1
beq 2f
1:
vmul.f32 d16, d4, d2[0]
vmul.f32 d17, d5, d0[0]
vmul.f32 d18, d4, d2[1]
vmul.f32 d19, d5, d0[1]
vmla.f32 d16, d6, d3[0]
vmla.f32 d17, d7, d1[0]
vmla.f32 d18, d6, d3[1]
vmla.f32 d19, d7, d1[1]
vadd.f32 q1, q1, q15
vadd.f32 q0, q0, q15
vld1.32 {q2}, [r0,:64]!
vld1.32 {q3}, [r1,:64]!
vst1.32 {q8}, [r2,:64]!
vst1.32 {q9}, [r3,:64]!
subs r12, r12, #2
bgt 1b
it lt
bxlt lr
2:
vmul.f32 d16, d4, d2[0]
vmul.f32 d18, d4, d2[1]
vmla.f32 d16, d6, d3[0]
vmla.f32 d18, d6, d3[1]
vst1.32 {d16}, [r2,:64]!
vst1.32 {d18}, [r3,:64]!
bx lr
endfunc

View File

@@ -0,0 +1,36 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_ac3_update_bap_counts_arm, export=1
push {lr}
ldrb lr, [r1], #1
1:
lsl r3, lr, #1
ldrh r12, [r0, r3]
subs r2, r2, #1
it gt
ldrbgt lr, [r1], #1
add r12, r12, #1
strh r12, [r0, r3]
bgt 1b
pop {pc}
endfunc

View File

@@ -0,0 +1,84 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_ac3_bit_alloc_calc_bap_armv6, export=1
ldr r12, [sp]
cmp r12, #-960
beq 4f
push {r4-r11,lr}
add r5, sp, #40
movrelx r4, X(ff_ac3_bin_to_band_tab), r11
movrelx lr, X(ff_ac3_band_start_tab)
ldm r5, {r5-r7}
ldrb r4, [r4, r2]
add r1, r1, r2, lsl #1 @ psd + start
add r0, r0, r4, lsl #1 @ mask + band
add r4, r4, lr
add r7, r7, r2 @ bap + start
1:
ldrsh r9, [r0], #2 @ mask[band]
mov r8, #0xff0
sub r9, r9, r12 @ - snr_offset
ldrb r10, [r4, #1]! @ band_start_tab[++band]
subs r9, r9, r5 @ - floor
it lt
movlt r9, #0
cmp r10, r3 @ - end
and r9, r9, r8, lsl #1 @ & 0x1fe0
ite gt
subgt r8, r3, r2
suble r8, r10, r2
mov r2, r10
add r9, r9, r5 @ + floor => m
tst r8, #1
add r11, r7, r8
bne 3f
b 5f
2:
ldrsh r8, [r1], #2
ldrsh lr, [r1], #2
sub r8, r8, r9
sub lr, lr, r9
usat r8, #6, r8, asr #5 @ address
usat lr, #6, lr, asr #5
ldrb r8, [r6, r8] @ bap_tab[address]
ldrb lr, [r6, lr]
strb r8, [r7], #1 @ bap[bin]
strb lr, [r7], #1
5: cmp r7, r11
blo 2b
cmp r3, r10
bgt 1b
pop {r4-r11,pc}
3:
ldrsh r8, [r1], #2 @ psd[bin]
sub r8, r8, r9 @ - m
usat r8, #6, r8, asr #5 @ address
ldrb r8, [r6, r8] @ bap_tab[address]
strb r8, [r7], #1 @ bap[bin]
b 5b
4:
ldr r0, [sp, #12]
mov r1, #0
mov r2, #256
b X(memset)
endfunc

View File

@@ -0,0 +1,73 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/ac3dsp.h"
#include "config.h"
void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
int ff_ac3_max_msb_abs_int16_neon(const int16_t *src, int len);
void ff_ac3_lshift_int16_neon(int16_t *src, unsigned len, unsigned shift);
void ff_ac3_rshift_int32_neon(int32_t *src, unsigned len, unsigned shift);
void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len);
void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
const int16_t *window, unsigned n);
void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
const int32_t *coef0,
const int32_t *coef1,
int len);
void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
const float *coef0,
const float *coef1,
int len);
void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
int start, int end,
int snr_offset, int floor,
const uint8_t *bap_tab, uint8_t *bap);
void ff_ac3_update_bap_counts_arm(uint16_t mant_cnt[16], uint8_t *bap, int len);
av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
{
int cpu_flags = av_get_cpu_flags();
c->update_bap_counts = ff_ac3_update_bap_counts_arm;
if (have_armv6(cpu_flags)) {
c->bit_alloc_calc_bap = ff_ac3_bit_alloc_calc_bap_armv6;
}
if (have_neon(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_neon;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_neon;
c->ac3_lshift_int16 = ff_ac3_lshift_int16_neon;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_neon;
c->float_to_fixed24 = ff_float_to_fixed24_neon;
c->extract_exponents = ff_ac3_extract_exponents_neon;
c->apply_window_int16 = ff_apply_window_int16_neon;
c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
}
}

View File

@@ -0,0 +1,177 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_ac3_max_msb_abs_int16_neon, export=1
vmov.i16 q0, #0
vmov.i16 q2, #0
1: vld1.16 {q1}, [r0,:128]!
vabs.s16 q1, q1
vld1.16 {q3}, [r0,:128]!
vabs.s16 q3, q3
vorr q0, q0, q1
vorr q2, q2, q3
subs r1, r1, #16
bgt 1b
vorr q0, q0, q2
vorr d0, d0, d1
vpmax.u16 d0, d0, d0
vpmax.u16 d0, d0, d0
vmov.u16 r0, d0[0]
bx lr
endfunc
function ff_ac3_exponent_min_neon, export=1
cmp r1, #0
it eq
bxeq lr
push {lr}
mov r12, #256
1:
vld1.8 {q0}, [r0,:128]
mov lr, r1
add r3, r0, #256
2: vld1.8 {q1}, [r3,:128], r12
subs lr, lr, #1
vmin.u8 q0, q0, q1
bgt 2b
subs r2, r2, #16
vst1.8 {q0}, [r0,:128]!
bgt 1b
pop {pc}
endfunc
function ff_ac3_lshift_int16_neon, export=1
vdup.16 q0, r2
1: vld1.16 {q1}, [r0,:128]
vshl.s16 q1, q1, q0
vst1.16 {q1}, [r0,:128]!
subs r1, r1, #8
bgt 1b
bx lr
endfunc
function ff_ac3_rshift_int32_neon, export=1
rsb r2, r2, #0
vdup.32 q0, r2
1: vld1.32 {q1}, [r0,:128]
vshl.s32 q1, q1, q0
vst1.32 {q1}, [r0,:128]!
subs r1, r1, #4
bgt 1b
bx lr
endfunc
function ff_float_to_fixed24_neon, export=1
1: vld1.32 {q0-q1}, [r1,:128]!
vcvt.s32.f32 q0, q0, #24
vld1.32 {q2-q3}, [r1,:128]!
vcvt.s32.f32 q1, q1, #24
vcvt.s32.f32 q2, q2, #24
vst1.32 {q0-q1}, [r0,:128]!
vcvt.s32.f32 q3, q3, #24
vst1.32 {q2-q3}, [r0,:128]!
subs r2, r2, #16
bgt 1b
bx lr
endfunc
function ff_ac3_extract_exponents_neon, export=1
vmov.i32 q15, #8
1:
vld1.32 {q0}, [r1,:128]!
vabs.s32 q1, q0
vclz.i32 q3, q1
vsub.i32 q3, q3, q15
vmovn.i32 d6, q3
vmovn.i16 d6, q3
vst1.32 {d6[0]}, [r0,:32]!
subs r2, r2, #4
bgt 1b
bx lr
endfunc
function ff_apply_window_int16_neon, export=1
push {r4,lr}
add r4, r1, r3, lsl #1
add lr, r0, r3, lsl #1
sub r4, r4, #16
sub lr, lr, #16
mov r12, #-16
1:
vld1.16 {q0}, [r1,:128]!
vld1.16 {q2}, [r2,:128]!
vld1.16 {q1}, [r4,:128], r12
vrev64.16 q3, q2
vqrdmulh.s16 q0, q0, q2
vqrdmulh.s16 d2, d2, d7
vqrdmulh.s16 d3, d3, d6
vst1.16 {q0}, [r0,:128]!
vst1.16 {q1}, [lr,:128], r12
subs r3, r3, #16
bgt 1b
pop {r4,pc}
endfunc
function ff_ac3_sum_square_butterfly_int32_neon, export=1
vmov.i64 q0, #0
vmov.i64 q1, #0
vmov.i64 q2, #0
vmov.i64 q3, #0
1:
vld1.32 {d16}, [r1]!
vld1.32 {d17}, [r2]!
vadd.s32 d18, d16, d17
vsub.s32 d19, d16, d17
vmlal.s32 q0, d16, d16
vmlal.s32 q1, d17, d17
vmlal.s32 q2, d18, d18
vmlal.s32 q3, d19, d19
subs r3, r3, #2
bgt 1b
vadd.s64 d0, d0, d1
vadd.s64 d1, d2, d3
vadd.s64 d2, d4, d5
vadd.s64 d3, d6, d7
vst1.64 {q0-q1}, [r0]
bx lr
endfunc
function ff_ac3_sum_square_butterfly_float_neon, export=1
vmov.f32 q0, #0.0
vmov.f32 q1, #0.0
1:
vld1.32 {d16}, [r1]!
vld1.32 {d17}, [r2]!
vadd.f32 d18, d16, d17
vsub.f32 d19, d16, d17
vmla.f32 d0, d16, d16
vmla.f32 d1, d17, d17
vmla.f32 d2, d18, d18
vmla.f32 d3, d19, d19
subs r3, r3, #2
bgt 1b
vpadd.f32 d0, d0, d1
vpadd.f32 d1, d2, d3
vst1.32 {q0}, [r0]
bx lr
endfunc

View File

@@ -0,0 +1,32 @@
/*
* Copyright (c) 2010 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_ASM_OFFSETS_H
#define AVCODEC_ARM_ASM_OFFSETS_H
/* MpegEncContext */
#define Y_DC_SCALE 0x04
#define C_DC_SCALE 0x08
#define AC_PRED 0x0c
#define BLOCK_LAST_INDEX 0x10
#define H263_AIC 0x40
#define INTER_SCANTAB_RASTER_END 0x88
#endif /* AVCODEC_ARM_ASM_OFFSETS_H */

View File

@@ -0,0 +1,26 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_AUDIODSP_ARM_H
#define AVCODEC_ARM_AUDIODSP_ARM_H
#include "libavcodec/audiodsp.h"
void ff_audiodsp_init_neon(AudioDSPContext *c);
#endif /* AVCODEC_ARM_AUDIODSP_ARM_H */

View File

@@ -0,0 +1,33 @@
/*
* ARM optimized audio functions
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/audiodsp.h"
#include "audiodsp_arm.h"
av_cold void ff_audiodsp_init_arm(AudioDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
ff_audiodsp_init_neon(c);
}

View File

@@ -0,0 +1,41 @@
/*
* ARM NEON optimised audio functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/audiodsp.h"
#include "audiodsp_arm.h"
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
int len);
void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
int32_t max, unsigned int len);
int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
av_cold void ff_audiodsp_init_neon(AudioDSPContext *c)
{
c->vector_clip_int32 = ff_vector_clip_int32_neon;
c->vector_clipf = ff_vector_clipf_neon;
c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
}

View File

@@ -0,0 +1,64 @@
/*
* ARM NEON optimised audio functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_vector_clipf_neon, export=1
VFP vdup.32 q1, d0[1]
VFP vdup.32 q0, d0[0]
NOVFP vdup.32 q0, r2
NOVFP vdup.32 q1, r3
NOVFP ldr r2, [sp]
vld1.f32 {q2},[r1,:128]!
vmin.f32 q10, q2, q1
vld1.f32 {q3},[r1,:128]!
vmin.f32 q11, q3, q1
1: vmax.f32 q8, q10, q0
vmax.f32 q9, q11, q0
subs r2, r2, #8
beq 2f
vld1.f32 {q2},[r1,:128]!
vmin.f32 q10, q2, q1
vld1.f32 {q3},[r1,:128]!
vmin.f32 q11, q3, q1
vst1.f32 {q8},[r0,:128]!
vst1.f32 {q9},[r0,:128]!
b 1b
2: vst1.f32 {q8},[r0,:128]!
vst1.f32 {q9},[r0,:128]!
bx lr
endfunc
function ff_vector_clip_int32_neon, export=1
vdup.32 q0, r2
vdup.32 q1, r3
ldr r2, [sp]
1:
vld1.32 {q2-q3}, [r1,:128]!
vmin.s32 q2, q2, q1
vmin.s32 q3, q3, q1
vmax.s32 q2, q2, q0
vmax.s32 q3, q3, q0
vst1.32 {q2-q3}, [r0,:128]!
subs r2, r2, #8
bgt 1b
bx lr
endfunc

View File

@@ -0,0 +1,26 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_BLOCKDSP_ARM_H
#define AVCODEC_ARM_BLOCKDSP_ARM_H
#include "libavcodec/blockdsp.h"
void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth);
#endif /* AVCODEC_ARM_BLOCKDSP_ARM_H */

View File

@@ -0,0 +1,33 @@
/*
* ARM optimized block operations
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/blockdsp.h"
#include "blockdsp_arm.h"
av_cold void ff_blockdsp_init_arm(BlockDSPContext *c, unsigned high_bit_depth)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
ff_blockdsp_init_neon(c, high_bit_depth);
}

View File

@@ -0,0 +1,37 @@
/*
* ARM NEON optimised block operations
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/blockdsp.h"
#include "blockdsp_arm.h"
void ff_clear_block_neon(int16_t *block);
void ff_clear_blocks_neon(int16_t *blocks);
av_cold void ff_blockdsp_init_neon(BlockDSPContext *c, unsigned high_bit_depth)
{
if (!high_bit_depth) {
c->clear_block = ff_clear_block_neon;
c->clear_blocks = ff_clear_blocks_neon;
}
}

View File

@@ -0,0 +1,38 @@
/*
* ARM NEON optimised block functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_clear_block_neon, export=1
vmov.i16 q0, #0
.rept 8
vst1.16 {q0}, [r0,:128]!
.endr
bx lr
endfunc
function ff_clear_blocks_neon, export=1
vmov.i16 q0, #0
.rept 8*6
vst1.16 {q0}, [r0,:128]!
.endr
bx lr
endfunc

View File

@@ -0,0 +1,108 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_CABAC_H
#define AVCODEC_ARM_CABAC_H
#include "config.h"
#if HAVE_ARMV6T2_INLINE
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
#include "libavcodec/cabac.h"
#define get_cabac_inline get_cabac_inline_arm
static av_always_inline int get_cabac_inline_arm(CABACContext *c,
uint8_t *const state)
{
int bit;
void *reg_b, *reg_c, *tmp;
__asm__ volatile(
"ldrb %[bit] , [%[state]] \n\t"
"add %[r_b] , %[tables] , %[lps_off] \n\t"
"mov %[tmp] , %[range] \n\t"
"and %[range] , %[range] , #0xC0 \n\t"
"add %[r_b] , %[r_b] , %[bit] \n\t"
"ldrb %[range] , [%[r_b], %[range], lsl #1] \n\t"
"add %[r_b] , %[tables] , %[norm_off] \n\t"
"sub %[r_c] , %[tmp] , %[range] \n\t"
"lsl %[tmp] , %[r_c] , #17 \n\t"
"cmp %[tmp] , %[low] \n\t"
"it gt \n\t"
"movgt %[range] , %[r_c] \n\t"
"itt cc \n\t"
"mvncc %[bit] , %[bit] \n\t"
"subcc %[low] , %[low] , %[tmp] \n\t"
"add %[r_c] , %[tables] , %[mlps_off] \n\t"
"ldrb %[tmp] , [%[r_b], %[range]] \n\t"
"ldrb %[r_b] , [%[r_c], %[bit]] \n\t"
"lsl %[low] , %[low] , %[tmp] \n\t"
"lsl %[range] , %[range] , %[tmp] \n\t"
"uxth %[r_c] , %[low] \n\t"
"strb %[r_b] , [%[state]] \n\t"
"tst %[r_c] , %[r_c] \n\t"
"bne 2f \n\t"
"ldr %[r_c] , [%[c], %[byte]] \n\t"
#if UNCHECKED_BITSTREAM_READER
"ldrh %[tmp] , [%[r_c]] \n\t"
"add %[r_c] , %[r_c] , #2 \n\t"
"str %[r_c] , [%[c], %[byte]] \n\t"
#else
"ldr %[r_b] , [%[c], %[end]] \n\t"
"ldrh %[tmp] , [%[r_c]] \n\t"
"cmp %[r_c] , %[r_b] \n\t"
"itt lt \n\t"
"addlt %[r_c] , %[r_c] , #2 \n\t"
"strlt %[r_c] , [%[c], %[byte]] \n\t"
#endif
"sub %[r_c] , %[low] , #1 \n\t"
"add %[r_b] , %[tables] , %[norm_off] \n\t"
"eor %[r_c] , %[low] , %[r_c] \n\t"
"rev %[tmp] , %[tmp] \n\t"
"lsr %[r_c] , %[r_c] , #15 \n\t"
"lsr %[tmp] , %[tmp] , #15 \n\t"
"ldrb %[r_c] , [%[r_b], %[r_c]] \n\t"
"movw %[r_b] , #0xFFFF \n\t"
"sub %[tmp] , %[tmp] , %[r_b] \n\t"
"rsb %[r_c] , %[r_c] , #7 \n\t"
"lsl %[tmp] , %[tmp] , %[r_c] \n\t"
"add %[low] , %[low] , %[tmp] \n\t"
"2: \n\t"
: [bit]"=&r"(bit),
[low]"+&r"(c->low),
[range]"+&r"(c->range),
[r_b]"=&r"(reg_b),
[r_c]"=&r"(reg_c),
[tmp]"=&r"(tmp)
: [c]"r"(c),
[state]"r"(state),
[tables]"r"(ff_h264_cabac_tables),
[byte]"M"(offsetof(CABACContext, bytestream)),
[end]"M"(offsetof(CABACContext, bytestream_end)),
[norm_off]"I"(H264_NORM_SHIFT_OFFSET),
[lps_off]"I"(H264_LPS_RANGE_OFFSET),
[mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
: "memory", "cc"
);
return bit & 1;
}
#endif /* HAVE_ARMV6T2_INLINE */
#endif /* AVCODEC_ARM_CABAC_H */

View File

@@ -0,0 +1,82 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_DCA_H
#define AVCODEC_ARM_DCA_H
#include <stdint.h>
#include "config.h"
#include "libavcodec/dcadsp.h"
#include "libavcodec/mathops.h"
#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
#define decode_blockcodes decode_blockcodes
static inline int decode_blockcodes(int code1, int code2, int levels,
int32_t *values)
{
int32_t v0, v1, v2, v3, v4, v5;
__asm__ ("smmul %0, %6, %10 \n"
"smmul %3, %7, %10 \n"
"smlabb %6, %0, %9, %6 \n"
"smlabb %7, %3, %9, %7 \n"
"smmul %1, %0, %10 \n"
"smmul %4, %3, %10 \n"
"sub %6, %6, %8, lsr #1 \n"
"sub %7, %7, %8, lsr #1 \n"
"smlabb %0, %1, %9, %0 \n"
"smlabb %3, %4, %9, %3 \n"
"smmul %2, %1, %10 \n"
"smmul %5, %4, %10 \n"
"str %6, [%11, #0] \n"
"str %7, [%11, #16] \n"
"sub %0, %0, %8, lsr #1 \n"
"sub %3, %3, %8, lsr #1 \n"
"smlabb %1, %2, %9, %1 \n"
"smlabb %4, %5, %9, %4 \n"
"smmul %6, %2, %10 \n"
"smmul %7, %5, %10 \n"
"str %0, [%11, #4] \n"
"str %3, [%11, #20] \n"
"sub %1, %1, %8, lsr #1 \n"
"sub %4, %4, %8, lsr #1 \n"
"smlabb %2, %6, %9, %2 \n"
"smlabb %5, %7, %9, %5 \n"
"str %1, [%11, #8] \n"
"str %4, [%11, #24] \n"
"sub %2, %2, %8, lsr #1 \n"
"sub %5, %5, %8, lsr #1 \n"
"str %2, [%11, #12] \n"
"str %5, [%11, #28] \n"
: "=&r"(v0), "=&r"(v1), "=&r"(v2),
"=&r"(v3), "=&r"(v4), "=&r"(v5),
"+&r"(code1), "+&r"(code2)
: "r"(levels - 1), "r"(-levels),
"r"(ff_inverse[levels]), "r"(values)
: "memory");
return code1 | code2;
}
#endif
#endif /* AVCODEC_ARM_DCA_H */

View File

@@ -0,0 +1,82 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/dcadsp.h"
void ff_dca_lfe_fir0_neon(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir1_neon(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir32_vfp(float *out, const float *in, const float *coefs);
void ff_dca_lfe_fir64_vfp(float *out, const float *in, const float *coefs);
void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
SynthFilterContext *synth, FFTContext *imdct,
float synth_buf_ptr[512],
int *synth_buf_offset, float synth_buf2[32],
const float window[512], float *samples_out,
float raXin[32], float scale);
void ff_synth_filter_float_vfp(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
float synth_buf2[32], const float window[512],
float out[32], const float in[32],
float scale);
void ff_synth_filter_float_neon(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
float synth_buf2[32], const float window[512],
float out[32], const float in[32],
float scale);
void ff_decode_hf_neon(float dst[DCA_SUBBANDS][8],
const int32_t vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int32_t scale[DCA_SUBBANDS][2],
intptr_t start, intptr_t end);
av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
s->lfe_fir[0] = ff_dca_lfe_fir32_vfp;
s->lfe_fir[1] = ff_dca_lfe_fir64_vfp;
s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
}
if (have_neon(cpu_flags)) {
s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
s->decode_hf = ff_decode_hf_neon;
}
}
av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
s->synth_filter_float = ff_synth_filter_float_vfp;
if (have_neon(cpu_flags))
s->synth_filter_float = ff_synth_filter_float_neon;
}

View File

@@ -0,0 +1,93 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_decode_hf_neon, export=1
push {r4-r5,lr}
add r2, r2, r3
ldr r3, [sp, #12]
ldrd r4, r5, [sp, #16]
add r3, r3, r4, lsl #3
add r1, r1, r4, lsl #2
add r0, r0, r4, lsl #5
1: ldr_post lr, r1, #4
add r4, r4, #1
add lr, r2, lr, lsl #5
cmp r4, r5
vld1.32 {d7}, [r3]!
vld1.8 {d0}, [lr,:64]
vcvt.f32.s32 d7, d7, #4
vmovl.s8 q1, d0
vmovl.s16 q0, d2
vmovl.s16 q1, d3
vcvt.f32.s32 q0, q0
vcvt.f32.s32 q1, q1
vmul.f32 q0, q0, d7[0]
vmul.f32 q1, q1, d7[0]
vst1.32 {q0-q1}, [r0,:128]!
bne 1b
pop {r4-r5,pc}
endfunc
function ff_dca_lfe_fir0_neon, export=1
push {r4-r6,lr}
mov r3, #32 @ decifactor
mov r6, #256/32
b dca_lfe_fir
endfunc
function ff_dca_lfe_fir1_neon, export=1
push {r4-r6,lr}
mov r3, #64 @ decifactor
mov r6, #256/64
dca_lfe_fir:
add r4, r0, r3, lsl #2 @ out2
add r5, r2, #256*4-16 @ cf1
sub r1, r1, #12
mov lr, #-16
1:
vmov.f32 q2, #0.0 @ v0
vmov.f32 q3, #0.0 @ v1
mov r12, r6
2:
vld1.32 {q8}, [r2,:128]! @ cf0
vld1.32 {q9}, [r5,:128], lr @ cf1
vld1.32 {q1}, [r1], lr @ in
subs r12, r12, #4
vrev64.32 q10, q8
vmla.f32 q3, q1, q9
vmla.f32 d4, d2, d21
vmla.f32 d5, d3, d20
bne 2b
add r1, r1, r6, lsl #2
subs r3, r3, #1
vadd.f32 d4, d4, d5
vadd.f32 d6, d6, d7
vpadd.f32 d5, d4, d6
vst1.32 {d5[0]}, [r0,:32]!
vst1.32 {d5[1]}, [r4,:32]!
bne 1b
pop {r4-r6,pc}
endfunc

View File

@@ -0,0 +1,476 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
POUT .req a1
PIN .req a2
PCOEF .req a3
OLDFPSCR .req a4
COUNTER .req ip
IN0 .req s4
IN1 .req s5
IN2 .req s6
IN3 .req s7
IN4 .req s0
IN5 .req s1
IN6 .req s2
IN7 .req s3
COEF0 .req s8 @ coefficient elements
COEF1 .req s9
COEF2 .req s10
COEF3 .req s11
COEF4 .req s12
COEF5 .req s13
COEF6 .req s14
COEF7 .req s15
ACCUM0 .req s16 @ double-buffered multiply-accumulate results
ACCUM4 .req s20
POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
POST1 .req s25
POST2 .req s26
POST3 .req s27
.macro inner_loop decifactor, dir, tail, head
.ifc "\dir","up"
.set X, 0
.set Y, 4
.else
.set X, 4*JMAX*4 - 4
.set Y, -4
.endif
.ifnc "\head",""
vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
.endif
.ifnc "\tail",""
vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
.endif
.ifnc "\head",""
vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
.endif
.ifnc "\head",""
vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
.ifc "\tail",""
vmul.f ACCUM4, COEF4, IN1 @ vector operation
.endif
vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
.ifnc "\tail",""
vmul.f ACCUM4, COEF4, IN1 @ vector operation
.endif
vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
.endif
.ifnc "\tail",""
vstmia POUT!, {POST0-POST3}
.endif
.ifnc "\head",""
vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
.if \decifactor == 32
vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
.endif
.endif
.endm
.macro dca_lfe_fir decifactor
function ff_dca_lfe_fir\decifactor\()_vfp, export=1
fmrx OLDFPSCR, FPSCR
ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, ip
vldr IN0, [PIN, #-0*4]
vldr IN1, [PIN, #-1*4]
vldr IN2, [PIN, #-2*4]
vldr IN3, [PIN, #-3*4]
.if \decifactor == 32
.set JMAX, 8
vpush {s16-s31}
vldr IN4, [PIN, #-4*4]
vldr IN5, [PIN, #-5*4]
vldr IN6, [PIN, #-6*4]
vldr IN7, [PIN, #-7*4]
.else
.set JMAX, 4
vpush {s16-s27}
.endif
mov COUNTER, #\decifactor/4 - 1
inner_loop \decifactor, up,, head
1: add PCOEF, PCOEF, #4*JMAX*4
subs COUNTER, COUNTER, #1
inner_loop \decifactor, up, tail, head
bne 1b
inner_loop \decifactor, up, tail
mov COUNTER, #\decifactor/4 - 1
inner_loop \decifactor, down,, head
1: sub PCOEF, PCOEF, #4*JMAX*4
subs COUNTER, COUNTER, #1
inner_loop \decifactor, down, tail, head
bne 1b
inner_loop \decifactor, down, tail
.if \decifactor == 32
vpop {s16-s31}
.else
vpop {s16-s27}
.endif
fmxr FPSCR, OLDFPSCR
bx lr
endfunc
.endm
dca_lfe_fir 64
.ltorg
dca_lfe_fir 32
.unreq POUT
.unreq PIN
.unreq PCOEF
.unreq OLDFPSCR
.unreq COUNTER
.unreq IN0
.unreq IN1
.unreq IN2
.unreq IN3
.unreq IN4
.unreq IN5
.unreq IN6
.unreq IN7
.unreq COEF0
.unreq COEF1
.unreq COEF2
.unreq COEF3
.unreq COEF4
.unreq COEF5
.unreq COEF6
.unreq COEF7
.unreq ACCUM0
.unreq ACCUM4
.unreq POST0
.unreq POST1
.unreq POST2
.unreq POST3
IN .req a1
SBACT .req a2
OLDFPSCR .req a3
IMDCT .req a4
WINDOW .req v1
OUT .req v2
BUF .req v3
SCALEINT .req v4 @ only used in softfp case
COUNT .req v5
SCALE .req s0
/* Stack layout differs in softfp and hardfp cases:
*
* hardfp
* fp -> 6 arg words saved by caller
* a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
* s16-s23 on entry
* align 16
* buf -> 8*32*4 bytes buffer
* s0 on entry
* sp -> 3 arg words for callee
*
* softfp
* fp -> 7 arg words saved by caller
* a4,v1-v5,fp,lr on entry
* s16-s23 on entry
* align 16
* buf -> 8*32*4 bytes buffer
* sp -> 4 arg words for callee
*/
/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
* SynthFilterContext *synth, FFTContext *imdct,
* float (*synth_buf_ptr)[512],
* int *synth_buf_offset, float (*synth_buf2)[32],
* const float (*window)[512], float *samples_out,
* float (*raXin)[32], float scale);
*/
function ff_dca_qmf_32_subbands_vfp, export=1
VFP push {a3-a4,v1-v3,v5,fp,lr}
NOVFP push {a4,v1-v5,fp,lr}
add fp, sp, #8*4
vpush {s16-s23}
@ The buffer pointed at by raXin isn't big enough for us to do a
@ complete matrix transposition as we want to, so allocate an
@ alternative buffer from the stack. Align to 4 words for speed.
sub BUF, sp, #8*32*4
bic BUF, BUF, #15
mov sp, BUF
ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
fmrx OLDFPSCR, FPSCR
fmxr FPSCR, lr
@ COUNT is used to count down 2 things at once:
@ bits 0-4 are the number of word pairs remaining in the output row
@ bits 5-31 are the number of words to copy (with possible negation)
@ from the source matrix before we start zeroing the remainder
mov COUNT, #(-4 << 5) + 16
adds COUNT, COUNT, SBACT, lsl #5
bmi 2f
1:
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, [IN, #(1*8+0)*4]
vldr s11, [IN, #(1*8+1)*4]
vldr s13, [IN, #(1*8+2)*4]
vldr s15, [IN, #(1*8+3)*4]
vneg.f s16, s16
vldr s17, [IN, #(1*8+4)*4]
vldr s19, [IN, #(1*8+5)*4]
vldr s21, [IN, #(1*8+6)*4]
vldr s23, [IN, #(1*8+7)*4]
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
vldr s9, [IN, #(3*8+0)*4]
vldr s11, [IN, #(3*8+1)*4]
vldr s13, [IN, #(3*8+2)*4]
vldr s15, [IN, #(3*8+3)*4]
vldr s17, [IN, #(3*8+4)*4]
vldr s19, [IN, #(3*8+5)*4]
vldr s21, [IN, #(3*8+6)*4]
vldr s23, [IN, #(3*8+7)*4]
vneg.f s9, s9
vldr s8, [IN, #(2*8+0)*4]
vldr s10, [IN, #(2*8+1)*4]
vldr s12, [IN, #(2*8+2)*4]
vldr s14, [IN, #(2*8+3)*4]
vneg.f s17, s17
vldr s16, [IN, #(2*8+4)*4]
vldr s18, [IN, #(2*8+5)*4]
vldr s20, [IN, #(2*8+6)*4]
vldr s22, [IN, #(2*8+7)*4]
vstr d4, [BUF, #(0*32+2)*4]
vstr d5, [BUF, #(1*32+2)*4]
vstr d6, [BUF, #(2*32+2)*4]
vstr d7, [BUF, #(3*32+2)*4]
vstr d8, [BUF, #(4*32+2)*4]
vstr d9, [BUF, #(5*32+2)*4]
vstr d10, [BUF, #(6*32+2)*4]
vstr d11, [BUF, #(7*32+2)*4]
add IN, IN, #4*8*4
add BUF, BUF, #4*4
subs COUNT, COUNT, #(4 << 5) + 2
bpl 1b
2: @ Now deal with trailing < 4 samples
adds COUNT, COUNT, #3 << 5
bmi 4f @ sb_act was a multiple of 4
bics lr, COUNT, #0x1F
bne 3f
@ sb_act was n*4+1
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, zero
vldr s11, zero
vldr s13, zero
vldr s15, zero
vneg.f s16, s16
vldr s17, zero
vldr s19, zero
vldr s21, zero
vldr s23, zero
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #1
b 4f
3: @ sb_act was n*4+2 or n*4+3, so do the first 2
vldr s8, [IN, #(0*8+0)*4]
vldr s10, [IN, #(0*8+1)*4]
vldr s12, [IN, #(0*8+2)*4]
vldr s14, [IN, #(0*8+3)*4]
vldr s16, [IN, #(0*8+4)*4]
vldr s18, [IN, #(0*8+5)*4]
vldr s20, [IN, #(0*8+6)*4]
vldr s22, [IN, #(0*8+7)*4]
vneg.f s8, s8
vldr s9, [IN, #(1*8+0)*4]
vldr s11, [IN, #(1*8+1)*4]
vldr s13, [IN, #(1*8+2)*4]
vldr s15, [IN, #(1*8+3)*4]
vneg.f s16, s16
vldr s17, [IN, #(1*8+4)*4]
vldr s19, [IN, #(1*8+5)*4]
vldr s21, [IN, #(1*8+6)*4]
vldr s23, [IN, #(1*8+7)*4]
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #(2 << 5) + 1
bics lr, COUNT, #0x1F
bne 4f
@ sb_act was n*4+3
vldr s8, [IN, #(2*8+0)*4]
vldr s10, [IN, #(2*8+1)*4]
vldr s12, [IN, #(2*8+2)*4]
vldr s14, [IN, #(2*8+3)*4]
vldr s16, [IN, #(2*8+4)*4]
vldr s18, [IN, #(2*8+5)*4]
vldr s20, [IN, #(2*8+6)*4]
vldr s22, [IN, #(2*8+7)*4]
vldr s9, zero
vldr s11, zero
vldr s13, zero
vldr s15, zero
vldr s17, zero
vldr s19, zero
vldr s21, zero
vldr s23, zero
vstr d4, [BUF, #(0*32+0)*4]
vstr d5, [BUF, #(1*32+0)*4]
vstr d6, [BUF, #(2*32+0)*4]
vstr d7, [BUF, #(3*32+0)*4]
vstr d8, [BUF, #(4*32+0)*4]
vstr d9, [BUF, #(5*32+0)*4]
vstr d10, [BUF, #(6*32+0)*4]
vstr d11, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
sub COUNT, COUNT, #1
4: @ Now fill the remainder with 0
vldr s8, zero
vldr s9, zero
ands COUNT, COUNT, #0x1F
beq 6f
5: vstr d4, [BUF, #(0*32+0)*4]
vstr d4, [BUF, #(1*32+0)*4]
vstr d4, [BUF, #(2*32+0)*4]
vstr d4, [BUF, #(3*32+0)*4]
vstr d4, [BUF, #(4*32+0)*4]
vstr d4, [BUF, #(5*32+0)*4]
vstr d4, [BUF, #(6*32+0)*4]
vstr d4, [BUF, #(7*32+0)*4]
add BUF, BUF, #2*4
subs COUNT, COUNT, #1
bne 5b
6:
fmxr FPSCR, OLDFPSCR
ldr WINDOW, [fp, #3*4]
ldr OUT, [fp, #4*4]
sub BUF, BUF, #32*4
NOVFP ldr SCALEINT, [fp, #6*4]
mov COUNT, #8
VFP vpush {SCALE}
VFP sub sp, sp, #3*4
NOVFP sub sp, sp, #4*4
7:
VFP ldr a1, [fp, #-7*4] @ imdct
NOVFP ldr a1, [fp, #-8*4]
ldmia fp, {a2-a4}
VFP stmia sp, {WINDOW, OUT, BUF}
NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
VFP vldr SCALE, [sp, #3*4]
bl X(ff_synth_filter_float_vfp)
add OUT, OUT, #32*4
add BUF, BUF, #32*4
subs COUNT, COUNT, #1
bne 7b
A sub sp, fp, #(8+8)*4
T sub fp, fp, #(8+8)*4
T mov sp, fp
vpop {s16-s23}
VFP pop {a3-a4,v1-v3,v5,fp,pc}
NOVFP pop {a4,v1-v5,fp,pc}
endfunc
.unreq IN
.unreq SBACT
.unreq OLDFPSCR
.unreq IMDCT
.unreq WINDOW
.unreq OUT
.unreq BUF
.unreq SCALEINT
.unreq COUNT
.unreq SCALE
.align 2
zero: .word 0

View File

@@ -0,0 +1,40 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "idct.h"
static const struct algo fdct_tab_arch[] = {
{ 0 }
};
static const struct algo idct_tab_arch[] = {
{ "SIMPLE-ARM", ff_simple_idct_arm, FF_IDCT_PERM_NONE },
{ "INT-ARM", ff_j_rev_dct_arm, FF_IDCT_PERM_LIBMPEG2 },
#if HAVE_ARMV5TE
{ "SIMPLE-ARMV5TE", ff_simple_idct_armv5te, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ARMV5TE },
#endif
#if HAVE_ARMV6
{ "SIMPLE-ARMV6", ff_simple_idct_armv6, FF_IDCT_PERM_LIBMPEG2, AV_CPU_FLAG_ARMV6 },
#endif
#if HAVE_NEON
{ "SIMPLE-NEON", ff_simple_idct_neon, FF_IDCT_PERM_PARTTRANS, AV_CPU_FLAG_NEON },
#endif
{ 0 }
};

View File

@@ -0,0 +1,48 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/cpu.h"
#define FFT_FLOAT 0
#include "libavcodec/fft.h"
void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z);
void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
av_cold void ff_fft_fixed_init_arm(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
#if CONFIG_FFT
s->fft_calc = ff_fft_fixed_calc_neon;
#endif
#if CONFIG_MDCT
if (!s->inverse && s->nbits >= 3) {
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
s->mdct_calc = ff_mdct_fixed_calc_neon;
s->mdct_calcw = ff_mdct_fixed_calcw_neon;
}
#endif
}
}

View File

@@ -0,0 +1,261 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro bflies d0, d1, r0, r1
vrev64.32 \r0, \d1 @ t5, t6, t1, t2
vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2
vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2
vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5
vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1
@ t5, t6, t4, t3
vhsub.s16 \d1, \d0, \r0
vhadd.s16 \d0, \d0, \r0
.endm
.macro transform01 q0, q1, d3, c0, c1, r0, w0, w1
vrev32.16 \r0, \d3
vmull.s16 \w0, \d3, \c0
vmlal.s16 \w0, \r0, \c1
vshrn.s32 \d3, \w0, #15
bflies \q0, \q1, \w0, \w1
.endm
.macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \
r0, r1, w0, w1
vrev32.16 \r0, \d1
vrev32.16 \r1, \d3
vmull.s16 \w0, \d1, \c0
vmlal.s16 \w0, \r0, \c1
vmull.s16 \w1, \d3, \c2
vmlal.s16 \w1, \r1, \c3
vshrn.s32 \d1, \w0, #15
vshrn.s32 \d3, \w1, #15
bflies \q0, \q1, \w0, \w1
.endm
.macro fft4 d0, d1, r0, r1
vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7
vhsub.s16 \r1, \d1, \d0
vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5
vmov.i64 \d1, #0xffff00000000
vbit \r0, \r1, \d1
vrev64.16 \r1, \r0 @ t7, t8, t4, t3
vtrn.32 \r0, \r1 @ t3, t4, t7, t8
vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7
vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1
vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3
.endm
.macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1
fft4 \d0, \d1, \r0, \r1
vtrn.32 \d0, \d1 @ z0, z2, z1, z3
vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4
vhsub.s16 \d3, \d2, \d3 @ z5, z7
vmov \d2, \r0
transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
.endm
function fft4_neon
vld1.16 {d0-d1}, [r0]
fft4 d0, d1, d2, d3
vst1.16 {d0-d1}, [r0]
bx lr
endfunc
function fft8_neon
vld1.16 {d0-d3}, [r0,:128]
movrel r1, coefs
vld1.16 {d30}, [r1,:64]
vdup.16 d31, d30[0]
fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9
vtrn.32 d0, d1
vtrn.32 d2, d3
vst1.16 {d0-d3}, [r0,:128]
bx lr
endfunc
function fft16_neon
vld1.16 {d0-d3}, [r0,:128]!
vld1.16 {d4-d7}, [r0,:128]
movrel r1, coefs
sub r0, r0, #32
vld1.16 {d28-d31},[r1,:128]
vdup.16 d31, d28[0]
fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9
vswp d5, d6
fft4 q2, q3, q8, q9
vswp d5, d6
vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7
vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15
vswp d1, d2
vdup.16 d31, d28[0]
transform01 q0, q2, d5, d31, d28, d20, q8, q9
vdup.16 d26, d29[0]
vdup.16 d27, d30[0]
transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \
d20, d21, q8, q9
vtrn.32 q0, q1
vtrn.32 q2, q3
vst1.16 {d0-d3}, [r0,:128]!
vst1.16 {d4-d7}, [r0,:128]
bx lr
endfunc
function fft_pass_neon
push {r4,lr}
movrel lr, coefs + 24
vld1.16 {d30}, [lr,:64]
lsl r12, r2, #3
vmov d31, d30
add r3, r1, r2, lsl #2
mov lr, #-8
sub r3, r3, #2
mov r4, r0
vld1.16 {d27[]}, [r3,:16]
sub r3, r3, #6
vld1.16 {q0}, [r4,:128], r12
vld1.16 {q1}, [r4,:128], r12
vld1.16 {q2}, [r4,:128], r12
vld1.16 {q3}, [r4,:128], r12
vld1.16 {d28}, [r1,:64]!
vld1.16 {d29}, [r3,:64], lr
vswp d1, d2
vswp d5, d6
vtrn.32 d0, d1
vtrn.32 d4, d5
vdup.16 d25, d28[1]
vmul.s16 d27, d27, d31
transform01 q0, q2, d5, d25, d27, d20, q8, q9
b 2f
1:
mov r4, r0
vdup.16 d26, d29[0]
vld1.16 {q0}, [r4,:128], r12
vld1.16 {q1}, [r4,:128], r12
vld1.16 {q2}, [r4,:128], r12
vld1.16 {q3}, [r4,:128], r12
vld1.16 {d28}, [r1,:64]!
vld1.16 {d29}, [r3,:64], lr
vswp d1, d2
vswp d5, d6
vtrn.32 d0, d1
vtrn.32 d4, d5
vdup.16 d24, d28[0]
vdup.16 d25, d28[1]
vdup.16 d27, d29[3]
vmul.s16 q13, q13, q15
transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \
d16, d17, q9, q10
2:
vtrn.32 d2, d3
vtrn.32 d6, d7
vdup.16 d24, d28[2]
vdup.16 d26, d29[2]
vdup.16 d25, d28[3]
vdup.16 d27, d29[1]
vmul.s16 q13, q13, q15
transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \
d16, d17, q9, q10
vtrn.32 d0, d1
vtrn.32 d2, d3
vtrn.32 d4, d5
vtrn.32 d6, d7
vswp d1, d2
vswp d5, d6
mov r4, r0
vst1.16 {q0}, [r4,:128], r12
vst1.16 {q1}, [r4,:128], r12
vst1.16 {q2}, [r4,:128], r12
vst1.16 {q3}, [r4,:128], r12
add r0, r0, #16
subs r2, r2, #2
bgt 1b
pop {r4,pc}
endfunc
#define F_SQRT1_2 23170
#define F_COS_16_1 30274
#define F_COS_16_3 12540
const coefs, align=4
.short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2
.short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
.short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
.short 1, -1, -1, 1
endconst
.macro def_fft n, n2, n4
function fft\n\()_neon
push {r4, lr}
mov r4, r0
bl fft\n2\()_neon
add r0, r4, #\n4*2*4
bl fft\n4\()_neon
add r0, r4, #\n4*3*4
bl fft\n4\()_neon
mov r0, r4
pop {r4, lr}
movrelx r1, X(ff_cos_\n\()_fixed)
mov r2, #\n4/2
b fft_pass_neon
endfunc
.endm
def_fft 32, 16, 8
def_fft 64, 32, 16
def_fft 128, 64, 32
def_fft 256, 128, 64
def_fft 512, 256, 128
def_fft 1024, 512, 256
def_fft 2048, 1024, 512
def_fft 4096, 2048, 1024
def_fft 8192, 4096, 2048
def_fft 16384, 8192, 4096
def_fft 32768, 16384, 8192
def_fft 65536, 32768, 16384
function ff_fft_fixed_calc_neon, export=1
ldr r2, [r0]
sub r2, r2, #2
movrel r3, fft_fixed_tab_neon
ldr r3, [r3, r2, lsl #2]
mov r0, r1
bx r3
endfunc
const fft_fixed_tab_neon, relocate=1
.word fft4_neon
.word fft8_neon
.word fft16_neon
.word fft32_neon
.word fft64_neon
.word fft128_neon
.word fft256_neon
.word fft512_neon
.word fft1024_neon
.word fft2048_neon
.word fft4096_neon
.word fft8192_neon
.word fft16384_neon
.word fft32768_neon
.word fft65536_neon
endconst

View File

@@ -0,0 +1,72 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/cpu.h"
#include "libavcodec/fft.h"
#include "libavcodec/rdft.h"
#include "libavcodec/synth_filter.h"
void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
av_cold void ff_fft_init_arm(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
s->fft_calc = ff_fft_calc_vfp;
#if CONFIG_MDCT
s->imdct_half = ff_imdct_half_vfp;
#endif
}
if (have_neon(cpu_flags)) {
#if CONFIG_FFT
s->fft_permute = ff_fft_permute_neon;
s->fft_calc = ff_fft_calc_neon;
#endif
#if CONFIG_MDCT
s->imdct_calc = ff_imdct_calc_neon;
s->imdct_half = ff_imdct_half_neon;
s->mdct_calc = ff_mdct_calc_neon;
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
#endif
}
}
#if CONFIG_RDFT
av_cold void ff_rdft_init_arm(RDFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
s->rdft_calc = ff_rdft_calc_neon;
}
#endif

View File

@@ -0,0 +1,375 @@
/*
* ARM NEON optimised FFT
*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2009 Naotoshi Nojiri
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#define M_SQRT1_2 0.70710678118654752440
function fft4_neon
vld1.32 {d0-d3}, [r0,:128]
vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2
vsub.f32 d6, d0, d1 @ r0-r1,i0-i1
vsub.f32 d7, d16, d17 @ r3-r2,i2-i3
vadd.f32 d4, d0, d1 @ r0+r1,i0+i1
vadd.f32 d5, d2, d3 @ i2+i3,r2+r3
vadd.f32 d1, d6, d7
vsub.f32 d3, d6, d7
vadd.f32 d0, d4, d5
vsub.f32 d2, d4, d5
vst1.32 {d0-d3}, [r0,:128]
bx lr
endfunc
function fft8_neon
mov r1, r0
vld1.32 {d0-d3}, [r1,:128]!
vld1.32 {d16-d19}, [r1,:128]
movw r2, #0x04f3 @ sqrt(1/2)
movt r2, #0x3f35
eor r3, r2, #1<<31
vdup.32 d31, r2
vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2
vadd.f32 d4, d16, d17 @ r4+r5,i4+i5
vmov d28, r3, r2
vadd.f32 d5, d18, d19 @ r6+r7,i6+i7
vsub.f32 d17, d16, d17 @ r4-r5,i4-i5
vsub.f32 d19, d18, d19 @ r6-r7,i6-i7
vrev64.32 d29, d28
vadd.f32 d20, d0, d1 @ r0+r1,i0+i1
vadd.f32 d21, d2, d3 @ r2+r3,i2+i3
vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w
vext.32 q3, q2, q2, #1
vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w
vsub.f32 d23, d22, d23 @ i2-i3,r3-r2
vsub.f32 d22, d0, d1 @ r0-r1,i0-i1
vmul.f32 d24, d17, d31 @ a2r*w,a2i*w
vmul.f32 d25, d19, d31 @ a3r*w,a3i*w
vadd.f32 d0, d20, d21
vsub.f32 d2, d20, d21
vadd.f32 d1, d22, d23
vrev64.32 q13, q13
vsub.f32 d3, d22, d23
vsub.f32 d6, d6, d7
vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2
vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6
vadd.f32 d7, d4, d5
vsub.f32 d18, d2, d6
vext.32 q13, q12, q12, #1
vadd.f32 d2, d2, d6
vsub.f32 d16, d0, d7
vadd.f32 d5, d25, d24
vsub.f32 d4, d26, d27
vadd.f32 d0, d0, d7
vsub.f32 d17, d1, d5
vsub.f32 d19, d3, d4
vadd.f32 d3, d3, d4
vadd.f32 d1, d1, d5
vst1.32 {d16-d19}, [r1,:128]
vst1.32 {d0-d3}, [r0,:128]
bx lr
endfunc
function fft16_neon
movrel r1, mppm
vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
pld [r0, #32]
vld1.32 {d2-d3}, [r1,:128]
vext.32 q13, q9, q9, #1
vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
vadd.f32 d4, d16, d17
vsub.f32 d5, d16, d17
vadd.f32 d18, d18, d19
vsub.f32 d19, d26, d27
vadd.f32 d20, d22, d23
vsub.f32 d22, d22, d23
vsub.f32 d23, d24, d25
vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1}
vadd.f32 d21, d24, d25
vmul.f32 d24, d22, d2
vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3}
vmul.f32 d25, d23, d3
vuzp.32 d16, d17 @ {r0,r1,i0,i1}
vmul.f32 q1, q11, d2[1]
vuzp.32 d18, d19 @ {r2,r3,i2,i3}
vrev64.32 q12, q12
vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6}
vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
vzip.32 q10, q11
vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
vadd.f32 d0, d22, d20
vadd.f32 d1, d21, d23
vsub.f32 d2, d21, d23
vsub.f32 d3, d22, d20
sub r0, r0, #96
vext.32 q13, q13, q13, #1
vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5}
vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
vext.32 q15, q15, q15, #1
vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7}
vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3}
vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
movrelx r2, X(ff_cos_16)
vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
vrev64.32 d1, d1
vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
vrev64.32 d3, d3
movrel r3, pmmp
vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13}
vld1.32 {d4-d5}, [r2,:64]
vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11}
vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15}
vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
vld1.32 {d6-d7}, [r3,:128]
vrev64.32 q1, q14
vmul.f32 q14, q14, d4[1]
vmul.f32 q1, q1, q3
vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a}
vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
vzip.32 q12, q14
vadd.f32 d0, d28, d24
vadd.f32 d1, d25, d29
vsub.f32 d2, d25, d29
vsub.f32 d3, d28, d24
vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9}
vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13}
mov r1, #32
vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5}
vrev64.32 q0, q13
vmul.f32 q13, q13, d5[0]
vrev64.32 q1, q15
vmul.f32 q15, q15, d5[1]
vst2.32 {d16-d17},[r0,:128], r1
vmul.f32 q0, q0, q3
vst2.32 {d20-d21},[r0,:128], r1
vmul.f32 q1, q1, q3
vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6}
vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a}
vst2.32 {d24-d25},[r0,:128], r1
vst2.32 {d28-d29},[r0,:128]
vzip.32 q13, q15
sub r0, r0, #80
vadd.f32 d0, d30, d26
vadd.f32 d1, d27, d31
vsub.f32 d2, d27, d31
vsub.f32 d3, d30, d26
vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11}
vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3}
vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15}
vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7}
vst2.32 {d18-d19},[r0,:128], r1
vst2.32 {d22-d23},[r0,:128], r1
vst2.32 {d26-d27},[r0,:128], r1
vst2.32 {d30-d31},[r0,:128]
bx lr
endfunc
function fft_pass_neon
push {r4-r6,lr}
mov r6, r2 @ n
lsl r5, r2, #3 @ 2 * n * sizeof FFTSample
lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex
lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex
add r3, r2, r4
add r4, r4, r0 @ &z[o1]
add r2, r2, r0 @ &z[o2]
add r3, r3, r0 @ &z[o3]
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
movrel r12, pmmp
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
add r5, r5, r1 @ wim
vld1.32 {d6-d7}, [r12,:128] @ pmmp
vswp d21, d22
vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]}
sub r5, r5, #4 @ wim--
vrev64.32 q1, q11
vmul.f32 q11, q11, d4[1]
vmul.f32 q1, q1, q3
vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1]
vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
sub r6, r6, #1 @ n--
vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
vzip.32 q10, q11
vadd.f32 d0, d22, d20
vadd.f32 d1, d21, d23
vsub.f32 d2, d21, d23
vsub.f32 d3, d22, d20
vsub.f32 q10, q8, q0
vadd.f32 q8, q8, q0
vsub.f32 q11, q9, q1
vadd.f32 q9, q9, q1
vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]}
vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]}
vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]}
vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]}
sub r5, r5, #8 @ wim -= 2
1:
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
vswp d21, d22
vld1.32 {d4}, [r1]! @ {wre[0],wre[1]}
vrev64.32 q0, q10
vmul.f32 q10, q10, d4[0]
vrev64.32 q1, q11
vmul.f32 q11, q11, d4[1]
vld1.32 {d5}, [r5] @ {wim[-1],wim[0]}
vmul.f32 q0, q0, q3
sub r5, r5, #8 @ wim -= 2
vmul.f32 q1, q1, q3
vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6}
vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
subs r6, r6, #1 @ n--
vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
vzip.32 q10, q11
vadd.f32 d0, d22, d20
vadd.f32 d1, d21, d23
vsub.f32 d2, d21, d23
vsub.f32 d3, d22, d20
vsub.f32 q10, q8, q0
vadd.f32 q8, q8, q0
vsub.f32 q11, q9, q1
vadd.f32 q9, q9, q1
vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]}
vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]}
vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]}
vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]}
bne 1b
pop {r4-r6,pc}
endfunc
.macro def_fft n, n2, n4
.align 6
function fft\n\()_neon
push {r4, lr}
mov r4, r0
bl fft\n2\()_neon
add r0, r4, #\n4*2*8
bl fft\n4\()_neon
add r0, r4, #\n4*3*8
bl fft\n4\()_neon
mov r0, r4
pop {r4, lr}
movrelx r1, X(ff_cos_\n)
mov r2, #\n4/2
b fft_pass_neon
endfunc
.endm
def_fft 32, 16, 8
def_fft 64, 32, 16
def_fft 128, 64, 32
def_fft 256, 128, 64
def_fft 512, 256, 128
def_fft 1024, 512, 256
def_fft 2048, 1024, 512
def_fft 4096, 2048, 1024
def_fft 8192, 4096, 2048
def_fft 16384, 8192, 4096
def_fft 32768, 16384, 8192
def_fft 65536, 32768, 16384
function ff_fft_calc_neon, export=1
ldr r2, [r0]
sub r2, r2, #2
movrel r3, fft_tab_neon
ldr r3, [r3, r2, lsl #2]
mov r0, r1
bx r3
endfunc
function ff_fft_permute_neon, export=1
push {r4,lr}
mov r12, #1
ldr r2, [r0] @ nbits
ldr r3, [r0, #12] @ tmp_buf
ldr r0, [r0, #8] @ revtab
lsl r12, r12, r2
mov r2, r12
1:
vld1.32 {d0-d1}, [r1,:128]!
ldr r4, [r0], #4
uxth lr, r4
uxth r4, r4, ror #16
add lr, r3, lr, lsl #3
add r4, r3, r4, lsl #3
vst1.32 {d0}, [lr,:64]
vst1.32 {d1}, [r4,:64]
subs r12, r12, #2
bgt 1b
sub r1, r1, r2, lsl #3
1:
vld1.32 {d0-d3}, [r3,:128]!
vst1.32 {d0-d3}, [r1,:128]!
subs r2, r2, #4
bgt 1b
pop {r4,pc}
endfunc
const fft_tab_neon, relocate=1
.word fft4_neon
.word fft8_neon
.word fft16_neon
.word fft32_neon
.word fft64_neon
.word fft128_neon
.word fft256_neon
.word fft512_neon
.word fft1024_neon
.word fft2048_neon
.word fft4096_neon
.word fft8192_neon
.word fft16384_neon
.word fft32768_neon
.word fft65536_neon
endconst
const pmmp, align=4
.float +1.0, -1.0, -1.0, +1.0
endconst
const mppm, align=4
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
endconst

View File

@@ -0,0 +1,530 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
@ all single-precision VFP registers may be corrupted on exit. The a2
@ register may not be clobbered in these functions, as it holds the
@ stored original FPSCR.
function ff_fft_calc_vfp, export=1
ldr ip, [a1, #0] @ nbits
mov a1, a2
movrel a2, (fft_tab_vfp - 8)
ldr pc, [a2, ip, lsl #2]
endfunc
const fft_tab_vfp, relocate=1
.word fft4_vfp
.word fft8_vfp
.word X(ff_fft16_vfp) @ this one alone is exported
.word fft32_vfp
.word fft64_vfp
.word fft128_vfp
.word fft256_vfp
.word fft512_vfp
.word fft1024_vfp
.word fft2048_vfp
.word fft4096_vfp
.word fft8192_vfp
.word fft16384_vfp
.word fft32768_vfp
.word fft65536_vfp
endconst
function fft4_vfp
vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
@ stall
vadd.f s12, s0, s8 @ i0
vadd.f s13, s1, s9 @ i1
vadd.f s14, s2, s10 @ i2
vadd.f s15, s3, s11 @ i3
vsub.f s8, s0, s8 @ i4
vsub.f s9, s1, s9 @ i5
vsub.f s10, s2, s10 @ i6
vsub.f s11, s3, s11 @ i7
@ stall
@ stall
vadd.f s0, s12, s14 @ z[0].re
vsub.f s4, s12, s14 @ z[2].re
vadd.f s1, s13, s15 @ z[0].im
vsub.f s5, s13, s15 @ z[2].im
vadd.f s7, s9, s10 @ z[3].im
vsub.f s3, s9, s10 @ z[1].im
vadd.f s2, s8, s11 @ z[1].re
vsub.f s6, s8, s11 @ z[3].re
@ stall
@ stall
vstr d0, [a1, #0*2*4]
vstr d2, [a1, #2*2*4]
@ stall
@ stall
vstr d1, [a1, #1*2*4]
vstr d3, [a1, #3*2*4]
bx lr
endfunc
.macro macro_fft8_head
@ FFT4
vldr d4, [a1, #0 * 2*4]
vldr d6, [a1, #1 * 2*4]
vldr d5, [a1, #2 * 2*4]
vldr d7, [a1, #3 * 2*4]
@ BF
vldr d12, [a1, #4 * 2*4]
vadd.f s16, s8, s12 @ vector op
vldr d14, [a1, #5 * 2*4]
vldr d13, [a1, #6 * 2*4]
vldr d15, [a1, #7 * 2*4]
vsub.f s20, s8, s12 @ vector op
vadd.f s0, s16, s18
vsub.f s2, s16, s18
vadd.f s1, s17, s19
vsub.f s3, s17, s19
vadd.f s7, s21, s22
vsub.f s5, s21, s22
vadd.f s4, s20, s23
vsub.f s6, s20, s23
vsub.f s20, s24, s28 @ vector op
vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
vstr d1, [a1, #1 * 2*4]
vldr s0, cos1pi4
vadd.f s16, s24, s28 @ vector op
vstr d2, [a1, #2 * 2*4]
vstr d3, [a1, #3 * 2*4]
vldr d12, [a1, #0 * 2*4]
@ TRANSFORM
vmul.f s20, s20, s0 @ vector x scalar op
vldr d13, [a1, #1 * 2*4]
vldr d14, [a1, #2 * 2*4]
vldr d15, [a1, #3 * 2*4]
@ BUTTERFLIES
vadd.f s0, s18, s16
vadd.f s1, s17, s19
vsub.f s2, s17, s19
vsub.f s3, s18, s16
vadd.f s4, s21, s20
vsub.f s5, s21, s20
vadd.f s6, s22, s23
vsub.f s7, s22, s23
vadd.f s8, s0, s24 @ vector op
vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
vstr d1, [a1, #1 * 2*4]
vldr d6, [a1, #0 * 2*4]
vldr d7, [a1, #1 * 2*4]
vadd.f s1, s5, s6
vadd.f s0, s7, s4
vsub.f s2, s5, s6
vsub.f s3, s7, s4
vsub.f s12, s24, s12 @ vector op
vsub.f s5, s29, s1
vsub.f s4, s28, s0
vsub.f s6, s30, s2
vsub.f s7, s31, s3
vadd.f s16, s0, s28 @ vector op
vstr d6, [a1, #4 * 2*4]
vstr d7, [a1, #6 * 2*4]
vstr d4, [a1, #0 * 2*4]
vstr d5, [a1, #2 * 2*4]
vstr d2, [a1, #5 * 2*4]
vstr d3, [a1, #7 * 2*4]
.endm
.macro macro_fft8_tail
vstr d8, [a1, #1 * 2*4]
vstr d9, [a1, #3 * 2*4]
.endm
function .Lfft8_internal_vfp
macro_fft8_head
macro_fft8_tail
bx lr
endfunc
function fft8_vfp
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
fmrx a2, FPSCR
fmxr FPSCR, a3
vpush {s16-s31}
mov ip, lr
bl .Lfft8_internal_vfp
vpop {s16-s31}
fmxr FPSCR, a2
bx ip
endfunc
.align 3
cos1pi4: @ cos(1*pi/4) = sqrt(2)
.float 0.707106769084930419921875
cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
.float 0.92387950420379638671875
cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
.float 0.3826834261417388916015625
function .Lfft16_internal_vfp
macro_fft8_head
@ FFT4(z+8)
vldr d10, [a1, #8 * 2*4]
vldr d12, [a1, #9 * 2*4]
vldr d11, [a1, #10 * 2*4]
vldr d13, [a1, #11 * 2*4]
macro_fft8_tail
vadd.f s16, s20, s24 @ vector op
@ FFT4(z+12)
vldr d4, [a1, #12 * 2*4]
vldr d6, [a1, #13 * 2*4]
vldr d5, [a1, #14 * 2*4]
vsub.f s20, s20, s24 @ vector op
vldr d7, [a1, #15 * 2*4]
vadd.f s0, s16, s18
vsub.f s4, s16, s18
vadd.f s1, s17, s19
vsub.f s5, s17, s19
vadd.f s7, s21, s22
vsub.f s3, s21, s22
vadd.f s2, s20, s23
vsub.f s6, s20, s23
vadd.f s16, s8, s12 @ vector op
vstr d0, [a1, #8 * 2*4]
vstr d2, [a1, #10 * 2*4]
vstr d1, [a1, #9 * 2*4]
vsub.f s20, s8, s12
vstr d3, [a1, #11 * 2*4]
@ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
vldr d12, [a1, #10 * 2*4]
vadd.f s0, s16, s18
vadd.f s1, s17, s19
vsub.f s6, s16, s18
vsub.f s7, s17, s19
vsub.f s3, s21, s22
vadd.f s2, s20, s23
vadd.f s5, s21, s22
vsub.f s4, s20, s23
vstr d0, [a1, #12 * 2*4]
vmov s0, s6
@ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
vldr d6, [a1, #9 * 2*4]
vstr d1, [a1, #13 * 2*4]
vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
vstr d2, [a1, #15 * 2*4]
vldr d7, [a1, #13 * 2*4]
vadd.f s4, s25, s24
vsub.f s5, s25, s24
vsub.f s6, s0, s7
vadd.f s7, s0, s7
vmul.f s20, s12, s3 @ vector op
@ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
vldr d4, [a1, #11 * 2*4]
vldr d5, [a1, #15 * 2*4]
vldr s1, cos3pi8
vmul.f s24, s4, s2 @ vector * scalar op
vmul.f s28, s12, s1 @ vector * scalar op
vmul.f s12, s8, s1 @ vector * scalar op
vadd.f s4, s20, s29
vsub.f s5, s21, s28
vsub.f s6, s22, s31
vadd.f s7, s23, s30
vmul.f s8, s8, s3 @ vector * scalar op
vldr d8, [a1, #1 * 2*4]
vldr d9, [a1, #5 * 2*4]
vldr d10, [a1, #3 * 2*4]
vldr d11, [a1, #7 * 2*4]
vldr d14, [a1, #2 * 2*4]
vadd.f s0, s6, s4
vadd.f s1, s5, s7
vsub.f s2, s5, s7
vsub.f s3, s6, s4
vadd.f s4, s12, s9
vsub.f s5, s13, s8
vsub.f s6, s14, s11
vadd.f s7, s15, s10
vadd.f s12, s0, s16 @ vector op
vstr d0, [a1, #1 * 2*4]
vstr d1, [a1, #5 * 2*4]
vldr d4, [a1, #1 * 2*4]
vldr d5, [a1, #5 * 2*4]
vadd.f s0, s6, s4
vadd.f s1, s5, s7
vsub.f s2, s5, s7
vsub.f s3, s6, s4
vsub.f s8, s16, s8 @ vector op
vstr d6, [a1, #1 * 2*4]
vstr d7, [a1, #5 * 2*4]
vldr d15, [a1, #6 * 2*4]
vsub.f s4, s20, s0
vsub.f s5, s21, s1
vsub.f s6, s22, s2
vsub.f s7, s23, s3
vadd.f s20, s0, s20 @ vector op
vstr d4, [a1, #9 * 2*4]
@ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
vldr d6, [a1, #8 * 2*4]
vstr d5, [a1, #13 * 2*4]
vldr d7, [a1, #12 * 2*4]
vstr d2, [a1, #11 * 2*4]
vldr d8, [a1, #0 * 2*4]
vstr d3, [a1, #15 * 2*4]
vldr d9, [a1, #4 * 2*4]
vadd.f s0, s26, s24
vadd.f s1, s25, s27
vsub.f s2, s25, s27
vsub.f s3, s26, s24
vadd.f s4, s14, s12
vadd.f s5, s13, s15
vsub.f s6, s13, s15
vsub.f s7, s14, s12
vadd.f s8, s0, s28 @ vector op
vstr d0, [a1, #3 * 2*4]
vstr d1, [a1, #7 * 2*4]
vldr d6, [a1, #3 * 2*4]
vldr d7, [a1, #7 * 2*4]
vsub.f s0, s16, s4
vsub.f s1, s17, s5
vsub.f s2, s18, s6
vsub.f s3, s19, s7
vsub.f s12, s28, s12 @ vector op
vadd.f s16, s4, s16 @ vector op
vstr d10, [a1, #3 * 2*4]
vstr d11, [a1, #7 * 2*4]
vstr d4, [a1, #2 * 2*4]
vstr d5, [a1, #6 * 2*4]
vstr d0, [a1, #8 * 2*4]
vstr d1, [a1, #12 * 2*4]
vstr d6, [a1, #10 * 2*4]
vstr d7, [a1, #14 * 2*4]
vstr d8, [a1, #0 * 2*4]
vstr d9, [a1, #4 * 2*4]
bx lr
endfunc
function ff_fft16_vfp, export=1
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
fmrx a2, FPSCR
fmxr FPSCR, a3
vpush {s16-s31}
mov ip, lr
bl .Lfft16_internal_vfp
vpop {s16-s31}
fmxr FPSCR, a2
bx ip
endfunc
.macro pass n, z0, z1, z2, z3
add v6, v5, #4*2*\n
@ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
@ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
vldr d8, [\z2, #8*(o2+1)] @ s16,s17
vldmdb v6!, {s2}
vldr d9, [\z3, #8*(o3+1)] @ s18,s19
vldmia v5!, {s0,s1} @ s0 is unused
vldr s7, [\z2, #8*o2] @ t1
vmul.f s20, s16, s2 @ vector * scalar
vldr s0, [\z3, #8*o3] @ t5
vldr s6, [\z2, #8*o2+4] @ t2
vldr s3, [\z3, #8*o3+4] @ t6
vmul.f s16, s16, s1 @ vector * scalar
ldr a4, =\n-1
1: add \z0, \z0, #8*2
.if \n*4*2 >= 512
add \z1, \z1, #8*2
.endif
.if \n*4*2 >= 256
add \z2, \z2, #8*2
.endif
.if \n*4*2 >= 512
add \z3, \z3, #8*2
.endif
@ up to 2 stalls (VFP vector issuing / waiting for s0)
@ depending upon whether this is the first iteration and
@ how many add instructions are inserted above
vadd.f s4, s0, s7 @ t5
vadd.f s5, s6, s3 @ t6
vsub.f s6, s6, s3 @ t4
vsub.f s7, s0, s7 @ t3
vldr d6, [\z0, #8*0-8*2] @ s12,s13
vadd.f s0, s16, s21 @ t1
vldr d7, [\z1, #8*o1-8*2] @ s14,s15
vsub.f s1, s18, s23 @ t5
vadd.f s8, s4, s12 @ vector + vector
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
vsub.f s4, s12, s4
vsub.f s5, s13, s5
vsub.f s6, s14, s6
vsub.f s7, s15, s7
vsub.f s2, s17, s20 @ t2
vadd.f s3, s19, s22 @ t6
vstr d4, [\z0, #8*0-8*2] @ s8,s9
vstr d5, [\z1, #8*o1-8*2] @ s10,s11
@ stall (waiting for s5)
vstr d2, [\z2, #8*o2-8*2] @ s4,s5
vadd.f s4, s1, s0 @ t5
vstr d3, [\z3, #8*o3-8*2] @ s6,s7
vsub.f s7, s1, s0 @ t3
vadd.f s5, s2, s3 @ t6
vsub.f s6, s2, s3 @ t4
vldr d6, [\z0, #8*1-8*2] @ s12,s13
vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
vldr d4, [\z2, #8*o2] @ s8,s9
vldmdb v6!, {s2,s3}
vldr d5, [\z3, #8*o3] @ s10,s11
vadd.f s20, s4, s12 @ vector + vector
vldmia v5!, {s0,s1}
vldr d8, [\z2, #8*(o2+1)] @ s16,s17
@ stall (VFP vector issuing)
vsub.f s4, s12, s4
vsub.f s5, s13, s5
vsub.f s6, s14, s6
vsub.f s7, s15, s7
vmul.f s12, s8, s3 @ vector * scalar
vstr d10, [\z0, #8*1-8*2] @ s20,s21
vldr d9, [\z3, #8*(o3+1)] @ s18,s19
vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
vmul.f s8, s8, s0 @ vector * scalar
vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
@ stall (waiting for s7)
vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
vmul.f s20, s16, s2 @ vector * scalar
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
vadd.f s7, s8, s13 @ t1
vsub.f s6, s9, s12 @ t2
vsub.f s0, s10, s15 @ t5
vadd.f s3, s11, s14 @ t6
vmul.f s16, s16, s1 @ vector * scalar
subs a4, a4, #1
bne 1b
@ What remains is identical to the first two indentations of
@ the above, but without the increment of z
vadd.f s4, s0, s7 @ t5
vadd.f s5, s6, s3 @ t6
vsub.f s6, s6, s3 @ t4
vsub.f s7, s0, s7 @ t3
vldr d6, [\z0, #8*0] @ s12,s13
vadd.f s0, s16, s21 @ t1
vldr d7, [\z1, #8*o1] @ s14,s15
vsub.f s1, s18, s23 @ t5
vadd.f s8, s4, s12 @ vector + vector
vsub.f s4, s12, s4
vsub.f s5, s13, s5
vsub.f s6, s14, s6
vsub.f s7, s15, s7
vsub.f s2, s17, s20 @ t2
vadd.f s3, s19, s22 @ t6
vstr d4, [\z0, #8*0] @ s8,s9
vstr d5, [\z1, #8*o1] @ s10,s11
vstr d2, [\z2, #8*o2] @ s4,s5
vadd.f s4, s1, s0 @ t5
vstr d3, [\z3, #8*o3] @ s6,s7
vsub.f s7, s1, s0 @ t3
vadd.f s5, s2, s3 @ t6
vsub.f s6, s2, s3 @ t4
vldr d6, [\z0, #8*1] @ s12,s13
vldr d7, [\z1, #8*(o1+1)] @ s14,s15
vadd.f s20, s4, s12 @ vector + vector
vsub.f s4, s12, s4
vsub.f s5, s13, s5
vsub.f s6, s14, s6
vsub.f s7, s15, s7
vstr d10, [\z0, #8*1] @ s20,s21
vstr d11, [\z1, #8*(o1+1)] @ s22,s23
vstr d2, [\z2, #8*(o2+1)] @ s4,s5
vstr d3, [\z3, #8*(o3+1)] @ s6,s7
.endm
.macro def_fft n, n2, n4
function .Lfft\n\()_internal_vfp
.if \n >= 512
push {v1-v6,lr}
.elseif \n >= 256
push {v1-v2,v5-v6,lr}
.else
push {v1,v5-v6,lr}
.endif
mov v1, a1
bl .Lfft\n2\()_internal_vfp
add a1, v1, #8*(\n/4)*2
bl .Lfft\n4\()_internal_vfp
movrelx v5, X(ff_cos_\n), a1
add a1, v1, #8*(\n/4)*3
bl .Lfft\n4\()_internal_vfp
.if \n >= 512
.set o1, 0*(\n/4/2)
.set o2, 0*(\n/4/2)
.set o3, 0*(\n/4/2)
add v2, v1, #8*2*(\n/4/2)
add v3, v1, #8*4*(\n/4/2)
add v4, v1, #8*6*(\n/4/2)
pass (\n/4/2), v1, v2, v3, v4
pop {v1-v6,pc}
.elseif \n >= 256
.set o1, 2*(\n/4/2)
.set o2, 0*(\n/4/2)
.set o3, 2*(\n/4/2)
add v2, v1, #8*4*(\n/4/2)
pass (\n/4/2), v1, v1, v2, v2
pop {v1-v2,v5-v6,pc}
.else
.set o1, 2*(\n/4/2)
.set o2, 4*(\n/4/2)
.set o3, 6*(\n/4/2)
pass (\n/4/2), v1, v1, v1, v1
pop {v1,v5-v6,pc}
.endif
endfunc
function fft\n\()_vfp
ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
fmrx a2, FPSCR
fmxr FPSCR, a3
vpush {s16-s31}
mov ip, lr
bl .Lfft\n\()_internal_vfp
vpop {s16-s31}
fmxr FPSCR, a2
bx ip
endfunc
.ltorg
.endm
def_fft 32, 16, 8
def_fft 64, 32, 16
def_fft 128, 64, 32
def_fft 256, 128, 64
def_fft 512, 256, 128
def_fft 1024, 512, 256
def_fft 2048, 1024, 512
def_fft 4096, 2048, 1024
def_fft 8192, 4096, 2048
def_fft 16384, 8192, 4096
def_fft 32768, 16384, 8192
def_fft 65536, 32768, 16384

View File

@@ -0,0 +1,146 @@
/*
* Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function flac_lpc_16_1_arm
ldr r12, [sp]
push {r4, lr}
ldr r1, [r1]
subs r12, r12, #2
ldr lr, [r0], #4
beq 2f
it lt
poplt {r4, pc}
1:
mul r4, lr, r1
ldm r0, {r2, lr}
add_sh r2, r2, r4, asr r3
mul r4, r2, r1
subs r12, r12, #2
add_sh lr, lr, r4, asr r3
stm r0!, {r2, lr}
bgt 1b
it lt
poplt {r4, pc}
2:
mul r4, lr, r1
ldr r2, [r0]
add_sh r2, r2, r4, asr r3
str r2, [r0]
pop {r4, pc}
endfunc
function flac_lpc_16_2_arm
ldr r12, [sp]
subs r12, r12, r2
it le
bxle lr
push {r4-r9, lr}
ldm r0!, {r6, r7}
ldm r1, {r8, r9}
subs r12, r12, #1
beq 2f
1:
mul r4, r6, r8
mul r5, r7, r8
mla r4, r7, r9, r4
ldm r0, {r6, r7}
add_sh r6, r6, r4, asr r3
mla r5, r6, r9, r5
add_sh r7, r7, r5, asr r3
stm r0!, {r6, r7}
subs r12, r12, #2
bgt 1b
it lt
poplt {r4-r9, pc}
2:
mul r4, r6, r8
mla r4, r7, r9, r4
ldr r5, [r0]
add_sh r5, r5, r4, asr r3
str r5, [r0]
pop {r4-r9, pc}
endfunc
function ff_flac_lpc_16_arm, export=1
cmp r2, #2
blt flac_lpc_16_1_arm
beq flac_lpc_16_2_arm
ldr r12, [sp]
subs r12, r12, r2
it le
bxle lr
push {r4-r9, lr}
subs r12, r12, #1
beq 3f
1:
sub lr, r2, #2
mov r4, #0
mov r5, #0
ldr r7, [r0], #4
ldr r9, [r1], #4
2:
mla r4, r7, r9, r4
ldm r0!, {r6, r7}
mla r5, r6, r9, r5
ldm r1!, {r8, r9}
mla r4, r6, r8, r4
subs lr, lr, #2
mla r5, r7, r8, r5
bgt 2b
blt 6f
mla r4, r7, r9, r4
ldr r7, [r0], #4
mla r5, r7, r9, r5
ldr r9, [r1], #4
6:
mla r4, r7, r9, r4
ldm r0, {r6, r7}
add_sh r6, r6, r4, asr r3
mla r5, r6, r9, r5
add_sh r7, r7, r5, asr r3
stm r0!, {r6, r7}
sub r0, r0, r2, lsl #2
sub r1, r1, r2, lsl #2
subs r12, r12, #2
bgt 1b
it lt
poplt {r4-r9, pc}
3:
mov r4, #0
4:
ldr r5, [r1], #4
ldr r6, [r0], #4
mla r4, r5, r6, r4
subs r2, r2, #1
bgt 4b
ldr r5, [r0]
add_sh r5, r5, r4, asr r3
str r5, [r0]
pop {r4-r9, pc}
endfunc

View File

@@ -0,0 +1,32 @@
/*
* Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/flacdsp.h"
#include "config.h"
void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
int qlevel, int len);
av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels,
int bps)
{
if (CONFIG_FLAC_DECODER)
c->lpc16 = ff_flac_lpc_16_arm;
}

View File

@@ -0,0 +1,51 @@
/*
* ARM optimized Format Conversion Utils
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/fmtconvert.h"
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
float mul, int len);
void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
float mul, int len);
void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst,
const int32_t *src, const float *mul,
int len);
av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags)) {
if (!have_vfpv3(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_vfp;
}
}
if (have_neon(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
}
}

View File

@@ -0,0 +1,51 @@
/*
* ARM NEON optimised Format Conversion Utils
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/asm.S"
function ff_int32_to_float_fmul_scalar_neon, export=1
VFP vdup.32 q0, d0[0]
VFP len .req r2
NOVFP vdup.32 q0, r2
NOVFP len .req r3
vld1.32 {q1},[r1,:128]!
vcvt.f32.s32 q3, q1
vld1.32 {q2},[r1,:128]!
vcvt.f32.s32 q8, q2
1: subs len, len, #8
pld [r1, #16]
vmul.f32 q9, q3, q0
vmul.f32 q10, q8, q0
beq 2f
vld1.32 {q1},[r1,:128]!
vcvt.f32.s32 q3, q1
vld1.32 {q2},[r1,:128]!
vcvt.f32.s32 q8, q2
vst1.32 {q9}, [r0,:128]!
vst1.32 {q10},[r0,:128]!
b 1b
2: vst1.32 {q9}, [r0,:128]!
vst1.32 {q10},[r0,:128]!
bx lr
.unreq len
endfunc

View File

@@ -0,0 +1,221 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/asm.S"
/**
* ARM VFP optimised int32 to float conversion.
* Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
* (16 bytes alignment is best for BCM2835), little-endian.
*/
@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
function ff_int32_to_float_fmul_array8_vfp, export=1
push {lr}
ldr a1, [sp, #4]
subs lr, a1, #3*8
bcc 50f @ too short to pipeline
@ Now need to find (len / 8) % 3. The approximation
@ x / 24 = (x * 0xAB) >> 12
@ is good for x < 4096, which is true for both AC3 and DCA.
mov a1, #0xAB
ldr ip, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
mul a1, lr, a1
vpush {s16-s31}
mov a1, a1, lsr #12
add a1, a1, a1, lsl #1
rsb a1, a1, lr, lsr #3
cmp a1, #1
fmrx a1, FPSCR
fmxr FPSCR, ip
beq 11f
blo 10f
@ Array is (2 + multiple of 3) x 8 floats long
@ drop through...
vldmia a3!, {s16-s23}
vldmia a4!, {s2,s3}
vldmia a3!, {s24-s31}
vcvt.f32.s32 s16, s16
vcvt.f32.s32 s17, s17
vcvt.f32.s32 s18, s18
vcvt.f32.s32 s19, s19
vcvt.f32.s32 s20, s20
vcvt.f32.s32 s21, s21
vcvt.f32.s32 s22, s22
vcvt.f32.s32 s23, s23
vmul.f32 s16, s16, s2
@ drop through...
3:
vldmia a3!, {s8-s15}
vldmia a4!, {s1}
vcvt.f32.s32 s24, s24
vcvt.f32.s32 s25, s25
vcvt.f32.s32 s26, s26
vcvt.f32.s32 s27, s27
vcvt.f32.s32 s28, s28
vcvt.f32.s32 s29, s29
vcvt.f32.s32 s30, s30
vcvt.f32.s32 s31, s31
vmul.f32 s24, s24, s3
vstmia a2!, {s16-s19}
vstmia a2!, {s20-s23}
2:
vldmia a3!, {s16-s23}
vldmia a4!, {s2}
vcvt.f32.s32 s8, s8
vcvt.f32.s32 s9, s9
vcvt.f32.s32 s10, s10
vcvt.f32.s32 s11, s11
vcvt.f32.s32 s12, s12
vcvt.f32.s32 s13, s13
vcvt.f32.s32 s14, s14
vcvt.f32.s32 s15, s15
vmul.f32 s8, s8, s1
vstmia a2!, {s24-s27}
vstmia a2!, {s28-s31}
1:
vldmia a3!, {s24-s31}
vldmia a4!, {s3}
vcvt.f32.s32 s16, s16
vcvt.f32.s32 s17, s17
vcvt.f32.s32 s18, s18
vcvt.f32.s32 s19, s19
vcvt.f32.s32 s20, s20
vcvt.f32.s32 s21, s21
vcvt.f32.s32 s22, s22
vcvt.f32.s32 s23, s23
vmul.f32 s16, s16, s2
vstmia a2!, {s8-s11}
vstmia a2!, {s12-s15}
subs lr, lr, #8*3
bpl 3b
vcvt.f32.s32 s24, s24
vcvt.f32.s32 s25, s25
vcvt.f32.s32 s26, s26
vcvt.f32.s32 s27, s27
vcvt.f32.s32 s28, s28
vcvt.f32.s32 s29, s29
vcvt.f32.s32 s30, s30
vcvt.f32.s32 s31, s31
vmul.f32 s24, s24, s3
vstmia a2!, {s16-s19}
vstmia a2!, {s20-s23}
vstmia a2!, {s24-s27}
vstmia a2!, {s28-s31}
fmxr FPSCR, a1
vpop {s16-s31}
pop {pc}
10: @ Array is (multiple of 3) x 8 floats long
vldmia a3!, {s8-s15}
vldmia a4!, {s1,s2}
vldmia a3!, {s16-s23}
vcvt.f32.s32 s8, s8
vcvt.f32.s32 s9, s9
vcvt.f32.s32 s10, s10
vcvt.f32.s32 s11, s11
vcvt.f32.s32 s12, s12
vcvt.f32.s32 s13, s13
vcvt.f32.s32 s14, s14
vcvt.f32.s32 s15, s15
vmul.f32 s8, s8, s1
b 1b
11: @ Array is (1 + multiple of 3) x 8 floats long
vldmia a3!, {s24-s31}
vldmia a4!, {s3}
vldmia a3!, {s8-s15}
vldmia a4!, {s1}
vcvt.f32.s32 s24, s24
vcvt.f32.s32 s25, s25
vcvt.f32.s32 s26, s26
vcvt.f32.s32 s27, s27
vcvt.f32.s32 s28, s28
vcvt.f32.s32 s29, s29
vcvt.f32.s32 s30, s30
vcvt.f32.s32 s31, s31
vmul.f32 s24, s24, s3
b 2b
50:
ldr lr, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
fmrx ip, FPSCR
fmxr FPSCR, lr
51:
vldmia a3!, {s8-s15}
vldmia a4!, {s0}
vcvt.f32.s32 s8, s8
vcvt.f32.s32 s9, s9
vcvt.f32.s32 s10, s10
vcvt.f32.s32 s11, s11
vcvt.f32.s32 s12, s12
vcvt.f32.s32 s13, s13
vcvt.f32.s32 s14, s14
vcvt.f32.s32 s15, s15
vmul.f32 s8, s8, s0
subs a1, a1, #8
vstmia a2!, {s8-s11}
vstmia a2!, {s12-s15}
bne 51b
fmxr FPSCR, ip
pop {pc}
endfunc
/**
* ARM VFP optimised int32 to float conversion.
* Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
* (16 bytes alignment is best for BCM2835), little-endian.
* TODO: could be further optimised by unrolling and interleaving, as above
*/
@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
function ff_int32_to_float_fmul_scalar_vfp, export=1
VFP tmp .req a4
VFP len .req a3
NOVFP tmp .req a3
NOVFP len .req a4
NOVFP vmov s0, a3
ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
fmrx ip, FPSCR
fmxr FPSCR, tmp
1:
vldmia a2!, {s8-s15}
vcvt.f32.s32 s8, s8
vcvt.f32.s32 s9, s9
vcvt.f32.s32 s10, s10
vcvt.f32.s32 s11, s11
vcvt.f32.s32 s12, s12
vcvt.f32.s32 s13, s13
vcvt.f32.s32 s14, s14
vcvt.f32.s32 s15, s15
vmul.f32 s8, s8, s0
subs len, len, #8
vstmia a1!, {s8-s11}
vstmia a1!, {s12-s15}
bne 1b
fmxr FPSCR, ip
bx lr
endfunc
.unreq tmp
.unreq len

View File

@@ -0,0 +1,35 @@
/*
* Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/g722dsp.h"
extern void ff_g722_apply_qmf_neon(const int16_t *prev_samples, int xout[2]);
av_cold void ff_g722dsp_init_arm(G722DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
dsp->apply_qmf = ff_g722_apply_qmf_neon;
}

View File

@@ -0,0 +1,69 @@
/*
* ARM NEON optimised DSP functions for G722 coding
* Copyright (c) 2015 Peter Meerwald <pmeerw@pmeerw.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_g722_apply_qmf_neon, export=1, align=4
movrel r3, qmf_coeffs
vld1.s16 {d2,d3,d4}, [r0]! /* load prev_samples */
vld1.s16 {d16,d17,d18}, [r3,:64]! /* load qmf_coeffs */
vmull.s16 q0, d2, d16
vmlal.s16 q0, d3, d17
vmlal.s16 q0, d4, d18
vld1.s16 {d5,d6,d7}, [r0]! /* load prev_samples */
vld1.s16 {d19,d20,d21}, [r3,:64]! /* load qmf_coeffs */
vmlal.s16 q0, d5, d19
vmlal.s16 q0, d6, d20
vmlal.s16 q0, d7, d21
vadd.s32 d0, d1, d0
vrev64.32 d0, d0
vst1.s32 {d0}, [r1]
bx lr
endfunc
const qmf_coeffs, align=4
.hword 3
.hword -11
.hword -11
.hword 53
.hword 12
.hword -156
.hword 32
.hword 362
.hword -210
.hword -805
.hword 951
.hword 3876
.hword 3876
.hword 951
.hword -805
.hword -210
.hword 362
.hword 32
.hword -156
.hword 12
.hword 53
.hword -11
.hword -11
.hword 3
endconst

View File

@@ -0,0 +1,51 @@
/*
* ARM NEON optimised H.264 chroma functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/h264chroma.h"
void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
av_cold void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth)
{
const int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
}
}

View File

@@ -0,0 +1,463 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
.macro h264_chroma_mc8 type, codec=h264
function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
push {r4-r7, lr}
ldrd r4, r5, [sp, #20]
.ifc \type,avg
mov lr, r0
.endif
pld [r1]
pld [r1, r2]
.ifc \codec,rv40
movrel r6, rv40bias
lsr r7, r5, #1
add r6, r6, r7, lsl #3
lsr r7, r4, #1
add r6, r6, r7, lsl #1
vld1.16 {d22[],d23[]}, [r6,:16]
.endif
.ifc \codec,vc1
vmov.u16 q11, #28
.endif
A muls r7, r4, r5
T mul r7, r4, r5
T cmp r7, #0
rsb r6, r7, r5, lsl #3
rsb r12, r7, r4, lsl #3
sub r4, r7, r4, lsl #3
sub r4, r4, r5, lsl #3
add r4, r4, #64
beq 2f
vdup.8 d0, r4
vdup.8 d1, r12
vld1.8 {d4, d5}, [r1], r2
vdup.8 d2, r6
vdup.8 d3, r7
vext.8 d5, d4, d5, #1
1: vld1.8 {d6, d7}, [r1], r2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
vext.8 d7, d6, d7, #1
vld1.8 {d4, d5}, [r1], r2
vmlal.u8 q8, d6, d2
pld [r1]
vext.8 d5, d4, d5, #1
vmlal.u8 q8, d7, d3
vmull.u8 q9, d6, d0
subs r3, r3, #2
vmlal.u8 q9, d7, d1
vmlal.u8 q9, d4, d2
vmlal.u8 q9, d5, d3
pld [r1, r2]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 1b
pop {r4-r7, pc}
2: adds r12, r12, r6
vdup.8 d0, r4
beq 5f
tst r6, r6
vdup.8 d1, r12
beq 4f
vld1.8 {d4}, [r1], r2
3: vld1.8 {d6}, [r1], r2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d1
vld1.8 {d4}, [r1], r2
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d1
pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
pld [r1, r2]
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
subs r3, r3, #2
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 3b
pop {r4-r7, pc}
4: vld1.8 {d4, d5}, [r1], r2
vld1.8 {d6, d7}, [r1], r2
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
pld [r1]
subs r3, r3, #2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
vmull.u8 q9, d6, d0
vmlal.u8 q9, d7, d1
pld [r1, r2]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 4b
pop {r4-r7, pc}
5: vld1.8 {d4}, [r1], r2
vld1.8 {d5}, [r1], r2
pld [r1]
subs r3, r3, #2
vmull.u8 q8, d4, d0
vmull.u8 q9, d5, d0
pld [r1, r2]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
vrshrn.u16 d17, q9, #6
.else
vadd.u16 q8, q8, q11
vadd.u16 q9, q9, q11
vshrn.u16 d16, q8, #6
vshrn.u16 d17, q9, #6
.endif
.ifc \type,avg
vld1.8 {d20}, [lr,:64], r2
vld1.8 {d21}, [lr,:64], r2
vrhadd.u8 q8, q8, q10
.endif
vst1.8 {d16}, [r0,:64], r2
vst1.8 {d17}, [r0,:64], r2
bgt 5b
pop {r4-r7, pc}
endfunc
.endm
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
.macro h264_chroma_mc4 type, codec=h264
function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
push {r4-r7, lr}
ldrd r4, r5, [sp, #20]
.ifc \type,avg
mov lr, r0
.endif
pld [r1]
pld [r1, r2]
.ifc \codec,rv40
movrel r6, rv40bias
lsr r7, r5, #1
add r6, r6, r7, lsl #3
lsr r7, r4, #1
add r6, r6, r7, lsl #1
vld1.16 {d22[],d23[]}, [r6,:16]
.endif
.ifc \codec,vc1
vmov.u16 q11, #28
.endif
A muls r7, r4, r5
T mul r7, r4, r5
T cmp r7, #0
rsb r6, r7, r5, lsl #3
rsb r12, r7, r4, lsl #3
sub r4, r7, r4, lsl #3
sub r4, r4, r5, lsl #3
add r4, r4, #64
beq 2f
vdup.8 d0, r4
vdup.8 d1, r12
vld1.8 {d4}, [r1], r2
vdup.8 d2, r6
vdup.8 d3, r7
vext.8 d5, d4, d5, #1
vtrn.32 d4, d5
vtrn.32 d0, d1
vtrn.32 d2, d3
1: vld1.8 {d6}, [r1], r2
vext.8 d7, d6, d7, #1
vtrn.32 d6, d7
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d2
vld1.8 {d4}, [r1], r2
vext.8 d5, d4, d5, #1
vtrn.32 d4, d5
pld [r1]
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d2
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
subs r3, r3, #2
pld [r1, r2]
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 1b
pop {r4-r7, pc}
2: adds r12, r12, r6
vdup.8 d0, r4
beq 5f
tst r6, r6
vdup.8 d1, r12
vtrn.32 d0, d1
beq 4f
vext.32 d1, d0, d1, #1
vld1.32 {d4[0]}, [r1], r2
3: vld1.32 {d4[1]}, [r1], r2
vmull.u8 q8, d4, d0
vld1.32 {d4[0]}, [r1], r2
vmull.u8 q9, d4, d1
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
subs r3, r3, #2
pld [r1, r2]
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 3b
pop {r4-r7, pc}
4: vld1.8 {d4}, [r1], r2
vld1.8 {d6}, [r1], r2
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vtrn.32 d4, d5
vtrn.32 d6, d7
vmull.u8 q8, d4, d0
vmull.u8 q9, d6, d0
subs r3, r3, #2
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
pld [r1]
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 4b
pop {r4-r7, pc}
5: vld1.32 {d4[0]}, [r1], r2
vld1.32 {d4[1]}, [r1], r2
vmull.u8 q8, d4, d0
subs r3, r3, #2
pld [r1]
.ifc \codec,h264
vrshrn.u16 d16, q8, #6
.else
vadd.u16 q8, q8, q11
vshrn.u16 d16, q8, #6
.endif
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
vrhadd.u8 d16, d16, d20
.endif
pld [r1]
vst1.32 {d16[0]}, [r0,:32], r2
vst1.32 {d16[1]}, [r0,:32], r2
bgt 5b
pop {r4-r7, pc}
endfunc
.endm
.macro h264_chroma_mc2 type
function ff_\type\()_h264_chroma_mc2_neon, export=1
push {r4-r6, lr}
ldr r4, [sp, #16]
ldr lr, [sp, #20]
pld [r1]
pld [r1, r2]
orrs r5, r4, lr
beq 2f
mul r5, r4, lr
rsb r6, r5, lr, lsl #3
rsb r12, r5, r4, lsl #3
sub r4, r5, r4, lsl #3
sub r4, r4, lr, lsl #3
add r4, r4, #64
vdup.8 d0, r4
vdup.8 d2, r12
vdup.8 d1, r6
vdup.8 d3, r5
vtrn.16 q0, q1
1:
vld1.32 {d4[0]}, [r1], r2
vld1.32 {d4[1]}, [r1], r2
vrev64.32 d5, d4
vld1.32 {d5[1]}, [r1]
vext.8 q3, q2, q2, #1
vtrn.16 q2, q3
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
.ifc \type,avg
vld1.16 {d18[0]}, [r0,:16], r2
vld1.16 {d18[1]}, [r0,:16]
sub r0, r0, r2
.endif
vtrn.32 d16, d17
vadd.i16 d16, d16, d17
vrshrn.u16 d16, q8, #6
.ifc \type,avg
vrhadd.u8 d16, d16, d18
.endif
vst1.16 {d16[0]}, [r0,:16], r2
vst1.16 {d16[1]}, [r0,:16], r2
subs r3, r3, #2
bgt 1b
pop {r4-r6, pc}
2:
.ifc \type,put
ldrh_post r5, r1, r2
strh_post r5, r0, r2
ldrh_post r6, r1, r2
strh_post r6, r0, r2
.else
vld1.16 {d16[0]}, [r1], r2
vld1.16 {d16[1]}, [r1], r2
vld1.16 {d18[0]}, [r0,:16], r2
vld1.16 {d18[1]}, [r0,:16]
sub r0, r0, r2
vrhadd.u8 d16, d16, d18
vst1.16 {d16[0]}, [r0,:16], r2
vst1.16 {d16[1]}, [r0,:16], r2
.endif
subs r3, r3, #2
bgt 2b
pop {r4-r6, pc}
endfunc
.endm
h264_chroma_mc8 put
h264_chroma_mc8 avg
h264_chroma_mc4 put
h264_chroma_mc4 avg
h264_chroma_mc2 put
h264_chroma_mc2 avg
#if CONFIG_RV40_DECODER
const rv40bias
.short 0, 16, 32, 16
.short 32, 28, 32, 28
.short 0, 32, 16, 32
.short 32, 28, 32, 28
endconst
h264_chroma_mc8 put, rv40
h264_chroma_mc8 avg, rv40
h264_chroma_mc4 put, rv40
h264_chroma_mc4 avg, rv40
#endif
#if CONFIG_VC1_DECODER
h264_chroma_mc8 put, vc1
h264_chroma_mc8 avg, vc1
h264_chroma_mc4 put, vc1
h264_chroma_mc4 avg, vc1
#endif

View File

@@ -0,0 +1,116 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/h264dsp.h"
#include "libavcodec/arm/startcode.h"
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_NEON
if (bit_depth == 8) {
c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
if(chroma_format_idc == 1){
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
}
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
c->h264_idct_add16 = ff_h264_idct_add16_neon;
c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
if (chroma_format_idc <= 1)
c->h264_idct_add8 = ff_h264_idct_add8_neon;
c->h264_idct8_add = ff_h264_idct8_add_neon;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
}
#endif // HAVE_NEON
}
av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
#if HAVE_ARMV6
if (have_setend(cpu_flags))
c->startcode_find_candidate = ff_startcode_find_candidate_armv6;
#endif
if (have_neon(cpu_flags))
h264dsp_init_neon(c, bit_depth, chroma_format_idc);
}

View File

@@ -0,0 +1,541 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
/* H.264 loop filter */
.macro h264_loop_filter_start
ldr r12, [sp]
tst r2, r2
ldr r12, [r12]
it ne
tstne r3, r3
vmov.32 d24[0], r12
and r12, r12, r12, lsl #16
it eq
bxeq lr
ands r12, r12, r12, lsl #8
it lt
bxlt lr
.endm
.macro h264_loop_filter_luma
vdup.8 q11, r2 @ alpha
vmovl.u8 q12, d24
vabd.u8 q6, q8, q0 @ abs(p0 - q0)
vmovl.u16 q12, d24
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
vsli.16 q12, q12, #8
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
vsli.32 q12, q12, #16
vclt.u8 q6, q6, q11 @ < alpha
vdup.8 q11, r3 @ beta
vclt.s8 q7, q12, #0
vclt.u8 q14, q14, q11 @ < beta
vclt.u8 q15, q15, q11 @ < beta
vbic q6, q6, q7
vabd.u8 q4, q10, q8 @ abs(p2 - p0)
vand q6, q6, q14
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
vclt.u8 q4, q4, q11 @ < beta
vand q6, q6, q15
vclt.u8 q5, q5, q11 @ < beta
vand q4, q4, q6
vand q5, q5, q6
vand q12, q12, q6
vrhadd.u8 q14, q8, q0
vsub.i8 q6, q12, q4
vqadd.u8 q7, q9, q12
vhadd.u8 q10, q10, q14
vsub.i8 q6, q6, q5
vhadd.u8 q14, q2, q14
vmin.u8 q7, q7, q10
vqsub.u8 q11, q9, q12
vqadd.u8 q2, q1, q12
vmax.u8 q7, q7, q11
vqsub.u8 q11, q1, q12
vmin.u8 q14, q2, q14
vmovl.u8 q2, d0
vmax.u8 q14, q14, q11
vmovl.u8 q10, d1
vsubw.u8 q2, q2, d16
vsubw.u8 q10, q10, d17
vshl.i16 q2, q2, #2
vshl.i16 q10, q10, #2
vaddw.u8 q2, q2, d18
vaddw.u8 q10, q10, d19
vsubw.u8 q2, q2, d2
vsubw.u8 q10, q10, d3
vrshrn.i16 d4, q2, #3
vrshrn.i16 d5, q10, #3
vbsl q4, q7, q9
vbsl q5, q14, q1
vneg.s8 q7, q6
vmovl.u8 q14, d16
vmin.s8 q2, q2, q6
vmovl.u8 q6, d17
vmax.s8 q2, q2, q7
vmovl.u8 q11, d0
vmovl.u8 q12, d1
vaddw.s8 q14, q14, d4
vaddw.s8 q6, q6, d5
vsubw.s8 q11, q11, d4
vsubw.s8 q12, q12, d5
vqmovun.s16 d16, q14
vqmovun.s16 d17, q6
vqmovun.s16 d0, q11
vqmovun.s16 d1, q12
.endm
function ff_h264_v_loop_filter_luma_neon, export=1
h264_loop_filter_start
vld1.8 {d0, d1}, [r0,:128], r1
vld1.8 {d2, d3}, [r0,:128], r1
vld1.8 {d4, d5}, [r0,:128], r1
sub r0, r0, r1, lsl #2
sub r0, r0, r1, lsl #1
vld1.8 {d20,d21}, [r0,:128], r1
vld1.8 {d18,d19}, [r0,:128], r1
vld1.8 {d16,d17}, [r0,:128], r1
vpush {d8-d15}
h264_loop_filter_luma
sub r0, r0, r1, lsl #1
vst1.8 {d8, d9}, [r0,:128], r1
vst1.8 {d16,d17}, [r0,:128], r1
vst1.8 {d0, d1}, [r0,:128], r1
vst1.8 {d10,d11}, [r0,:128]
vpop {d8-d15}
bx lr
endfunc
function ff_h264_h_loop_filter_luma_neon, export=1
h264_loop_filter_start
sub r0, r0, #4
vld1.8 {d6}, [r0], r1
vld1.8 {d20}, [r0], r1
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d4}, [r0], r1
vld1.8 {d26}, [r0], r1
vld1.8 {d7}, [r0], r1
vld1.8 {d21}, [r0], r1
vld1.8 {d19}, [r0], r1
vld1.8 {d17}, [r0], r1
vld1.8 {d1}, [r0], r1
vld1.8 {d3}, [r0], r1
vld1.8 {d5}, [r0], r1
vld1.8 {d27}, [r0], r1
transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
vpush {d8-d15}
h264_loop_filter_luma
transpose_4x4 q4, q8, q0, q5
sub r0, r0, r1, lsl #4
add r0, r0, #2
vst1.32 {d8[0]}, [r0], r1
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d10[0]}, [r0], r1
vst1.32 {d8[1]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0], r1
vst1.32 {d10[1]}, [r0], r1
vst1.32 {d9[0]}, [r0], r1
vst1.32 {d17[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d11[0]}, [r0], r1
vst1.32 {d9[1]}, [r0], r1
vst1.32 {d17[1]}, [r0], r1
vst1.32 {d1[1]}, [r0], r1
vst1.32 {d11[1]}, [r0], r1
vpop {d8-d15}
bx lr
endfunc
.macro h264_loop_filter_chroma
vdup.8 d22, r2 @ alpha
vmovl.u8 q12, d24
vabd.u8 d26, d16, d0 @ abs(p0 - q0)
vmovl.u8 q2, d0
vabd.u8 d28, d18, d16 @ abs(p1 - p0)
vsubw.u8 q2, q2, d16
vsli.16 d24, d24, #8
vshl.i16 q2, q2, #2
vabd.u8 d30, d2, d0 @ abs(q1 - q0)
vaddw.u8 q2, q2, d18
vclt.u8 d26, d26, d22 @ < alpha
vsubw.u8 q2, q2, d2
vdup.8 d22, r3 @ beta
vrshrn.i16 d4, q2, #3
vclt.u8 d28, d28, d22 @ < beta
vclt.u8 d30, d30, d22 @ < beta
vmin.s8 d4, d4, d24
vneg.s8 d25, d24
vand d26, d26, d28
vmax.s8 d4, d4, d25
vand d26, d26, d30
vmovl.u8 q11, d0
vand d4, d4, d26
vmovl.u8 q14, d16
vaddw.s8 q14, q14, d4
vsubw.s8 q11, q11, d4
vqmovun.s16 d16, q14
vqmovun.s16 d0, q11
.endm
function ff_h264_v_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub r0, r0, r1, lsl #1
vld1.8 {d18}, [r0,:64], r1
vld1.8 {d16}, [r0,:64], r1
vld1.8 {d0}, [r0,:64], r1
vld1.8 {d2}, [r0,:64]
h264_loop_filter_chroma
sub r0, r0, r1, lsl #1
vst1.8 {d16}, [r0,:64], r1
vst1.8 {d0}, [r0,:64], r1
bx lr
endfunc
function ff_h264_h_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub r0, r0, #2
vld1.32 {d18[0]}, [r0], r1
vld1.32 {d16[0]}, [r0], r1
vld1.32 {d0[0]}, [r0], r1
vld1.32 {d2[0]}, [r0], r1
vld1.32 {d18[1]}, [r0], r1
vld1.32 {d16[1]}, [r0], r1
vld1.32 {d0[1]}, [r0], r1
vld1.32 {d2[1]}, [r0], r1
vtrn.16 d18, d0
vtrn.16 d16, d2
vtrn.8 d18, d16
vtrn.8 d0, d2
h264_loop_filter_chroma
vtrn.16 d18, d0
vtrn.16 d16, d2
vtrn.8 d18, d16
vtrn.8 d0, d2
sub r0, r0, r1, lsl #3
vst1.32 {d18[0]}, [r0], r1
vst1.32 {d16[0]}, [r0], r1
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d2[0]}, [r0], r1
vst1.32 {d18[1]}, [r0], r1
vst1.32 {d16[1]}, [r0], r1
vst1.32 {d0[1]}, [r0], r1
vst1.32 {d2[1]}, [r0], r1
bx lr
endfunc
@ Biweighted prediction
.macro biweight_16 macs, macd
vdup.8 d0, r4
vdup.8 d1, r5
vmov q2, q8
vmov q3, q8
1: subs r3, r3, #2
vld1.8 {d20-d21},[r0,:128], r2
\macd q2, d0, d20
pld [r0]
\macd q3, d0, d21
vld1.8 {d22-d23},[r1,:128], r2
\macs q2, d1, d22
pld [r1]
\macs q3, d1, d23
vmov q12, q8
vld1.8 {d28-d29},[r0,:128], r2
vmov q13, q8
\macd q12, d0, d28
pld [r0]
\macd q13, d0, d29
vld1.8 {d30-d31},[r1,:128], r2
\macs q12, d1, d30
pld [r1]
\macs q13, d1, d31
vshl.s16 q2, q2, q9
vshl.s16 q3, q3, q9
vqmovun.s16 d4, q2
vqmovun.s16 d5, q3
vshl.s16 q12, q12, q9
vshl.s16 q13, q13, q9
vqmovun.s16 d24, q12
vqmovun.s16 d25, q13
vmov q3, q8
vst1.8 {d4- d5}, [r6,:128], r2
vmov q2, q8
vst1.8 {d24-d25},[r6,:128], r2
bne 1b
pop {r4-r6, pc}
.endm
.macro biweight_8 macs, macd
vdup.8 d0, r4
vdup.8 d1, r5
vmov q1, q8
vmov q10, q8
1: subs r3, r3, #2
vld1.8 {d4},[r0,:64], r2
\macd q1, d0, d4
pld [r0]
vld1.8 {d5},[r1,:64], r2
\macs q1, d1, d5
pld [r1]
vld1.8 {d6},[r0,:64], r2
\macd q10, d0, d6
pld [r0]
vld1.8 {d7},[r1,:64], r2
\macs q10, d1, d7
pld [r1]
vshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vshl.s16 q10, q10, q9
vqmovun.s16 d4, q10
vmov q10, q8
vst1.8 {d2},[r6,:64], r2
vmov q1, q8
vst1.8 {d4},[r6,:64], r2
bne 1b
pop {r4-r6, pc}
.endm
.macro biweight_4 macs, macd
vdup.8 d0, r4
vdup.8 d1, r5
vmov q1, q8
vmov q10, q8
1: subs r3, r3, #4
vld1.32 {d4[0]},[r0,:32], r2
vld1.32 {d4[1]},[r0,:32], r2
\macd q1, d0, d4
pld [r0]
vld1.32 {d5[0]},[r1,:32], r2
vld1.32 {d5[1]},[r1,:32], r2
\macs q1, d1, d5
pld [r1]
blt 2f
vld1.32 {d6[0]},[r0,:32], r2
vld1.32 {d6[1]},[r0,:32], r2
\macd q10, d0, d6
pld [r0]
vld1.32 {d7[0]},[r1,:32], r2
vld1.32 {d7[1]},[r1,:32], r2
\macs q10, d1, d7
pld [r1]
vshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vshl.s16 q10, q10, q9
vqmovun.s16 d4, q10
vmov q10, q8
vst1.32 {d2[0]},[r6,:32], r2
vst1.32 {d2[1]},[r6,:32], r2
vmov q1, q8
vst1.32 {d4[0]},[r6,:32], r2
vst1.32 {d4[1]},[r6,:32], r2
bne 1b
pop {r4-r6, pc}
2: vshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vst1.32 {d2[0]},[r6,:32], r2
vst1.32 {d2[1]},[r6,:32], r2
pop {r4-r6, pc}
.endm
.macro biweight_func w
function ff_biweight_h264_pixels_\w\()_neon, export=1
push {r4-r6, lr}
ldr r12, [sp, #16]
add r4, sp, #20
ldm r4, {r4-r6}
lsr lr, r4, #31
add r6, r6, #1
eors lr, lr, r5, lsr #30
orr r6, r6, #1
vdup.16 q9, r12
lsl r6, r6, r12
vmvn q9, q9
vdup.16 q8, r6
mov r6, r0
beq 10f
subs lr, lr, #1
beq 20f
subs lr, lr, #1
beq 30f
b 40f
10: biweight_\w vmlal.u8, vmlal.u8
20: rsb r4, r4, #0
biweight_\w vmlal.u8, vmlsl.u8
30: rsb r4, r4, #0
rsb r5, r5, #0
biweight_\w vmlsl.u8, vmlsl.u8
40: rsb r5, r5, #0
biweight_\w vmlsl.u8, vmlal.u8
endfunc
.endm
biweight_func 16
biweight_func 8
biweight_func 4
@ Weighted prediction
.macro weight_16 add
vdup.8 d0, r12
1: subs r2, r2, #2
vld1.8 {d20-d21},[r0,:128], r1
vmull.u8 q2, d0, d20
pld [r0]
vmull.u8 q3, d0, d21
vld1.8 {d28-d29},[r0,:128], r1
vmull.u8 q12, d0, d28
pld [r0]
vmull.u8 q13, d0, d29
\add q2, q8, q2
vrshl.s16 q2, q2, q9
\add q3, q8, q3
vrshl.s16 q3, q3, q9
vqmovun.s16 d4, q2
vqmovun.s16 d5, q3
\add q12, q8, q12
vrshl.s16 q12, q12, q9
\add q13, q8, q13
vrshl.s16 q13, q13, q9
vqmovun.s16 d24, q12
vqmovun.s16 d25, q13
vst1.8 {d4- d5}, [r4,:128], r1
vst1.8 {d24-d25},[r4,:128], r1
bne 1b
pop {r4, pc}
.endm
.macro weight_8 add
vdup.8 d0, r12
1: subs r2, r2, #2
vld1.8 {d4},[r0,:64], r1
vmull.u8 q1, d0, d4
pld [r0]
vld1.8 {d6},[r0,:64], r1
vmull.u8 q10, d0, d6
\add q1, q8, q1
pld [r0]
vrshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
\add q10, q8, q10
vrshl.s16 q10, q10, q9
vqmovun.s16 d4, q10
vst1.8 {d2},[r4,:64], r1
vst1.8 {d4},[r4,:64], r1
bne 1b
pop {r4, pc}
.endm
.macro weight_4 add
vdup.8 d0, r12
vmov q1, q8
vmov q10, q8
1: subs r2, r2, #4
vld1.32 {d4[0]},[r0,:32], r1
vld1.32 {d4[1]},[r0,:32], r1
vmull.u8 q1, d0, d4
pld [r0]
blt 2f
vld1.32 {d6[0]},[r0,:32], r1
vld1.32 {d6[1]},[r0,:32], r1
vmull.u8 q10, d0, d6
pld [r0]
\add q1, q8, q1
vrshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
\add q10, q8, q10
vrshl.s16 q10, q10, q9
vqmovun.s16 d4, q10
vmov q10, q8
vst1.32 {d2[0]},[r4,:32], r1
vst1.32 {d2[1]},[r4,:32], r1
vmov q1, q8
vst1.32 {d4[0]},[r4,:32], r1
vst1.32 {d4[1]},[r4,:32], r1
bne 1b
pop {r4, pc}
2: \add q1, q8, q1
vrshl.s16 q1, q1, q9
vqmovun.s16 d2, q1
vst1.32 {d2[0]},[r4,:32], r1
vst1.32 {d2[1]},[r4,:32], r1
pop {r4, pc}
.endm
.macro weight_func w
function ff_weight_h264_pixels_\w\()_neon, export=1
push {r4, lr}
ldr r12, [sp, #8]
ldr r4, [sp, #12]
cmp r3, #1
lsl r4, r4, r3
vdup.16 q8, r4
mov r4, r0
ble 20f
rsb lr, r3, #1
vdup.16 q9, lr
cmp r12, #0
blt 10f
weight_\w vhadd.s16
10: rsb r12, r12, #0
weight_\w vhsub.s16
20: rsb lr, r3, #0
vdup.16 q9, lr
cmp r12, #0
blt 10f
weight_\w vadd.s16
10: rsb r12, r12, #0
weight_\w vsub.s16
endfunc
.endm
weight_func 16
weight_func 8
weight_func 4

View File

@@ -0,0 +1,413 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_h264_idct_add_neon, export=1
vld1.64 {d0-d3}, [r1,:128]
vmov.i16 q15, #0
vswp d1, d2
vst1.16 {q15}, [r1,:128]!
vadd.i16 d4, d0, d1
vst1.16 {q15}, [r1,:128]!
vshr.s16 q8, q1, #1
vsub.i16 d5, d0, d1
vadd.i16 d6, d2, d17
vsub.i16 d7, d16, d3
vadd.i16 q0, q2, q3
vsub.i16 q1, q2, q3
vtrn.16 d0, d1
vtrn.16 d3, d2
vtrn.32 d0, d3
vtrn.32 d1, d2
vadd.i16 d4, d0, d3
vld1.32 {d18[0]}, [r0,:32], r2
vswp d1, d3
vshr.s16 q8, q1, #1
vld1.32 {d19[1]}, [r0,:32], r2
vsub.i16 d5, d0, d1
vld1.32 {d18[1]}, [r0,:32], r2
vadd.i16 d6, d16, d3
vld1.32 {d19[0]}, [r0,:32], r2
vsub.i16 d7, d2, d17
sub r0, r0, r2, lsl #2
vadd.i16 q0, q2, q3
vsub.i16 q1, q2, q3
vrshr.s16 q0, q0, #6
vrshr.s16 q1, q1, #6
vaddw.u8 q0, q0, d18
vaddw.u8 q1, q1, d19
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vst1.32 {d0[0]}, [r0,:32], r2
vst1.32 {d1[1]}, [r0,:32], r2
vst1.32 {d0[1]}, [r0,:32], r2
vst1.32 {d1[0]}, [r0,:32], r2
sub r1, r1, #32
bx lr
endfunc
function ff_h264_idct_dc_add_neon, export=1
mov r3, #0
vld1.16 {d2[],d3[]}, [r1,:16]
strh r3, [r1]
vrshr.s16 q1, q1, #6
vld1.32 {d0[0]}, [r0,:32], r2
vld1.32 {d0[1]}, [r0,:32], r2
vaddw.u8 q2, q1, d0
vld1.32 {d1[0]}, [r0,:32], r2
vld1.32 {d1[1]}, [r0,:32], r2
vaddw.u8 q1, q1, d1
vqmovun.s16 d0, q2
vqmovun.s16 d1, q1
sub r0, r0, r2, lsl #2
vst1.32 {d0[0]}, [r0,:32], r2
vst1.32 {d0[1]}, [r0,:32], r2
vst1.32 {d1[0]}, [r0,:32], r2
vst1.32 {d1[1]}, [r0,:32], r2
bx lr
endfunc
function ff_h264_idct_add16_neon, export=1
push {r4-r8,lr}
mov r4, r0
mov r5, r1
mov r1, r2
mov r2, r3
ldr r6, [sp, #24]
movrel r7, scan8
mov ip, #16
1: ldrb r8, [r7], #1
ldr r0, [r5], #4
ldrb r8, [r6, r8]
subs r8, r8, #1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
it ne
movne lr, #0
cmp lr, #0
ite ne
adrne lr, X(ff_h264_idct_dc_add_neon) + CONFIG_THUMB
adreq lr, X(ff_h264_idct_add_neon) + CONFIG_THUMB
blx lr
2: subs ip, ip, #1
add r1, r1, #32
bne 1b
pop {r4-r8,pc}
endfunc
function ff_h264_idct_add16intra_neon, export=1
push {r4-r8,lr}
mov r4, r0
mov r5, r1
mov r1, r2
mov r2, r3
ldr r6, [sp, #24]
movrel r7, scan8
mov ip, #16
1: ldrb r8, [r7], #1
ldr r0, [r5], #4
ldrb r8, [r6, r8]
add r0, r0, r4
cmp r8, #0
ldrsh r8, [r1]
iteet ne
adrne lr, X(ff_h264_idct_add_neon) + CONFIG_THUMB
adreq lr, X(ff_h264_idct_dc_add_neon) + CONFIG_THUMB
cmpeq r8, #0
blxne lr
subs ip, ip, #1
add r1, r1, #32
bne 1b
pop {r4-r8,pc}
endfunc
function ff_h264_idct_add8_neon, export=1
push {r4-r10,lr}
ldm r0, {r4,r9}
add r5, r1, #16*4
add r1, r2, #16*32
mov r2, r3
mov r10, r1
ldr r6, [sp, #32]
movrel r7, scan8+16
mov r12, #0
1: ldrb r8, [r7, r12]
ldr r0, [r5, r12, lsl #2]
ldrb r8, [r6, r8]
add r0, r0, r4
add r1, r10, r12, lsl #5
cmp r8, #0
ldrsh r8, [r1]
iteet ne
adrne lr, X(ff_h264_idct_add_neon) + CONFIG_THUMB
adreq lr, X(ff_h264_idct_dc_add_neon) + CONFIG_THUMB
cmpeq r8, #0
blxne lr
add r12, r12, #1
cmp r12, #4
itt eq
moveq r12, #16
moveq r4, r9
cmp r12, #20
blt 1b
pop {r4-r10,pc}
endfunc
.macro idct8x8_cols pass
.if \pass == 0
qa .req q2
qb .req q14
vshr.s16 q2, q10, #1
vadd.i16 q0, q8, q12
vld1.16 {q14-q15},[r1,:128]
vst1.16 {q3}, [r1,:128]!
vst1.16 {q3}, [r1,:128]!
vsub.i16 q1, q8, q12
vshr.s16 q3, q14, #1
vsub.i16 q2, q2, q14
vadd.i16 q3, q3, q10
.else
qa .req q14
qb .req q2
vtrn.32 q8, q10
vtrn.16 q12, q13
vtrn.32 q9, q11
vtrn.32 q12, q2
vtrn.32 q13, q15
vswp d21, d4
vshr.s16 q14, q10, #1
vswp d17, d24
vshr.s16 q3, q2, #1
vswp d19, d26
vadd.i16 q0, q8, q12
vswp d23, d30
vsub.i16 q1, q8, q12
vsub.i16 q14, q14, q2
vadd.i16 q3, q3, q10
.endif
vadd.i16 q10, q1, qa
vsub.i16 q12, q1, qa
vadd.i16 q8, q0, q3
vsub.i16 qb, q0, q3
vsub.i16 q0, q13, q11
vadd.i16 q1, q15, q9
vsub.i16 qa, q15, q9
vadd.i16 q3, q13, q11
vsub.i16 q0, q0, q15
vsub.i16 q1, q1, q11
vadd.i16 qa, qa, q13
vadd.i16 q3, q3, q9
vshr.s16 q9, q9, #1
vshr.s16 q11, q11, #1
vshr.s16 q13, q13, #1
vshr.s16 q15, q15, #1
vsub.i16 q0, q0, q15
vsub.i16 q1, q1, q11
vadd.i16 qa, qa, q13
vadd.i16 q3, q3, q9
vshr.s16 q9, q0, #2
vshr.s16 q11, q1, #2
vshr.s16 q13, qa, #2
vshr.s16 q15, q3, #2
vsub.i16 q3, q3, q9
vsub.i16 qa, q11, qa
vadd.i16 q1, q1, q13
vadd.i16 q0, q0, q15
.if \pass == 0
vsub.i16 q15, q8, q3
vadd.i16 q8, q8, q3
vadd.i16 q9, q10, q2
vsub.i16 q2, q10, q2
vtrn.16 q8, q9
vadd.i16 q10, q12, q1
vtrn.16 q2, q15
vadd.i16 q11, q14, q0
vsub.i16 q13, q12, q1
vtrn.16 q10, q11
vsub.i16 q12, q14, q0
.else
vsub.i16 q15, q8, q3
vadd.i16 q8, q8, q3
vadd.i16 q9, q10, q14
vsub.i16 q14, q10, q14
vadd.i16 q10, q12, q1
vsub.i16 q13, q12, q1
vadd.i16 q11, q2, q0
vsub.i16 q12, q2, q0
.endif
.unreq qa
.unreq qb
.endm
function ff_h264_idct8_add_neon, export=1
vmov.i16 q3, #0
vld1.16 {q8-q9}, [r1,:128]
vst1.16 {q3}, [r1,:128]!
vst1.16 {q3}, [r1,:128]!
vld1.16 {q10-q11},[r1,:128]
vst1.16 {q3}, [r1,:128]!
vst1.16 {q3}, [r1,:128]!
vld1.16 {q12-q13},[r1,:128]
vst1.16 {q3}, [r1,:128]!
vst1.16 {q3}, [r1,:128]!
idct8x8_cols 0
idct8x8_cols 1
mov r3, r0
vrshr.s16 q8, q8, #6
vld1.8 {d0}, [r0,:64], r2
vrshr.s16 q9, q9, #6
vld1.8 {d1}, [r0,:64], r2
vrshr.s16 q10, q10, #6
vld1.8 {d2}, [r0,:64], r2
vrshr.s16 q11, q11, #6
vld1.8 {d3}, [r0,:64], r2
vrshr.s16 q12, q12, #6
vld1.8 {d4}, [r0,:64], r2
vrshr.s16 q13, q13, #6
vld1.8 {d5}, [r0,:64], r2
vrshr.s16 q14, q14, #6
vld1.8 {d6}, [r0,:64], r2
vrshr.s16 q15, q15, #6
vld1.8 {d7}, [r0,:64], r2
vaddw.u8 q8, q8, d0
vaddw.u8 q9, q9, d1
vaddw.u8 q10, q10, d2
vqmovun.s16 d0, q8
vaddw.u8 q11, q11, d3
vqmovun.s16 d1, q9
vaddw.u8 q12, q12, d4
vqmovun.s16 d2, q10
vst1.8 {d0}, [r3,:64], r2
vaddw.u8 q13, q13, d5
vqmovun.s16 d3, q11
vst1.8 {d1}, [r3,:64], r2
vaddw.u8 q14, q14, d6
vqmovun.s16 d4, q12
vst1.8 {d2}, [r3,:64], r2
vaddw.u8 q15, q15, d7
vqmovun.s16 d5, q13
vst1.8 {d3}, [r3,:64], r2
vqmovun.s16 d6, q14
vqmovun.s16 d7, q15
vst1.8 {d4}, [r3,:64], r2
vst1.8 {d5}, [r3,:64], r2
vst1.8 {d6}, [r3,:64], r2
vst1.8 {d7}, [r3,:64], r2
sub r1, r1, #128
bx lr
endfunc
function ff_h264_idct8_dc_add_neon, export=1
mov r3, #0
vld1.16 {d30[],d31[]},[r1,:16]
strh r3, [r1]
vld1.32 {d0}, [r0,:64], r2
vrshr.s16 q15, q15, #6
vld1.32 {d1}, [r0,:64], r2
vld1.32 {d2}, [r0,:64], r2
vaddw.u8 q8, q15, d0
vld1.32 {d3}, [r0,:64], r2
vaddw.u8 q9, q15, d1
vld1.32 {d4}, [r0,:64], r2
vaddw.u8 q10, q15, d2
vld1.32 {d5}, [r0,:64], r2
vaddw.u8 q11, q15, d3
vld1.32 {d6}, [r0,:64], r2
vaddw.u8 q12, q15, d4
vld1.32 {d7}, [r0,:64], r2
vaddw.u8 q13, q15, d5
vaddw.u8 q14, q15, d6
vaddw.u8 q15, q15, d7
vqmovun.s16 d0, q8
vqmovun.s16 d1, q9
vqmovun.s16 d2, q10
vqmovun.s16 d3, q11
sub r0, r0, r2, lsl #3
vst1.32 {d0}, [r0,:64], r2
vqmovun.s16 d4, q12
vst1.32 {d1}, [r0,:64], r2
vqmovun.s16 d5, q13
vst1.32 {d2}, [r0,:64], r2
vqmovun.s16 d6, q14
vst1.32 {d3}, [r0,:64], r2
vqmovun.s16 d7, q15
vst1.32 {d4}, [r0,:64], r2
vst1.32 {d5}, [r0,:64], r2
vst1.32 {d6}, [r0,:64], r2
vst1.32 {d7}, [r0,:64], r2
bx lr
endfunc
function ff_h264_idct8_add4_neon, export=1
push {r4-r8,lr}
mov r4, r0
mov r5, r1
mov r1, r2
mov r2, r3
ldr r6, [sp, #24]
movrel r7, scan8
mov r12, #16
1: ldrb r8, [r7], #4
ldr r0, [r5], #16
ldrb r8, [r6, r8]
subs r8, r8, #1
blt 2f
ldrsh lr, [r1]
add r0, r0, r4
it ne
movne lr, #0
cmp lr, #0
ite ne
adrne lr, X(ff_h264_idct8_dc_add_neon) + CONFIG_THUMB
adreq lr, X(ff_h264_idct8_add_neon) + CONFIG_THUMB
blx lr
2: subs r12, r12, #4
add r1, r1, #128
bne 1b
pop {r4-r8,pc}
endfunc
const scan8
.byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
.byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
.byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
.byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
.byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
.byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
.byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
.byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
.byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
.byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
.byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
.byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
endconst

View File

@@ -0,0 +1,95 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/h264pred.h"
void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_NEON
const int high_depth = bit_depth > 8;
if (high_depth)
return;
if (chroma_format_idc <= 1) {
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon;
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon;
if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon;
if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
codec_id != AV_CODEC_ID_VP8) {
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon;
h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
}
}
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon;
h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
#endif // HAVE_NEON
}
av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
int bit_depth, const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
}

View File

@@ -0,0 +1,359 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro ldcol.8 rd, rs, rt, n=8, hi=0
.if \n == 8 || \hi == 0
vld1.8 {\rd[0]}, [\rs], \rt
vld1.8 {\rd[1]}, [\rs], \rt
vld1.8 {\rd[2]}, [\rs], \rt
vld1.8 {\rd[3]}, [\rs], \rt
.endif
.if \n == 8 || \hi == 1
vld1.8 {\rd[4]}, [\rs], \rt
vld1.8 {\rd[5]}, [\rs], \rt
vld1.8 {\rd[6]}, [\rs], \rt
vld1.8 {\rd[7]}, [\rs], \rt
.endif
.endm
.macro add16x8 dq, dl, dh, rl, rh
vaddl.u8 \dq, \rl, \rh
vadd.u16 \dl, \dl, \dh
vpadd.u16 \dl, \dl, \dl
vpadd.u16 \dl, \dl, \dl
.endm
function ff_pred16x16_128_dc_neon, export=1
vmov.i8 q0, #128
b .L_pred16x16_dc_end
endfunc
function ff_pred16x16_top_dc_neon, export=1
sub r2, r0, r1
vld1.8 {q0}, [r2,:128]
add16x8 q0, d0, d1, d0, d1
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
b .L_pred16x16_dc_end
endfunc
function ff_pred16x16_left_dc_neon, export=1
sub r2, r0, #1
ldcol.8 d0, r2, r1
ldcol.8 d1, r2, r1
add16x8 q0, d0, d1, d0, d1
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
b .L_pred16x16_dc_end
endfunc
function ff_pred16x16_dc_neon, export=1
sub r2, r0, r1
vld1.8 {q0}, [r2,:128]
sub r2, r0, #1
ldcol.8 d2, r2, r1
ldcol.8 d3, r2, r1
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #5
vdup.8 q0, d0[0]
.L_pred16x16_dc_end:
mov r3, #8
6: vst1.8 {q0}, [r0,:128], r1
vst1.8 {q0}, [r0,:128], r1
subs r3, r3, #1
bne 6b
bx lr
endfunc
function ff_pred16x16_hor_neon, export=1
sub r2, r0, #1
mov r3, #16
1: vld1.8 {d0[],d1[]},[r2], r1
vst1.8 {q0}, [r0,:128], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function ff_pred16x16_vert_neon, export=1
sub r0, r0, r1
vld1.8 {q0}, [r0,:128], r1
mov r3, #8
1: vst1.8 {q0}, [r0,:128], r1
vst1.8 {q0}, [r0,:128], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function ff_pred16x16_plane_neon, export=1
sub r3, r0, r1
add r2, r3, #8
sub r3, r3, #1
vld1.8 {d0}, [r3]
vld1.8 {d2}, [r2,:64], r1
ldcol.8 d1, r3, r1
add r3, r3, r1
ldcol.8 d3, r3, r1
vrev64.8 q0, q0
vaddl.u8 q8, d2, d3
vsubl.u8 q2, d2, d0
vsubl.u8 q3, d3, d1
movrel r3, p16weight
vld1.8 {q0}, [r3,:128]
vmul.s16 q2, q2, q0
vmul.s16 q3, q3, q0
vadd.i16 d4, d4, d5
vadd.i16 d5, d6, d7
vpadd.i16 d4, d4, d5
vpadd.i16 d4, d4, d4
vshll.s16 q3, d4, #2
vaddw.s16 q2, q3, d4
vrshrn.s32 d4, q2, #6
mov r3, #0
vtrn.16 d4, d5
vadd.i16 d2, d4, d5
vshl.i16 d3, d2, #3
vrev64.16 d16, d17
vsub.i16 d3, d3, d2
vadd.i16 d16, d16, d0
vshl.i16 d2, d16, #4
vsub.i16 d2, d2, d3
vshl.i16 d3, d4, #4
vext.16 q0, q0, q0, #7
vsub.i16 d6, d5, d3
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q2, d4[0]
vdup.16 q3, d6[0]
vshl.i16 q2, q2, #3
vadd.i16 q1, q1, q0
vadd.i16 q3, q3, q2
mov r3, #16
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q2
vqshrun.s16 d1, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {q0}, [r0,:128], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
const p16weight, align=4
.short 1,2,3,4,5,6,7,8
endconst
function ff_pred8x8_hor_neon, export=1
sub r2, r0, #1
mov r3, #8
1: vld1.8 {d0[]}, [r2], r1
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function ff_pred8x8_vert_neon, export=1
sub r0, r0, r1
vld1.8 {d0}, [r0,:64], r1
mov r3, #4
1: vst1.8 {d0}, [r0,:64], r1
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function ff_pred8x8_plane_neon, export=1
sub r3, r0, r1
add r2, r3, #4
sub r3, r3, #1
vld1.32 {d0[0]}, [r3]
vld1.32 {d2[0]}, [r2,:32], r1
ldcol.8 d0, r3, r1, 4, hi=1
add r3, r3, r1
ldcol.8 d3, r3, r1, 4
vaddl.u8 q8, d2, d3
vrev32.8 d0, d0
vtrn.32 d2, d3
vsubl.u8 q2, d2, d0
movrel r3, p16weight
vld1.16 {q0}, [r3,:128]
vmul.s16 d4, d4, d0
vmul.s16 d5, d5, d0
vpadd.i16 d4, d4, d5
vpaddl.s16 d4, d4
vshl.i32 d5, d4, #4
vadd.s32 d4, d4, d5
vrshrn.s32 d4, q2, #5
mov r3, #0
vtrn.16 d4, d5
vadd.i16 d2, d4, d5
vshl.i16 d3, d2, #2
vrev64.16 d16, d16
vsub.i16 d3, d3, d2
vadd.i16 d16, d16, d0
vshl.i16 d2, d16, #4
vsub.i16 d2, d2, d3
vshl.i16 d3, d4, #3
vext.16 q0, q0, q0, #7
vsub.i16 d6, d5, d3
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q2, d4[0]
vdup.16 q3, d6[0]
vshl.i16 q2, q2, #3
vadd.i16 q1, q1, q0
vadd.i16 q3, q3, q2
mov r3, #8
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function ff_pred8x8_128_dc_neon, export=1
vmov.i8 q0, #128
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_top_dc_neon, export=1
sub r2, r0, r1
vld1.8 {d0}, [r2,:64]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
vtrn.32 d0, d1
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_left_dc_neon, export=1
sub r2, r0, #1
ldcol.8 d0, r2, r1
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_dc_neon, export=1
sub r2, r0, r1
vld1.8 {d0}, [r2,:64]
sub r2, r0, #1
ldcol.8 d1, r2, r1
vtrn.32 d0, d1
vpaddl.u8 q0, q0
vpadd.u16 d0, d0, d1
vpadd.u16 d1, d0, d0
vrshrn.u16 d2, q0, #3
vrshrn.u16 d3, q0, #2
vdup.8 d0, d2[4]
vdup.8 d1, d3[3]
vdup.8 d4, d3[2]
vdup.8 d5, d2[5]
vtrn.32 q0, q2
.L_pred8x8_dc_end:
mov r3, #4
add r2, r0, r1, lsl #2
6: vst1.8 {d0}, [r0,:64], r1
vst1.8 {d1}, [r2,:64], r1
subs r3, r3, #1
bne 6b
bx lr
endfunc
function ff_pred8x8_l0t_dc_neon, export=1
sub r2, r0, r1
vld1.8 {d0}, [r2,:64]
sub r2, r0, #1
ldcol.8 d1, r2, r1, 4
vtrn.32 d0, d1
vpaddl.u8 q0, q0
vpadd.u16 d0, d0, d1
vpadd.u16 d1, d0, d0
vrshrn.u16 d2, q0, #3
vrshrn.u16 d3, q0, #2
vdup.8 d0, d2[4]
vdup.8 d1, d3[0]
vdup.8 q2, d3[2]
vtrn.32 q0, q2
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_l00_dc_neon, export=1
sub r2, r0, #1
ldcol.8 d0, r2, r1, 4
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vmov.i8 d1, #128
vdup.8 d0, d0[0]
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_0lt_dc_neon, export=1
sub r2, r0, r1
vld1.8 {d0}, [r2,:64]
add r2, r0, r1, lsl #2
sub r2, r2, #1
ldcol.8 d1, r2, r1, 4, hi=1
vtrn.32 d0, d1
vpaddl.u8 q0, q0
vpadd.u16 d0, d0, d1
vpadd.u16 d1, d0, d0
vrshrn.u16 d3, q0, #2
vrshrn.u16 d2, q0, #3
vdup.8 d0, d3[0]
vdup.8 d1, d3[3]
vdup.8 d4, d3[2]
vdup.8 d5, d2[5]
vtrn.32 q0, q2
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_0l0_dc_neon, export=1
add r2, r0, r1, lsl #2
sub r2, r2, #1
ldcol.8 d1, r2, r1, 4
vpaddl.u8 d2, d1
vpadd.u16 d2, d2, d2
vrshrn.u16 d1, q1, #2
vmov.i8 d0, #128
vdup.8 d1, d1[0]
b .L_pred8x8_dc_end
endfunc

View File

@@ -0,0 +1,171 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/h264qpel.h"
void ff_put_h264_qpel16_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel16_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel16_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
av_cold void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth)
{
const int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags) && !high_bit_depth) {
c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon;
c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
}
}

View File

@@ -0,0 +1,955 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
/* H.264 qpel MC */
.macro lowpass_const r
movw \r, #5
movt \r, #20
vmov.32 d6[0], \r
.endm
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
.if \narrow
t0 .req q0
t1 .req q8
.else
t0 .req \d0
t1 .req \d1
.endif
vext.8 d2, \r0, \r1, #2
vext.8 d3, \r0, \r1, #3
vaddl.u8 q1, d2, d3
vext.8 d4, \r0, \r1, #1
vext.8 d5, \r0, \r1, #4
vaddl.u8 q2, d4, d5
vext.8 d30, \r0, \r1, #5
vaddl.u8 t0, \r0, d30
vext.8 d18, \r2, \r3, #2
vmla.i16 t0, q1, d6[1]
vext.8 d19, \r2, \r3, #3
vaddl.u8 q9, d18, d19
vext.8 d20, \r2, \r3, #1
vmls.i16 t0, q2, d6[0]
vext.8 d21, \r2, \r3, #4
vaddl.u8 q10, d20, d21
vext.8 d31, \r2, \r3, #5
vaddl.u8 t1, \r2, d31
vmla.i16 t1, q9, d6[1]
vmls.i16 t1, q10, d6[0]
.if \narrow
vqrshrun.s16 \d0, t0, #5
vqrshrun.s16 \d1, t1, #5
.endif
.unreq t0
.unreq t1
.endm
.macro lowpass_8_1 r0, r1, d0, narrow=1
.if \narrow
t0 .req q0
.else
t0 .req \d0
.endif
vext.8 d2, \r0, \r1, #2
vext.8 d3, \r0, \r1, #3
vaddl.u8 q1, d2, d3
vext.8 d4, \r0, \r1, #1
vext.8 d5, \r0, \r1, #4
vaddl.u8 q2, d4, d5
vext.8 d30, \r0, \r1, #5
vaddl.u8 t0, \r0, d30
vmla.i16 t0, q1, d6[1]
vmls.i16 t0, q2, d6[0]
.if \narrow
vqrshrun.s16 \d0, t0, #5
.endif
.unreq t0
.endm
.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
vext.16 q1, \r0, \r1, #2
vext.16 q0, \r0, \r1, #3
vaddl.s16 q9, d2, d0
vext.16 q2, \r0, \r1, #1
vaddl.s16 q1, d3, d1
vext.16 q3, \r0, \r1, #4
vaddl.s16 q10, d4, d6
vext.16 \r1, \r0, \r1, #5
vaddl.s16 q2, d5, d7
vaddl.s16 q0, \h0, \h1
vaddl.s16 q8, \l0, \l1
vshl.i32 q3, q9, #4
vshl.i32 q9, q9, #2
vshl.i32 q15, q10, #2
vadd.i32 q9, q9, q3
vadd.i32 q10, q10, q15
vshl.i32 q3, q1, #4
vshl.i32 q1, q1, #2
vshl.i32 q15, q2, #2
vadd.i32 q1, q1, q3
vadd.i32 q2, q2, q15
vadd.i32 q9, q9, q8
vsub.i32 q9, q9, q10
vadd.i32 q1, q1, q0
vsub.i32 q1, q1, q2
vrshrn.s32 d18, q9, #10
vrshrn.s32 d19, q1, #10
vqmovun.s16 \d, q9
.endm
function put_h264_qpel16_h_lowpass_neon_packed
mov r4, lr
mov r12, #16
mov r3, #8
bl put_h264_qpel8_h_lowpass_neon
sub r1, r1, r2, lsl #4
add r1, r1, #8
mov r12, #16
mov lr, r4
b put_h264_qpel8_h_lowpass_neon
endfunc
.macro h264_qpel_h_lowpass type
function \type\()_h264_qpel16_h_lowpass_neon
push {lr}
mov r12, #16
bl \type\()_h264_qpel8_h_lowpass_neon
sub r0, r0, r3, lsl #4
sub r1, r1, r2, lsl #4
add r0, r0, #8
add r1, r1, #8
mov r12, #16
pop {lr}
endfunc
function \type\()_h264_qpel8_h_lowpass_neon
1: vld1.8 {d0, d1}, [r1], r2
vld1.8 {d16,d17}, [r1], r2
subs r12, r12, #2
lowpass_8 d0, d1, d16, d17, d0, d16
.ifc \type,avg
vld1.8 {d2}, [r0,:64], r3
vrhadd.u8 d0, d0, d2
vld1.8 {d3}, [r0,:64]
vrhadd.u8 d16, d16, d3
sub r0, r0, r3
.endif
vst1.8 {d0}, [r0,:64], r3
vst1.8 {d16}, [r0,:64], r3
bne 1b
bx lr
endfunc
.endm
h264_qpel_h_lowpass put
h264_qpel_h_lowpass avg
.macro h264_qpel_h_lowpass_l2 type
function \type\()_h264_qpel16_h_lowpass_l2_neon
push {lr}
mov r12, #16
bl \type\()_h264_qpel8_h_lowpass_l2_neon
sub r0, r0, r2, lsl #4
sub r1, r1, r2, lsl #4
sub r3, r3, r2, lsl #4
add r0, r0, #8
add r1, r1, #8
add r3, r3, #8
mov r12, #16
pop {lr}
endfunc
function \type\()_h264_qpel8_h_lowpass_l2_neon
1: vld1.8 {d0, d1}, [r1], r2
vld1.8 {d16,d17}, [r1], r2
vld1.8 {d28}, [r3], r2
vld1.8 {d29}, [r3], r2
subs r12, r12, #2
lowpass_8 d0, d1, d16, d17, d0, d1
vrhadd.u8 q0, q0, q14
.ifc \type,avg
vld1.8 {d2}, [r0,:64], r2
vrhadd.u8 d0, d0, d2
vld1.8 {d3}, [r0,:64]
vrhadd.u8 d1, d1, d3
sub r0, r0, r2
.endif
vst1.8 {d0}, [r0,:64], r2
vst1.8 {d1}, [r0,:64], r2
bne 1b
bx lr
endfunc
.endm
h264_qpel_h_lowpass_l2 put
h264_qpel_h_lowpass_l2 avg
function put_h264_qpel16_v_lowpass_neon_packed
mov r4, lr
mov r2, #8
bl put_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #2
bl put_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
bl put_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #2
mov lr, r4
b put_h264_qpel8_v_lowpass_neon
endfunc
.macro h264_qpel_v_lowpass type
function \type\()_h264_qpel16_v_lowpass_neon
mov r4, lr
bl \type\()_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #2
bl \type\()_h264_qpel8_v_lowpass_neon
sub r0, r0, r2, lsl #4
add r0, r0, #8
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
bl \type\()_h264_qpel8_v_lowpass_neon
sub r1, r1, r3, lsl #2
mov lr, r4
endfunc
function \type\()_h264_qpel8_v_lowpass_neon
vld1.8 {d8}, [r1], r3
vld1.8 {d10}, [r1], r3
vld1.8 {d12}, [r1], r3
vld1.8 {d14}, [r1], r3
vld1.8 {d22}, [r1], r3
vld1.8 {d24}, [r1], r3
vld1.8 {d26}, [r1], r3
vld1.8 {d28}, [r1], r3
vld1.8 {d9}, [r1], r3
vld1.8 {d11}, [r1], r3
vld1.8 {d13}, [r1], r3
vld1.8 {d15}, [r1], r3
vld1.8 {d23}, [r1]
transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
lowpass_8 d8, d9, d10, d11, d8, d10
lowpass_8 d12, d13, d14, d15, d12, d14
lowpass_8 d22, d23, d24, d25, d22, d24
lowpass_8 d26, d27, d28, d29, d26, d28
transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
.ifc \type,avg
vld1.8 {d9}, [r0,:64], r2
vrhadd.u8 d8, d8, d9
vld1.8 {d11}, [r0,:64], r2
vrhadd.u8 d10, d10, d11
vld1.8 {d13}, [r0,:64], r2
vrhadd.u8 d12, d12, d13
vld1.8 {d15}, [r0,:64], r2
vrhadd.u8 d14, d14, d15
vld1.8 {d23}, [r0,:64], r2
vrhadd.u8 d22, d22, d23
vld1.8 {d25}, [r0,:64], r2
vrhadd.u8 d24, d24, d25
vld1.8 {d27}, [r0,:64], r2
vrhadd.u8 d26, d26, d27
vld1.8 {d29}, [r0,:64], r2
vrhadd.u8 d28, d28, d29
sub r0, r0, r2, lsl #3
.endif
vst1.8 {d8}, [r0,:64], r2
vst1.8 {d10}, [r0,:64], r2
vst1.8 {d12}, [r0,:64], r2
vst1.8 {d14}, [r0,:64], r2
vst1.8 {d22}, [r0,:64], r2
vst1.8 {d24}, [r0,:64], r2
vst1.8 {d26}, [r0,:64], r2
vst1.8 {d28}, [r0,:64], r2
bx lr
endfunc
.endm
h264_qpel_v_lowpass put
h264_qpel_v_lowpass avg
.macro h264_qpel_v_lowpass_l2 type
function \type\()_h264_qpel16_v_lowpass_l2_neon
mov r4, lr
bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub r1, r1, r3, lsl #2
bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub r0, r0, r3, lsl #4
sub r12, r12, r2, lsl #4
add r0, r0, #8
add r12, r12, #8
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
bl \type\()_h264_qpel8_v_lowpass_l2_neon
sub r1, r1, r3, lsl #2
mov lr, r4
endfunc
function \type\()_h264_qpel8_v_lowpass_l2_neon
vld1.8 {d8}, [r1], r3
vld1.8 {d10}, [r1], r3
vld1.8 {d12}, [r1], r3
vld1.8 {d14}, [r1], r3
vld1.8 {d22}, [r1], r3
vld1.8 {d24}, [r1], r3
vld1.8 {d26}, [r1], r3
vld1.8 {d28}, [r1], r3
vld1.8 {d9}, [r1], r3
vld1.8 {d11}, [r1], r3
vld1.8 {d13}, [r1], r3
vld1.8 {d15}, [r1], r3
vld1.8 {d23}, [r1]
transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
lowpass_8 d8, d9, d10, d11, d8, d9
lowpass_8 d12, d13, d14, d15, d12, d13
lowpass_8 d22, d23, d24, d25, d22, d23
lowpass_8 d26, d27, d28, d29, d26, d27
transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
vld1.8 {d0}, [r12], r2
vld1.8 {d1}, [r12], r2
vld1.8 {d2}, [r12], r2
vld1.8 {d3}, [r12], r2
vld1.8 {d4}, [r12], r2
vrhadd.u8 q0, q0, q4
vld1.8 {d5}, [r12], r2
vrhadd.u8 q1, q1, q6
vld1.8 {d10}, [r12], r2
vrhadd.u8 q2, q2, q11
vld1.8 {d11}, [r12], r2
vrhadd.u8 q5, q5, q13
.ifc \type,avg
vld1.8 {d16}, [r0,:64], r3
vrhadd.u8 d0, d0, d16
vld1.8 {d17}, [r0,:64], r3
vrhadd.u8 d1, d1, d17
vld1.8 {d16}, [r0,:64], r3
vrhadd.u8 d2, d2, d16
vld1.8 {d17}, [r0,:64], r3
vrhadd.u8 d3, d3, d17
vld1.8 {d16}, [r0,:64], r3
vrhadd.u8 d4, d4, d16
vld1.8 {d17}, [r0,:64], r3
vrhadd.u8 d5, d5, d17
vld1.8 {d16}, [r0,:64], r3
vrhadd.u8 d10, d10, d16
vld1.8 {d17}, [r0,:64], r3
vrhadd.u8 d11, d11, d17
sub r0, r0, r3, lsl #3
.endif
vst1.8 {d0}, [r0,:64], r3
vst1.8 {d1}, [r0,:64], r3
vst1.8 {d2}, [r0,:64], r3
vst1.8 {d3}, [r0,:64], r3
vst1.8 {d4}, [r0,:64], r3
vst1.8 {d5}, [r0,:64], r3
vst1.8 {d10}, [r0,:64], r3
vst1.8 {d11}, [r0,:64], r3
bx lr
endfunc
.endm
h264_qpel_v_lowpass_l2 put
h264_qpel_v_lowpass_l2 avg
function put_h264_qpel8_hv_lowpass_neon_top
lowpass_const r12
mov r12, #12
1: vld1.8 {d0, d1}, [r1], r3
vld1.8 {d16,d17}, [r1], r3
subs r12, r12, #2
lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
vst1.8 {d22-d25}, [r4,:128]!
bne 1b
vld1.8 {d0, d1}, [r1]
lowpass_8_1 d0, d1, q12, narrow=0
mov r12, #-16
add r4, r4, r12
vld1.8 {d30,d31}, [r4,:128], r12
vld1.8 {d20,d21}, [r4,:128], r12
vld1.8 {d18,d19}, [r4,:128], r12
vld1.8 {d16,d17}, [r4,:128], r12
vld1.8 {d14,d15}, [r4,:128], r12
vld1.8 {d12,d13}, [r4,:128], r12
vld1.8 {d10,d11}, [r4,:128], r12
vld1.8 {d8, d9}, [r4,:128], r12
vld1.8 {d6, d7}, [r4,:128], r12
vld1.8 {d4, d5}, [r4,:128], r12
vld1.8 {d2, d3}, [r4,:128], r12
vld1.8 {d0, d1}, [r4,:128]
swap4 d1, d3, d5, d7, d8, d10, d12, d14
transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
swap4 d17, d19, d21, d31, d24, d26, d28, d22
transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
vst1.8 {d30,d31}, [r4,:128]!
vst1.8 {d6, d7}, [r4,:128]!
vst1.8 {d20,d21}, [r4,:128]!
vst1.8 {d4, d5}, [r4,:128]!
vst1.8 {d18,d19}, [r4,:128]!
vst1.8 {d2, d3}, [r4,:128]!
vst1.8 {d16,d17}, [r4,:128]!
vst1.8 {d0, d1}, [r4,:128]
lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
vld1.8 {d16,d17}, [r4,:128], r12
vld1.8 {d30,d31}, [r4,:128], r12
lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
vld1.8 {d16,d17}, [r4,:128], r12
vld1.8 {d30,d31}, [r4,:128], r12
lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
vld1.8 {d16,d17}, [r4,:128], r12
vld1.8 {d30,d31}, [r4,:128], r12
lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
vld1.8 {d16,d17}, [r4,:128], r12
vld1.8 {d30,d31}, [r4,:128]
lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
bx lr
endfunc
.macro h264_qpel8_hv_lowpass type
function \type\()_h264_qpel8_hv_lowpass_neon
mov r10, lr
bl put_h264_qpel8_hv_lowpass_neon_top
.ifc \type,avg
vld1.8 {d0}, [r0,:64], r2
vrhadd.u8 d12, d12, d0
vld1.8 {d1}, [r0,:64], r2
vrhadd.u8 d13, d13, d1
vld1.8 {d2}, [r0,:64], r2
vrhadd.u8 d14, d14, d2
vld1.8 {d3}, [r0,:64], r2
vrhadd.u8 d15, d15, d3
vld1.8 {d4}, [r0,:64], r2
vrhadd.u8 d8, d8, d4
vld1.8 {d5}, [r0,:64], r2
vrhadd.u8 d9, d9, d5
vld1.8 {d6}, [r0,:64], r2
vrhadd.u8 d10, d10, d6
vld1.8 {d7}, [r0,:64], r2
vrhadd.u8 d11, d11, d7
sub r0, r0, r2, lsl #3
.endif
vst1.8 {d12}, [r0,:64], r2
vst1.8 {d13}, [r0,:64], r2
vst1.8 {d14}, [r0,:64], r2
vst1.8 {d15}, [r0,:64], r2
vst1.8 {d8}, [r0,:64], r2
vst1.8 {d9}, [r0,:64], r2
vst1.8 {d10}, [r0,:64], r2
vst1.8 {d11}, [r0,:64], r2
mov lr, r10
bx lr
endfunc
.endm
h264_qpel8_hv_lowpass put
h264_qpel8_hv_lowpass avg
.macro h264_qpel8_hv_lowpass_l2 type
function \type\()_h264_qpel8_hv_lowpass_l2_neon
mov r10, lr
bl put_h264_qpel8_hv_lowpass_neon_top
vld1.8 {d0, d1}, [r2,:128]!
vld1.8 {d2, d3}, [r2,:128]!
vrhadd.u8 q0, q0, q6
vld1.8 {d4, d5}, [r2,:128]!
vrhadd.u8 q1, q1, q7
vld1.8 {d6, d7}, [r2,:128]!
vrhadd.u8 q2, q2, q4
vrhadd.u8 q3, q3, q5
.ifc \type,avg
vld1.8 {d16}, [r0,:64], r3
vrhadd.u8 d0, d0, d16
vld1.8 {d17}, [r0,:64], r3
vrhadd.u8 d1, d1, d17
vld1.8 {d18}, [r0,:64], r3
vrhadd.u8 d2, d2, d18
vld1.8 {d19}, [r0,:64], r3
vrhadd.u8 d3, d3, d19
vld1.8 {d20}, [r0,:64], r3
vrhadd.u8 d4, d4, d20
vld1.8 {d21}, [r0,:64], r3
vrhadd.u8 d5, d5, d21
vld1.8 {d22}, [r0,:64], r3
vrhadd.u8 d6, d6, d22
vld1.8 {d23}, [r0,:64], r3
vrhadd.u8 d7, d7, d23
sub r0, r0, r3, lsl #3
.endif
vst1.8 {d0}, [r0,:64], r3
vst1.8 {d1}, [r0,:64], r3
vst1.8 {d2}, [r0,:64], r3
vst1.8 {d3}, [r0,:64], r3
vst1.8 {d4}, [r0,:64], r3
vst1.8 {d5}, [r0,:64], r3
vst1.8 {d6}, [r0,:64], r3
vst1.8 {d7}, [r0,:64], r3
mov lr, r10
bx lr
endfunc
.endm
h264_qpel8_hv_lowpass_l2 put
h264_qpel8_hv_lowpass_l2 avg
.macro h264_qpel16_hv type
function \type\()_h264_qpel16_hv_lowpass_neon
mov r9, lr
bl \type\()_h264_qpel8_hv_lowpass_neon
sub r1, r1, r3, lsl #2
bl \type\()_h264_qpel8_hv_lowpass_neon
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
sub r0, r0, r2, lsl #4
add r0, r0, #8
bl \type\()_h264_qpel8_hv_lowpass_neon
sub r1, r1, r3, lsl #2
mov lr, r9
b \type\()_h264_qpel8_hv_lowpass_neon
endfunc
function \type\()_h264_qpel16_hv_lowpass_l2_neon
mov r9, lr
sub r2, r4, #256
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub r1, r1, r3, lsl #2
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub r1, r1, r3, lsl #4
sub r1, r1, r3, lsl #2
add r1, r1, #8
sub r0, r0, r3, lsl #4
add r0, r0, #8
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
sub r1, r1, r3, lsl #2
mov lr, r9
b \type\()_h264_qpel8_hv_lowpass_l2_neon
endfunc
.endm
h264_qpel16_hv put
h264_qpel16_hv avg
.macro h264_qpel8 type
function ff_\type\()_h264_qpel8_mc10_neon, export=1
lowpass_const r3
mov r3, r1
sub r1, r1, #2
mov r12, #8
b \type\()_h264_qpel8_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel8_mc20_neon, export=1
lowpass_const r3
sub r1, r1, #2
mov r3, r2
mov r12, #8
b \type\()_h264_qpel8_h_lowpass_neon
endfunc
function ff_\type\()_h264_qpel8_mc30_neon, export=1
lowpass_const r3
add r3, r1, #1
sub r1, r1, #2
mov r12, #8
b \type\()_h264_qpel8_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel8_mc01_neon, export=1
push {lr}
mov r12, r1
\type\()_h264_qpel8_mc01:
lowpass_const r3
mov r3, r2
sub r1, r1, r2, lsl #1
vpush {d8-d15}
bl \type\()_h264_qpel8_v_lowpass_l2_neon
vpop {d8-d15}
pop {pc}
endfunc
function ff_\type\()_h264_qpel8_mc11_neon, export=1
push {r0, r1, r11, lr}
\type\()_h264_qpel8_mc11:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #64
mov r0, sp
sub r1, r1, #2
mov r3, #8
mov r12, #8
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
ldrd r0, r1, [r11], #8
mov r3, r2
add r12, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #8
bl \type\()_h264_qpel8_v_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc21_neon, export=1
push {r0, r1, r4, r10, r11, lr}
\type\()_h264_qpel8_mc21:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(8*8+16*12)
sub r1, r1, #2
mov r3, #8
mov r0, sp
mov r12, #8
vpush {d8-d15}
bl put_h264_qpel8_h_lowpass_neon
mov r4, r0
ldrd r0, r1, [r11], #8
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
sub r2, r4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r10, r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc31_neon, export=1
add r1, r1, #1
push {r0, r1, r11, lr}
sub r1, r1, #1
b \type\()_h264_qpel8_mc11
endfunc
function ff_\type\()_h264_qpel8_mc02_neon, export=1
push {lr}
lowpass_const r3
sub r1, r1, r2, lsl #1
mov r3, r2
vpush {d8-d15}
bl \type\()_h264_qpel8_v_lowpass_neon
vpop {d8-d15}
pop {pc}
endfunc
function ff_\type\()_h264_qpel8_mc12_neon, export=1
push {r0, r1, r4, r10, r11, lr}
\type\()_h264_qpel8_mc12:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(8*8+16*12)
sub r1, r1, r2, lsl #1
mov r3, r2
mov r2, #8
mov r0, sp
vpush {d8-d15}
bl put_h264_qpel8_v_lowpass_neon
mov r4, r0
ldrd r0, r1, [r11], #8
sub r1, r1, r3, lsl #1
sub r1, r1, #2
sub r2, r4, #64
bl \type\()_h264_qpel8_hv_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r10, r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc22_neon, export=1
push {r4, r10, r11, lr}
mov r11, sp
A bic sp, sp, #15
T bic r4, r11, #15
T mov sp, r4
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
sub sp, sp, #(16*12)
mov r4, sp
vpush {d8-d15}
bl \type\()_h264_qpel8_hv_lowpass_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r10, r11, pc}
endfunc
function ff_\type\()_h264_qpel8_mc32_neon, export=1
push {r0, r1, r4, r10, r11, lr}
add r1, r1, #1
b \type\()_h264_qpel8_mc12
endfunc
function ff_\type\()_h264_qpel8_mc03_neon, export=1
push {lr}
add r12, r1, r2
b \type\()_h264_qpel8_mc01
endfunc
function ff_\type\()_h264_qpel8_mc13_neon, export=1
push {r0, r1, r11, lr}
add r1, r1, r2
b \type\()_h264_qpel8_mc11
endfunc
function ff_\type\()_h264_qpel8_mc23_neon, export=1
push {r0, r1, r4, r10, r11, lr}
add r1, r1, r2
b \type\()_h264_qpel8_mc21
endfunc
function ff_\type\()_h264_qpel8_mc33_neon, export=1
add r1, r1, #1
push {r0, r1, r11, lr}
add r1, r1, r2
sub r1, r1, #1
b \type\()_h264_qpel8_mc11
endfunc
.endm
h264_qpel8 put
h264_qpel8 avg
.macro h264_qpel16 type
function ff_\type\()_h264_qpel16_mc10_neon, export=1
lowpass_const r3
mov r3, r1
sub r1, r1, #2
b \type\()_h264_qpel16_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel16_mc20_neon, export=1
lowpass_const r3
sub r1, r1, #2
mov r3, r2
b \type\()_h264_qpel16_h_lowpass_neon
endfunc
function ff_\type\()_h264_qpel16_mc30_neon, export=1
lowpass_const r3
add r3, r1, #1
sub r1, r1, #2
b \type\()_h264_qpel16_h_lowpass_l2_neon
endfunc
function ff_\type\()_h264_qpel16_mc01_neon, export=1
push {r4, lr}
mov r12, r1
\type\()_h264_qpel16_mc01:
lowpass_const r3
mov r3, r2
sub r1, r1, r2, lsl #1
vpush {d8-d15}
bl \type\()_h264_qpel16_v_lowpass_l2_neon
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_h264_qpel16_mc11_neon, export=1
push {r0, r1, r4, r11, lr}
\type\()_h264_qpel16_mc11:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #256
mov r0, sp
sub r1, r1, #2
mov r3, #16
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon
ldrd r0, r1, [r11], #8
mov r3, r2
add r12, sp, #64
sub r1, r1, r2, lsl #1
mov r2, #16
bl \type\()_h264_qpel16_v_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r11, pc}
endfunc
function ff_\type\()_h264_qpel16_mc21_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
\type\()_h264_qpel16_mc21:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(16*16+16*12)
sub r1, r1, #2
mov r0, sp
vpush {d8-d15}
bl put_h264_qpel16_h_lowpass_neon_packed
mov r4, r0
ldrd r0, r1, [r11], #8
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r4-r5, r9-r11, pc}
endfunc
function ff_\type\()_h264_qpel16_mc31_neon, export=1
add r1, r1, #1
push {r0, r1, r4, r11, lr}
sub r1, r1, #1
b \type\()_h264_qpel16_mc11
endfunc
function ff_\type\()_h264_qpel16_mc02_neon, export=1
push {r4, lr}
lowpass_const r3
sub r1, r1, r2, lsl #1
mov r3, r2
vpush {d8-d15}
bl \type\()_h264_qpel16_v_lowpass_neon
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_h264_qpel16_mc12_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
\type\()_h264_qpel16_mc12:
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r0, r11, #15
T mov sp, r0
sub sp, sp, #(16*16+16*12)
sub r1, r1, r2, lsl #1
mov r0, sp
mov r3, r2
vpush {d8-d15}
bl put_h264_qpel16_v_lowpass_neon_packed
mov r4, r0
ldrd r0, r1, [r11], #8
sub r1, r1, r3, lsl #1
sub r1, r1, #2
mov r2, r3
bl \type\()_h264_qpel16_hv_lowpass_l2_neon
vpop {d8-d15}
mov sp, r11
pop {r4-r5, r9-r11, pc}
endfunc
function ff_\type\()_h264_qpel16_mc22_neon, export=1
push {r4, r9-r11, lr}
lowpass_const r3
mov r11, sp
A bic sp, sp, #15
T bic r4, r11, #15
T mov sp, r4
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, r2
sub sp, sp, #(16*12)
mov r4, sp
vpush {d8-d15}
bl \type\()_h264_qpel16_hv_lowpass_neon
vpop {d8-d15}
mov sp, r11
pop {r4, r9-r11, pc}
endfunc
function ff_\type\()_h264_qpel16_mc32_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
add r1, r1, #1
b \type\()_h264_qpel16_mc12
endfunc
function ff_\type\()_h264_qpel16_mc03_neon, export=1
push {r4, lr}
add r12, r1, r2
b \type\()_h264_qpel16_mc01
endfunc
function ff_\type\()_h264_qpel16_mc13_neon, export=1
push {r0, r1, r4, r11, lr}
add r1, r1, r2
b \type\()_h264_qpel16_mc11
endfunc
function ff_\type\()_h264_qpel16_mc23_neon, export=1
push {r0, r1, r4-r5, r9-r11, lr}
add r1, r1, r2
b \type\()_h264_qpel16_mc21
endfunc
function ff_\type\()_h264_qpel16_mc33_neon, export=1
add r1, r1, #1
push {r0, r1, r4, r11, lr}
add r1, r1, r2
sub r1, r1, #1
b \type\()_h264_qpel16_mc11
endfunc
.endm
h264_qpel16 put
h264_qpel16 avg

View File

@@ -0,0 +1,26 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
#define AVCODEC_ARM_HEVCDSP_ARM_H
#include "libavcodec/hevcdsp.h"
void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth);
#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */

View File

@@ -0,0 +1,385 @@
/*
* Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
.macro hevc_loop_filter_chroma_start
ldr r12, [r2]
ldr r3, [r2, #4]
add r2, r3, r12
cmp r2, #0
it eq
bxeq lr
.endm
.macro hevc_loop_filter_chroma_body
vsubl.u8 q3, d4, d2
vsubl.u8 q11, d18, d19
vshl.i16 q3, #2
vadd.i16 q11, q3
vdup.16 d0, r12
vdup.16 d1, r3
vrshr.s16 q11, q11, #3
vneg.s16 q12, q0
vmovl.u8 q2, d4
vmin.s16 q11, q11, q0
vmax.s16 q11, q11, q12
vaddw.u8 q1, q11, d2
vsub.i16 q2, q11
vqmovun.s16 d2, q1
vqmovun.s16 d4, q2
.endm
.macro hevc_loop_filter_luma_start
ldr r12, [r3]
ldr r3, [r3, #4]
lsl r3, #16
orr r3, r12
cmp r3, #0
it eq
bxeq lr
lsr r3, #16
.endm
.macro hevc_loop_filter_luma_body
vmovl.u8 q8, d16
vmovl.u8 q9, d18
vmovl.u8 q10, d20
vmovl.u8 q11, d22
vmovl.u8 q12, d24
vmovl.u8 q13, d26
vmovl.u8 q14, d28
vmovl.u8 q15, d30
vadd.i16 q7, q9, q11
vadd.i16 q6, q14, q12
vsub.i16 q7, q10
vsub.i16 q6, q13
vabd.s16 q7, q7, q10
vabd.s16 q6, q6, q13
vdup.16 q0, r2
vmov q4, q7
vmov q5, q6
vdup.16 d4, r12
vtrn.16 q7, q4
vtrn.16 q6, q5
vshl.u64 q7, #32
vshr.u64 q4, #32
vshl.u64 q6, #32
vshr.u64 q5, #32
vshr.u64 q7, #32
vshr.u64 q6, #32
vshl.u64 q5, #32
vshl.u64 q4, #32
vorr q6, q5
vorr q7, q4
vdup.16 d5, r3
vadd.i16 q5, q7, q6
vmov q4, q5
vmov q3, q5
vtrn.32 q3, q4
vadd.i16 q4, q3
vshl.s16 q5, q5, #1
vcgt.s16 q3, q0, q4
vmovn.i16 d6, q3
vshr.s16 q1, q0, #2
vmovn.i16 d6, q3
vcgt.s16 q5, q1, q5
vmov r7, s12
cmp r7, #0
beq bypasswrite
vpadd.i32 d0, d14, d12
vpadd.i32 d1, d15, d13
vmov q4, q2
vshl.s16 q2, #2
vshr.s16 q1, q1, #1
vrhadd.s16 q2, q4
vabd.s16 q7, q8, q11
vaba.s16 q7, q15, q12
vmovn.i32 d0, q0
vmov r5, r6, s0, s1
vcgt.s16 q6, q1, q7
vand q5, q5, q6
vabd.s16 q7, q11, q12
vcgt.s16 q6, q2, q7
vand q5, q5, q6
vmov q2, q5
vtrn.s16 q5, q2
vshr.u64 q2, #32
vshl.u64 q5, #32
vshl.u64 q2, #32
vshr.u64 q5, #32
vorr q5, q2
vmov q2, q5
vshl.i16 q7, q4, #1
vtrn.32 q2, q5
vand q5, q2
vneg.s16 q6, q7
vmovn.i16 d4, q5
vmovn.i16 d4, q2
vmov r8, s8
and r9, r8, r7
cmp r9, #0
beq weakfilter_\@
vadd.i16 q2, q11, q12
vadd.i16 q4, q9, q8
vadd.i16 q1, q2, q10
vdup.16 d10, r9
vadd.i16 q0, q1, q9
vshl.i16 q4, #1
lsr r9, #16
vadd.i16 q1, q0
vrshr.s16 q3, q0, #2
vadd.i16 q1, q13
vadd.i16 q4, q0
vsub.i16 q3, q10
vrshr.s16 q1, #3
vrshr.s16 q4, #3
vmax.s16 q3, q6
vsub.i16 q1, q11
vsub.i16 q4, q9
vmin.s16 q3, q7
vmax.s16 q4, q6
vmax.s16 q1, q6
vadd.i16 q3, q10
vmin.s16 q4, q7
vmin.s16 q1, q7
vdup.16 d11, r9
vadd.i16 q4, q9
vadd.i16 q1, q11
vbit q9, q4, q5
vadd.i16 q4, q2, q13
vbit q11, q1, q5
vadd.i16 q0, q4, q14
vadd.i16 q2, q15, q14
vadd.i16 q4, q0
vshl.i16 q2, #1
vadd.i16 q4, q10
vbit q10, q3, q5
vrshr.s16 q4, #3
vadd.i16 q2, q0
vrshr.s16 q3, q0, #2
vsub.i16 q4, q12
vrshr.s16 q2, #3
vsub.i16 q3, q13
vmax.s16 q4, q6
vsub.i16 q2, q14
vmax.s16 q3, q6
vmin.s16 q4, q7
vmax.s16 q2, q6
vmin.s16 q3, q7
vadd.i16 q4, q12
vmin.s16 q2, q7
vadd.i16 q3, q13
vbit q12, q4, q5
vadd.i16 q2, q14
vbit q13, q3, q5
vbit q14, q2, q5
weakfilter_\@:
mvn r8, r8
and r9, r8, r7
cmp r9, #0
beq ready_\@
vdup.16 q4, r2
vdup.16 d10, r9
lsr r9, #16
vmov q1, q4
vdup.16 d11, r9
vshr.s16 q1, #1
vsub.i16 q2, q12, q11
vadd.i16 q4, q1
vshl.s16 q0, q2, #3
vshr.s16 q4, #3
vadd.i16 q2, q0
vsub.i16 q0, q13, q10
vsub.i16 q2, q0
vshl.i16 q0, q0, #1
vsub.i16 q2, q0
vshl.s16 q1, q7, 2
vrshr.s16 q2, q2, #4
vadd.i16 q1, q7
vabs.s16 q3, q2
vshr.s16 q6, q6, #1
vcgt.s16 q1, q1, q3
vand q5, q1
vshr.s16 q7, q7, #1
vmax.s16 q2, q2, q6
vmin.s16 q2, q2, q7
vshr.s16 q7, q7, #1
vrhadd.s16 q3, q9, q11
vneg.s16 q6, q7
vsub.s16 q3, q10
vdup.16 d2, r5
vhadd.s16 q3, q2
vdup.16 d3, r6
vmax.s16 q3, q3, q6
vcgt.s16 q1, q4, q1
vmin.s16 q3, q3, q7
vand q1, q5
vadd.i16 q3, q10
lsr r5, #16
lsr r6, #16
vbit q10, q3, q1
vrhadd.s16 q3, q14, q12
vdup.16 d2, r5
vsub.s16 q3, q13
vdup.16 d3, r6
vhsub.s16 q3, q2
vcgt.s16 q1, q4, q1
vmax.s16 q3, q3, q6
vand q1, q5
vmin.s16 q3, q3, q7
vadd.i16 q3, q13
vbit q13, q3, q1
vadd.i16 q0, q11, q2
vsub.i16 q4, q12, q2
vbit q11, q0, q5
vbit q12, q4, q5
ready_\@:
vqmovun.s16 d16, q8
vqmovun.s16 d18, q9
vqmovun.s16 d20, q10
vqmovun.s16 d22, q11
vqmovun.s16 d24, q12
vqmovun.s16 d26, q13
vqmovun.s16 d28, q14
vqmovun.s16 d30, q15
.endm
function ff_hevc_v_loop_filter_luma_neon, export=1
hevc_loop_filter_luma_start
push {r5-r11}
vpush {d8-d15}
sub r0, #4
vld1.8 {d16}, [r0], r1
vld1.8 {d18}, [r0], r1
vld1.8 {d20}, [r0], r1
vld1.8 {d22}, [r0], r1
vld1.8 {d24}, [r0], r1
vld1.8 {d26}, [r0], r1
vld1.8 {d28}, [r0], r1
vld1.8 {d30}, [r0], r1
sub r0, r0, r1, lsl #3
transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
hevc_loop_filter_luma_body
transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
vst1.8 {d16}, [r0], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d20}, [r0], r1
vst1.8 {d22}, [r0], r1
vst1.8 {d24}, [r0], r1
vst1.8 {d26}, [r0], r1
vst1.8 {d28}, [r0], r1
vst1.8 {d30}, [r0]
vpop {d8-d15}
pop {r5-r11}
bx lr
endfunc
function ff_hevc_h_loop_filter_luma_neon, export=1
hevc_loop_filter_luma_start
push {r5-r11}
vpush {d8-d15}
sub r0, r0, r1, lsl #2
vld1.8 {d16}, [r0], r1
vld1.8 {d18}, [r0], r1
vld1.8 {d20}, [r0], r1
vld1.8 {d22}, [r0], r1
vld1.8 {d24}, [r0], r1
vld1.8 {d26}, [r0], r1
vld1.8 {d28}, [r0], r1
vld1.8 {d30}, [r0], r1
sub r0, r0, r1, lsl #3
add r0, r1
hevc_loop_filter_luma_body
vst1.8 {d18}, [r0], r1
vst1.8 {d20}, [r0], r1
vst1.8 {d22}, [r0], r1
vst1.8 {d24}, [r0], r1
vst1.8 {d26}, [r0], r1
vst1.8 {d28}, [r0]
bypasswrite:
vpop {d8-d15}
pop {r5-r11}
bx lr
endfunc
function ff_hevc_v_loop_filter_chroma_neon, export=1
hevc_loop_filter_chroma_start
sub r0, #4
vld1.8 {d16}, [r0], r1
vld1.8 {d17}, [r0], r1
vld1.8 {d18}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d4}, [r0], r1
vld1.8 {d19}, [r0], r1
vld1.8 {d20}, [r0], r1
vld1.8 {d21}, [r0], r1
sub r0, r0, r1, lsl #3
transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
hevc_loop_filter_chroma_body
transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r0], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d2}, [r0], r1
vst1.8 {d4}, [r0], r1
vst1.8 {d19}, [r0], r1
vst1.8 {d20}, [r0], r1
vst1.8 {d21}, [r0]
bx lr
endfunc
function ff_hevc_h_loop_filter_chroma_neon, export=1
hevc_loop_filter_chroma_start
sub r0, r0, r1, lsl #1
vld1.8 {d18}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d4}, [r0], r1
vld1.8 {d19}, [r0]
sub r0, r0, r1, lsl #1
hevc_loop_filter_chroma_body
vst1.8 {d2}, [r0], r1
vst1.8 {d4}, [r0]
bx lr
endfunc

View File

@@ -0,0 +1,465 @@
/*
* Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
function ff_hevc_idct_4x4_dc_neon_8, export=1
ldrsh r1, [r0]
ldr r2, =0x20
add r1, #1
asr r1, #1
add r1, r2
asr r1, #6
vdup.16 q0, r1
vdup.16 q1, r1
vst1.16 {q0, q1}, [r0]
bx lr
endfunc
function ff_hevc_idct_8x8_dc_neon_8, export=1
ldrsh r1, [r0]
ldr r2, =0x20
add r1, #1
asr r1, #1
add r1, r2
asr r1, #6
vdup.16 q8, r1
vdup.16 q9, r1
vmov.16 q10, q8
vmov.16 q11, q8
vmov.16 q12, q8
vmov.16 q13, q8
vmov.16 q14, q8
vmov.16 q15, q8
vstm r0, {q8-q15}
bx lr
endfunc
function ff_hevc_idct_16x16_dc_neon_8, export=1
ldrsh r1, [r0]
ldr r2, =0x20
add r1, #1
asr r1, #1
add r1, r2
asr r1, #6
vdup.16 q8, r1
vdup.16 q9, r1
vmov.16 q10, q8
vmov.16 q11, q8
vmov.16 q12, q8
vmov.16 q13, q8
vmov.16 q14, q8
vmov.16 q15, q8
vstm r0!, {q8-q15}
vstm r0!, {q8-q15}
vstm r0!, {q8-q15}
vstm r0, {q8-q15}
bx lr
endfunc
function ff_hevc_idct_32x32_dc_neon_8, export=1
ldrsh r1, [r0]
ldr r2, =0x20
add r1, #1
asr r1, #1
add r1, r2
asr r1, #6
mov r3, #16
vdup.16 q8, r1
vdup.16 q9, r1
vmov.16 q10, q8
vmov.16 q11, q8
vmov.16 q12, q8
vmov.16 q13, q8
vmov.16 q14, q8
vmov.16 q15, q8
1: subs r3, #1
vstm r0!, {q8-q15}
bne 1b
bx lr
endfunc
function ff_hevc_transform_add_4x4_neon_8, export=1
vldm r1, {q0-q1}
vld1.32 d4[0], [r0], r2
vld1.32 d4[1], [r0], r2
vld1.32 d5[0], [r0], r2
vld1.32 d5[1], [r0], r2
sub r0, r0, r2, lsl #2
vmovl.u8 q8, d4
vmovl.u8 q9, d5
vqadd.s16 q0, q0, q8
vqadd.s16 q1, q1, q9
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vst1.32 d0[0], [r0], r2
vst1.32 d0[1], [r0], r2
vst1.32 d1[0], [r0], r2
vst1.32 d1[1], [r0], r2
bx lr
endfunc
function ff_hevc_transform_add_8x8_neon_8, export=1
mov r3, #8
1: subs r3, #1
vld1.16 {q0}, [r1]!
vld1.8 d16, [r0]
vmovl.u8 q8, d16
vqadd.s16 q0, q8
vqmovun.s16 d0, q0
vst1.32 d0, [r0], r2
bne 1b
bx lr
endfunc
function ff_hevc_transform_add_16x16_neon_8, export=1
mov r3, #16
1: subs r3, #1
vld1.16 {q0, q1}, [r1]!
vld1.8 {q8}, [r0]
vmovl.u8 q9, d16
vmovl.u8 q10, d17
vqadd.s16 q0, q9
vqadd.s16 q1, q10
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vst1.8 {q0}, [r0], r2
bne 1b
bx lr
endfunc
function ff_hevc_transform_add_32x32_neon_8, export=1
mov r3, #32
1: subs r3, #1
vldm r1!, {q0-q3}
vld1.8 {q8, q9}, [r0]
vmovl.u8 q10, d16
vmovl.u8 q11, d17
vmovl.u8 q12, d18
vmovl.u8 q13, d19
vqadd.s16 q0, q10
vqadd.s16 q1, q11
vqadd.s16 q2, q12
vqadd.s16 q3, q13
vqmovun.s16 d0, q0
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vst1.8 {q0, q1}, [r0], r2
bne 1b
bx lr
endfunc
.macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7
vtrn.64 \r0, \r4
vtrn.64 \r1, \r5
vtrn.64 \r2, \r6
vtrn.64 \r3, \r7
vtrn.32 \r0, \r2
vtrn.32 \r1, \r3
vtrn.32 \r4, \r6
vtrn.32 \r5, \r7
vtrn.16 \r0, \r1
vtrn.16 \r2, \r3
vtrn.16 \r4, \r5
vtrn.16 \r6, \r7
.endm
// in 4 q regs
// output 8 d regs
.macro transpose_16b_4x4 r0, r1, r2, r3
vtrn.32 \r0, \r2
vtrn.32 \r1, \r3
vtrn.16 \r0, \r1
vtrn.16 \r2, \r3
.endm
/* uses registers q2 - q9 for temp values */
/* TODO: reorder */
.macro tr4_luma_shift r0, r1, r2, r3, shift
vaddl.s16 q5, \r0, \r2 // c0 = src0 + src2
vaddl.s16 q2, \r2, \r3 // c1 = src2 + src3
vsubl.s16 q4, \r0, \r3 // c2 = src0 - src3
vmull.s16 q6, \r1, d0[0] // c3 = 74 * src1
vaddl.s16 q7, \r0, \r3 // src0 + src3
vsubw.s16 q7, q7, \r2 // src0 - src2 + src3
vmul.s32 q7, q7, d0[0] // dst2 = 74 * (src0 - src2 + src3)
vmul.s32 q8, q5, d0[1] // 29 * c0
vmul.s32 q9, q2, d1[0] // 55 * c1
vadd.s32 q8, q9 // 29 * c0 + 55 * c1
vadd.s32 q8, q6 // dst0 = 29 * c0 + 55 * c1 + c3
vmul.s32 q2, q2, d0[1] // 29 * c1
vmul.s32 q9, q4, d1[0] // 55 * c2
vsub.s32 q9, q2 // 55 * c2 - 29 * c1
vadd.s32 q9, q6 // dst1 = 55 * c2 - 29 * c1 + c3
vmul.s32 q5, q5, d1[0] // 55 * c0
vmul.s32 q4, q4, d0[1] // 29 * c2
vadd.s32 q5, q4 // 55 * c0 + 29 * c2
vsub.s32 q5, q6 // dst3 = 55 * c0 + 29 * c2 - c3
vqrshrn.s32 \r0, q8, \shift
vqrshrn.s32 \r1, q9, \shift
vqrshrn.s32 \r2, q7, \shift
vqrshrn.s32 \r3, q5, \shift
.endm
/* uses registers q2 - q6 for temp values */
.macro tr4 r0, r1, r2, r3
vmull.s16 q4, \r1, d0[0] // 83 * src1
vmull.s16 q6, \r1, d0[1] // 36 * src1
vshll.s16 q2, \r0, #6 // 64 * src0
vshll.s16 q3, \r2, #6 // 64 * src2
vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0
vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1
vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0
vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1
vsub.s32 q3, q5, q4 // e0 - o0
vadd.s32 q4, q5, q4 // e0 + o0
vadd.s32 q5, q2, q6 // e1 + o1
vsub.s32 q6, q2, q6 // e1 - o1
.endm
.macro tr4_shift r0, r1, r2, r3, shift
vmull.s16 q4, \r1, d0[0] // 83 * src1
vmull.s16 q6, \r1, d0[1] // 36 * src1
vshll.s16 q2, \r0, #6 // 64 * src0
vshll.s16 q3, \r2, #6 // 64 * src2
vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0
vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1
vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0
vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1
vsub.s32 q3, q5, q4 // e0 - o0
vadd.s32 q4, q5, q4 // e0 + o0
vadd.s32 q5, q2, q6 // e1 + o1
vsub.s32 q6, q2, q6 // e1 - o1
vqrshrn.s32 \r0, q4, \shift
vqrshrn.s32 \r1, q5, \shift
vqrshrn.s32 \r2, q6, \shift
vqrshrn.s32 \r3, q3, \shift
.endm
function ff_hevc_transform_4x4_neon_8, export=1
vpush {d8-d15}
vld1.16 {q14, q15}, [r0] // coeffs
ldr r3, =0x00240053 // 36 and 83
vmov.32 d0[0], r3
tr4_shift d28, d29, d30, d31, #7
vtrn.16 d28, d29
vtrn.16 d30, d31
vtrn.32 q14, q15
tr4_shift d28, d29, d30, d31, #12
vtrn.16 d28, d29
vtrn.16 d30, d31
vtrn.32 q14, q15
vst1.16 {q14, q15}, [r0]
vpop {d8-d15}
bx lr
endfunc
function ff_hevc_transform_luma_4x4_neon_8, export=1
vpush {d8-d15}
vld1.16 {q14, q15}, [r0] // coeffs
ldr r3, =0x4a // 74
vmov.32 d0[0], r3
ldr r3, =0x1d // 29
vmov.32 d0[1], r3
ldr r3, =0x37 // 55
vmov.32 d1[0], r3
tr4_luma_shift d28, d29, d30, d31, #7
vtrn.16 d28, d29
vtrn.16 d30, d31
vtrn.32 q14, q15
tr4_luma_shift d28, d29, d30, d31, #12
vtrn.16 d28, d29
vtrn.16 d30, d31
vtrn.32 q14, q15
vst1.16 {q14, q15}, [r0]
vpop {d8-d15}
bx lr
endfunc
.macro tr8_begin in0, in1, in2, in3
vmull.s16 q7, \in0, d1[1] // 89 * src1
vmull.s16 q8, \in0, d1[0] // 75 * src1
vmull.s16 q9, \in0, d1[3] // 50 * src1
vmull.s16 q10, \in0, d1[2] // 18 * src1
vmlal.s16 q7, \in1, d1[0] // 75 * src3
vmlsl.s16 q8, \in1, d1[2] //-18 * src3
vmlsl.s16 q9, \in1, d1[1] //-89 * src3
vmlsl.s16 q10, \in1, d1[3] //-50 * src3
vmlal.s16 q7, \in2, d1[3] // 50 * src5
vmlsl.s16 q8, \in2, d1[1] //-89 * src5
vmlal.s16 q9, \in2, d1[2] // 18 * src5
vmlal.s16 q10, \in2, d1[0] // 75 * src5
vmlal.s16 q7, \in3, d1[2] // 18 * src7
vmlsl.s16 q8, \in3, d1[3] //-50 * src7
vmlal.s16 q9, \in3, d1[0] // 75 * src7
vmlsl.s16 q10, \in3, d1[1] //-89 * src7
.endm
.macro tr8_end shift
vadd.s32 q1, q4, q7 // e_8[0] + o_8[0], dst[0]
vsub.s32 q4, q4, q7 // e_8[0] - o_8[0], dst[7]
vadd.s32 q2, q5, q8 // e_8[1] + o_8[1], dst[1]
vsub.s32 q5, q5, q8 // e_8[1] - o_8[1], dst[6]
vadd.s32 q11, q6, q9 // e_8[2] + o_8[2], dst[2]
vsub.s32 q6, q6, q9 // e_8[2] - o_8[2], dst[5]
vadd.s32 q12, q3, q10 // e_8[3] + o_8[3], dst[3]
vsub.s32 q3, q3, q10 // e_8[3] - o_8[3], dst[4]
vqrshrn.s32 d2, q1, \shift
vqrshrn.s32 d3, q2, \shift
vqrshrn.s32 d4, q11, \shift
vqrshrn.s32 d5, q12, \shift
vqrshrn.s32 d6, q3, \shift
vqrshrn.s32 d7, q6, \shift
vqrshrn.s32 d9, q4, \shift
vqrshrn.s32 d8, q5, \shift
.endm
function ff_hevc_transform_8x8_neon_8, export=1
push {r4-r8}
vpush {d8-d15}
mov r5, #16
adr r3, tr4f
vld1.16 {d0, d1}, [r3]
// left half
vld1.16 {d24}, [r0], r5
vld1.16 {d25}, [r0], r5
vld1.16 {d26}, [r0], r5
vld1.16 {d27}, [r0], r5
vld1.16 {d28}, [r0], r5
vld1.16 {d29}, [r0], r5
vld1.16 {d30}, [r0], r5
vld1.16 {d31}, [r0], r5
sub r0, #128
tr8_begin d25, d27, d29, d31
tr4 d24, d26, d28, d30
tr8_end #7
vst1.16 {d2}, [r0], r5
vst1.16 {d3}, [r0], r5
vst1.16 {d4}, [r0], r5
vst1.16 {d5}, [r0], r5
vst1.16 {d6}, [r0], r5
vst1.16 {d7}, [r0], r5
vst1.16 {d8}, [r0], r5
vst1.16 {d9}, [r0], r5
sub r0, #128
//skip right half if col_limit in r1 is less than 4
cmp r1, #4
blt 1f
//right half
add r0, #8
vld1.16 {d24}, [r0], r5
vld1.16 {d25}, [r0], r5
vld1.16 {d26}, [r0], r5
vld1.16 {d27}, [r0], r5
vld1.16 {d28}, [r0], r5
vld1.16 {d29}, [r0], r5
vld1.16 {d30}, [r0], r5
vld1.16 {d31}, [r0], r5
sub r0, #128
tr8_begin d25, d27, d29, d31
tr4 d24, d26, d28, d30
tr8_end #7
vst1.16 {d2}, [r0], r5
vst1.16 {d3}, [r0], r5
vst1.16 {d4}, [r0], r5
vst1.16 {d5}, [r0], r5
vst1.16 {d6}, [r0], r5
vst1.16 {d7}, [r0], r5
vst1.16 {d8}, [r0], r5
vst1.16 {d9}, [r0], r5
sub r0, #136
1:
// top half
vldm r0, {q12-q15} // coeffs
transpose_16b_4x4 d24, d26, d28, d30
transpose_16b_4x4 d25, d27, d29, d31
tr8_begin d26, d30, d27, d31
tr4 d24, d28, d25, d29
tr8_end #12
transpose_16b_4x4 d2, d3, d4, d5
transpose_16b_4x4 d6, d7, d8, d9
vswp d7, d5
vswp d7, d8
vswp d3, d6
vswp d6, d4
vstm r0!, {q1-q4}
// bottom half
vldm r0, {q12-q15} // coeffs
transpose_16b_4x4 d24, d26, d28, d30
transpose_16b_4x4 d25, d27, d29, d31
tr8_begin d26, d30, d27, d31
tr4 d24, d28, d25, d29
tr8_end #12
transpose_16b_4x4 d2, d3, d4, d5
transpose_16b_4x4 d6, d7, d8, d9
vswp d7, d5
vswp d7, d8
vswp d3, d6
vswp d6, d4
//vstm r0, {q1-q4}
vst1.16 {q1-q2}, [r0]
add r0, #32
vst1.16 {q3-q4}, [r0]
sub r0, #32
vpop {d8-d15}
pop {r4-r8}
bx lr
endfunc
.align 4
tr4f:
.word 0x00240053 // 36 and d1[0] = 83
.word 0x00000000
tr8f:
.word 0x0059004b // 89, d0[0] = 75
.word 0x00320012 // 50, d0[2] = 18
tr16:
.word 0x005a0057 // 90, d2[0] = 87
.word 0x00500046 // 80, d2[2] = 70
.word 0x0039002b // 57, d2[0] = 43
.word 0x00190009 // 25, d2[2] = 9

View File

@@ -0,0 +1,32 @@
/*
* Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/hevcdsp.h"
#include "hevcdsp_arm.h"
av_cold void ff_hevcdsp_init_arm(HEVCDSPContext *c, const int bit_depth)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
ff_hevcdsp_init_neon(c, bit_depth);
}

View File

@@ -0,0 +1,224 @@
/*
* Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/hevcdsp.h"
#include "hevcdsp_arm.h"
void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
ptrdiff_t stride);
#define PUT_PIXELS(name) \
void name(int16_t *dst, uint8_t *src, \
ptrdiff_t srcstride, int height, \
intptr_t mx, intptr_t my, int width)
PUT_PIXELS(ff_hevc_put_pixels_w2_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w4_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w6_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w8_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w12_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w16_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w24_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
#undef PUT_PIXELS
static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
int height, int width);
static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
int width, int height, int16_t* src2, ptrdiff_t src2stride);
void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
int16_t *src2,
int height, intptr_t mx, intptr_t my, int width);
#define QPEL_FUNC(name) \
void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
int height, int width)
QPEL_FUNC(ff_hevc_put_qpel_v1_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_v2_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_v3_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h1_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h2_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h3_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h1v1_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h1v2_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h1v3_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h2v1_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h2v2_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h2v3_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h3v1_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h3v2_neon_8);
QPEL_FUNC(ff_hevc_put_qpel_h3v3_neon_8);
#undef QPEL_FUNC
#define QPEL_FUNC_UW_PIX(name) \
void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
int height, intptr_t mx, intptr_t my, int width);
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w4_neon_8);
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w8_neon_8);
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w16_neon_8);
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w24_neon_8);
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w32_neon_8);
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w48_neon_8);
QPEL_FUNC_UW_PIX(ff_hevc_put_qpel_uw_pixels_w64_neon_8);
#undef QPEL_FUNC_UW_PIX
#define QPEL_FUNC_UW(name) \
void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
int width, int height, int16_t* src2, ptrdiff_t src2stride);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_pixels_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v1_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v2_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_v3_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v1_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v2_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h1v3_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v1_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v2_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
#undef QPEL_FUNC_UW
void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width) {
put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
}
void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width) {
put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
}
void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
int16_t *src2,
int height, intptr_t mx, intptr_t my, int width) {
put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
}
av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
{
if (bit_depth == 8) {
int x;
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon;
c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon;
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon;
c->idct[0] = ff_hevc_transform_4x4_neon_8;
c->idct[1] = ff_hevc_transform_8x8_neon_8;
c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8;
c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_8;
c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_8;
c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_8;
c->transform_add[0] = ff_hevc_transform_add_4x4_neon_8;
c->transform_add[1] = ff_hevc_transform_add_8x8_neon_8;
c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8;
c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8;
c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8;
put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8;
put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8;
put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8;
put_hevc_qpel_neon[0][1] = ff_hevc_put_qpel_h1_neon_8;
put_hevc_qpel_neon[0][2] = ff_hevc_put_qpel_h2_neon_8;
put_hevc_qpel_neon[0][3] = ff_hevc_put_qpel_h3_neon_8;
put_hevc_qpel_neon[1][1] = ff_hevc_put_qpel_h1v1_neon_8;
put_hevc_qpel_neon[1][2] = ff_hevc_put_qpel_h2v1_neon_8;
put_hevc_qpel_neon[1][3] = ff_hevc_put_qpel_h3v1_neon_8;
put_hevc_qpel_neon[2][1] = ff_hevc_put_qpel_h1v2_neon_8;
put_hevc_qpel_neon[2][2] = ff_hevc_put_qpel_h2v2_neon_8;
put_hevc_qpel_neon[2][3] = ff_hevc_put_qpel_h3v2_neon_8;
put_hevc_qpel_neon[3][1] = ff_hevc_put_qpel_h1v3_neon_8;
put_hevc_qpel_neon[3][2] = ff_hevc_put_qpel_h2v3_neon_8;
put_hevc_qpel_neon[3][3] = ff_hevc_put_qpel_h3v3_neon_8;
put_hevc_qpel_uw_neon[1][0] = ff_hevc_put_qpel_uw_v1_neon_8;
put_hevc_qpel_uw_neon[2][0] = ff_hevc_put_qpel_uw_v2_neon_8;
put_hevc_qpel_uw_neon[3][0] = ff_hevc_put_qpel_uw_v3_neon_8;
put_hevc_qpel_uw_neon[0][1] = ff_hevc_put_qpel_uw_h1_neon_8;
put_hevc_qpel_uw_neon[0][2] = ff_hevc_put_qpel_uw_h2_neon_8;
put_hevc_qpel_uw_neon[0][3] = ff_hevc_put_qpel_uw_h3_neon_8;
put_hevc_qpel_uw_neon[1][1] = ff_hevc_put_qpel_uw_h1v1_neon_8;
put_hevc_qpel_uw_neon[1][2] = ff_hevc_put_qpel_uw_h2v1_neon_8;
put_hevc_qpel_uw_neon[1][3] = ff_hevc_put_qpel_uw_h3v1_neon_8;
put_hevc_qpel_uw_neon[2][1] = ff_hevc_put_qpel_uw_h1v2_neon_8;
put_hevc_qpel_uw_neon[2][2] = ff_hevc_put_qpel_uw_h2v2_neon_8;
put_hevc_qpel_uw_neon[2][3] = ff_hevc_put_qpel_uw_h3v2_neon_8;
put_hevc_qpel_uw_neon[3][1] = ff_hevc_put_qpel_uw_h1v3_neon_8;
put_hevc_qpel_uw_neon[3][2] = ff_hevc_put_qpel_uw_h2v3_neon_8;
put_hevc_qpel_uw_neon[3][3] = ff_hevc_put_qpel_uw_h3v3_neon_8;
for (x = 0; x < 10; x++) {
c->put_hevc_qpel[x][1][0] = ff_hevc_put_qpel_neon_wrapper;
c->put_hevc_qpel[x][0][1] = ff_hevc_put_qpel_neon_wrapper;
c->put_hevc_qpel[x][1][1] = ff_hevc_put_qpel_neon_wrapper;
c->put_hevc_qpel_uni[x][1][0] = ff_hevc_put_qpel_uni_neon_wrapper;
c->put_hevc_qpel_uni[x][0][1] = ff_hevc_put_qpel_uni_neon_wrapper;
c->put_hevc_qpel_uni[x][1][1] = ff_hevc_put_qpel_uni_neon_wrapper;
c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper;
c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper;
c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper;
}
c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
c->put_hevc_qpel[3][0][0] = ff_hevc_put_pixels_w8_neon_8;
c->put_hevc_qpel[4][0][0] = ff_hevc_put_pixels_w12_neon_8;
c->put_hevc_qpel[5][0][0] = ff_hevc_put_pixels_w16_neon_8;
c->put_hevc_qpel[6][0][0] = ff_hevc_put_pixels_w24_neon_8;
c->put_hevc_qpel[7][0][0] = ff_hevc_put_pixels_w32_neon_8;
c->put_hevc_qpel[8][0][0] = ff_hevc_put_pixels_w48_neon_8;
c->put_hevc_qpel[9][0][0] = ff_hevc_put_pixels_w64_neon_8;
c->put_hevc_qpel_uni[1][0][0] = ff_hevc_put_qpel_uw_pixels_w4_neon_8;
c->put_hevc_qpel_uni[3][0][0] = ff_hevc_put_qpel_uw_pixels_w8_neon_8;
c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_qpel_uw_pixels_w16_neon_8;
c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_qpel_uw_pixels_w24_neon_8;
c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
}
}

View File

@@ -0,0 +1,999 @@
/*
* Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
#define MAX_PB_SIZE #64
.macro regshuffle_d8
vmov d16, d17
vmov d17, d18
vmov d18, d19
vmov d19, d20
vmov d20, d21
vmov d21, d22
vmov d22, d23
.endm
.macro regshuffle_q8
vmov q0, q1
vmov q1, q2
vmov q2, q3
vmov q3, q4
vmov q4, q5
vmov q5, q6
vmov q6, q7
.endm
.macro vextin8
pld [r2]
vld1.8 {q11}, [r2], r3
vext.8 d16, d22, d23, #1
vext.8 d17, d22, d23, #2
vext.8 d18, d22, d23, #3
vext.8 d19, d22, d23, #4
vext.8 d20, d22, d23, #5
vext.8 d21, d22, d23, #6
vext.8 d22, d22, d23, #7
.endm
.macro loadin8
pld [r2]
vld1.8 {d16}, [r2], r3
pld [r2]
vld1.8 {d17}, [r2], r3
pld [r2]
vld1.8 {d18}, [r2], r3
pld [r2]
vld1.8 {d19}, [r2], r3
pld [r2]
vld1.8 {d20}, [r2], r3
pld [r2]
vld1.8 {d21}, [r2], r3
pld [r2]
vld1.8 {d22}, [r2], r3
pld [r2]
vld1.8 {d23}, [r2], r3
.endm
.macro qpel_filter_1_32b
vmov.i16 d16, #58
vmov.i16 d17, #10
vmull.s16 q9, d6, d16 // 58 * d0
vmull.s16 q10, d7, d16 // 58 * d1
vmov.i16 d16, #17
vmull.s16 q11, d4, d17 // 10 * c0
vmull.s16 q12, d5, d17 // 10 * c1
vmov.i16 d17, #5
vmull.s16 q13, d8, d16 // 17 * e0
vmull.s16 q14, d9, d16 // 17 * e1
vmull.s16 q15, d10, d17 // 5 * f0
vmull.s16 q8, d11, d17 // 5 * f1
vsub.s32 q9, q11 // 58 * d0 - 10 * c0
vsub.s32 q10, q12 // 58 * d1 - 10 * c1
vshll.s16 q11, d2, #2 // 4 * b0
vshll.s16 q12, d3, #2 // 4 * b1
vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0
vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1
vsubl.s16 q13, d12, d0 // g0 - a0
vsubl.s16 q14, d13, d1 // g1 - a1
vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
vsub.s32 q13, q15 // g0 - a0 - 5 * f0
vsub.s32 q14, q8 // g1 - a1 - 5 * f1
vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
vqshrn.s32 d16, q9, #6
vqshrn.s32 d17, q10, #6
.endm
// input q0 - q7
// output q8
.macro qpel_filter_2_32b
vmov.i32 q8, #11
vaddl.s16 q9, d6, d8 // d0 + e0
vaddl.s16 q10, d7, d9 // d1 + e1
vaddl.s16 q11, d4, d10 // c0 + f0
vaddl.s16 q12, d5, d11 // c1 + f1
vmul.s32 q11, q8 // 11 * (c0 + f0)
vmul.s32 q12, q8 // 11 * (c1 + f1)
vmov.i32 q8, #40
vaddl.s16 q15, d2, d12 // b0 + g0
vmul.s32 q9, q8 // 40 * (d0 + e0)
vmul.s32 q10, q8 // 40 * (d1 + e1)
vaddl.s16 q8, d3, d13 // b1 + g1
vaddl.s16 q13, d0, d14 // a0 + h0
vaddl.s16 q14, d1, d15 // a1 + h1
vshl.s32 q15, #2 // 4*(b0+g0)
vshl.s32 q8, #2 // 4*(b1+g1)
vadd.s32 q11, q13 // 11 * (c0 + f0) + a0 + h0
vadd.s32 q12, q14 // 11 * (c1 + f1) + a1 + h1
vadd.s32 q9, q15 // 40 * (d0 + e0) + 4*(b0+g0)
vadd.s32 q10, q8 // 40 * (d1 + e1) + 4*(b1+g1)
vsub.s32 q9, q11 // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
vsub.s32 q10, q12 // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
vqshrn.s32 d16, q9, #6
vqshrn.s32 d17, q10, #6
.endm
.macro qpel_filter_3_32b
vmov.i16 d16, #58
vmov.i16 d17, #10
vmull.s16 q9, d8, d16 // 58 * d0
vmull.s16 q10, d9, d16 // 58 * d1
vmov.i16 d16, #17
vmull.s16 q11, d10, d17 // 10 * c0
vmull.s16 q12, d11, d17 // 10 * c1
vmov.i16 d17, #5
vmull.s16 q13, d6, d16 // 17 * e0
vmull.s16 q14, d7, d16 // 17 * e1
vmull.s16 q15, d4, d17 // 5 * f0
vmull.s16 q8, d5, d17 // 5 * f1
vsub.s32 q9, q11 // 58 * d0 - 10 * c0
vsub.s32 q10, q12 // 58 * d1 - 10 * c1
vshll.s16 q11, d12, #2 // 4 * b0
vshll.s16 q12, d13, #2 // 4 * b1
vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0
vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1
vsubl.s16 q13, d2, d14 // g0 - a0
vsubl.s16 q14, d3, d15 // g1 - a1
vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
vsub.s32 q13, q15 // g0 - a0 - 5 * f0
vsub.s32 q14, q8 // g1 - a1 - 5 * f1
vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
vqshrn.s32 d16, q9, #6
vqshrn.s32 d17, q10, #6
.endm
.macro qpel_filter_1 out=q7
vmov.u8 d24, #58
vmov.u8 d25, #10
vshll.u8 q13, d20, #4 // 16*e
vshll.u8 q14, d21, #2 // 4*f
vmull.u8 \out, d19, d24 // 58*d
vaddw.u8 q13, q13, d20 // 17*e
vmull.u8 q15, d18, d25 // 10*c
vaddw.u8 q14, q14, d21 // 5*f
vsubl.u8 q12, d22, d16 // g - a
vadd.u16 \out, q13 // 58d + 17e
vshll.u8 q13, d17, #2 // 4*b
vadd.u16 q15, q14 // 10*c + 5*f
vadd.s16 q13, q12 // - a + 4*b + g
vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f
vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f
.endm
.macro qpel_filter_2 out=q7
vmov.i16 q12, #10
vmov.i16 q14, #11
vaddl.u8 q13, d19, d20 // d + e
vaddl.u8 q15, d18, d21 // c + f
vmul.u16 q13, q12 // 10 * (d+e)
vmul.u16 q15, q14 // 11 * ( c + f)
vaddl.u8 \out, d17, d22 // b + g
vaddl.u8 q12, d16, d23 // a + h
vadd.u16 \out, q13 // b + 10 * (d + e) + g
vadd.s16 q12, q15
vshl.u16 \out, #2 // 4 * (b + 10 * (d + e) + g)
vsub.s16 \out, q12
.endm
.macro qpel_filter_3 out=q7
vmov.u8 d24, #58
vmov.u8 d25, #10
vshll.u8 q13, d19, #4 // 16*e
vshll.u8 q14, d18, #2 // 4*f
vmull.u8 \out, d20, d24 // 58*d
vaddw.u8 q13, q13, d19 // 17*e
vmull.u8 q15, d21, d25 // 10*c
vaddw.u8 q14, q14, d18 // 5*f
vsubl.u8 q12, d17, d23 // g - a
vadd.u16 \out, q13 // 58d + 17e
vshll.u8 q13, d22, #2 // 4*b
vadd.u16 q15, q14 // 10*c + 5*f
vadd.s16 q13, q12 // - a + 4*b + g
vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f
vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f
.endm
.macro hevc_put_qpel_vX_neon_8 filter
push {r4, r5, r6, r7}
ldr r4, [sp, #16] // height
ldr r5, [sp, #20] // width
vpush {d8-d15}
sub r2, r2, r3, lsl #1
sub r2, r3
mov r12, r4
mov r6, r0
mov r7, r2
lsl r1, #1
0: loadin8
cmp r5, #4
beq 4f
8: subs r4, #1
\filter
vst1.16 {q7}, [r0], r1
regshuffle_d8
vld1.8 {d23}, [r2], r3
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #16
mov r0, r6
add r7, #8
mov r2, r7
b 0b
4: subs r4, #1
\filter
vst1.16 d14, [r0], r1
regshuffle_d8
vld1.32 {d23[0]}, [r2], r3
bne 4b
99: vpop {d8-d15}
pop {r4, r5, r6, r7}
bx lr
.endm
.macro hevc_put_qpel_uw_vX_neon_8 filter
push {r4-r10}
ldr r5, [sp, #28] // width
ldr r4, [sp, #32] // height
ldr r8, [sp, #36] // src2
ldr r9, [sp, #40] // src2stride
vpush {d8-d15}
sub r2, r2, r3, lsl #1
sub r2, r3
mov r12, r4
mov r6, r0
mov r7, r2
cmp r8, #0
bne .Lbi\@
0: loadin8
cmp r5, #4
beq 4f
8: subs r4, #1
\filter
vqrshrun.s16 d0, q7, #6
vst1.8 d0, [r0], r1
regshuffle_d8
vld1.8 {d23}, [r2], r3
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #8
mov r0, r6
add r7, #8
mov r2, r7
b 0b
4: subs r4, #1
\filter
vqrshrun.s16 d0, q7, #6
vst1.32 d0[0], [r0], r1
regshuffle_d8
vld1.32 {d23[0]}, [r2], r3
bne 4b
b 99f
.Lbi\@: lsl r9, #1
mov r10, r8
0: loadin8
cmp r5, #4
beq 4f
8: subs r4, #1
\filter
vld1.16 {q0}, [r8], r9
vqadd.s16 q0, q7
vqrshrun.s16 d0, q0, #7
vst1.8 d0, [r0], r1
regshuffle_d8
vld1.8 {d23}, [r2], r3
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #8
mov r0, r6
add r10, #16
mov r8, r10
add r7, #8
mov r2, r7
b 0b
4: subs r4, #1
\filter
vld1.16 d0, [r8], r9
vqadd.s16 d0, d14
vqrshrun.s16 d0, q0, #7
vst1.32 d0[0], [r0], r1
regshuffle_d8
vld1.32 {d23[0]}, [r2], r3
bne 4b
99: vpop {d8-d15}
pop {r4-r10}
bx lr
.endm
function ff_hevc_put_qpel_v1_neon_8, export=1
hevc_put_qpel_vX_neon_8 qpel_filter_1
endfunc
function ff_hevc_put_qpel_v2_neon_8, export=1
hevc_put_qpel_vX_neon_8 qpel_filter_2
endfunc
function ff_hevc_put_qpel_v3_neon_8, export=1
hevc_put_qpel_vX_neon_8 qpel_filter_3
endfunc
function ff_hevc_put_qpel_uw_v1_neon_8, export=1
hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
endfunc
function ff_hevc_put_qpel_uw_v2_neon_8, export=1
hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
endfunc
function ff_hevc_put_qpel_uw_v3_neon_8, export=1
hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
endfunc
.macro hevc_put_qpel_hX_neon_8 filter
push {r4, r5, r6, r7}
ldr r4, [sp, #16] // height
ldr r5, [sp, #20] // width
vpush {d8-d15}
sub r2, #4
lsl r1, #1
mov r12, r4
mov r6, r0
mov r7, r2
cmp r5, #4
beq 4f
8: subs r4, #1
vextin8
\filter
vst1.16 {q7}, [r0], r1
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #16
mov r0, r6
add r7, #8
mov r2, r7
cmp r5, #4
bne 8b
4: subs r4, #1
vextin8
\filter
vst1.16 d14, [r0], r1
bne 4b
99: vpop {d8-d15}
pop {r4, r5, r6, r7}
bx lr
.endm
.macro hevc_put_qpel_uw_hX_neon_8 filter
push {r4-r10}
ldr r5, [sp, #28] // width
ldr r4, [sp, #32] // height
ldr r8, [sp, #36] // src2
ldr r9, [sp, #40] // src2stride
vpush {d8-d15}
sub r2, #4
mov r12, r4
mov r6, r0
mov r7, r2
cmp r8, #0
bne .Lbi\@
cmp r5, #4
beq 4f
8: subs r4, #1
vextin8
\filter
vqrshrun.s16 d0, q7, #6
vst1.8 d0, [r0], r1
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #8
mov r0, r6
add r7, #8
mov r2, r7
cmp r5, #4
bne 8b
4: subs r4, #1
vextin8
\filter
vqrshrun.s16 d0, q7, #6
vst1.32 d0[0], [r0], r1
bne 4b
b 99f
.Lbi\@:
lsl r9, #1
cmp r5, #4
beq 4f
mov r10, r8
8: subs r4, #1
vextin8
\filter
vld1.16 {q0}, [r8], r9
vqadd.s16 q0, q7
vqrshrun.s16 d0, q0, #7
vst1.8 d0, [r0], r1
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #8
add r10, #16
mov r8, r10
mov r0, r6
add r7, #8
mov r2, r7
cmp r5, #4
bne 8b
4: subs r4, #1
vextin8
\filter
vld1.16 d0, [r8], r9
vqadd.s16 d0, d14
vqrshrun.s16 d0, q0, #7
vst1.32 d0[0], [r0], r1
bne 4b
99: vpop {d8-d15}
pop {r4-r10}
bx lr
.endm
function ff_hevc_put_qpel_h1_neon_8, export=1
hevc_put_qpel_hX_neon_8 qpel_filter_1
endfunc
function ff_hevc_put_qpel_h2_neon_8, export=1
hevc_put_qpel_hX_neon_8 qpel_filter_2
endfunc
function ff_hevc_put_qpel_h3_neon_8, export=1
hevc_put_qpel_hX_neon_8 qpel_filter_3
endfunc
function ff_hevc_put_qpel_uw_h1_neon_8, export=1
hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
endfunc
function ff_hevc_put_qpel_uw_h2_neon_8, export=1
hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
endfunc
function ff_hevc_put_qpel_uw_h3_neon_8, export=1
hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
endfunc
.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
push {r4, r5, r6, r7}
ldr r4, [sp, #16] // height
ldr r5, [sp, #20] // width
vpush {d8-d15}
sub r2, #4
sub r2, r2, r3, lsl #1
sub r2, r3 // extra_before 3
lsl r1, #1
mov r12, r4
mov r6, r0
mov r7, r2
0: vextin8
\filterh q0
vextin8
\filterh q1
vextin8
\filterh q2
vextin8
\filterh q3
vextin8
\filterh q4
vextin8
\filterh q5
vextin8
\filterh q6
vextin8
\filterh q7
cmp r5, #4
beq 4f
8: subs r4, #1
\filterv
vst1.16 {q8}, [r0], r1
regshuffle_q8
vextin8
\filterh q7
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #16
mov r0, r6
add r7, #8
mov r2, r7
b 0b
4: subs r4, #1
\filterv
vst1.16 d16, [r0], r1
regshuffle_q8
vextin8
\filterh q7
bne 4b
99: vpop {d8-d15}
pop {r4, r5, r6, r7}
bx lr
.endm
.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
push {r4-r10}
ldr r5, [sp, #28] // width
ldr r4, [sp, #32] // height
ldr r8, [sp, #36] // src2
ldr r9, [sp, #40] // src2stride
vpush {d8-d15}
sub r2, #4
sub r2, r2, r3, lsl #1
sub r2, r3 // extra_before 3
mov r12, r4
mov r6, r0
mov r7, r2
cmp r8, #0
bne .Lbi\@
0: vextin8
\filterh q0
vextin8
\filterh q1
vextin8
\filterh q2
vextin8
\filterh q3
vextin8
\filterh q4
vextin8
\filterh q5
vextin8
\filterh q6
vextin8
\filterh q7
cmp r5, #4
beq 4f
8: subs r4, #1
\filterv
vqrshrun.s16 d0, q8, #6
vst1.8 d0, [r0], r1
regshuffle_q8
vextin8
\filterh q7
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #8
mov r0, r6
add r7, #8
mov r2, r7
b 0b
4: subs r4, #1
\filterv
vqrshrun.s16 d0, q8, #6
vst1.32 d0[0], [r0], r1
regshuffle_q8
vextin8
\filterh q7
bne 4b
b 99f
.Lbi\@: lsl r9, #1
mov r10, r8
0: vextin8
\filterh q0
vextin8
\filterh q1
vextin8
\filterh q2
vextin8
\filterh q3
vextin8
\filterh q4
vextin8
\filterh q5
vextin8
\filterh q6
vextin8
\filterh q7
cmp r5, #4
beq 4f
8: subs r4, #1
\filterv
vld1.16 {q0}, [r8], r9
vqadd.s16 q0, q8
vqrshrun.s16 d0, q0, #7
vst1.8 d0, [r0], r1
regshuffle_q8
vextin8
\filterh q7
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #8
mov r0, r6
add r10, #16
mov r8, r10
add r7, #8
mov r2, r7
b 0b
4: subs r4, #1
\filterv
vld1.16 d0, [r8], r9
vqadd.s16 d0, d16
vqrshrun.s16 d0, q0, #7
vst1.32 d0[0], [r0], r1
regshuffle_q8
vextin8
\filterh q7
bne 4b
99: vpop {d8-d15}
pop {r4-r10}
bx lr
.endm
function ff_hevc_put_qpel_h1v1_neon_8, export=1
hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
endfunc
function ff_hevc_put_qpel_h2v1_neon_8, export=1
hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
endfunc
function ff_hevc_put_qpel_h3v1_neon_8, export=1
hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
endfunc
function ff_hevc_put_qpel_h1v2_neon_8, export=1
hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
endfunc
function ff_hevc_put_qpel_h2v2_neon_8, export=1
hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
endfunc
function ff_hevc_put_qpel_h3v2_neon_8, export=1
hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
endfunc
function ff_hevc_put_qpel_h1v3_neon_8, export=1
hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
endfunc
function ff_hevc_put_qpel_h2v3_neon_8, export=1
hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
endfunc
function ff_hevc_put_qpel_h3v3_neon_8, export=1
hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
endfunc
function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
endfunc
function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
endfunc
function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
endfunc
function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
endfunc
function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
endfunc
function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
endfunc
function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
endfunc
function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
endfunc
function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1
hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
endfunc
.macro init_put_pixels
pld [r1]
pld [r1, r2]
mov r12, MAX_PB_SIZE
lsl r12, #1
.endm
function ff_hevc_put_pixels_w2_neon_8, export=1
init_put_pixels
vmov.u8 d5, #255
vshr.u64 d5, #32
0: subs r3, #1
vld1.32 {d0[0]}, [r1], r2
pld [r1]
vld1.32 d6, [r0]
vshll.u8 q0, d0, #6
vbit d6, d0, d5
vst1.32 d6, [r0], r12
bne 0b
bx lr
endfunc
function ff_hevc_put_pixels_w4_neon_8, export=1
init_put_pixels
0: subs r3, #2
vld1.32 {d0[0]}, [r1], r2
vld1.32 {d0[1]}, [r1], r2
pld [r1]
pld [r1, r2]
vshll.u8 q0, d0, #6
vst1.64 {d0}, [r0], r12
vst1.64 {d1}, [r0], r12
bne 0b
bx lr
endfunc
function ff_hevc_put_pixels_w6_neon_8, export=1
init_put_pixels
vmov.u8 q10, #255
vshr.u64 d21, #32
0: subs r3, #1
vld1.16 {d0}, [r1], r2
pld [r1]
vshll.u8 q0, d0, #6
vld1.8 {q12}, [r0]
vbit q12, q0, q10
vst1.8 {q12}, [r0], r12
bne 0b
bx lr
endfunc
function ff_hevc_put_pixels_w8_neon_8, export=1
init_put_pixels
0: subs r3, #2
vld1.8 {d0}, [r1], r2
vld1.8 {d2}, [r1], r2
pld [r1]
pld [r1, r2]
vshll.u8 q0, d0, #6
vshll.u8 q1, d2, #6
vst1.16 {q0}, [r0], r12
vst1.16 {q1}, [r0], r12
bne 0b
bx lr
endfunc
function ff_hevc_put_pixels_w12_neon_8, export=1
init_put_pixels
0: subs r3, #2
vld1.64 {d0}, [r1]
add r1, #8
vld1.32 {d1[0]}, [r1], r2
sub r1, #8
vld1.64 {d2}, [r1]
add r1, #8
vld1.32 {d1[1]}, [r1], r2
sub r1, #8
pld [r1]
pld [r1, r2]
vshll.u8 q8, d0, #6
vshll.u8 q9, d1, #6
vshll.u8 q10, d2, #6
vmov d22, d19
vst1.64 {d16, d17, d18}, [r0], r12
vst1.64 {d20, d21, d22}, [r0], r12
bne 0b
bx lr
endfunc
function ff_hevc_put_pixels_w16_neon_8, export=1
init_put_pixels
0: subs r3, #2
vld1.8 {q0}, [r1], r2
vld1.8 {q1}, [r1], r2
pld [r1]
pld [r1, r2]
vshll.u8 q8, d0, #6
vshll.u8 q9, d1, #6
vshll.u8 q10, d2, #6
vshll.u8 q11, d3, #6
vst1.8 {q8, q9}, [r0], r12
vst1.8 {q10, q11}, [r0], r12
bne 0b
bx lr
endfunc
function ff_hevc_put_pixels_w24_neon_8, export=1
init_put_pixels
0: subs r3, #1
vld1.8 {d0, d1, d2}, [r1], r2
pld [r1]
vshll.u8 q10, d0, #6
vshll.u8 q11, d1, #6
vshll.u8 q12, d2, #6
vstm r0, {q10, q11, q12}
add r0, r12
bne 0b
bx lr
endfunc
function ff_hevc_put_pixels_w32_neon_8, export=1
init_put_pixels
0: subs r3, #1
vld1.8 {q0, q1}, [r1], r2
pld [r1]
vshll.u8 q8, d0, #6
vshll.u8 q9, d1, #6
vshll.u8 q10, d2, #6
vshll.u8 q11, d3, #6
vstm r0, {q8, q9, q10, q11}
add r0, r12
bne 0b
bx lr
endfunc
function ff_hevc_put_pixels_w48_neon_8, export=1
init_put_pixels
0: subs r3, #1
vld1.8 {q0, q1}, [r1]
add r1, #32
vld1.8 {q2}, [r1], r2
sub r1, #32
pld [r1]
vshll.u8 q8, d0, #6
vshll.u8 q9, d1, #6
vshll.u8 q10, d2, #6
vshll.u8 q11, d3, #6
vshll.u8 q12, d4, #6
vshll.u8 q13, d5, #6
vstm r0, {q8, q9, q10, q11, q12, q13}
add r0, r12
bne 0b
bx lr
endfunc
function ff_hevc_put_pixels_w64_neon_8, export=1
init_put_pixels
0: subs r3, #1
vld1.8 {q0, q1}, [r1]
add r1, #32
vld1.8 {q2, q3}, [r1], r2
sub r1, #32
pld [r1]
vshll.u8 q8, d0, #6
vshll.u8 q9, d1, #6
vshll.u8 q10, d2, #6
vshll.u8 q11, d3, #6
vshll.u8 q12, d4, #6
vshll.u8 q13, d5, #6
vshll.u8 q14, d6, #6
vshll.u8 q15, d7, #6
vstm r0, {q8, q9, q10, q11, q12, q13, q14, q15}
add r0, r12
bne 0b
bx lr
endfunc
function ff_hevc_put_qpel_uw_pixels_neon_8, export=1
push {r4-r9}
ldr r5, [sp, #24] // width
ldr r4, [sp, #28] // height
ldr r8, [sp, #32] // src2
ldr r9, [sp, #36] // src2stride
vpush {d8-d15}
cmp r8, #0
bne 2f
1: subs r4, #1
vld1.8 {d0}, [r2], r3
vst1.8 d0, [r0], r1
bne 1b
vpop {d8-d15}
pop {r4-r9}
bx lr
2: subs r4, #1
vld1.8 {d0}, [r2], r3
vld1.16 {q1}, [r8], r9
vshll.u8 q0, d0, #6
vqadd.s16 q0, q1
vqrshrun.s16 d0, q0, #7
vst1.8 d0, [r0], r1
bne 2b
vpop {d8-d15}
pop {r4-r9}
bx lr
endfunc
.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4
function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
ldr r12, [sp] // height
1: subs r12, #4
vld1.32 {\regs} , [r2], r3
vld1.32 {\regs2} , [r2], r3
vld1.32 {\regs3} , [r2], r3
vld1.32 {\regs4} , [r2], r3
vst1.32 {\regs} , [r0], r1
vst1.32 {\regs2} , [r0], r1
vst1.32 {\regs3} , [r0], r1
vst1.32 {\regs4} , [r0], r1
bne 1b
bx lr
endfunc
.endm
.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4
function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
push {r4-r5}
ldr r12, [sp, #8] // height
1: subs r12, #2
mov r4, r2
vld1.32 {\regs} , [r2]!
vld1.32 {\regs2} , [r2]
add r2, r4, r3
mov r4, r2
vld1.32 {\regs3} , [r2]!
vld1.32 {\regs4} , [r2]
add r2, r4, r3
mov r5, r0
vst1.32 {\regs} , [r0]!
vst1.32 {\regs2} , [r0]
add r0, r5, r1
mov r5, r0
vst1.32 {\regs3} , [r0]!
vst1.32 {\regs4} , [r0]
add r0, r5, r1
bne 1b
pop {r4-r5}
bx lr
endfunc
.endm
put_qpel_uw_pixels 4, d0[0], d0[1], d1[0], d1[1]
put_qpel_uw_pixels 8, d0, d1, d2, d3
put_qpel_uw_pixels_m 12, d0, d1[0], d2, d3[0]
put_qpel_uw_pixels 16, q0, q1, q2, q3
put_qpel_uw_pixels 24, d0-d2, d3-d5, d16-d18, d19-d21
put_qpel_uw_pixels 32, q0-q1, q2-q3, q8-q9, q10-q11
put_qpel_uw_pixels_m 48, q0-q1, q2, q8-q9, q10
put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11

View File

@@ -0,0 +1,603 @@
@
@ ARMv4-optimized halfpel functions
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
@ This file is part of FFmpeg.
@
@ FFmpeg is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2.1 of the License, or (at your option) any later version.
@
@ FFmpeg is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
@ License along with FFmpeg; if not, write to the Free Software
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@
#include "config.h"
#include "libavutil/arm/asm.S"
#if !HAVE_ARMV5TE_EXTERNAL
#define pld @
#endif
.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
mov \Rd0, \Rn0, lsr #(\shift * 8)
mov \Rd1, \Rn1, lsr #(\shift * 8)
mov \Rd2, \Rn2, lsr #(\shift * 8)
mov \Rd3, \Rn3, lsr #(\shift * 8)
orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
.endm
.macro ALIGN_DWORD shift, R0, R1, R2
mov \R0, \R0, lsr #(\shift * 8)
orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
mov \R1, \R1, lsr #(\shift * 8)
orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
.endm
.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
.endm
.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
@ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
@ Rmask = 0xFEFEFEFE
@ Rn = destroy
eor \Rd0, \Rn0, \Rm0
eor \Rd1, \Rn1, \Rm1
orr \Rn0, \Rn0, \Rm0
orr \Rn1, \Rn1, \Rm1
and \Rd0, \Rd0, \Rmask
and \Rd1, \Rd1, \Rmask
sub \Rd0, \Rn0, \Rd0, lsr #1
sub \Rd1, \Rn1, \Rd1, lsr #1
.endm
.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
@ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
@ Rmask = 0xFEFEFEFE
@ Rn = destroy
eor \Rd0, \Rn0, \Rm0
eor \Rd1, \Rn1, \Rm1
and \Rn0, \Rn0, \Rm0
and \Rn1, \Rn1, \Rm1
and \Rd0, \Rd0, \Rmask
and \Rd1, \Rd1, \Rmask
add \Rd0, \Rn0, \Rd0, lsr #1
add \Rd1, \Rn1, \Rd1, lsr #1
.endm
.macro JMP_ALIGN tmp, reg
ands \tmp, \reg, #3
bic \reg, \reg, #3
beq 1f
subs \tmp, \tmp, #1
beq 2f
subs \tmp, \tmp, #1
beq 3f
b 4f
.endm
@ ----------------------------------------------------------------
function ff_put_pixels16_arm, export=1, align=5
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r11, lr}
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r7}
add r1, r1, r2
stm r0, {r4-r7}
pld [r1]
subs r3, r3, #1
add r0, r0, r2
bne 1b
pop {r4-r11, pc}
.align 5
2:
ldm r1, {r4-r8}
add r1, r1, r2
ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stm r0, {r9-r12}
add r0, r0, r2
bne 2b
pop {r4-r11, pc}
.align 5
3:
ldm r1, {r4-r8}
add r1, r1, r2
ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stm r0, {r9-r12}
add r0, r0, r2
bne 3b
pop {r4-r11, pc}
.align 5
4:
ldm r1, {r4-r8}
add r1, r1, r2
ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
pld [r1]
subs r3, r3, #1
stm r0, {r9-r12}
add r0, r0, r2
bne 4b
pop {r4-r11,pc}
endfunc
@ ----------------------------------------------------------------
function ff_put_pixels8_arm, export=1, align=5
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r5,lr}
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r5}
add r1, r1, r2
subs r3, r3, #1
pld [r1]
stm r0, {r4-r5}
add r0, r0, r2
bne 1b
pop {r4-r5,pc}
.align 5
2:
ldm r1, {r4-r5, r12}
add r1, r1, r2
ALIGN_DWORD 1, r4, r5, r12
pld [r1]
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 2b
pop {r4-r5,pc}
.align 5
3:
ldm r1, {r4-r5, r12}
add r1, r1, r2
ALIGN_DWORD 2, r4, r5, r12
pld [r1]
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 3b
pop {r4-r5,pc}
.align 5
4:
ldm r1, {r4-r5, r12}
add r1, r1, r2
ALIGN_DWORD 3, r4, r5, r12
pld [r1]
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 4b
pop {r4-r5,pc}
endfunc
@ ----------------------------------------------------------------
function ff_put_pixels8_x2_arm, export=1, align=5
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r10,lr}
ldr r12, =0xfefefefe
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
pld [r1]
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 1b
pop {r4-r10,pc}
.align 5
2:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
pld [r1]
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 2b
pop {r4-r10,pc}
.align 5
3:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
pld [r1]
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 3b
pop {r4-r10,pc}
.align 5
4:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
pld [r1]
RND_AVG32 r8, r9, r6, r7, r5, r10, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 4b
pop {r4-r10,pc}
endfunc
function ff_put_no_rnd_pixels8_x2_arm, export=1, align=5
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r10,lr}
ldr r12, =0xfefefefe
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
pld [r1]
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 1b
pop {r4-r10,pc}
.align 5
2:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
pld [r1]
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 2b
pop {r4-r10,pc}
.align 5
3:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
pld [r1]
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
subs r3, r3, #1
stm r0, {r4-r5}
add r0, r0, r2
bne 3b
pop {r4-r10,pc}
.align 5
4:
ldm r1, {r4-r5, r10}
add r1, r1, r2
ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
pld [r1]
NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 4b
pop {r4-r10,pc}
endfunc
@ ----------------------------------------------------------------
function ff_put_pixels8_y2_arm, export=1, align=5
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r11,lr}
mov r3, r3, lsr #1
ldr r12, =0xfefefefe
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r5}
add r1, r1, r2
6: ldm r1, {r6-r7}
add r1, r1, r2
pld [r1]
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
ldm r1, {r4-r5}
add r1, r1, r2
stm r0, {r8-r9}
add r0, r0, r2
pld [r1]
RND_AVG32 r8, r9, r6, r7, r4, r5, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
2:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
3:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
4:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r7, r8, r9
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r4, r5, r6
subs r3, r3, #1
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
endfunc
function ff_put_no_rnd_pixels8_y2_arm, export=1, align=5
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r11,lr}
mov r3, r3, lsr #1
ldr r12, =0xfefefefe
JMP_ALIGN r5, r1
1:
ldm r1, {r4-r5}
add r1, r1, r2
6: ldm r1, {r6-r7}
add r1, r1, r2
pld [r1]
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
ldm r1, {r4-r5}
add r1, r1, r2
stm r0, {r8-r9}
add r0, r0, r2
pld [r1]
NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
subs r3, r3, #1
stm r0, {r8-r9}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
2:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 1, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
3:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 2, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
.align 5
4:
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r4, r5, r6
6: ldm r1, {r7-r9}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r7, r8, r9
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
stm r0, {r10-r11}
add r0, r0, r2
ldm r1, {r4-r6}
add r1, r1, r2
pld [r1]
ALIGN_DWORD 3, r4, r5, r6
subs r3, r3, #1
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
stm r0, {r10-r11}
add r0, r0, r2
bne 6b
pop {r4-r11,pc}
endfunc
.ltorg
@ ----------------------------------------------------------------
.macro RND_XY2_IT align, rnd
@ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
@ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
.if \align == 0
ldm r1, {r6-r8}
.elseif \align == 3
ldm r1, {r5-r7}
.else
ldm r1, {r8-r10}
.endif
add r1, r1, r2
pld [r1]
.if \align == 0
ALIGN_DWORD_D 1, r4, r5, r6, r7, r8
.elseif \align == 1
ALIGN_DWORD_D 1, r4, r5, r8, r9, r10
ALIGN_DWORD_D 2, r6, r7, r8, r9, r10
.elseif \align == 2
ALIGN_DWORD_D 2, r4, r5, r8, r9, r10
ALIGN_DWORD_D 3, r6, r7, r8, r9, r10
.elseif \align == 3
ALIGN_DWORD_D 3, r4, r5, r5, r6, r7
.endif
ldr r14, =0x03030303
tst r3, #1
and r8, r4, r14
and r9, r5, r14
and r10, r6, r14
and r11, r7, r14
it eq
andeq r14, r14, r14, \rnd #1
add r8, r8, r10
add r9, r9, r11
ldr r12, =0xfcfcfcfc >> 2
itt eq
addeq r8, r8, r14
addeq r9, r9, r14
and r4, r12, r4, lsr #2
and r5, r12, r5, lsr #2
and r6, r12, r6, lsr #2
and r7, r12, r7, lsr #2
add r10, r4, r6
add r11, r5, r7
subs r3, r3, #1
.endm
.macro RND_XY2_EXPAND align, rnd
RND_XY2_IT \align, \rnd
6: push {r8-r11}
RND_XY2_IT \align, \rnd
pop {r4-r7}
add r4, r4, r8
add r5, r5, r9
ldr r14, =0x0f0f0f0f
add r6, r6, r10
add r7, r7, r11
and r4, r14, r4, lsr #2
and r5, r14, r5, lsr #2
add r4, r4, r6
add r5, r5, r7
stm r0, {r4-r5}
add r0, r0, r2
bge 6b
pop {r4-r11,pc}
.endm
function ff_put_pixels8_xy2_arm, export=1, align=5
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r11,lr} @ R14 is also called LR
JMP_ALIGN r5, r1
1: RND_XY2_EXPAND 0, lsl
.align 5
2: RND_XY2_EXPAND 1, lsl
.align 5
3: RND_XY2_EXPAND 2, lsl
.align 5
4: RND_XY2_EXPAND 3, lsl
endfunc
function ff_put_no_rnd_pixels8_xy2_arm, export=1, align=5
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
@ block = word aligned, pixles = unaligned
pld [r1]
push {r4-r11,lr}
JMP_ALIGN r5, r1
1: RND_XY2_EXPAND 0, lsr
.align 5
2: RND_XY2_EXPAND 1, lsr
.align 5
3: RND_XY2_EXPAND 2, lsr
.align 5
4: RND_XY2_EXPAND 3, lsr
endfunc

View File

@@ -0,0 +1,29 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_HPELDSP_ARM_H
#define AVCODEC_ARM_HPELDSP_ARM_H
#include "libavcodec/hpeldsp.h"
void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags);
void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags);
#endif /* AVCODEC_ARM_HPELDSP_ARM_H */

View File

@@ -0,0 +1,261 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro call_2x_pixels type, subp
function ff_\type\()_pixels16\subp\()_armv6, export=1
push {r0-r3, lr}
bl X(ff_\type\()_pixels8\subp\()_armv6)
pop {r0-r3, lr}
add r0, r0, #8
add r1, r1, #8
b X(ff_\type\()_pixels8\subp\()_armv6)
endfunc
.endm
call_2x_pixels avg
call_2x_pixels put, _x2
call_2x_pixels put, _y2
call_2x_pixels put, _x2_no_rnd
call_2x_pixels put, _y2_no_rnd
function ff_put_pixels16_armv6, export=1
push {r4-r11}
1:
ldr r5, [r1, #4]
ldr r6, [r1, #8]
ldr r7, [r1, #12]
ldr_post r4, r1, r2
strd r6, r7, [r0, #8]
ldr r9, [r1, #4]
strd_post r4, r5, r0, r2
ldr r10, [r1, #8]
ldr r11, [r1, #12]
ldr_post r8, r1, r2
strd r10, r11, [r0, #8]
subs r3, r3, #2
strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11}
bx lr
endfunc
function ff_put_pixels8_armv6, export=1
push {r4-r7}
1:
ldr r5, [r1, #4]
ldr_post r4, r1, r2
ldr r7, [r1, #4]
strd_post r4, r5, r0, r2
ldr_post r6, r1, r2
subs r3, r3, #2
strd_post r6, r7, r0, r2
bne 1b
pop {r4-r7}
bx lr
endfunc
function ff_put_pixels8_x2_armv6, export=1
push {r4-r11, lr}
mov r12, #1
orr r12, r12, r12, lsl #8
orr r12, r12, r12, lsl #16
1:
ldr r4, [r1]
subs r3, r3, #2
ldr r5, [r1, #4]
ldr r7, [r1, #5]
lsr r6, r4, #8
ldr_pre r8, r1, r2
orr r6, r6, r5, lsl #24
ldr r9, [r1, #4]
ldr r11, [r1, #5]
lsr r10, r8, #8
add r1, r1, r2
orr r10, r10, r9, lsl #24
eor r14, r4, r6
uhadd8 r4, r4, r6
eor r6, r5, r7
uhadd8 r5, r5, r7
and r14, r14, r12
and r6, r6, r12
uadd8 r4, r4, r14
eor r14, r8, r10
uadd8 r5, r5, r6
eor r6, r9, r11
uhadd8 r8, r8, r10
and r14, r14, r12
uhadd8 r9, r9, r11
and r6, r6, r12
uadd8 r8, r8, r14
strd_post r4, r5, r0, r2
uadd8 r9, r9, r6
strd_post r8, r9, r0, r2
bne 1b
pop {r4-r11, pc}
endfunc
function ff_put_pixels8_y2_armv6, export=1
push {r4-r11}
mov r12, #1
orr r12, r12, r12, lsl #8
orr r12, r12, r12, lsl #16
ldr r4, [r1]
ldr r5, [r1, #4]
ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
uhadd8 r8, r4, r6
eor r10, r4, r6
uhadd8 r9, r5, r7
eor r11, r5, r7
and r10, r10, r12
ldr_pre r4, r1, r2
uadd8 r8, r8, r10
and r11, r11, r12
uadd8 r9, r9, r11
ldr r5, [r1, #4]
uhadd8 r10, r4, r6
eor r6, r4, r6
uhadd8 r11, r5, r7
and r6, r6, r12
eor r7, r5, r7
uadd8 r10, r10, r6
and r7, r7, r12
ldrc_pre ne, r6, r1, r2
uadd8 r11, r11, r7
strd_post r8, r9, r0, r2
it ne
ldrne r7, [r1, #4]
strd_post r10, r11, r0, r2
bne 1b
pop {r4-r11}
bx lr
endfunc
function ff_put_pixels8_x2_no_rnd_armv6, export=1
push {r4-r9, lr}
1:
subs r3, r3, #2
ldr r4, [r1]
ldr r5, [r1, #4]
ldr r7, [r1, #5]
ldr_pre r8, r1, r2
ldr r9, [r1, #4]
ldr r14, [r1, #5]
add r1, r1, r2
lsr r6, r4, #8
orr r6, r6, r5, lsl #24
lsr r12, r8, #8
orr r12, r12, r9, lsl #24
uhadd8 r4, r4, r6
uhadd8 r5, r5, r7
uhadd8 r8, r8, r12
uhadd8 r9, r9, r14
stm r0, {r4,r5}
add r0, r0, r2
stm r0, {r8,r9}
add r0, r0, r2
bne 1b
pop {r4-r9, pc}
endfunc
function ff_put_pixels8_y2_no_rnd_armv6, export=1
push {r4-r9, lr}
ldr r4, [r1]
ldr r5, [r1, #4]
ldr_pre r6, r1, r2
ldr r7, [r1, #4]
1:
subs r3, r3, #2
uhadd8 r8, r4, r6
ldr_pre r4, r1, r2
uhadd8 r9, r5, r7
ldr r5, [r1, #4]
uhadd8 r12, r4, r6
ldrc_pre ne, r6, r1, r2
uhadd8 r14, r5, r7
it ne
ldrne r7, [r1, #4]
stm r0, {r8,r9}
add r0, r0, r2
stm r0, {r12,r14}
add r0, r0, r2
bne 1b
pop {r4-r9, pc}
endfunc
function ff_avg_pixels8_armv6, export=1
pld [r1, r2]
push {r4-r10, lr}
mov lr, #1
orr lr, lr, lr, lsl #8
orr lr, lr, lr, lsl #16
ldrd r4, r5, [r0]
ldr r10, [r1, #4]
ldr_post r9, r1, r2
subs r3, r3, #2
1:
pld [r1, r2]
eor r8, r4, r9
uhadd8 r4, r4, r9
eor r12, r5, r10
ldrd_reg r6, r7, r0, r2
uhadd8 r5, r5, r10
and r8, r8, lr
ldr r10, [r1, #4]
and r12, r12, lr
uadd8 r4, r4, r8
ldr_post r9, r1, r2
eor r8, r6, r9
uadd8 r5, r5, r12
pld [r1, r2, lsl #1]
eor r12, r7, r10
uhadd8 r6, r6, r9
strd_post r4, r5, r0, r2
uhadd8 r7, r7, r10
beq 2f
and r8, r8, lr
ldrd_reg r4, r5, r0, r2
uadd8 r6, r6, r8
ldr r10, [r1, #4]
and r12, r12, lr
subs r3, r3, #2
uadd8 r7, r7, r12
ldr_post r9, r1, r2
strd_post r6, r7, r0, r2
b 1b
2:
and r8, r8, lr
and r12, r12, lr
uadd8 r6, r6, r8
uadd8 r7, r7, r12
strd_post r6, r7, r0, r2
pop {r4-r10, pc}
endfunc

View File

@@ -0,0 +1,71 @@
/*
* ARM-optimized halfpel functions
* Copyright (c) 2001 Lionel Ulmer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/pixels.h"
#include "hpeldsp_arm.h"
void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8)
CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8)
CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8)
CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8)
CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8)
CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
av_cold void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
if (have_armv6(cpu_flags))
ff_hpeldsp_init_armv6(c, flags);
if (have_neon(cpu_flags))
ff_hpeldsp_init_neon(c, flags);
}

View File

@@ -0,0 +1,67 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "libavutil/attributes.h"
#include "hpeldsp_arm.h"
void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags)
{
c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
}

View File

@@ -0,0 +1,88 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "libavutil/attributes.h"
#include "hpeldsp_arm.h"
void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
av_cold void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags)
{
c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
}

View File

@@ -0,0 +1,410 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro pixels16 rnd=1, avg=0
.if \avg
mov r12, r0
.endif
1: vld1.8 {q0}, [r1], r2
vld1.8 {q1}, [r1], r2
vld1.8 {q2}, [r1], r2
pld [r1, r2, lsl #2]
vld1.8 {q3}, [r1], r2
pld [r1]
pld [r1, r2]
pld [r1, r2, lsl #1]
.if \avg
vld1.8 {q8}, [r12,:128], r2
vrhadd.u8 q0, q0, q8
vld1.8 {q9}, [r12,:128], r2
vrhadd.u8 q1, q1, q9
vld1.8 {q10}, [r12,:128], r2
vrhadd.u8 q2, q2, q10
vld1.8 {q11}, [r12,:128], r2
vrhadd.u8 q3, q3, q11
.endif
subs r3, r3, #4
vst1.64 {q0}, [r0,:128], r2
vst1.64 {q1}, [r0,:128], r2
vst1.64 {q2}, [r0,:128], r2
vst1.64 {q3}, [r0,:128], r2
bne 1b
bx lr
.endm
.macro pixels16_x2 rnd=1, avg=0
1: vld1.8 {d0-d2}, [r1], r2
vld1.8 {d4-d6}, [r1], r2
pld [r1]
pld [r1, r2]
subs r3, r3, #2
vext.8 q1, q0, q1, #1
avg q0, q0, q1
vext.8 q3, q2, q3, #1
avg q2, q2, q3
.if \avg
vld1.8 {q1}, [r0,:128], r2
vld1.8 {q3}, [r0,:128]
vrhadd.u8 q0, q0, q1
vrhadd.u8 q2, q2, q3
sub r0, r0, r2
.endif
vst1.8 {q0}, [r0,:128], r2
vst1.8 {q2}, [r0,:128], r2
bne 1b
bx lr
.endm
.macro pixels16_y2 rnd=1, avg=0
sub r3, r3, #2
vld1.8 {q0}, [r1], r2
vld1.8 {q1}, [r1], r2
1: subs r3, r3, #2
avg q2, q0, q1
vld1.8 {q0}, [r1], r2
avg q3, q0, q1
vld1.8 {q1}, [r1], r2
pld [r1]
pld [r1, r2]
.if \avg
vld1.8 {q8}, [r0,:128], r2
vld1.8 {q9}, [r0,:128]
vrhadd.u8 q2, q2, q8
vrhadd.u8 q3, q3, q9
sub r0, r0, r2
.endif
vst1.8 {q2}, [r0,:128], r2
vst1.8 {q3}, [r0,:128], r2
bne 1b
avg q2, q0, q1
vld1.8 {q0}, [r1], r2
avg q3, q0, q1
.if \avg
vld1.8 {q8}, [r0,:128], r2
vld1.8 {q9}, [r0,:128]
vrhadd.u8 q2, q2, q8
vrhadd.u8 q3, q3, q9
sub r0, r0, r2
.endif
vst1.8 {q2}, [r0,:128], r2
vst1.8 {q3}, [r0,:128], r2
bx lr
.endm
.macro pixels16_xy2 rnd=1, avg=0
sub r3, r3, #2
vld1.8 {d0-d2}, [r1], r2
vld1.8 {d4-d6}, [r1], r2
NRND vmov.i16 q13, #1
pld [r1]
pld [r1, r2]
vext.8 q1, q0, q1, #1
vext.8 q3, q2, q3, #1
vaddl.u8 q8, d0, d2
vaddl.u8 q10, d1, d3
vaddl.u8 q9, d4, d6
vaddl.u8 q11, d5, d7
1: subs r3, r3, #2
vld1.8 {d0-d2}, [r1], r2
vadd.u16 q12, q8, q9
pld [r1]
NRND vadd.u16 q12, q12, q13
vext.8 q15, q0, q1, #1
vadd.u16 q1 , q10, q11
shrn d28, q12, #2
NRND vadd.u16 q1, q1, q13
shrn d29, q1, #2
.if \avg
vld1.8 {q8}, [r0,:128]
vrhadd.u8 q14, q14, q8
.endif
vaddl.u8 q8, d0, d30
vld1.8 {d2-d4}, [r1], r2
vaddl.u8 q10, d1, d31
vst1.8 {q14}, [r0,:128], r2
vadd.u16 q12, q8, q9
pld [r1, r2]
NRND vadd.u16 q12, q12, q13
vext.8 q2, q1, q2, #1
vadd.u16 q0, q10, q11
shrn d30, q12, #2
NRND vadd.u16 q0, q0, q13
shrn d31, q0, #2
.if \avg
vld1.8 {q9}, [r0,:128]
vrhadd.u8 q15, q15, q9
.endif
vaddl.u8 q9, d2, d4
vaddl.u8 q11, d3, d5
vst1.8 {q15}, [r0,:128], r2
bgt 1b
vld1.8 {d0-d2}, [r1], r2
vadd.u16 q12, q8, q9
NRND vadd.u16 q12, q12, q13
vext.8 q15, q0, q1, #1
vadd.u16 q1 , q10, q11
shrn d28, q12, #2
NRND vadd.u16 q1, q1, q13
shrn d29, q1, #2
.if \avg
vld1.8 {q8}, [r0,:128]
vrhadd.u8 q14, q14, q8
.endif
vaddl.u8 q8, d0, d30
vaddl.u8 q10, d1, d31
vst1.8 {q14}, [r0,:128], r2
vadd.u16 q12, q8, q9
NRND vadd.u16 q12, q12, q13
vadd.u16 q0, q10, q11
shrn d30, q12, #2
NRND vadd.u16 q0, q0, q13
shrn d31, q0, #2
.if \avg
vld1.8 {q9}, [r0,:128]
vrhadd.u8 q15, q15, q9
.endif
vst1.8 {q15}, [r0,:128], r2
bx lr
.endm
.macro pixels8 rnd=1, avg=0
1: vld1.8 {d0}, [r1], r2
vld1.8 {d1}, [r1], r2
vld1.8 {d2}, [r1], r2
pld [r1, r2, lsl #2]
vld1.8 {d3}, [r1], r2
pld [r1]
pld [r1, r2]
pld [r1, r2, lsl #1]
.if \avg
vld1.8 {d4}, [r0,:64], r2
vrhadd.u8 d0, d0, d4
vld1.8 {d5}, [r0,:64], r2
vrhadd.u8 d1, d1, d5
vld1.8 {d6}, [r0,:64], r2
vrhadd.u8 d2, d2, d6
vld1.8 {d7}, [r0,:64], r2
vrhadd.u8 d3, d3, d7
sub r0, r0, r2, lsl #2
.endif
subs r3, r3, #4
vst1.8 {d0}, [r0,:64], r2
vst1.8 {d1}, [r0,:64], r2
vst1.8 {d2}, [r0,:64], r2
vst1.8 {d3}, [r0,:64], r2
bne 1b
bx lr
.endm
.macro pixels8_x2 rnd=1, avg=0
1: vld1.8 {q0}, [r1], r2
vext.8 d1, d0, d1, #1
vld1.8 {q1}, [r1], r2
vext.8 d3, d2, d3, #1
pld [r1]
pld [r1, r2]
subs r3, r3, #2
vswp d1, d2
avg q0, q0, q1
.if \avg
vld1.8 {d4}, [r0,:64], r2
vld1.8 {d5}, [r0,:64]
vrhadd.u8 q0, q0, q2
sub r0, r0, r2
.endif
vst1.8 {d0}, [r0,:64], r2
vst1.8 {d1}, [r0,:64], r2
bne 1b
bx lr
.endm
.macro pixels8_y2 rnd=1, avg=0
sub r3, r3, #2
vld1.8 {d0}, [r1], r2
vld1.8 {d1}, [r1], r2
1: subs r3, r3, #2
avg d4, d0, d1
vld1.8 {d0}, [r1], r2
avg d5, d0, d1
vld1.8 {d1}, [r1], r2
pld [r1]
pld [r1, r2]
.if \avg
vld1.8 {d2}, [r0,:64], r2
vld1.8 {d3}, [r0,:64]
vrhadd.u8 q2, q2, q1
sub r0, r0, r2
.endif
vst1.8 {d4}, [r0,:64], r2
vst1.8 {d5}, [r0,:64], r2
bne 1b
avg d4, d0, d1
vld1.8 {d0}, [r1], r2
avg d5, d0, d1
.if \avg
vld1.8 {d2}, [r0,:64], r2
vld1.8 {d3}, [r0,:64]
vrhadd.u8 q2, q2, q1
sub r0, r0, r2
.endif
vst1.8 {d4}, [r0,:64], r2
vst1.8 {d5}, [r0,:64], r2
bx lr
.endm
.macro pixels8_xy2 rnd=1, avg=0
sub r3, r3, #2
vld1.8 {q0}, [r1], r2
vld1.8 {q1}, [r1], r2
NRND vmov.i16 q11, #1
pld [r1]
pld [r1, r2]
vext.8 d4, d0, d1, #1
vext.8 d6, d2, d3, #1
vaddl.u8 q8, d0, d4
vaddl.u8 q9, d2, d6
1: subs r3, r3, #2
vld1.8 {q0}, [r1], r2
pld [r1]
vadd.u16 q10, q8, q9
vext.8 d4, d0, d1, #1
NRND vadd.u16 q10, q10, q11
vaddl.u8 q8, d0, d4
shrn d5, q10, #2
vld1.8 {q1}, [r1], r2
vadd.u16 q10, q8, q9
pld [r1, r2]
.if \avg
vld1.8 {d7}, [r0,:64]
vrhadd.u8 d5, d5, d7
.endif
NRND vadd.u16 q10, q10, q11
vst1.8 {d5}, [r0,:64], r2
shrn d7, q10, #2
.if \avg
vld1.8 {d5}, [r0,:64]
vrhadd.u8 d7, d7, d5
.endif
vext.8 d6, d2, d3, #1
vaddl.u8 q9, d2, d6
vst1.8 {d7}, [r0,:64], r2
bgt 1b
vld1.8 {q0}, [r1], r2
vadd.u16 q10, q8, q9
vext.8 d4, d0, d1, #1
NRND vadd.u16 q10, q10, q11
vaddl.u8 q8, d0, d4
shrn d5, q10, #2
vadd.u16 q10, q8, q9
.if \avg
vld1.8 {d7}, [r0,:64]
vrhadd.u8 d5, d5, d7
.endif
NRND vadd.u16 q10, q10, q11
vst1.8 {d5}, [r0,:64], r2
shrn d7, q10, #2
.if \avg
vld1.8 {d5}, [r0,:64]
vrhadd.u8 d7, d7, d5
.endif
vst1.8 {d7}, [r0,:64], r2
bx lr
.endm
.macro pixfunc pfx, name, suf, rnd=1, avg=0
.if \rnd
.macro avg rd, rn, rm
vrhadd.u8 \rd, \rn, \rm
.endm
.macro shrn rd, rn, rm
vrshrn.u16 \rd, \rn, \rm
.endm
.macro NRND insn:vararg
.endm
.else
.macro avg rd, rn, rm
vhadd.u8 \rd, \rn, \rm
.endm
.macro shrn rd, rn, rm
vshrn.u16 \rd, \rn, \rm
.endm
.macro NRND insn:vararg
\insn
.endm
.endif
function ff_\pfx\name\suf\()_neon, export=1
\name \rnd, \avg
endfunc
.purgem avg
.purgem shrn
.purgem NRND
.endm
.macro pixfunc2 pfx, name, avg=0
pixfunc \pfx, \name, rnd=1, avg=\avg
pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
.endm
function ff_put_h264_qpel16_mc00_neon, export=1
mov r3, #16
endfunc
pixfunc put_, pixels16, avg=0
pixfunc2 put_, pixels16_x2, avg=0
pixfunc2 put_, pixels16_y2, avg=0
pixfunc2 put_, pixels16_xy2, avg=0
function ff_avg_h264_qpel16_mc00_neon, export=1
mov r3, #16
endfunc
pixfunc avg_, pixels16, avg=1
pixfunc2 avg_, pixels16_x2, avg=1
pixfunc2 avg_, pixels16_y2, avg=1
pixfunc2 avg_, pixels16_xy2, avg=1
function ff_put_h264_qpel8_mc00_neon, export=1
mov r3, #8
endfunc
pixfunc put_, pixels8, avg=0
pixfunc2 put_, pixels8_x2, avg=0
pixfunc2 put_, pixels8_y2, avg=0
pixfunc2 put_, pixels8_xy2, avg=0
function ff_avg_h264_qpel8_mc00_neon, export=1
mov r3, #8
endfunc
pixfunc avg_, pixels8, avg=1
pixfunc avg_, pixels8_x2, avg=1
pixfunc avg_, pixels8_y2, avg=1
pixfunc avg_, pixels8_xy2, avg=1

View File

@@ -0,0 +1,40 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_IDCT_H
#define AVCODEC_ARM_IDCT_H
#include <stdint.h>
void ff_j_rev_dct_arm(int16_t *data);
void ff_simple_idct_arm(int16_t *data);
void ff_simple_idct_armv5te(int16_t *data);
void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data);
void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data);
void ff_simple_idct_armv6(int16_t *data);
void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data);
void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data);
void ff_simple_idct_neon(int16_t *data);
void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
#endif /* AVCODEC_ARM_IDCT_H */

View File

@@ -0,0 +1,120 @@
@
@ ARMv4-optimized IDCT functions
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
@
@ This file is part of FFmpeg.
@
@ FFmpeg is free software; you can redistribute it and/or
@ modify it under the terms of the GNU Lesser General Public
@ License as published by the Free Software Foundation; either
@ version 2.1 of the License, or (at your option) any later version.
@
@ FFmpeg is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
@ Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public
@ License along with FFmpeg; if not, write to the Free Software
@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@
#include "config.h"
#include "libavutil/arm/asm.S"
@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, ptrdiff_t stride)
function ff_add_pixels_clamped_arm, export=1, align=5
push {r4-r10}
mov r10, #8
1:
ldr r4, [r1] /* load dest */
/* block[0] and block[1]*/
ldrsh r5, [r0]
ldrsh r7, [r0, #2]
and r6, r4, #0xFF
and r8, r4, #0xFF00
add r6, r6, r5
add r8, r7, r8, lsr #8
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #4] /* moved form [A] */
orr r9, r9, r8, lsl #8
/* block[2] and block[3] */
/* [A] */
ldrsh r7, [r0, #6]
and r6, r4, #0xFF0000
and r8, r4, #0xFF000000
add r6, r5, r6, lsr #16
add r8, r7, r8, lsr #24
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
ldr r4, [r1, #4] /* moved form [B] */
orr r9, r9, r8, lsl #24
/* store dest */
ldrsh r5, [r0, #8] /* moved form [C] */
str r9, [r1]
/* load dest */
/* [B] */
/* block[4] and block[5] */
/* [C] */
ldrsh r7, [r0, #10]
and r6, r4, #0xFF
and r8, r4, #0xFF00
add r6, r6, r5
add r8, r7, r8, lsr #8
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
mov r9, r6
ldrsh r5, [r0, #12] /* moved from [D] */
orr r9, r9, r8, lsl #8
/* block[6] and block[7] */
/* [D] */
ldrsh r7, [r0, #14]
and r6, r4, #0xFF0000
and r8, r4, #0xFF000000
add r6, r5, r6, lsr #16
add r8, r7, r8, lsr #24
mvn r5, r5
mvn r7, r7
tst r6, #0x100
it ne
movne r6, r5, lsr #24
tst r8, #0x100
it ne
movne r8, r7, lsr #24
orr r9, r9, r6, lsl #16
add r0, r0, #16 /* moved from [E] */
orr r9, r9, r8, lsl #24
subs r10, r10, #1 /* moved from [F] */
/* store dest */
str r9, [r1, #4]
/* [E] */
/* [F] */
add r1, r1, r2
bne 1b
pop {r4-r10}
bx lr
endfunc

View File

@@ -0,0 +1,34 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_IDCTDSP_ARM_H
#define AVCODEC_ARM_IDCTDSP_ARM_H
#include "libavcodec/avcodec.h"
#include "libavcodec/idctdsp.h"
void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth);
void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth);
void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth);
#endif /* AVCODEC_ARM_IDCTDSP_ARM_H */

View File

@@ -0,0 +1,48 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_add_pixels_clamped_armv6, export=1
push {r4-r8,lr}
mov r3, #8
1:
ldm r0!, {r4,r5,r12,lr}
ldrd r6, r7, [r1]
pkhbt r8, r4, r5, lsl #16
pkhtb r5, r5, r4, asr #16
pkhbt r4, r12, lr, lsl #16
pkhtb lr, lr, r12, asr #16
pld [r1, r2]
uxtab16 r8, r8, r6
uxtab16 r5, r5, r6, ror #8
uxtab16 r4, r4, r7
uxtab16 lr, lr, r7, ror #8
usat16 r8, #8, r8
usat16 r5, #8, r5
usat16 r4, #8, r4
usat16 lr, #8, lr
orr r6, r8, r5, lsl #8
orr r7, r4, lr, lsl #8
subs r3, r3, #1
strd_post r6, r7, r1, r2
bgt 1b
pop {r4-r8,pc}
endfunc

View File

@@ -0,0 +1,89 @@
/*
* ARM-optimized IDCT functions
* Copyright (c) 2001 Lionel Ulmer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/idctdsp.h"
#include "idct.h"
#include "idctdsp_arm.h"
void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
ptrdiff_t line_size);
/* XXX: those functions should be suppressed ASAP when all IDCTs are
* converted */
static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block)
{
ff_j_rev_dct_arm(block);
ff_put_pixels_clamped(block, dest, line_size);
}
static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block)
{
ff_j_rev_dct_arm(block);
ff_add_pixels_clamped(block, dest, line_size);
}
static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block)
{
ff_simple_idct_arm(block);
ff_put_pixels_clamped(block, dest, line_size);
}
static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block)
{
ff_simple_idct_arm(block);
ff_add_pixels_clamped(block, dest, line_size);
}
av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
int cpu_flags = av_get_cpu_flags();
if (!avctx->lowres && !high_bit_depth) {
if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
avctx->idct_algo == FF_IDCT_ARM) {
c->idct_put = j_rev_dct_arm_put;
c->idct_add = j_rev_dct_arm_add;
c->idct = ff_j_rev_dct_arm;
c->perm_type = FF_IDCT_PERM_LIBMPEG2;
} else if (avctx->idct_algo == FF_IDCT_SIMPLEARM) {
c->idct_put = simple_idct_arm_put;
c->idct_add = simple_idct_arm_add;
c->idct = ff_simple_idct_arm;
c->perm_type = FF_IDCT_PERM_NONE;
}
}
c->add_pixels_clamped = ff_add_pixels_clamped_arm;
if (have_armv5te(cpu_flags))
ff_idctdsp_init_armv5te(c, avctx, high_bit_depth);
if (have_armv6(cpu_flags))
ff_idctdsp_init_armv6(c, avctx, high_bit_depth);
if (have_neon(cpu_flags))
ff_idctdsp_init_neon(c, avctx, high_bit_depth);
}

View File

@@ -0,0 +1,41 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/idctdsp.h"
#include "idct.h"
#include "idctdsp_arm.h"
av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
if (!avctx->lowres && !high_bit_depth &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
c->idct_put = ff_simple_idct_put_armv5te;
c->idct_add = ff_simple_idct_add_armv5te;
c->idct = ff_simple_idct_armv5te;
c->perm_type = FF_IDCT_PERM_NONE;
}
}

View File

@@ -0,0 +1,45 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/idctdsp.h"
#include "idct.h"
#include "idctdsp_arm.h"
void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels,
ptrdiff_t line_size);
av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
if (!avctx->lowres && !high_bit_depth) {
if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
avctx->idct_algo == FF_IDCT_SIMPLEARMV6) {
c->idct_put = ff_simple_idct_put_armv6;
c->idct_add = ff_simple_idct_add_armv6;
c->idct = ff_simple_idct_armv6;
c->perm_type = FF_IDCT_PERM_LIBMPEG2;
}
}
c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
}

View File

@@ -0,0 +1,51 @@
/*
* ARM-NEON-optimized IDCT functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/idctdsp.h"
#include "idct.h"
#include "idctdsp_arm.h"
void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
if (!avctx->lowres && !high_bit_depth) {
if (avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLENEON) {
c->idct_put = ff_simple_idct_put_neon;
c->idct_add = ff_simple_idct_add_neon;
c->idct = ff_simple_idct_neon;
c->perm_type = FF_IDCT_PERM_PARTTRANS;
}
}
c->add_pixels_clamped = ff_add_pixels_clamped_neon;
c->put_pixels_clamped = ff_put_pixels_clamped_neon;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
}

View File

@@ -0,0 +1,128 @@
/*
* ARM-NEON-optimized IDCT functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_put_pixels_clamped_neon, export=1
vld1.16 {d16-d19}, [r0,:128]!
vqmovun.s16 d0, q8
vld1.16 {d20-d23}, [r0,:128]!
vqmovun.s16 d1, q9
vld1.16 {d24-d27}, [r0,:128]!
vqmovun.s16 d2, q10
vld1.16 {d28-d31}, [r0,:128]!
vqmovun.s16 d3, q11
vst1.8 {d0}, [r1,:64], r2
vqmovun.s16 d4, q12
vst1.8 {d1}, [r1,:64], r2
vqmovun.s16 d5, q13
vst1.8 {d2}, [r1,:64], r2
vqmovun.s16 d6, q14
vst1.8 {d3}, [r1,:64], r2
vqmovun.s16 d7, q15
vst1.8 {d4}, [r1,:64], r2
vst1.8 {d5}, [r1,:64], r2
vst1.8 {d6}, [r1,:64], r2
vst1.8 {d7}, [r1,:64], r2
bx lr
endfunc
function ff_put_signed_pixels_clamped_neon, export=1
vmov.u8 d31, #128
vld1.16 {d16-d17}, [r0,:128]!
vqmovn.s16 d0, q8
vld1.16 {d18-d19}, [r0,:128]!
vqmovn.s16 d1, q9
vld1.16 {d16-d17}, [r0,:128]!
vqmovn.s16 d2, q8
vld1.16 {d18-d19}, [r0,:128]!
vadd.u8 d0, d0, d31
vld1.16 {d20-d21}, [r0,:128]!
vadd.u8 d1, d1, d31
vld1.16 {d22-d23}, [r0,:128]!
vadd.u8 d2, d2, d31
vst1.8 {d0}, [r1,:64], r2
vqmovn.s16 d3, q9
vst1.8 {d1}, [r1,:64], r2
vqmovn.s16 d4, q10
vst1.8 {d2}, [r1,:64], r2
vqmovn.s16 d5, q11
vld1.16 {d24-d25}, [r0,:128]!
vadd.u8 d3, d3, d31
vld1.16 {d26-d27}, [r0,:128]!
vadd.u8 d4, d4, d31
vadd.u8 d5, d5, d31
vst1.8 {d3}, [r1,:64], r2
vqmovn.s16 d6, q12
vst1.8 {d4}, [r1,:64], r2
vqmovn.s16 d7, q13
vst1.8 {d5}, [r1,:64], r2
vadd.u8 d6, d6, d31
vadd.u8 d7, d7, d31
vst1.8 {d6}, [r1,:64], r2
vst1.8 {d7}, [r1,:64], r2
bx lr
endfunc
function ff_add_pixels_clamped_neon, export=1
mov r3, r1
vld1.8 {d16}, [r1,:64], r2
vld1.16 {d0-d1}, [r0,:128]!
vaddw.u8 q0, q0, d16
vld1.8 {d17}, [r1,:64], r2
vld1.16 {d2-d3}, [r0,:128]!
vqmovun.s16 d0, q0
vld1.8 {d18}, [r1,:64], r2
vaddw.u8 q1, q1, d17
vld1.16 {d4-d5}, [r0,:128]!
vaddw.u8 q2, q2, d18
vst1.8 {d0}, [r3,:64], r2
vqmovun.s16 d2, q1
vld1.8 {d19}, [r1,:64], r2
vld1.16 {d6-d7}, [r0,:128]!
vaddw.u8 q3, q3, d19
vqmovun.s16 d4, q2
vst1.8 {d2}, [r3,:64], r2
vld1.8 {d16}, [r1,:64], r2
vqmovun.s16 d6, q3
vld1.16 {d0-d1}, [r0,:128]!
vaddw.u8 q0, q0, d16
vst1.8 {d4}, [r3,:64], r2
vld1.8 {d17}, [r1,:64], r2
vld1.16 {d2-d3}, [r0,:128]!
vaddw.u8 q1, q1, d17
vst1.8 {d6}, [r3,:64], r2
vqmovun.s16 d0, q0
vld1.8 {d18}, [r1,:64], r2
vld1.16 {d4-d5}, [r0,:128]!
vaddw.u8 q2, q2, d18
vst1.8 {d0}, [r3,:64], r2
vqmovun.s16 d2, q1
vld1.8 {d19}, [r1,:64], r2
vqmovun.s16 d4, q2
vld1.16 {d6-d7}, [r0,:128]!
vaddw.u8 q3, q3, d19
vst1.8 {d2}, [r3,:64], r2
vqmovun.s16 d6, q3
vst1.8 {d4}, [r3,:64], r2
vst1.8 {d6}, [r3,:64], r2
bx lr
endfunc

View File

@@ -0,0 +1,51 @@
/*
* ARM NEON optimised integer operations
* Copyright (c) 2009 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_scalarproduct_int16_neon, export=1
vmov.i16 q0, #0
vmov.i16 q1, #0
vmov.i16 q2, #0
vmov.i16 q3, #0
1: vld1.16 {d16-d17}, [r0]!
vld1.16 {d20-d21}, [r1,:128]!
vmlal.s16 q0, d16, d20
vld1.16 {d18-d19}, [r0]!
vmlal.s16 q1, d17, d21
vld1.16 {d22-d23}, [r1,:128]!
vmlal.s16 q2, d18, d22
vmlal.s16 q3, d19, d23
subs r2, r2, #16
bgt 1b
vpadd.s32 d16, d0, d1
vpadd.s32 d17, d2, d3
vpadd.s32 d18, d4, d5
vpadd.s32 d19, d6, d7
vpadd.s32 d0, d16, d17
vpadd.s32 d1, d18, d19
vpadd.s32 d2, d0, d1
vpaddl.s32 d3, d2
vmov.32 r0, d3[0]
bx lr
endfunc

View File

@@ -0,0 +1,383 @@
/*
C-like prototype :
void j_rev_dct_arm(DCTBLOCK data)
With DCTBLOCK being a pointer to an array of 64 'signed shorts'
Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "libavutil/arm/asm.S"
#define FIX_0_298631336 2446
#define FIX_0_541196100 4433
#define FIX_0_765366865 6270
#define FIX_1_175875602 9633
#define FIX_1_501321110 12299
#define FIX_2_053119869 16819
#define FIX_3_072711026 25172
#define FIX_M_0_390180644 -3196
#define FIX_M_0_899976223 -7373
#define FIX_M_1_847759065 -15137
#define FIX_M_1_961570560 -16069
#define FIX_M_2_562915447 -20995
#define FIX_0xFFFF 0xFFFF
#define FIX_0_298631336_ID 0
#define FIX_0_541196100_ID 4
#define FIX_0_765366865_ID 8
#define FIX_1_175875602_ID 12
#define FIX_1_501321110_ID 16
#define FIX_2_053119869_ID 20
#define FIX_3_072711026_ID 24
#define FIX_M_0_390180644_ID 28
#define FIX_M_0_899976223_ID 32
#define FIX_M_1_847759065_ID 36
#define FIX_M_1_961570560_ID 40
#define FIX_M_2_562915447_ID 44
#define FIX_0xFFFF_ID 48
function ff_j_rev_dct_arm, export=1
push {r0, r4 - r11, lr}
mov lr, r0 @ lr = pointer to the current row
mov r12, #8 @ r12 = row-counter
movrel r11, const_array @ r11 = base pointer to the constants array
row_loop:
ldrsh r0, [lr, # 0] @ r0 = 'd0'
ldrsh r2, [lr, # 2] @ r2 = 'd2'
@ Optimization for row that have all items except the first set to 0
@ (this works as the int16_t are always 4-byte aligned)
ldr r5, [lr, # 0]
ldr r6, [lr, # 4]
ldr r3, [lr, # 8]
ldr r4, [lr, #12]
orr r3, r3, r4
orr r3, r3, r6
orrs r5, r3, r5
beq end_of_row_loop @ nothing to be done as ALL of them are '0'
orrs r3, r3, r2
beq empty_row
ldrsh r1, [lr, # 8] @ r1 = 'd1'
ldrsh r4, [lr, # 4] @ r4 = 'd4'
ldrsh r6, [lr, # 6] @ r6 = 'd6'
ldr r3, [r11, #FIX_0_541196100_ID]
add r7, r2, r6
ldr r5, [r11, #FIX_M_1_847759065_ID]
mul r7, r3, r7 @ r7 = z1
ldr r3, [r11, #FIX_0_765366865_ID]
mla r6, r5, r6, r7 @ r6 = tmp2
add r5, r0, r4 @ r5 = tmp0
mla r2, r3, r2, r7 @ r2 = tmp3
sub r3, r0, r4 @ r3 = tmp1
add r0, r2, r5, lsl #13 @ r0 = tmp10
rsb r2, r2, r5, lsl #13 @ r2 = tmp13
add r4, r6, r3, lsl #13 @ r4 = tmp11
rsb r3, r6, r3, lsl #13 @ r3 = tmp12
push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11
ldrsh r3, [lr, #10] @ r3 = 'd3'
ldrsh r5, [lr, #12] @ r5 = 'd5'
ldrsh r7, [lr, #14] @ r7 = 'd7'
add r0, r3, r5 @ r0 = 'z2'
add r2, r1, r7 @ r2 = 'z1'
add r4, r3, r7 @ r4 = 'z3'
add r6, r1, r5 @ r6 = 'z4'
ldr r9, [r11, #FIX_1_175875602_ID]
add r8, r4, r6 @ r8 = z3 + z4
ldr r10, [r11, #FIX_M_0_899976223_ID]
mul r8, r9, r8 @ r8 = 'z5'
ldr r9, [r11, #FIX_M_2_562915447_ID]
mul r2, r10, r2 @ r2 = 'z1'
ldr r10, [r11, #FIX_M_1_961570560_ID]
mul r0, r9, r0 @ r0 = 'z2'
ldr r9, [r11, #FIX_M_0_390180644_ID]
mla r4, r10, r4, r8 @ r4 = 'z3'
ldr r10, [r11, #FIX_0_298631336_ID]
mla r6, r9, r6, r8 @ r6 = 'z4'
ldr r9, [r11, #FIX_2_053119869_ID]
mla r7, r10, r7, r2 @ r7 = tmp0 + z1
ldr r10, [r11, #FIX_3_072711026_ID]
mla r5, r9, r5, r0 @ r5 = tmp1 + z2
ldr r9, [r11, #FIX_1_501321110_ID]
mla r3, r10, r3, r0 @ r3 = tmp2 + z2
add r7, r7, r4 @ r7 = tmp0
mla r1, r9, r1, r2 @ r1 = tmp3 + z1
add r5, r5, r6 @ r5 = tmp1
add r3, r3, r4 @ r3 = tmp2
add r1, r1, r6 @ r1 = tmp3
pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
@ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
add r8, r0, r1
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 0]
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
sub r8, r0, r1
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, #14]
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
add r8, r6, r3
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 2]
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
sub r8, r6, r3
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, #12]
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
add r8, r4, r5
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 4]
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
sub r8, r4, r5
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, #10]
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
add r8, r2, r7
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 6]
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
sub r8, r2, r7
add r8, r8, #(1<<10)
mov r8, r8, asr #11
strh r8, [lr, # 8]
@ End of row loop
add lr, lr, #16
subs r12, r12, #1
bne row_loop
beq start_column_loop
empty_row:
ldr r1, [r11, #FIX_0xFFFF_ID]
mov r0, r0, lsl #2
and r0, r0, r1
add r0, r0, r0, lsl #16
str r0, [lr, # 0]
str r0, [lr, # 4]
str r0, [lr, # 8]
str r0, [lr, #12]
end_of_row_loop:
@ End of loop
add lr, lr, #16
subs r12, r12, #1
bne row_loop
start_column_loop:
@ Start of column loop
pop {lr}
mov r12, #8
column_loop:
ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
ldr r3, [r11, #FIX_0_541196100_ID]
add r1, r2, r6
ldr r5, [r11, #FIX_M_1_847759065_ID]
mul r1, r3, r1 @ r1 = z1
ldr r3, [r11, #FIX_0_765366865_ID]
mla r6, r5, r6, r1 @ r6 = tmp2
add r5, r0, r4 @ r5 = tmp0
mla r2, r3, r2, r1 @ r2 = tmp3
sub r3, r0, r4 @ r3 = tmp1
add r0, r2, r5, lsl #13 @ r0 = tmp10
rsb r2, r2, r5, lsl #13 @ r2 = tmp13
add r4, r6, r3, lsl #13 @ r4 = tmp11
rsb r6, r6, r3, lsl #13 @ r6 = tmp12
ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
@ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
orr r9, r1, r3
orr r10, r5, r7
orrs r10, r9, r10
beq empty_odd_column
push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11
add r0, r3, r5 @ r0 = 'z2'
add r2, r1, r7 @ r2 = 'z1'
add r4, r3, r7 @ r4 = 'z3'
add r6, r1, r5 @ r6 = 'z4'
ldr r9, [r11, #FIX_1_175875602_ID]
add r8, r4, r6
ldr r10, [r11, #FIX_M_0_899976223_ID]
mul r8, r9, r8 @ r8 = 'z5'
ldr r9, [r11, #FIX_M_2_562915447_ID]
mul r2, r10, r2 @ r2 = 'z1'
ldr r10, [r11, #FIX_M_1_961570560_ID]
mul r0, r9, r0 @ r0 = 'z2'
ldr r9, [r11, #FIX_M_0_390180644_ID]
mla r4, r10, r4, r8 @ r4 = 'z3'
ldr r10, [r11, #FIX_0_298631336_ID]
mla r6, r9, r6, r8 @ r6 = 'z4'
ldr r9, [r11, #FIX_2_053119869_ID]
mla r7, r10, r7, r2 @ r7 = tmp0 + z1
ldr r10, [r11, #FIX_3_072711026_ID]
mla r5, r9, r5, r0 @ r5 = tmp1 + z2
ldr r9, [r11, #FIX_1_501321110_ID]
mla r3, r10, r3, r0 @ r3 = tmp2 + z2
add r7, r7, r4 @ r7 = tmp0
mla r1, r9, r1, r2 @ r1 = tmp3 + z1
add r5, r5, r6 @ r5 = tmp1
add r3, r3, r4 @ r3 = tmp2
add r1, r1, r6 @ r1 = tmp3
pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
@ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
add r8, r0, r1
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 0*8)]
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
sub r8, r0, r1
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #(14*8)]
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
add r8, r4, r3
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 2*8)]
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
sub r8, r4, r3
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #(12*8)]
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
add r8, r6, r5
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 4*8)]
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
sub r8, r6, r5
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #(10*8)]
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
add r8, r2, r7
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 6*8)]
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
sub r8, r2, r7
add r8, r8, #(1<<17)
mov r8, r8, asr #18
strh r8, [lr, #( 8*8)]
@ End of row loop
add lr, lr, #2
subs r12, r12, #1
bne column_loop
beq the_end
empty_odd_column:
@ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
@ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
add r0, r0, #(1<<17)
mov r0, r0, asr #18
strh r0, [lr, #( 0*8)]
strh r0, [lr, #(14*8)]
@ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
@ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
add r4, r4, #(1<<17)
mov r4, r4, asr #18
strh r4, [lr, #( 2*8)]
strh r4, [lr, #(12*8)]
@ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
@ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
add r6, r6, #(1<<17)
mov r6, r6, asr #18
strh r6, [lr, #( 4*8)]
strh r6, [lr, #(10*8)]
@ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
@ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
add r2, r2, #(1<<17)
mov r2, r2, asr #18
strh r2, [lr, #( 6*8)]
strh r2, [lr, #( 8*8)]
@ End of row loop
add lr, lr, #2
subs r12, r12, #1
bne column_loop
the_end:
@ The end....
pop {r4 - r11, pc}
endfunc
const const_array
.word FIX_0_298631336
.word FIX_0_541196100
.word FIX_0_765366865
.word FIX_1_175875602
.word FIX_1_501321110
.word FIX_2_053119869
.word FIX_3_072711026
.word FIX_M_0_390180644
.word FIX_M_0_899976223
.word FIX_M_1_847759065
.word FIX_M_1_961570560
.word FIX_M_2_562915447
.word FIX_0xFFFF
endconst

View File

@@ -0,0 +1,38 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/lossless_audiodsp.h"
int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
const int16_t *v3, int len, int mul);
av_cold void ff_llauddsp_init_arm(LLAudDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
}
}

View File

@@ -0,0 +1,62 @@
/*
* ARM NEON optimised integer operations
* Copyright (c) 2009 Kostya Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
function ff_scalarproduct_and_madd_int16_neon, export=1
vld1.16 {d28[],d29[]}, [sp]
vmov.i16 q0, #0
vmov.i16 q1, #0
vmov.i16 q2, #0
vmov.i16 q3, #0
mov r12, r0
1: vld1.16 {d16-d17}, [r0,:128]!
vld1.16 {d18-d19}, [r1]!
vld1.16 {d20-d21}, [r2]!
vld1.16 {d22-d23}, [r0,:128]!
vld1.16 {d24-d25}, [r1]!
vld1.16 {d26-d27}, [r2]!
vmul.s16 q10, q10, q14
vmul.s16 q13, q13, q14
vmlal.s16 q0, d16, d18
vmlal.s16 q1, d17, d19
vadd.s16 q10, q8, q10
vadd.s16 q13, q11, q13
vmlal.s16 q2, d22, d24
vmlal.s16 q3, d23, d25
vst1.16 {q10}, [r12,:128]!
subs r3, r3, #16
vst1.16 {q13}, [r12,:128]!
bgt 1b
vpadd.s32 d16, d0, d1
vpadd.s32 d17, d2, d3
vpadd.s32 d18, d4, d5
vpadd.s32 d19, d6, d7
vpadd.s32 d0, d16, d17
vpadd.s32 d1, d18, d19
vpadd.s32 d2, d0, d1
vpaddl.s32 d3, d2
vmov.32 r0, d3[0]
bx lr
endfunc

View File

@@ -0,0 +1,108 @@
/*
* simple math operations
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_MATHOPS_H
#define AVCODEC_ARM_MATHOPS_H
#include <stdint.h>
#include "config.h"
#include "libavutil/common.h"
#if HAVE_INLINE_ASM
#if HAVE_ARMV6_INLINE
#define MULH MULH
static inline av_const int MULH(int a, int b)
{
int r;
__asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
return r;
}
#define FASTDIV FASTDIV
static av_always_inline av_const int FASTDIV(int a, int b)
{
int r;
__asm__ ("cmp %2, #2 \n\t"
"ldr %0, [%3, %2, lsl #2] \n\t"
"ite le \n\t"
"lsrle %0, %1, #1 \n\t"
"smmulgt %0, %0, %1 \n\t"
: "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
return r;
}
#else /* HAVE_ARMV6_INLINE */
#define FASTDIV FASTDIV
static av_always_inline av_const int FASTDIV(int a, int b)
{
int r, t;
__asm__ ("umull %1, %0, %2, %3"
: "=&r"(r), "=&r"(t) : "r"(a), "r"(ff_inverse[b]));
return r;
}
#endif
#define MLS64(d, a, b) MAC64(d, -(a), b)
#if HAVE_ARMV5TE_INLINE
/* signed 16x16 -> 32 multiply add accumulate */
# define MAC16(rt, ra, rb) \
__asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
/* signed 16x16 -> 32 multiply */
# define MUL16 MUL16
static inline av_const int MUL16(int ra, int rb)
{
int rt;
__asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
return rt;
}
#endif
#define mid_pred mid_pred
static inline av_const int mid_pred(int a, int b, int c)
{
int m;
__asm__ (
"mov %0, %2 \n\t"
"cmp %1, %2 \n\t"
"itt gt \n\t"
"movgt %0, %1 \n\t"
"movgt %1, %2 \n\t"
"cmp %1, %3 \n\t"
"it le \n\t"
"movle %1, %3 \n\t"
"cmp %0, %1 \n\t"
"it gt \n\t"
"movgt %0, %1 \n\t"
: "=&r"(m), "+r"(a)
: "r"(b), "r"(c)
: "cc");
return m;
}
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_ARM_MATHOPS_H */

View File

@@ -0,0 +1,193 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro prerot dst, rt
lsr r3, r6, #2 @ n4
add \rt, r4, r6, lsr #1 @ revtab + n4
add r9, r3, r3, lsl #1 @ n3
add r8, r7, r6 @ tcos + n4
add r3, r2, r6, lsr #1 @ in + n4
add r9, r2, r9, lsl #1 @ in + n3
sub r8, r8, #16
sub r10, r3, #16
sub r11, r9, #16
mov r12, #-16
1:
vld2.16 {d0,d1}, [r9, :128]!
vld2.16 {d2,d3}, [r11,:128], r12
vld2.16 {d4,d5}, [r3, :128]!
vld2.16 {d6,d7}, [r10,:128], r12
vld2.16 {d16,d17},[r7, :128]! @ cos, sin
vld2.16 {d18,d19},[r8, :128], r12
vrev64.16 q1, q1
vrev64.16 q3, q3
vrev64.16 q9, q9
vneg.s16 d0, d0
vneg.s16 d2, d2
vneg.s16 d16, d16
vneg.s16 d18, d18
vhsub.s16 d0, d0, d3 @ re
vhsub.s16 d4, d7, d4 @ im
vhsub.s16 d6, d6, d5
vhsub.s16 d2, d2, d1
vmull.s16 q10, d0, d16
vmlsl.s16 q10, d4, d17
vmull.s16 q11, d0, d17
vmlal.s16 q11, d4, d16
vmull.s16 q12, d6, d18
vmlsl.s16 q12, d2, d19
vmull.s16 q13, d6, d19
vmlal.s16 q13, d2, d18
vshrn.s32 d0, q10, #15
vshrn.s32 d1, q11, #15
vshrn.s32 d2, q12, #15
vshrn.s32 d3, q13, #15
vzip.16 d0, d1
vzip.16 d2, d3
ldrh lr, [r4], #2
ldrh r2, [\rt, #-2]!
add lr, \dst, lr, lsl #2
add r2, \dst, r2, lsl #2
vst1.32 {d0[0]}, [lr,:32]
vst1.32 {d2[0]}, [r2,:32]
ldrh lr, [r4], #2
ldrh r2, [\rt, #-2]!
add lr, \dst, lr, lsl #2
add r2, \dst, r2, lsl #2
vst1.32 {d0[1]}, [lr,:32]
vst1.32 {d2[1]}, [r2,:32]
ldrh lr, [r4], #2
ldrh r2, [\rt, #-2]!
add lr, \dst, lr, lsl #2
add r2, \dst, r2, lsl #2
vst1.32 {d1[0]}, [lr,:32]
vst1.32 {d3[0]}, [r2,:32]
ldrh lr, [r4], #2
ldrh r2, [\rt, #-2]!
add lr, \dst, lr, lsl #2
add r2, \dst, r2, lsl #2
vst1.32 {d1[1]}, [lr,:32]
vst1.32 {d3[1]}, [r2,:32]
subs r6, r6, #32
bgt 1b
.endm
function ff_mdct_fixed_calc_neon, export=1
push {r1,r4-r11,lr}
ldr r4, [r0, #8] @ revtab
ldr r6, [r0, #16] @ mdct_size; n
ldr r7, [r0, #24] @ tcos
prerot r1, r5
mov r4, r0
bl X(ff_fft_fixed_calc_neon)
pop {r5}
mov r12, #-16
ldr r6, [r4, #16] @ mdct_size; n
ldr r7, [r4, #24] @ tcos
add r5, r5, r6, lsr #1
add r7, r7, r6, lsr #1
sub r1, r5, #16
sub r2, r7, #16
1:
vld2.16 {d4,d5}, [r7,:128]!
vld2.16 {d6,d7}, [r2,:128], r12
vld2.16 {d0,d1}, [r5,:128]
vld2.16 {d2,d3}, [r1,:128]
vrev64.16 q3, q3
vrev64.16 q1, q1
vneg.s16 q3, q3
vneg.s16 q2, q2
vmull.s16 q11, d2, d6
vmlal.s16 q11, d3, d7
vmull.s16 q8, d0, d5
vmlsl.s16 q8, d1, d4
vmull.s16 q9, d0, d4
vmlal.s16 q9, d1, d5
vmull.s16 q10, d2, d7
vmlsl.s16 q10, d3, d6
vshrn.s32 d0, q11, #15
vshrn.s32 d1, q8, #15
vshrn.s32 d2, q9, #15
vshrn.s32 d3, q10, #15
vrev64.16 q0, q0
vst2.16 {d2,d3}, [r5,:128]!
vst2.16 {d0,d1}, [r1,:128], r12
subs r6, r6, #32
bgt 1b
pop {r4-r11,pc}
endfunc
function ff_mdct_fixed_calcw_neon, export=1
push {r1,r4-r11,lr}
ldrd r4, r5, [r0, #8] @ revtab, tmp_buf
ldr r6, [r0, #16] @ mdct_size; n
ldr r7, [r0, #24] @ tcos
prerot r5, r1
mov r4, r0
mov r1, r5
bl X(ff_fft_fixed_calc_neon)
pop {r7}
mov r12, #-16
ldr r6, [r4, #16] @ mdct_size; n
ldr r9, [r4, #24] @ tcos
add r5, r5, r6, lsr #1
add r7, r7, r6
add r9, r9, r6, lsr #1
sub r3, r5, #16
sub r1, r7, #16
sub r2, r9, #16
1:
vld2.16 {d4,d5}, [r9,:128]!
vld2.16 {d6,d7}, [r2,:128], r12
vld2.16 {d0,d1}, [r5,:128]!
vld2.16 {d2,d3}, [r3,:128], r12
vrev64.16 q3, q3
vrev64.16 q1, q1
vneg.s16 q3, q3
vneg.s16 q2, q2
vmull.s16 q8, d2, d6
vmlal.s16 q8, d3, d7
vmull.s16 q9, d0, d5
vmlsl.s16 q9, d1, d4
vmull.s16 q10, d0, d4
vmlal.s16 q10, d1, d5
vmull.s16 q11, d2, d7
vmlsl.s16 q11, d3, d6
vrev64.32 q8, q8
vrev64.32 q9, q9
vst2.32 {q10,q11},[r7,:128]!
vst2.32 {d16,d18},[r1,:128], r12
vst2.32 {d17,d19},[r1,:128], r12
subs r6, r6, #32
bgt 1b
pop {r4-r11,pc}
endfunc

View File

@@ -0,0 +1,301 @@
/*
* ARM NEON optimised MDCT
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#define ff_fft_calc_neon X(ff_fft_calc_neon)
function ff_imdct_half_neon, export=1
push {r4-r8,lr}
mov r12, #1
ldr lr, [r0, #20] @ mdct_bits
ldr r4, [r0, #24] @ tcos
ldr r3, [r0, #8] @ revtab
lsl r12, r12, lr @ n = 1 << nbits
lsr lr, r12, #2 @ n4 = n >> 2
add r7, r2, r12, lsl #1
mov r12, #-16
sub r7, r7, #16
vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
vrev64.32 d17, d17
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
vmul.f32 d6, d17, d2
vmul.f32 d7, d0, d2
1:
subs lr, lr, #2
ldr r6, [r3], #4
vmul.f32 d4, d0, d3
vmul.f32 d5, d17, d3
vsub.f32 d4, d6, d4
vadd.f32 d5, d5, d7
uxth r8, r6, ror #16
uxth r6, r6
add r8, r1, r8, lsl #3
add r6, r1, r6, lsl #3
beq 1f
vld2.32 {d16-d17},[r7,:128],r12
vld2.32 {d0-d1}, [r2,:128]!
vrev64.32 d17, d17
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
vmul.f32 d6, d17, d2
vmul.f32 d7, d0, d2
vst2.32 {d4[0],d5[0]}, [r6,:64]
vst2.32 {d4[1],d5[1]}, [r8,:64]
b 1b
1:
vst2.32 {d4[0],d5[0]}, [r6,:64]
vst2.32 {d4[1],d5[1]}, [r8,:64]
mov r4, r0
mov r6, r1
bl ff_fft_calc_neon
mov r12, #1
ldr lr, [r4, #20] @ mdct_bits
ldr r4, [r4, #24] @ tcos
lsl r12, r12, lr @ n = 1 << nbits
lsr lr, r12, #3 @ n8 = n >> 3
add r4, r4, lr, lsl #3
add r6, r6, lr, lsl #3
sub r1, r4, #16
sub r3, r6, #16
mov r7, #-16
mov r8, r6
mov r0, r3
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
1:
subs lr, lr, #2
vmul.f32 d7, d0, d18
vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
vmul.f32 d4, d1, d18
vmul.f32 d5, d21, d19
vmul.f32 d6, d20, d19
vmul.f32 d22, d1, d16
vmul.f32 d23, d21, d17
vmul.f32 d24, d0, d16
vmul.f32 d25, d20, d17
vadd.f32 d7, d7, d22
vadd.f32 d6, d6, d23
vsub.f32 d4, d4, d24
vsub.f32 d5, d5, d25
beq 1f
vld2.32 {d0-d1}, [r3,:128], r7
vld2.32 {d20-d21},[r6,:128]!
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128], r7
vst2.32 {d5,d7}, [r8,:128]!
b 1b
1:
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128]
vst2.32 {d5,d7}, [r8,:128]
pop {r4-r8,pc}
endfunc
function ff_imdct_calc_neon, export=1
push {r4-r6,lr}
ldr r3, [r0, #20]
mov r4, #1
mov r5, r1
lsl r4, r4, r3
add r1, r1, r4
bl X(ff_imdct_half_neon)
add r0, r5, r4, lsl #2
add r1, r5, r4, lsl #1
sub r0, r0, #8
sub r2, r1, #16
mov r3, #-16
mov r6, #-8
vmov.i32 d30, #1<<31
1:
vld1.32 {d0-d1}, [r2,:128], r3
pld [r0, #-16]
vrev64.32 q0, q0
vld1.32 {d2-d3}, [r1,:128]!
veor d4, d1, d30
pld [r2, #-16]
vrev64.32 q1, q1
veor d5, d0, d30
vst1.32 {d2}, [r0,:64], r6
vst1.32 {d3}, [r0,:64], r6
vst1.32 {d4-d5}, [r5,:128]!
subs r4, r4, #16
bgt 1b
pop {r4-r6,pc}
endfunc
function ff_mdct_calc_neon, export=1
push {r4-r10,lr}
mov r12, #1
ldr lr, [r0, #20] @ mdct_bits
ldr r4, [r0, #24] @ tcos
ldr r3, [r0, #8] @ revtab
lsl lr, r12, lr @ n = 1 << nbits
add r7, r2, lr @ in4u
sub r9, r7, #16 @ in4d
add r2, r7, lr, lsl #1 @ in3u
add r8, r9, lr, lsl #1 @ in3d
add r5, r4, lr, lsl #1
sub r5, r5, #16
sub r3, r3, #4
mov r12, #-16
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
vsub.f32 d0, d18, d0 @ in4d-in4u I
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
vadd.f32 d1, d1, d19 @ in3u+in3d -R
vsub.f32 d16, d16, d2 @ in0u-in2d R
vadd.f32 d17, d17, d3 @ in2u+in1d -I
1:
vmul.f32 d7, d0, d21 @ I*s
A ldr r10, [r3, lr, lsr #1]
T lsr r10, lr, #1
T ldr r10, [r3, r10]
vmul.f32 d6, d1, d20 @ -R*c
ldr r6, [r3, #4]!
vmul.f32 d4, d1, d21 @ -R*s
vmul.f32 d5, d0, d20 @ I*c
vmul.f32 d24, d16, d30 @ R*c
vmul.f32 d25, d17, d31 @ -I*s
vmul.f32 d22, d16, d31 @ R*s
vmul.f32 d23, d17, d30 @ I*c
subs lr, lr, #16
vsub.f32 d6, d6, d7 @ -R*c-I*s
vadd.f32 d7, d4, d5 @ -R*s+I*c
vsub.f32 d24, d25, d24 @ I*s-R*c
vadd.f32 d25, d22, d23 @ R*s-I*c
beq 1f
mov r12, #-16
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
vneg.f32 d7, d7 @ R*s-I*c
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
vsub.f32 d0, d18, d0 @ in4d-in4u I
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
vadd.f32 d1, d1, d19 @ in3u+in3d -R
vsub.f32 d16, d16, d2 @ in0u-in2d R
vadd.f32 d17, d17, d3 @ in2u+in1d -I
uxth r12, r6, ror #16
uxth r6, r6
add r12, r1, r12, lsl #3
add r6, r1, r6, lsl #3
vst2.32 {d6[0],d7[0]}, [r6,:64]
vst2.32 {d6[1],d7[1]}, [r12,:64]
uxth r6, r10, ror #16
uxth r10, r10
add r6 , r1, r6, lsl #3
add r10, r1, r10, lsl #3
vst2.32 {d24[0],d25[0]},[r10,:64]
vst2.32 {d24[1],d25[1]},[r6,:64]
b 1b
1:
vneg.f32 d7, d7 @ R*s-I*c
uxth r12, r6, ror #16
uxth r6, r6
add r12, r1, r12, lsl #3
add r6, r1, r6, lsl #3
vst2.32 {d6[0],d7[0]}, [r6,:64]
vst2.32 {d6[1],d7[1]}, [r12,:64]
uxth r6, r10, ror #16
uxth r10, r10
add r6 , r1, r6, lsl #3
add r10, r1, r10, lsl #3
vst2.32 {d24[0],d25[0]},[r10,:64]
vst2.32 {d24[1],d25[1]},[r6,:64]
mov r4, r0
mov r6, r1
bl ff_fft_calc_neon
mov r12, #1
ldr lr, [r4, #20] @ mdct_bits
ldr r4, [r4, #24] @ tcos
lsl r12, r12, lr @ n = 1 << nbits
lsr lr, r12, #3 @ n8 = n >> 3
add r4, r4, lr, lsl #3
add r6, r6, lr, lsl #3
sub r1, r4, #16
sub r3, r6, #16
mov r7, #-16
mov r8, r6
mov r0, r3
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
1:
subs lr, lr, #2
vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
vneg.f32 q2, q2
beq 1f
vld2.32 {d0-d1}, [r3,:128], r7
vld2.32 {d20-d21},[r6,:128]!
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128], r7
vst2.32 {d5,d7}, [r8,:128]!
b 1b
1:
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128]
vst2.32 {d5,d7}, [r8,:128]
pop {r4-r10,pc}
endfunc

View File

@@ -0,0 +1,347 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
CONTEXT .req a1
ORIGOUT .req a2
IN .req a3
OUT .req v1
REVTAB .req v2
TCOS .req v3
TSIN .req v4
OLDFPSCR .req v5
J0 .req a2
J1 .req a4
J2 .req ip
J3 .req lr
REVTAB_HI .req v5
IN_HI .req v6
OUT_HI .req v6
TCOS_HI .req sl
TSIN_HI .req fp
.macro prerotation_innerloop
.set trig_lo, k
.set trig_hi, n4 - k - 2
.set in_lo, trig_lo * 2
.set in_hi, trig_hi * 2
vldr d8, [TCOS, #trig_lo*4] @ s16,s17
vldr d9, [TCOS, #trig_hi*4] @ s18,s19
vldr s0, [IN, #in_hi*4 + 12]
vldr s1, [IN, #in_hi*4 + 4]
vldr s2, [IN, #in_lo*4 + 12]
vldr s3, [IN, #in_lo*4 + 4]
vmul.f s8, s0, s16 @ vector operation
vldr d10, [TSIN, #trig_lo*4] @ s20,s21
vldr d11, [TSIN, #trig_hi*4] @ s22,s23
vldr s4, [IN, #in_lo*4]
vldr s5, [IN, #in_lo*4 + 8]
vldr s6, [IN, #in_hi*4]
vldr s7, [IN, #in_hi*4 + 8]
ldr J0, [REVTAB, #trig_lo*2]
vmul.f s12, s0, s20 @ vector operation
ldr J2, [REVTAB, #trig_hi*2]
mov J1, J0, lsr #16
and J0, J0, #255 @ halfword value will be < n4
vmls.f s8, s4, s20 @ vector operation
mov J3, J2, lsr #16
and J2, J2, #255 @ halfword value will be < n4
add J0, OUT, J0, lsl #3
vmla.f s12, s4, s16 @ vector operation
add J1, OUT, J1, lsl #3
add J2, OUT, J2, lsl #3
add J3, OUT, J3, lsl #3
vstr s8, [J0]
vstr s9, [J1]
vstr s10, [J2]
vstr s11, [J3]
vstr s12, [J0, #4]
vstr s13, [J1, #4]
vstr s14, [J2, #4]
vstr s15, [J3, #4]
.set k, k + 2
.endm
.macro prerotation_innerloop_rolled
vldmia TCOS!, {s16,s17}
vldmdb TCOS_HI!, {s18,s19}
vldr s0, [IN_HI, #-4]
vldr s1, [IN_HI, #-12]
vldr s2, [IN, #12]
vldr s3, [IN, #4]
vmul.f s8, s0, s16 @ vector operation
vldmia TSIN!, {s20,s21}
vldmdb TSIN_HI!, {s22,s23}
vldr s4, [IN]
vldr s5, [IN, #8]
vldr s6, [IN_HI, #-16]
vldr s7, [IN_HI, #-8]
vmul.f s12, s0, s20 @ vector operation
add IN, IN, #16
sub IN_HI, IN_HI, #16
ldrh J0, [REVTAB], #2
ldrh J1, [REVTAB], #2
vmls.f s8, s4, s20 @ vector operation
ldrh J3, [REVTAB_HI, #-2]!
ldrh J2, [REVTAB_HI, #-2]!
add J0, OUT, J0, lsl #3
vmla.f s12, s4, s16 @ vector operation
add J1, OUT, J1, lsl #3
add J2, OUT, J2, lsl #3
add J3, OUT, J3, lsl #3
vstr s8, [J0]
vstr s9, [J1]
vstr s10, [J2]
vstr s11, [J3]
vstr s12, [J0, #4]
vstr s13, [J1, #4]
vstr s14, [J2, #4]
vstr s15, [J3, #4]
.endm
.macro postrotation_innerloop tail, head
.set trig_lo_head, n8 - k - 2
.set trig_hi_head, n8 + k
.set out_lo_head, trig_lo_head * 2
.set out_hi_head, trig_hi_head * 2
.set trig_lo_tail, n8 - (k - 2) - 2
.set trig_hi_tail, n8 + (k - 2)
.set out_lo_tail, trig_lo_tail * 2
.set out_hi_tail, trig_hi_tail * 2
.if (k & 2) == 0
TCOS_D0_HEAD .req d10 @ s20,s21
TCOS_D1_HEAD .req d11 @ s22,s23
TCOS_S0_TAIL .req s24
.else
TCOS_D0_HEAD .req d12 @ s24,s25
TCOS_D1_HEAD .req d13 @ s26,s27
TCOS_S0_TAIL .req s20
.endif
.ifnc "\tail",""
vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
.endif
.ifnc "\head",""
vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
.endif
.ifnc "\tail",""
vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
.endif
.ifnc "\head",""
vldr s0, [OUT, #out_lo_head*4]
vldr s1, [OUT, #out_lo_head*4 + 8]
vldr s2, [OUT, #out_hi_head*4]
vldr s3, [OUT, #out_hi_head*4 + 8]
vldr s4, [OUT, #out_lo_head*4 + 4]
vldr s5, [OUT, #out_lo_head*4 + 12]
vldr s6, [OUT, #out_hi_head*4 + 4]
vldr s7, [OUT, #out_hi_head*4 + 12]
.endif
.ifnc "\tail",""
vstr s8, [OUT, #out_lo_tail*4]
vstr s9, [OUT, #out_lo_tail*4 + 8]
vstr s10, [OUT, #out_hi_tail*4]
vstr s11, [OUT, #out_hi_tail*4 + 8]
.endif
.ifnc "\head",""
vmul.f s8, s4, s16 @ vector operation
.endif
.ifnc "\tail",""
vstr s12, [OUT, #out_hi_tail*4 + 12]
vstr s13, [OUT, #out_hi_tail*4 + 4]
vstr s14, [OUT, #out_lo_tail*4 + 12]
vstr s15, [OUT, #out_lo_tail*4 + 4]
.endif
.ifnc "\head",""
vmul.f s12, s0, s16 @ vector operation
vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
.endif
.unreq TCOS_D0_HEAD
.unreq TCOS_D1_HEAD
.unreq TCOS_S0_TAIL
.ifnc "\head",""
.set k, k + 2
.endif
.endm
.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
.ifnc "\tail",""
vmls.f s8, s0, \tcos_s0_tail @ vector operation
.endif
.ifnc "\head",""
vldmia TSIN!, {s16,s17}
vldmdb TSIN_HI!, {s18,s19}
vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head}
.endif
.ifnc "\tail",""
vmla.f s12, s4, \tcos_s0_tail @ vector operation
.endif
.ifnc "\head",""
vldr s0, [OUT, #+\out_offset_head+0]
vldr s1, [OUT, #+\out_offset_head+8]
vldr s2, [OUT_HI, #-\out_offset_head-16]
vldr s3, [OUT_HI, #-\out_offset_head-8]
vldr s4, [OUT, #+\out_offset_head+4]
vldr s5, [OUT, #+\out_offset_head+12]
vldr s6, [OUT_HI, #-\out_offset_head-12]
vldr s7, [OUT_HI, #-\out_offset_head-4]
.endif
.ifnc "\tail",""
vstr s8, [OUT, #+\out_offset_tail+0]
vstr s9, [OUT, #+\out_offset_tail+8]
vstr s10, [OUT_HI, #-\out_offset_tail-16]
vstr s11, [OUT_HI, #-\out_offset_tail-8]
.endif
.ifnc "\head",""
vmul.f s8, s4, s16 @ vector operation
.endif
.ifnc "\tail",""
vstr s12, [OUT_HI, #-\out_offset_tail-4]
vstr s13, [OUT_HI, #-\out_offset_tail-12]
vstr s14, [OUT, #+\out_offset_tail+12]
vstr s15, [OUT, #+\out_offset_tail+4]
.endif
.ifnc "\head",""
vmul.f s12, s0, s16 @ vector operation
vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
.endif
.endm
/* void ff_imdct_half_vfp(FFTContext *s,
* FFTSample *output,
* const FFTSample *input)
*/
function ff_imdct_half_vfp, export=1
ldr ip, [CONTEXT, #5*4] @ mdct_bits
teq ip, #6
bne 10f
.set n, 1<<6
.set n2, n/2
.set n4, n/4
.set n8, n/8
push {v1-v5,lr}
vpush {s16-s27}
fmrx OLDFPSCR, FPSCR
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
mov OUT, ORIGOUT
ldr REVTAB, [CONTEXT, #2*4]
ldr TCOS, [CONTEXT, #6*4]
ldr TSIN, [CONTEXT, #7*4]
.set k, 0
.rept n8/2
prerotation_innerloop
.endr
fmxr FPSCR, OLDFPSCR
mov a1, OUT
bl X(ff_fft16_vfp)
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
.set k, 0
postrotation_innerloop , head
.rept n8/2 - 1
postrotation_innerloop tail, head
.endr
postrotation_innerloop tail
fmxr FPSCR, OLDFPSCR
vpop {s16-s27}
pop {v1-v5,pc}
10:
push {v1-v6,sl,fp,lr}
vpush {s16-s27}
fmrx OLDFPSCR, FPSCR
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
mov lr, #1
mov OUT, ORIGOUT
ldr REVTAB, [CONTEXT, #2*4]
ldr TCOS, [CONTEXT, #6*4]
ldr TSIN, [CONTEXT, #7*4]
mov lr, lr, lsl ip
push {CONTEXT,OLDFPSCR}
add IN_HI, IN, lr, lsl #1
add REVTAB_HI, REVTAB, lr, lsr #1
add TCOS_HI, TCOS, lr
add TSIN_HI, TSIN, lr
0: prerotation_innerloop_rolled
teq IN, IN_HI
bne 0b
ldmia sp, {CONTEXT,OLDFPSCR}
mov ORIGOUT, OUT
fmxr FPSCR, OLDFPSCR
ldr ip, [CONTEXT, #9*4]
blx ip @ s->fft_calc(s, output)
pop {CONTEXT,OLDFPSCR}
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
ldr ip, [CONTEXT, #5*4] @ mdct_bits
fmxr FPSCR, lr
mov lr, #1
mov lr, lr, lsl ip
sub TCOS, TCOS, lr, lsr #1
sub TSIN, TSIN, lr, lsr #1
add OUT_HI, OUT, lr, lsl #1
add TCOS_HI, TCOS, lr
add TSIN_HI, TSIN, lr
postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
b 1f
0: add OUT, OUT, #32
sub OUT_HI, OUT_HI, #32
postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
teq TSIN, TSIN_HI
bne 0b
postrotation_innerloop_rolled tail,,,,,, s24,, 16
fmxr FPSCR, OLDFPSCR
vpop {s16-s27}
pop {v1-v6,sl,fp,pc}
endfunc
.unreq CONTEXT
.unreq ORIGOUT
.unreq IN
.unreq OUT
.unreq REVTAB
.unreq TCOS
.unreq TSIN
.unreq OLDFPSCR
.unreq J0
.unreq J1
.unreq J2
.unreq J3
.unreq REVTAB_HI
.unreq IN_HI
.unreq OUT_HI
.unreq TCOS_HI
.unreq TSIN_HI

View File

@@ -0,0 +1,244 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_pix_abs16_armv6, export=1
ldr r0, [sp]
push {r4-r9, lr}
mov r12, #0
mov lr, #0
ldm r1, {r4-r7}
ldr r8, [r2]
1:
ldr r9, [r2, #4]
pld [r1, r3]
usada8 r12, r4, r8, r12
ldr r8, [r2, #8]
pld [r2, r3]
usada8 lr, r5, r9, lr
ldr r9, [r2, #12]
usada8 r12, r6, r8, r12
subs r0, r0, #1
usada8 lr, r7, r9, lr
beq 2f
add r1, r1, r3
ldm r1, {r4-r7}
add r2, r2, r3
ldr r8, [r2]
b 1b
2:
add r0, r12, lr
pop {r4-r9, pc}
endfunc
function ff_pix_abs16_x2_armv6, export=1
ldr r12, [sp]
push {r4-r11, lr}
mov r0, #0
mov lr, #1
orr lr, lr, lr, lsl #8
orr lr, lr, lr, lsl #16
1:
ldr r8, [r2]
ldr r9, [r2, #4]
lsr r10, r8, #8
ldr r4, [r1]
lsr r6, r9, #8
orr r10, r10, r9, lsl #24
ldr r5, [r2, #8]
eor r11, r8, r10
uhadd8 r7, r8, r10
orr r6, r6, r5, lsl #24
and r11, r11, lr
uadd8 r7, r7, r11
ldr r8, [r1, #4]
usada8 r0, r4, r7, r0
eor r7, r9, r6
lsr r10, r5, #8
and r7, r7, lr
uhadd8 r4, r9, r6
ldr r6, [r2, #12]
uadd8 r4, r4, r7
pld [r1, r3]
orr r10, r10, r6, lsl #24
usada8 r0, r8, r4, r0
ldr r4, [r1, #8]
eor r11, r5, r10
ldrb r7, [r2, #16]
and r11, r11, lr
uhadd8 r8, r5, r10
ldr r5, [r1, #12]
uadd8 r8, r8, r11
pld [r2, r3]
lsr r10, r6, #8
usada8 r0, r4, r8, r0
orr r10, r10, r7, lsl #24
subs r12, r12, #1
eor r11, r6, r10
add r1, r1, r3
uhadd8 r9, r6, r10
and r11, r11, lr
uadd8 r9, r9, r11
add r2, r2, r3
usada8 r0, r5, r9, r0
bgt 1b
pop {r4-r11, pc}
endfunc
.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3
ldr \n0, [r2]
eor \n1, \p0, \n0
uhadd8 \p0, \p0, \n0
and \n1, \n1, lr
ldr \n2, [r1]
uadd8 \p0, \p0, \n1
ldr \n1, [r2, #4]
usada8 r0, \p0, \n2, r0
pld [r1, r3]
eor \n3, \p1, \n1
uhadd8 \p1, \p1, \n1
and \n3, \n3, lr
ldr \p0, [r1, #4]
uadd8 \p1, \p1, \n3
ldr \n2, [r2, #8]
usada8 r0, \p1, \p0, r0
pld [r2, r3]
eor \p0, \p2, \n2
uhadd8 \p2, \p2, \n2
and \p0, \p0, lr
ldr \p1, [r1, #8]
uadd8 \p2, \p2, \p0
ldr \n3, [r2, #12]
usada8 r0, \p2, \p1, r0
eor \p1, \p3, \n3
uhadd8 \p3, \p3, \n3
and \p1, \p1, lr
ldr \p0, [r1, #12]
uadd8 \p3, \p3, \p1
add r1, r1, r3
usada8 r0, \p3, \p0, r0
add r2, r2, r3
.endm
function ff_pix_abs16_y2_armv6, export=1
pld [r1]
pld [r2]
ldr r12, [sp]
push {r4-r11, lr}
mov r0, #0
mov lr, #1
orr lr, lr, lr, lsl #8
orr lr, lr, lr, lsl #16
ldr r4, [r2]
ldr r5, [r2, #4]
ldr r6, [r2, #8]
ldr r7, [r2, #12]
add r2, r2, r3
1:
usad_y2 r4, r5, r6, r7, r8, r9, r10, r11
subs r12, r12, #2
usad_y2 r8, r9, r10, r11, r4, r5, r6, r7
bgt 1b
pop {r4-r11, pc}
endfunc
function ff_pix_abs8_armv6, export=1
pld [r2, r3]
ldr r12, [sp]
push {r4-r9, lr}
mov r0, #0
mov lr, #0
ldrd_post r4, r5, r1, r3
1:
subs r12, r12, #2
ldr r7, [r2, #4]
ldr_post r6, r2, r3
ldrd_post r8, r9, r1, r3
usada8 r0, r4, r6, r0
pld [r2, r3]
usada8 lr, r5, r7, lr
ldr r7, [r2, #4]
ldr_post r6, r2, r3
beq 2f
ldrd_post r4, r5, r1, r3
usada8 r0, r8, r6, r0
pld [r2, r3]
usada8 lr, r9, r7, lr
b 1b
2:
usada8 r0, r8, r6, r0
usada8 lr, r9, r7, lr
add r0, r0, lr
pop {r4-r9, pc}
endfunc
function ff_sse16_armv6, export=1
ldr r12, [sp]
push {r4-r9, lr}
mov r0, #0
1:
ldrd r4, r5, [r1]
ldr r8, [r2]
uxtb16 lr, r4
uxtb16 r4, r4, ror #8
uxtb16 r9, r8
uxtb16 r8, r8, ror #8
ldr r7, [r2, #4]
usub16 lr, lr, r9
usub16 r4, r4, r8
smlad r0, lr, lr, r0
uxtb16 r6, r5
uxtb16 lr, r5, ror #8
uxtb16 r8, r7
uxtb16 r9, r7, ror #8
smlad r0, r4, r4, r0
ldrd r4, r5, [r1, #8]
usub16 r6, r6, r8
usub16 r8, lr, r9
ldr r7, [r2, #8]
smlad r0, r6, r6, r0
uxtb16 lr, r4
uxtb16 r4, r4, ror #8
uxtb16 r9, r7
uxtb16 r7, r7, ror #8
smlad r0, r8, r8, r0
ldr r8, [r2, #12]
usub16 lr, lr, r9
usub16 r4, r4, r7
smlad r0, lr, lr, r0
uxtb16 r6, r5
uxtb16 r5, r5, ror #8
uxtb16 r9, r8
uxtb16 r8, r8, ror #8
smlad r0, r4, r4, r0
usub16 r6, r6, r9
usub16 r5, r5, r8
smlad r0, r6, r6, r0
add r1, r1, r3
add r2, r2, r3
subs r12, r12, #1
smlad r0, r5, r5, r0
bgt 1b
pop {r4-r9, pc}
endfunc

View File

@@ -0,0 +1,57 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/me_cmp.h"
#include "libavcodec/mpegvideo.h"
int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv6(cpu_flags)) {
c->pix_abs[0][0] = ff_pix_abs16_armv6;
c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
c->pix_abs[1][0] = ff_pix_abs8_armv6;
c->sad[0] = ff_pix_abs16_armv6;
c->sad[1] = ff_pix_abs8_armv6;
c->sse[0] = ff_sse16_armv6;
}
}

View File

@@ -0,0 +1,662 @@
/*
* Copyright (c) 2014 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#define MAX_CHANNELS 8
#define MAX_FIR_ORDER 8
#define MAX_IIR_ORDER 4
#define MAX_RATEFACTOR 4
#define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR)
PST .req a1
PCO .req a2
AC0 .req a3
AC1 .req a4
CO0 .req v1
CO1 .req v2
CO2 .req v3
CO3 .req v4
ST0 .req v5
ST1 .req v6
ST2 .req sl
ST3 .req fp
I .req ip
PSAMP .req lr
.macro branch_pic_label first, remainder:vararg
A .word \first - 4
T .hword (\first) / 2
.ifnb \remainder
branch_pic_label \remainder
.endif
.endm
// Some macros that do loads/multiplies where the register number is determined
// from an assembly-time expression. Boy is GNU assembler's syntax ugly...
.macro load group, index, base, offset
.altmacro
load_ \group, %(\index), \base, \offset
.noaltmacro
.endm
.macro load_ group, index, base, offset
ldr \group\index, [\base, #\offset]
.endm
.macro loadd group, index, base, offset
.altmacro
loadd_ \group, %(\index), %(\index+1), \base, \offset
.noaltmacro
.endm
.macro loadd_ group, index0, index1, base, offset
A .if \offset >= 256
A ldr \group\index0, [\base, #\offset]
A ldr \group\index1, [\base, #(\offset) + 4]
A .else
ldrd \group\index0, \group\index1, [\base, #\offset]
A .endif
.endm
.macro multiply index, accumulate, long
.altmacro
multiply_ %(\index), \accumulate, \long
.noaltmacro
.endm
.macro multiply_ index, accumulate, long
.if \long
.if \accumulate
smlal AC0, AC1, CO\index, ST\index
.else
smull AC0, AC1, CO\index, ST\index
.endif
.else
.if \accumulate
mla AC0, CO\index, ST\index, AC0
.else
mul AC0, CO\index, ST\index
.endif
.endif
.endm
// A macro to update the load register number and load offsets
.macro inc howmany
.set LOAD_REG, (LOAD_REG + \howmany) & 3
.set OFFSET_CO, OFFSET_CO + 4 * \howmany
.set OFFSET_ST, OFFSET_ST + 4 * \howmany
.if FIR_REMAIN > 0
.set FIR_REMAIN, FIR_REMAIN - \howmany
.if FIR_REMAIN == 0
.set OFFSET_CO, 4 * MAX_FIR_ORDER
.set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
.endif
.elseif IIR_REMAIN > 0
.set IIR_REMAIN, IIR_REMAIN - \howmany
.endif
.endm
// Macro to implement the inner loop for one specific combination of parameters
.macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps
.set TOTAL_TAPS, \iir_taps + \fir_taps
// Deal with register allocation...
.set DEFINED_SHIFT, 0
.set DEFINED_MASK, 0
.set SHUFFLE_SHIFT, 0
.set SHUFFLE_MASK, 0
.set SPILL_SHIFT, 0
.set SPILL_MASK, 0
.if TOTAL_TAPS == 0
// Little register pressure in this case - just keep MASK where it was
.if !\mask_minus1
MASK .req ST1
.set DEFINED_MASK, 1
.endif
.else
.if \shift_0
.if !\mask_minus1
// AC1 is unused with shift 0
MASK .req AC1
.set DEFINED_MASK, 1
.set SHUFFLE_MASK, 1
.endif
.elseif \shift_8
.if !\mask_minus1
.if TOTAL_TAPS <= 4
// All coefficients are preloaded (so pointer not needed)
MASK .req PCO
.set DEFINED_MASK, 1
.set SHUFFLE_MASK, 1
.else
.set SPILL_MASK, 1
.endif
.endif
.else // shift not 0 or 8
.if TOTAL_TAPS <= 3
// All coefficients are preloaded, and at least one CO register is unused
.if \fir_taps & 1
SHIFT .req CO0
.set DEFINED_SHIFT, 1
.set SHUFFLE_SHIFT, 1
.else
SHIFT .req CO3
.set DEFINED_SHIFT, 1
.set SHUFFLE_SHIFT, 1
.endif
.if !\mask_minus1
MASK .req PCO
.set DEFINED_MASK, 1
.set SHUFFLE_MASK, 1
.endif
.elseif TOTAL_TAPS == 4
// All coefficients are preloaded
SHIFT .req PCO
.set DEFINED_SHIFT, 1
.set SHUFFLE_SHIFT, 1
.if !\mask_minus1
.set SPILL_MASK, 1
.endif
.else
.set SPILL_SHIFT, 1
.if !\mask_minus1
.set SPILL_MASK, 1
.endif
.endif
.endif
.endif
.if SPILL_SHIFT
SHIFT .req ST0
.set DEFINED_SHIFT, 1
.endif
.if SPILL_MASK
MASK .req ST1
.set DEFINED_MASK, 1
.endif
// Preload coefficients if possible
.if TOTAL_TAPS <= 4
.set OFFSET_CO, 0
.if \fir_taps & 1
.set LOAD_REG, 1
.else
.set LOAD_REG, 0
.endif
.rept \fir_taps
load CO, LOAD_REG, PCO, OFFSET_CO
.set LOAD_REG, (LOAD_REG + 1) & 3
.set OFFSET_CO, OFFSET_CO + 4
.endr
.set OFFSET_CO, 4 * MAX_FIR_ORDER
.rept \iir_taps
load CO, LOAD_REG, PCO, OFFSET_CO
.set LOAD_REG, (LOAD_REG + 1) & 3
.set OFFSET_CO, OFFSET_CO + 4
.endr
.endif
// Move mask/shift to final positions if necessary
// Need to do this after preloading, because in some cases we
// reuse the coefficient pointer register
.if SHUFFLE_SHIFT
mov SHIFT, ST0
.endif
.if SHUFFLE_MASK
mov MASK, ST1
.endif
// Begin loop
01:
.if TOTAL_TAPS == 0
// Things simplify a lot in this case
// In fact this could be pipelined further if it's worth it...
ldr ST0, [PSAMP]
subs I, I, #1
.if !\mask_minus1
and ST0, ST0, MASK
.endif
str ST0, [PST, #-4]!
str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
str ST0, [PSAMP], #4 * MAX_CHANNELS
bne 01b
.else
.if \fir_taps & 1
.set LOAD_REG, 1
.else
.set LOAD_REG, 0
.endif
.set LOAD_BANK, 0
.set FIR_REMAIN, \fir_taps
.set IIR_REMAIN, \iir_taps
.if FIR_REMAIN == 0 // only IIR terms
.set OFFSET_CO, 4 * MAX_FIR_ORDER
.set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
.else
.set OFFSET_CO, 0
.set OFFSET_ST, 0
.endif
.set MUL_REG, LOAD_REG
.set COUNTER, 0
.rept TOTAL_TAPS + 2
// Do load(s)
.if FIR_REMAIN != 0 || IIR_REMAIN != 0
.if COUNTER == 0
.if TOTAL_TAPS > 4
load CO, LOAD_REG, PCO, OFFSET_CO
.endif
load ST, LOAD_REG, PST, OFFSET_ST
inc 1
.elseif COUNTER == 1 && (\fir_taps & 1) == 0
.if TOTAL_TAPS > 4
load CO, LOAD_REG, PCO, OFFSET_CO
.endif
load ST, LOAD_REG, PST, OFFSET_ST
inc 1
.elseif LOAD_BANK == 0
.if TOTAL_TAPS > 4
.if FIR_REMAIN == 0 && IIR_REMAIN == 1
load CO, LOAD_REG, PCO, OFFSET_CO
.else
loadd CO, LOAD_REG, PCO, OFFSET_CO
.endif
.endif
.set LOAD_BANK, 1
.else
.if FIR_REMAIN == 0 && IIR_REMAIN == 1
load ST, LOAD_REG, PST, OFFSET_ST
inc 1
.else
loadd ST, LOAD_REG, PST, OFFSET_ST
inc 2
.endif
.set LOAD_BANK, 0
.endif
.endif
// Do interleaved multiplies, slightly delayed
.if COUNTER >= 2
multiply MUL_REG, COUNTER > 2, !\shift_0
.set MUL_REG, (MUL_REG + 1) & 3
.endif
.set COUNTER, COUNTER + 1
.endr
// Post-process the result of the multiplies
.if SPILL_SHIFT
ldr SHIFT, [sp, #9*4 + 0*4]
.endif
.if SPILL_MASK
ldr MASK, [sp, #9*4 + 1*4]
.endif
ldr ST2, [PSAMP]
subs I, I, #1
.if \shift_8
mov AC0, AC0, lsr #8
orr AC0, AC0, AC1, lsl #24
.elseif !\shift_0
rsb ST3, SHIFT, #32
mov AC0, AC0, lsr SHIFT
A orr AC0, AC0, AC1, lsl ST3
T mov AC1, AC1, lsl ST3
T orr AC0, AC0, AC1
.endif
.if \mask_minus1
add ST3, ST2, AC0
.else
add ST2, ST2, AC0
and ST3, ST2, MASK
sub ST2, ST3, AC0
.endif
str ST3, [PST, #-4]!
str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
str ST3, [PSAMP], #4 * MAX_CHANNELS
bne 01b
.endif
b 99f
.if DEFINED_SHIFT
.unreq SHIFT
.endif
.if DEFINED_MASK
.unreq MASK
.endif
.endm
.macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps
A ldr CO0, [pc, a3, lsl #2] // firorder is in range 0-(8-iir_taps)
A add pc, pc, CO0
T tbh [pc, a3, lsl #1]
0:
branch_pic_label (70f - 0b), (71f - 0b), (72f - 0b), (73f - 0b)
branch_pic_label (74f - 0b)
.if \iir_taps <= 3
branch_pic_label (75f - 0b)
.if \iir_taps <= 2
branch_pic_label (76f - 0b)
.if \iir_taps <= 1
branch_pic_label (77f - 0b)
.if \iir_taps == 0
branch_pic_label (78f - 0b)
.endif
.endif
.endif
.endif
70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
.if \iir_taps <= 3
75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
.if \iir_taps <= 2
76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
.if \iir_taps <= 1
77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
.if \iir_taps == 0
78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
.endif
.endif
.endif
.endif
.endm
.macro switch_on_iir_taps mask_minus1, shift_0, shift_8
A ldr CO0, [pc, a4, lsl #2] // irorder is in range 0-4
A add pc, pc, CO0
T tbh [pc, a4, lsl #1]
0:
branch_pic_label (60f - 0b), (61f - 0b), (62f - 0b), (63f - 0b)
branch_pic_label (64f - 0b)
60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0
61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1
62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2
63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3
64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4
.endm
/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
* int firorder, int iirorder,
* unsigned int filter_shift, int32_t mask,
* int blocksize, int32_t *sample_buffer);
*/
function ff_mlp_filter_channel_arm, export=1
push {v1-fp,lr}
add v1, sp, #9*4 // point at arguments on stack
ldm v1, {ST0,ST1,I,PSAMP}
cmp ST1, #-1
bne 30f
movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
bne 20f
bcs 10f
switch_on_iir_taps 1, 1, 0
10: switch_on_iir_taps 1, 0, 1
20: switch_on_iir_taps 1, 0, 0
30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
bne 50f
bcs 40f
switch_on_iir_taps 0, 1, 0
40: switch_on_iir_taps 0, 0, 1
50: switch_on_iir_taps 0, 0, 0
99: pop {v1-fp,pc}
endfunc
.unreq PST
.unreq PCO
.unreq AC0
.unreq AC1
.unreq CO0
.unreq CO1
.unreq CO2
.unreq CO3
.unreq ST0
.unreq ST1
.unreq ST2
.unreq ST3
.unreq I
.unreq PSAMP
/********************************************************************/
PSA .req a1 // samples
PCO .req a2 // coeffs
PBL .req a3 // bypassed_lsbs
INDEX .req a4
CO0 .req v1
CO1 .req v2
CO2 .req v3
CO3 .req v4
SA0 .req v5
SA1 .req v6
SA2 .req sl
SA3 .req fp
AC0 .req ip
AC1 .req lr
NOISE .req SA0
LSB .req SA1
DCH .req SA2 // dest_ch
MASK .req SA3
// INDEX is used as follows:
// bits 0..6 index2 (values up to 17, but wider so that we can
// add to index field without needing to mask)
// bits 7..14 i (values up to 160)
// bit 15 underflow detect for i
// bits 25..31 (if access_unit_size_pow2 == 128) \ index
// bits 26..31 (if access_unit_size_pow2 == 64) /
.macro implement_rematrix shift, index_mask, mask_minus1, maxchan
.if \maxchan == 1
// We can just leave the coefficients in registers in this case
ldrd CO0, CO1, [PCO]
.endif
1:
.if \maxchan == 1
ldrd SA0, SA1, [PSA]
smull AC0, AC1, CO0, SA0
.elseif \maxchan == 5
ldr CO0, [PCO, #0]
ldr SA0, [PSA, #0]
ldr CO1, [PCO, #4]
ldr SA1, [PSA, #4]
ldrd CO2, CO3, [PCO, #8]
smull AC0, AC1, CO0, SA0
ldrd SA2, SA3, [PSA, #8]
smlal AC0, AC1, CO1, SA1
ldrd CO0, CO1, [PCO, #16]
smlal AC0, AC1, CO2, SA2
ldrd SA0, SA1, [PSA, #16]
smlal AC0, AC1, CO3, SA3
smlal AC0, AC1, CO0, SA0
.else // \maxchan == 7
ldr CO2, [PCO, #0]
ldr SA2, [PSA, #0]
ldr CO3, [PCO, #4]
ldr SA3, [PSA, #4]
ldrd CO0, CO1, [PCO, #8]
smull AC0, AC1, CO2, SA2
ldrd SA0, SA1, [PSA, #8]
smlal AC0, AC1, CO3, SA3
ldrd CO2, CO3, [PCO, #16]
smlal AC0, AC1, CO0, SA0
ldrd SA2, SA3, [PSA, #16]
smlal AC0, AC1, CO1, SA1
ldrd CO0, CO1, [PCO, #24]
smlal AC0, AC1, CO2, SA2
ldrd SA0, SA1, [PSA, #24]
smlal AC0, AC1, CO3, SA3
smlal AC0, AC1, CO0, SA0
.endif
ldm sp, {NOISE, DCH, MASK}
smlal AC0, AC1, CO1, SA1
.if \shift != 0
.if \index_mask == 63
add NOISE, NOISE, INDEX, lsr #32-6
ldrb LSB, [PBL], #MAX_CHANNELS
ldrsb NOISE, [NOISE]
add INDEX, INDEX, INDEX, lsl #32-6
.else // \index_mask == 127
add NOISE, NOISE, INDEX, lsr #32-7
ldrb LSB, [PBL], #MAX_CHANNELS
ldrsb NOISE, [NOISE]
add INDEX, INDEX, INDEX, lsl #32-7
.endif
sub INDEX, INDEX, #1<<7
adds AC0, AC0, NOISE, lsl #\shift + 7
adc AC1, AC1, NOISE, asr #31
.else
ldrb LSB, [PBL], #MAX_CHANNELS
sub INDEX, INDEX, #1<<7
.endif
add PSA, PSA, #MAX_CHANNELS*4
mov AC0, AC0, lsr #14
orr AC0, AC0, AC1, lsl #18
.if !\mask_minus1
and AC0, AC0, MASK
.endif
add AC0, AC0, LSB
tst INDEX, #1<<15
str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA
beq 1b
b 98f
.endm
.macro switch_on_maxchan shift, index_mask, mask_minus1
cmp v4, #5
blo 51f
beq 50f
implement_rematrix \shift, \index_mask, \mask_minus1, 7
50: implement_rematrix \shift, \index_mask, \mask_minus1, 5
51: implement_rematrix \shift, \index_mask, \mask_minus1, 1
.endm
.macro switch_on_mask shift, index_mask
cmp sl, #-1
bne 40f
switch_on_maxchan \shift, \index_mask, 1
40: switch_on_maxchan \shift, \index_mask, 0
.endm
.macro switch_on_au_size shift
.if \shift == 0
switch_on_mask \shift, undefined
.else
teq v6, #64
bne 30f
orr INDEX, INDEX, v1, lsl #32-6
switch_on_mask \shift, 63
30: orr INDEX, INDEX, v1, lsl #32-7
switch_on_mask \shift, 127
.endif
.endm
/* void ff_mlp_rematrix_channel_arm(int32_t *samples,
* const int32_t *coeffs,
* const uint8_t *bypassed_lsbs,
* const int8_t *noise_buffer,
* int index,
* unsigned int dest_ch,
* uint16_t blockpos,
* unsigned int maxchan,
* int matrix_noise_shift,
* int access_unit_size_pow2,
* int32_t mask);
*/
function ff_mlp_rematrix_channel_arm, export=1
push {v1-fp,lr}
add v1, sp, #9*4 // point at arguments on stack
ldm v1, {v1-sl}
teq v4, #1
itt ne
teqne v4, #5
teqne v4, #7
bne 99f
teq v6, #64
it ne
teqne v6, #128
bne 99f
sub v2, v2, #MAX_CHANNELS
push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned
movs INDEX, v3, lsl #7
beq 98f // just in case, do nothing if blockpos = 0
subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
adc lr, v1, v1 // calculate index2 (C was set by preceding subs)
orr INDEX, INDEX, lr
// Switch on matrix_noise_shift: values 0 and 1 are
// disproportionately common so do those in a form the branch
// predictor can accelerate. Values can only go up to 15.
cmp v5, #1
beq 11f
blo 10f
A ldr v5, [pc, v5, lsl #2]
A add pc, pc, v5
T tbh [pc, v5, lsl #1]
0:
branch_pic_label 0, 0, (12f - 0b), (13f - 0b)
branch_pic_label (14f - 0b), (15f - 0b), (16f - 0b), (17f - 0b)
branch_pic_label (18f - 0b), (19f - 0b), (20f - 0b), (21f - 0b)
branch_pic_label (22f - 0b), (23f - 0b), (24f - 0b), (25f - 0b)
10: switch_on_au_size 0
11: switch_on_au_size 1
12: switch_on_au_size 2
13: switch_on_au_size 3
14: switch_on_au_size 4
15: switch_on_au_size 5
16: switch_on_au_size 6
17: switch_on_au_size 7
18: switch_on_au_size 8
19: switch_on_au_size 9
20: switch_on_au_size 10
21: switch_on_au_size 11
22: switch_on_au_size 12
23: switch_on_au_size 13
24: switch_on_au_size 14
25: switch_on_au_size 15
98: add sp, sp, #3*4
pop {v1-fp,pc}
99: // Can't handle these parameters, drop back to C
pop {v1-fp,lr}
b X(ff_mlp_rematrix_channel)
endfunc
.unreq PSA
.unreq PCO
.unreq PBL
.unreq INDEX
.unreq CO0
.unreq CO1
.unreq CO2
.unreq CO3
.unreq SA0
.unreq SA1
.unreq SA2
.unreq SA3
.unreq AC0
.unreq AC1
.unreq NOISE
.unreq LSB
.unreq DCH
.unreq MASK

View File

@@ -0,0 +1,533 @@
/*
* Copyright (c) 2014 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro loadregoffsh2 group, index, base, offgroup, offindex
.altmacro
loadregoffsh2_ \group, %(\index), \base, \offgroup, %(\offindex)
.noaltmacro
.endm
.macro loadregoffsh2_ group, index, base, offgroup, offindex
ldr \group\index, [\base, \offgroup\offindex, lsl #2]
.endm
.macro eorlslreg check, data, group, index
.altmacro
eorlslreg_ \check, \data, \group, %(\index)
.noaltmacro
.endm
.macro eorlslreg_ check, data, group, index
eor \check, \check, \data, lsl \group\index
.endm
.macro decr_modulo var, by, modulus
.set \var, \var - \by
.if \var == 0
.set \var, \modulus
.endif
.endm
.macro load_group1 size, channels, r0, r1, r2, r3, pointer_dead=0
.if \size == 2
ldrd \r0, \r1, [IN], #(\size + 8 - \channels) * 4
.else // size == 4
.if IDX1 > 4 || \channels==8
ldm IN!, {\r0, \r1, \r2, \r3}
.else
ldm IN, {\r0, \r1, \r2, \r3}
.if !\pointer_dead
add IN, IN, #(4 + 8 - \channels) * 4
.endif
.endif
.endif
decr_modulo IDX1, \size, \channels
.endm
.macro load_group2 size, channels, r0, r1, r2, r3, pointer_dead=0
.if \size == 2
.if IDX1 > 2
ldm IN!, {\r2, \r3}
.else
//A .ifc \r2, ip
//A .if \pointer_dead
//A ldm IN, {\r2, \r3}
//A .else
//A ldr \r2, [IN], #4
//A ldr \r3, [IN], #(\size - 1 + 8 - \channels) * 4
//A .endif
//A .else
ldrd \r2, \r3, [IN], #(\size + 8 - \channels) * 4
//A .endif
.endif
.endif
decr_modulo IDX1, \size, \channels
.endm
.macro implement_pack inorder, channels, shift
.if \inorder
.ifc \shift, mixed
CHECK .req a1
COUNT .req a2
IN .req a3
OUT .req a4
DAT0 .req v1
DAT1 .req v2
DAT2 .req v3
DAT3 .req v4
SHIFT0 .req v5
SHIFT1 .req v6
SHIFT2 .req sl
SHIFT3 .req fp
SHIFT4 .req ip
SHIFT5 .req lr
.macro output4words
.set SIZE_GROUP1, IDX1
.if SIZE_GROUP1 > 4
.set SIZE_GROUP1, 4
.endif
.set SIZE_GROUP2, 4 - SIZE_GROUP1
load_group1 SIZE_GROUP1, \channels, DAT0, DAT1, DAT2, DAT3
load_group2 SIZE_GROUP2, \channels, DAT0, DAT1, DAT2, DAT3
.if \channels == 2
lsl DAT0, SHIFT0
lsl DAT1, SHIFT1
lsl DAT2, SHIFT0
lsl DAT3, SHIFT1
.elseif \channels == 6
.if IDX2 == 6
lsl DAT0, SHIFT0
lsl DAT1, SHIFT1
lsl DAT2, SHIFT2
lsl DAT3, SHIFT3
.elseif IDX2 == 2
lsl DAT0, SHIFT4
lsl DAT1, SHIFT5
lsl DAT2, SHIFT0
lsl DAT3, SHIFT1
.else // IDX2 == 4
lsl DAT0, SHIFT2
lsl DAT1, SHIFT3
lsl DAT2, SHIFT4
lsl DAT3, SHIFT5
.endif
.elseif \channels == 8
.if IDX2 == 8
uxtb SHIFT0, SHIFT4, ror #0
uxtb SHIFT1, SHIFT4, ror #8
uxtb SHIFT2, SHIFT4, ror #16
uxtb SHIFT3, SHIFT4, ror #24
.else
uxtb SHIFT0, SHIFT5, ror #0
uxtb SHIFT1, SHIFT5, ror #8
uxtb SHIFT2, SHIFT5, ror #16
uxtb SHIFT3, SHIFT5, ror #24
.endif
lsl DAT0, SHIFT0
lsl DAT1, SHIFT1
lsl DAT2, SHIFT2
lsl DAT3, SHIFT3
.endif
eor CHECK, CHECK, DAT0, lsr #8 - (\channels - IDX2)
eor CHECK, CHECK, DAT1, lsr #7 - (\channels - IDX2)
decr_modulo IDX2, 2, \channels
eor CHECK, CHECK, DAT2, lsr #8 - (\channels - IDX2)
eor CHECK, CHECK, DAT3, lsr #7 - (\channels - IDX2)
decr_modulo IDX2, 2, \channels
stm OUT!, {DAT0 - DAT3}
.endm
.set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
.if (WORDS_PER_LOOP % 2) == 0
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
.endif
.if (WORDS_PER_LOOP % 2) == 0
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
.endif
.set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
.set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
.if SAMPLES_PER_LOOP > 1
tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
it ne
bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
.endif
teq COUNT, #0
it eq
bxeq lr
push {v1-v6,sl,fp,lr}
ldr SHIFT0, [sp, #(9+1)*4] // get output_shift from stack
ldr SHIFT1, =0x08080808
ldr SHIFT4, [SHIFT0]
.if \channels == 2
uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
uxtb SHIFT0, SHIFT4, ror #0
uxtb SHIFT1, SHIFT4, ror #8
.else
ldr SHIFT5, [SHIFT0, #4]
uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
uadd8 SHIFT5, SHIFT5, SHIFT1
.if \channels == 6
uxtb SHIFT0, SHIFT4, ror #0
uxtb SHIFT1, SHIFT4, ror #8
uxtb SHIFT2, SHIFT4, ror #16
uxtb SHIFT3, SHIFT4, ror #24
uxtb SHIFT4, SHIFT5, ror #0
uxtb SHIFT5, SHIFT5, ror #8
.endif
.endif
.set IDX1, \channels
.set IDX2, \channels
0:
.rept WORDS_PER_LOOP / 4
output4words
.endr
subs COUNT, COUNT, #SAMPLES_PER_LOOP
bne 0b
pop {v1-v6,sl,fp,pc}
.ltorg
endfunc
.purgem output4words
.unreq CHECK
.unreq COUNT
.unreq IN
.unreq OUT
.unreq DAT0
.unreq DAT1
.unreq DAT2
.unreq DAT3
.unreq SHIFT0
.unreq SHIFT1
.unreq SHIFT2
.unreq SHIFT3
.unreq SHIFT4
.unreq SHIFT5
.else // not mixed
CHECK .req a1
COUNT .req a2
IN .req a3
OUT .req a4
DAT0 .req v1
DAT1 .req v2
DAT2 .req v3
DAT3 .req v4
DAT4 .req v5
DAT5 .req v6
DAT6 .req sl // use these rather than the otherwise unused
DAT7 .req fp // ip and lr so that we can load them usinf LDRD
.macro output4words tail, head, r0, r1, r2, r3, r4, r5, r6, r7, pointer_dead=0
.if \head
.set SIZE_GROUP1, IDX1
.if SIZE_GROUP1 > 4
.set SIZE_GROUP1, 4
.endif
.set SIZE_GROUP2, 4 - SIZE_GROUP1
load_group1 SIZE_GROUP1, \channels, \r0, \r1, \r2, \r3, \pointer_dead
.endif
.if \tail
eor CHECK, CHECK, \r4, lsr #8 - (\channels - IDX2)
eor CHECK, CHECK, \r5, lsr #7 - (\channels - IDX2)
decr_modulo IDX2, 2, \channels
.endif
.if \head
load_group2 SIZE_GROUP2, \channels, \r0, \r1, \r2, \r3, \pointer_dead
.endif
.if \tail
eor CHECK, CHECK, \r6, lsr #8 - (\channels - IDX2)
eor CHECK, CHECK, \r7, lsr #7 - (\channels - IDX2)
decr_modulo IDX2, 2, \channels
stm OUT!, {\r4, \r5, \r6, \r7}
.endif
.if \head
lsl \r0, #8 + \shift
lsl \r1, #8 + \shift
lsl \r2, #8 + \shift
lsl \r3, #8 + \shift
.endif
.endm
.set WORDS_PER_LOOP, \channels // calculate LCM (channels, 8)
.if (WORDS_PER_LOOP % 2) == 0
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
.endif
.if (WORDS_PER_LOOP % 2) == 0
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
.endif
.if (WORDS_PER_LOOP % 2) == 0
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
.endif
.set WORDS_PER_LOOP, WORDS_PER_LOOP * 8
.set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
function ff_mlp_pack_output_inorder_\channels\()ch_\shift\()shift_armv6, export=1
.if SAMPLES_PER_LOOP > 1
tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
it ne
bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
.endif
subs COUNT, COUNT, #SAMPLES_PER_LOOP
it lo
bxlo lr
push {v1-v6,sl,fp,lr}
.set IDX1, \channels
.set IDX2, \channels
output4words 0, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
0: beq 1f
.rept WORDS_PER_LOOP / 8
output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
.endr
subs COUNT, COUNT, #SAMPLES_PER_LOOP
bne 0b
1:
.rept WORDS_PER_LOOP / 8 - 1
output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
.endr
output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3, pointer_dead=1
output4words 1, 0, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
pop {v1-v6,sl,fp,pc}
endfunc
.purgem output4words
.unreq CHECK
.unreq COUNT
.unreq IN
.unreq OUT
.unreq DAT0
.unreq DAT1
.unreq DAT2
.unreq DAT3
.unreq DAT4
.unreq DAT5
.unreq DAT6
.unreq DAT7
.endif // mixed
.else // not inorder
.ifc \shift, mixed
// This case not currently handled
.else // not mixed
#if !CONFIG_THUMB
CHECK .req a1
COUNT .req a2
IN .req a3
OUT .req a4
DAT0 .req v1
DAT1 .req v2
DAT2 .req v3
DAT3 .req v4
CHAN0 .req v5
CHAN1 .req v6
CHAN2 .req sl
CHAN3 .req fp
CHAN4 .req ip
CHAN5 .req lr
.macro output4words
.if \channels == 8
.if IDX1 == 8
uxtb CHAN0, CHAN4, ror #0
uxtb CHAN1, CHAN4, ror #8
uxtb CHAN2, CHAN4, ror #16
uxtb CHAN3, CHAN4, ror #24
.else
uxtb CHAN0, CHAN5, ror #0
uxtb CHAN1, CHAN5, ror #8
uxtb CHAN2, CHAN5, ror #16
uxtb CHAN3, CHAN5, ror #24
.endif
ldr DAT0, [IN, CHAN0, lsl #2]
ldr DAT1, [IN, CHAN1, lsl #2]
ldr DAT2, [IN, CHAN2, lsl #2]
ldr DAT3, [IN, CHAN3, lsl #2]
.if IDX1 == 4
add IN, IN, #8*4
.endif
decr_modulo IDX1, 4, \channels
.else
.set SIZE_GROUP1, IDX1
.if SIZE_GROUP1 > 4
.set SIZE_GROUP1, 4
.endif
.set SIZE_GROUP2, 4 - SIZE_GROUP1
.if SIZE_GROUP1 == 2
loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
add IN, IN, #8*4
.else // SIZE_GROUP1 == 4
loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
loadregoffsh2 DAT, 2, IN, CHAN, 2 + (\channels - IDX1)
loadregoffsh2 DAT, 3, IN, CHAN, 3 + (\channels - IDX1)
.if IDX1 == 4
add IN, IN, #8*4
.endif
.endif
decr_modulo IDX1, SIZE_GROUP1, \channels
.if SIZE_GROUP2 == 2
loadregoffsh2 DAT, 2, IN, CHAN, 0 + (\channels - IDX1)
loadregoffsh2 DAT, 3, IN, CHAN, 1 + (\channels - IDX1)
.if IDX1 == 2
add IN, IN, #8*4
.endif
.endif
decr_modulo IDX1, SIZE_GROUP2, \channels
.endif
.if \channels == 8 // in this case we can corrupt CHAN0-3
rsb CHAN0, CHAN0, #8
rsb CHAN1, CHAN1, #8
rsb CHAN2, CHAN2, #8
rsb CHAN3, CHAN3, #8
lsl DAT0, #8 + \shift
lsl DAT1, #8 + \shift
lsl DAT2, #8 + \shift
lsl DAT3, #8 + \shift
eor CHECK, CHECK, DAT0, lsr CHAN0
eor CHECK, CHECK, DAT1, lsr CHAN1
eor CHECK, CHECK, DAT2, lsr CHAN2
eor CHECK, CHECK, DAT3, lsr CHAN3
.else
.if \shift != 0
lsl DAT0, #\shift
lsl DAT1, #\shift
lsl DAT2, #\shift
lsl DAT3, #\shift
.endif
bic DAT0, DAT0, #0xff000000
bic DAT1, DAT1, #0xff000000
bic DAT2, DAT2, #0xff000000
bic DAT3, DAT3, #0xff000000
eorlslreg CHECK, DAT0, CHAN, 0 + (\channels - IDX2)
eorlslreg CHECK, DAT1, CHAN, 1 + (\channels - IDX2)
decr_modulo IDX2, 2, \channels
eorlslreg CHECK, DAT2, CHAN, 0 + (\channels - IDX2)
eorlslreg CHECK, DAT3, CHAN, 1 + (\channels - IDX2)
decr_modulo IDX2, 2, \channels
lsl DAT0, #8
lsl DAT1, #8
lsl DAT2, #8
lsl DAT3, #8
.endif
stm OUT!, {DAT0 - DAT3}
.endm
.set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
.if (WORDS_PER_LOOP % 2) == 0
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
.endif
.if (WORDS_PER_LOOP % 2) == 0
.set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
.endif
.set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
.set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
function ff_mlp_pack_output_outoforder_\channels\()ch_\shift\()shift_armv6, export=1
.if SAMPLES_PER_LOOP > 1
tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
it ne
bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
.endif
teq COUNT, #0
it eq
bxeq lr
push {v1-v6,sl,fp,lr}
ldr CHAN0, [sp, #(9+0)*4] // get ch_assign from stack
ldr CHAN4, [CHAN0]
.if \channels == 2
uxtb CHAN0, CHAN4, ror #0
uxtb CHAN1, CHAN4, ror #8
.else
ldr CHAN5, [CHAN0, #4]
.if \channels == 6
uxtb CHAN0, CHAN4, ror #0
uxtb CHAN1, CHAN4, ror #8
uxtb CHAN2, CHAN4, ror #16
uxtb CHAN3, CHAN4, ror #24
uxtb CHAN4, CHAN5, ror #0
uxtb CHAN5, CHAN5, ror #8
.endif
.endif
.set IDX1, \channels
.set IDX2, \channels
0:
.rept WORDS_PER_LOOP / 4
output4words
.endr
subs COUNT, COUNT, #SAMPLES_PER_LOOP
bne 0b
pop {v1-v6,sl,fp,pc}
.ltorg
endfunc
.purgem output4words
.unreq CHECK
.unreq COUNT
.unreq IN
.unreq OUT
.unreq DAT0
.unreq DAT1
.unreq DAT2
.unreq DAT3
.unreq CHAN0
.unreq CHAN1
.unreq CHAN2
.unreq CHAN3
.unreq CHAN4
.unreq CHAN5
#endif // !CONFIG_THUMB
.endif // mixed
.endif // inorder
.endm // implement_pack
.macro pack_channels inorder, channels
implement_pack \inorder, \channels, 0
implement_pack \inorder, \channels, 1
implement_pack \inorder, \channels, 2
implement_pack \inorder, \channels, 3
implement_pack \inorder, \channels, 4
implement_pack \inorder, \channels, 5
implement_pack \inorder, \channels, mixed
.endm
.macro pack_order inorder
pack_channels \inorder, 2
pack_channels \inorder, 6
pack_channels \inorder, 8
.endm
pack_order 0
pack_order 1

View File

@@ -0,0 +1,146 @@
/*
* Copyright (c) 2014 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/mlpdsp.h"
void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
void ff_mlp_rematrix_channel_arm(int32_t *samples,
const int32_t *coeffs,
const uint8_t *bypassed_lsbs,
const int8_t *noise_buffer,
int index,
unsigned int dest_ch,
uint16_t blockpos,
unsigned int maxchan,
int matrix_noise_shift,
int access_unit_size_pow2,
int32_t mask);
#define DECLARE_PACK(order,channels,shift) \
int32_t ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
#define ENUMERATE_PACK(order,channels,shift) \
ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6,
#define PACK_CHANNELS(macro,order,channels) \
macro(order,channels,0) \
macro(order,channels,1) \
macro(order,channels,2) \
macro(order,channels,3) \
macro(order,channels,4) \
macro(order,channels,5) \
macro(order,channels,mixed)
#define PACK_ORDER(macro,order) \
PACK_CHANNELS(macro,order,2) \
PACK_CHANNELS(macro,order,6) \
PACK_CHANNELS(macro,order,8)
#define PACK_ALL(macro) \
PACK_ORDER(macro,outof) \
PACK_ORDER(macro,in)
PACK_ALL(DECLARE_PACK)
#define ff_mlp_pack_output_outoforder_2ch_mixedshift_armv6 0
#define ff_mlp_pack_output_outoforder_6ch_mixedshift_armv6 0
#define ff_mlp_pack_output_outoforder_8ch_mixedshift_armv6 0
#if CONFIG_THUMB
#define ff_mlp_pack_output_outoforder_2ch_0shift_armv6 0
#define ff_mlp_pack_output_outoforder_2ch_1shift_armv6 0
#define ff_mlp_pack_output_outoforder_2ch_2shift_armv6 0
#define ff_mlp_pack_output_outoforder_2ch_3shift_armv6 0
#define ff_mlp_pack_output_outoforder_2ch_4shift_armv6 0
#define ff_mlp_pack_output_outoforder_2ch_5shift_armv6 0
#define ff_mlp_pack_output_outoforder_6ch_0shift_armv6 0
#define ff_mlp_pack_output_outoforder_6ch_1shift_armv6 0
#define ff_mlp_pack_output_outoforder_6ch_2shift_armv6 0
#define ff_mlp_pack_output_outoforder_6ch_3shift_armv6 0
#define ff_mlp_pack_output_outoforder_6ch_4shift_armv6 0
#define ff_mlp_pack_output_outoforder_6ch_5shift_armv6 0
#define ff_mlp_pack_output_outoforder_8ch_0shift_armv6 0
#define ff_mlp_pack_output_outoforder_8ch_1shift_armv6 0
#define ff_mlp_pack_output_outoforder_8ch_2shift_armv6 0
#define ff_mlp_pack_output_outoforder_8ch_3shift_armv6 0
#define ff_mlp_pack_output_outoforder_8ch_4shift_armv6 0
#define ff_mlp_pack_output_outoforder_8ch_5shift_armv6 0
#endif
static int32_t (*mlp_select_pack_output_armv6(uint8_t *ch_assign,
int8_t *output_shift,
uint8_t max_matrix_channel,
int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
{
int ch_index;
int shift = output_shift[0] < 0 || output_shift[0] > 5 ? 6 : output_shift[0];
int inorder = 1;
static int32_t (*const routine[2*3*7])(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int) = {
PACK_ALL(ENUMERATE_PACK)
};
int i;
if (!is32) // don't support 16-bit output (it's not used by TrueHD)
return ff_mlp_pack_output;
switch (max_matrix_channel) {
case 1:
ch_index = 0;
break;
case 5:
ch_index = 1;
break;
case 7:
ch_index = 2;
break;
default:
return ff_mlp_pack_output;
}
for (i = 0; i <= max_matrix_channel; i++) {
if (shift != 6 && output_shift[i] != shift)
shift = 6; // indicate mixed shifts
if (ch_assign[i] != i)
inorder = 0;
}
#if CONFIG_THUMB
if (!inorder)
return ff_mlp_pack_output; // can't currently handle an order array except in ARM mode
#else
if (shift == 6 && !inorder)
return ff_mlp_pack_output; // can't currently handle both an order array and a shift array
#endif
return routine[(inorder*3+ch_index)*7+shift];
}
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv5te(cpu_flags)) {
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
}
if (have_armv6(cpu_flags))
c->mlp_select_pack_output = mlp_select_pack_output_armv6;
}

View File

@@ -0,0 +1,143 @@
/*
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
.macro skip args:vararg
.endm
.macro sum8 lo, hi, w, p, t1, t2, t3, t4, rsb=skip, offs=0
ldr \t1, [\w, #4*\offs]
ldr \t2, [\p, #4]!
\rsb \t1, \t1, #0
.irpc i, 135
ldr \t3, [\w, #4*64*\i+4*\offs]
ldr \t4, [\p, #4*64*\i]
smlal \lo, \hi, \t1, \t2
\rsb \t3, \t3, #0
ldr \t1, [\w, #4*64*(\i+1)+4*\offs]
ldr \t2, [\p, #4*64*(\i+1)]
smlal \lo, \hi, \t3, \t4
\rsb \t1, \t1, #0
.endr
ldr \t3, [\w, #4*64*7+4*\offs]
ldr \t4, [\p, #4*64*7]
smlal \lo, \hi, \t1, \t2
\rsb \t3, \t3, #0
smlal \lo, \hi, \t3, \t4
.endm
.macro round rd, lo, hi
lsr \rd, \lo, #24
bic \lo, \lo, #0xff000000
orr \rd, \rd, \hi, lsl #8
mov \hi, #0
ssat \rd, #16, \rd
.endm
function ff_mpadsp_apply_window_fixed_armv6, export=1
push {r2,r4-r11,lr}
add r4, r0, #4*512 @ synth_buf + 512
.rept 4
ldm r0!, {r5-r12}
stm r4!, {r5-r12}
.endr
ldr r4, [sp, #40] @ incr
sub r0, r0, #4*17 @ synth_buf + 16
ldr r8, [r2] @ sum:low
add r2, r0, #4*32 @ synth_buf + 48
rsb r5, r4, r4, lsl #5 @ 31 * incr
lsl r4, r4, #1
asr r9, r8, #31 @ sum:high
add r5, r3, r5, lsl #1 @ samples2
add r6, r1, #4*32 @ w2
str r4, [sp, #40]
sum8 r8, r9, r1, r0, r10, r11, r12, lr
sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
round r10, r8, r9
strh_post r10, r3, r4
mov lr, #15
1:
ldr r12, [r0, #4]!
ldr r11, [r6, #-4]!
ldr r10, [r1, #4]!
.irpc i, 0246
.if \i
ldr r11, [r6, #4*64*\i]
ldr r10, [r1, #4*64*\i]
.endif
rsb r11, r11, #0
smlal r8, r9, r10, r12
ldr r10, [r0, #4*64*(\i+1)]
.ifeq \i
smull r4, r7, r11, r12
.else
smlal r4, r7, r11, r12
.endif
ldr r11, [r6, #4*64*(\i+1)]
ldr r12, [r1, #4*64*(\i+1)]
rsb r11, r11, #0
smlal r8, r9, r12, r10
.iflt \i-6
ldr r12, [r0, #4*64*(\i+2)]
.else
ldr r12, [r2, #-4]!
.endif
smlal r4, r7, r11, r10
.endr
.irpc i, 0246
ldr r10, [r1, #4*64*\i+4*32]
rsb r12, r12, #0
ldr r11, [r6, #4*64*\i+4*32]
smlal r8, r9, r10, r12
ldr r10, [r2, #4*64*(\i+1)]
smlal r4, r7, r11, r12
ldr r12, [r1, #4*64*(\i+1)+4*32]
rsb r10, r10, #0
ldr r11, [r6, #4*64*(\i+1)+4*32]
smlal r8, r9, r12, r10
.iflt \i-6
ldr r12, [r2, #4*64*(\i+2)]
.else
ldr r12, [sp, #40]
.endif
smlal r4, r7, r11, r10
.endr
round r10, r8, r9
adds r8, r8, r4
adc r9, r9, r7
strh_post r10, r3, r12
round r11, r8, r9
subs lr, lr, #1
strh_dpost r11, r5, r12
bgt 1b
sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33
pop {r4}
round r10, r8, r9
str r8, [r4]
strh r10, [r3]
pop {r4-r11,pc}
endfunc

View File

@@ -0,0 +1,38 @@
/*
* Copyright (c) 2011 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/mpegaudiodsp.h"
#include "config.h"
void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window,
int *dither, int16_t *out, int incr);
av_cold void ff_mpadsp_init_arm(MPADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv6(cpu_flags)) {
s->apply_window_fixed = ff_mpadsp_apply_window_fixed_armv6;
}
}

View File

@@ -0,0 +1,54 @@
/*
* Copyright (c) 2002 Michael Niedermayer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/internal.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
#include "mpegvideo_arm.h"
#include "asm-offsets.h"
#if HAVE_NEON
AV_CHECK_OFFSET(MpegEncContext, y_dc_scale, Y_DC_SCALE);
AV_CHECK_OFFSET(MpegEncContext, c_dc_scale, C_DC_SCALE);
AV_CHECK_OFFSET(MpegEncContext, ac_pred, AC_PRED);
AV_CHECK_OFFSET(MpegEncContext, block_last_index, BLOCK_LAST_INDEX);
AV_CHECK_OFFSET(MpegEncContext, inter_scantable.raster_end,
INTER_SCANTAB_RASTER_END);
AV_CHECK_OFFSET(MpegEncContext, h263_aic, H263_AIC);
#endif
void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
int n, int qscale);
void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
int n, int qscale);
av_cold void ff_mpv_common_init_arm(MpegEncContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv5te(cpu_flags))
ff_mpv_common_init_armv5te(s);
if (have_neon(cpu_flags)) {
s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_neon;
s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_neon;
}
}

View File

@@ -0,0 +1,26 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_ARM_MPEGVIDEO_ARM_H
#define AVCODEC_ARM_MPEGVIDEO_ARM_H
#include "libavcodec/mpegvideo.h"
void ff_mpv_common_init_armv5te(MpegEncContext *s);
#endif /* AVCODEC_ARM_MPEGVIDEO_ARM_H */

View File

@@ -0,0 +1,102 @@
/*
* Optimization of some functions from mpegvideo.c for armv5te
* Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
#include "mpegvideo_arm.h"
void ff_dct_unquantize_h263_armv5te(int16_t *block, int qmul, int qadd, int count);
#ifdef ENABLE_ARM_TESTS
/**
* h263 dequantizer supplementary function, it is performance critical and needs to
* have optimized implementations for each architecture. Is also used as a reference
* implementation in regression tests
*/
static inline void dct_unquantize_h263_helper_c(int16_t *block, int qmul, int qadd, int count)
{
int i, level;
for (i = 0; i < count; i++) {
level = block[i];
if (level) {
if (level < 0) {
level = level * qmul - qadd;
} else {
level = level * qmul + qadd;
}
block[i] = level;
}
}
}
#endif
static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
int level, qmul, qadd;
int nCoeffs;
av_assert2(s->block_last_index[n]>=0);
qmul = qscale << 1;
if (!s->h263_aic) {
if (n < 4)
level = block[0] * s->y_dc_scale;
else
level = block[0] * s->c_dc_scale;
qadd = (qscale - 1) | 1;
}else{
qadd = 0;
level = block[0];
}
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
block[0] = level;
}
static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
int qmul, qadd;
int nCoeffs;
av_assert2(s->block_last_index[n]>=0);
qadd = (qscale - 1) | 1;
qmul = qscale << 1;
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
}
av_cold void ff_mpv_common_init_armv5te(MpegEncContext *s)
{
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
}

View File

@@ -0,0 +1,114 @@
/*
* Optimization of some functions from mpegvideo.c for armv5te
* Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/asm.S"
/*
* Special optimized version of dct_unquantize_h263_helper_c, it
* requires the block to be at least 8 bytes aligned, and may process
* more elements than requested. But it is guaranteed to never
* process more than 64 elements provided that count argument is <= 64,
* so it is safe. This function is optimized for a common distribution
* of values for nCoeffs (they are mostly multiple of 8 plus one or
* two extra elements). So this function processes data as 8 elements
* per loop iteration and contains optional 2 elements processing in
* the end.
*
* Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
*/
.macro dequant_t dst, src, mul, add, tmp
rsbs \tmp, ip, \src, asr #16
it gt
addgt \tmp, \add, #0
it lt
rsblt \tmp, \add, #0
it ne
smlatbne \dst, \src, \mul, \tmp
.endm
.macro dequant_b dst, src, mul, add, tmp
rsbs \tmp, ip, \src, lsl #16
it gt
addgt \tmp, \add, #0
it lt
rsblt \tmp, \add, #0
it ne
smlabbne \dst, \src, \mul, \tmp
.endm
function ff_dct_unquantize_h263_armv5te, export=1
push {r4-r9,lr}
mov ip, #0
subs r3, r3, #2
ble 2f
ldrd r4, r5, [r0, #0]
1:
ldrd r6, r7, [r0, #8]
dequant_t r9, r4, r1, r2, r9
dequant_t lr, r5, r1, r2, lr
dequant_b r4, r4, r1, r2, r8
dequant_b r5, r5, r1, r2, r8
strh r4, [r0], #2
strh r9, [r0], #2
strh r5, [r0], #2
strh lr, [r0], #2
dequant_t r9, r6, r1, r2, r9
dequant_t lr, r7, r1, r2, lr
dequant_b r6, r6, r1, r2, r8
dequant_b r7, r7, r1, r2, r8
strh r6, [r0], #2
strh r9, [r0], #2
strh r7, [r0], #2
strh lr, [r0], #2
subs r3, r3, #8
it gt
ldrdgt r4, r5, [r0, #0] /* load data early to avoid load/use pipeline stall */
bgt 1b
adds r3, r3, #2
it le
pople {r4-r9,pc}
2:
ldrsh r9, [r0, #0]
ldrsh lr, [r0, #2]
mov r8, r2
cmp r9, #0
it lt
rsblt r8, r2, #0
it ne
smlabbne r9, r9, r1, r8
mov r8, r2
cmp lr, #0
it lt
rsblt r8, r2, #0
it ne
smlabbne lr, lr, r1, r8
strh r9, [r0], #2
strh lr, [r0], #2
pop {r4-r9,pc}
endfunc

View File

@@ -0,0 +1,107 @@
/*
* Copyright (c) 2010 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "asm-offsets.h"
function ff_dct_unquantize_h263_inter_neon, export=1
add r12, r0, #BLOCK_LAST_INDEX
ldr r12, [r12, r2, lsl #2]
add r0, r0, #INTER_SCANTAB_RASTER_END
ldrb r12, [r0, r12]
sub r2, r3, #1
lsl r0, r3, #1
orr r2, r2, #1
add r3, r12, #1
endfunc
function ff_dct_unquantize_h263_neon, export=1
vdup.16 q15, r0 @ qmul
vdup.16 q14, r2 @ qadd
vneg.s16 q13, q14
cmp r3, #4
mov r0, r1
ble 2f
1:
vld1.16 {q0}, [r0,:128]!
vclt.s16 q3, q0, #0
vld1.16 {q8}, [r0,:128]!
vceq.s16 q1, q0, #0
vmul.s16 q2, q0, q15
vclt.s16 q11, q8, #0
vmul.s16 q10, q8, q15
vbsl q3, q13, q14
vbsl q11, q13, q14
vadd.s16 q2, q2, q3
vceq.s16 q9, q8, #0
vadd.s16 q10, q10, q11
vbif q0, q2, q1
vbif q8, q10, q9
subs r3, r3, #16
vst1.16 {q0}, [r1,:128]!
vst1.16 {q8}, [r1,:128]!
it le
bxle lr
cmp r3, #8
bgt 1b
2:
vld1.16 {d0}, [r0,:64]
vclt.s16 d3, d0, #0
vceq.s16 d1, d0, #0
vmul.s16 d2, d0, d30
vbsl d3, d26, d28
vadd.s16 d2, d2, d3
vbif d0, d2, d1
vst1.16 {d0}, [r1,:64]
bx lr
endfunc
function ff_dct_unquantize_h263_intra_neon, export=1
push {r4-r6,lr}
add r12, r0, #BLOCK_LAST_INDEX
ldr r6, [r0, #AC_PRED]
add lr, r0, #INTER_SCANTAB_RASTER_END
cmp r6, #0
it ne
movne r12, #63
bne 1f
ldr r12, [r12, r2, lsl #2]
ldrb r12, [lr, r12]
1: ldr r5, [r0, #H263_AIC]
ldrsh r4, [r1]
cmp r5, #0
mov r5, r1
it ne
movne r2, #0
bne 2f
cmp r2, #4
it ge
addge r0, r0, #4
sub r2, r3, #1
ldr r6, [r0, #Y_DC_SCALE]
orr r2, r2, #1
smulbb r4, r4, r6
2: lsl r0, r3, #1
add r3, r12, #1
bl X(ff_dct_unquantize_h263_neon)
vmov.16 d0[0], r4
vst1.16 {d0[0]}, [r5]
pop {r4-r6,pc}
endfunc

View File

@@ -0,0 +1,76 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_pix_norm1_armv6, export=1
push {r4-r6, lr}
mov r12, #16
mov lr, #0
1:
ldm r0, {r2-r5}
uxtb16 r6, r2
uxtb16 r2, r2, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r3
smlad lr, r2, r2, lr
uxtb16 r3, r3, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r4
smlad lr, r3, r3, lr
uxtb16 r4, r4, ror #8
smlad lr, r6, r6, lr
uxtb16 r6, r5
smlad lr, r4, r4, lr
uxtb16 r5, r5, ror #8
smlad lr, r6, r6, lr
subs r12, r12, #1
add r0, r0, r1
smlad lr, r5, r5, lr
bgt 1b
mov r0, lr
pop {r4-r6, pc}
endfunc
function ff_pix_sum_armv6, export=1
push {r4-r7, lr}
mov r12, #16
mov r2, #0
mov r3, #0
mov lr, #0
ldr r4, [r0]
1:
subs r12, r12, #1
ldr r5, [r0, #4]
usada8 r2, r4, lr, r2
ldr r6, [r0, #8]
usada8 r3, r5, lr, r3
ldr r7, [r0, #12]
usada8 r2, r6, lr, r2
beq 2f
ldr_pre r4, r0, r1
usada8 r3, r7, lr, r3
bgt 1b
2:
usada8 r3, r7, lr, r3
add r0, r2, r3
pop {r4-r7, pc}
endfunc

View File

@@ -0,0 +1,38 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideoencdsp.h"
int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
int ff_pix_sum_armv6(uint8_t *pix, int line_size);
av_cold void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv6(cpu_flags)) {
c->pix_norm1 = ff_pix_norm1_armv6;
c->pix_sum = ff_pix_sum_armv6;
}
}

View File

@@ -0,0 +1,59 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
vtrn.32 \r0, \r4
vtrn.32 \r1, \r5
vtrn.32 \r2, \r6
vtrn.32 \r3, \r7
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.16 \r4, \r6
vtrn.16 \r5, \r7
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
vtrn.8 \r4, \r5
vtrn.8 \r6, \r7
.endm
.macro transpose_4x4 r0, r1, r2, r3
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
.endm
.macro swap4 r0, r1, r2, r3, r4, r5, r6, r7
vswp \r0, \r4
vswp \r1, \r5
vswp \r2, \r6
vswp \r3, \r7
.endm
.macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7
vtrn.32 \r0, \r2
vtrn.32 \r1, \r3
vtrn.32 \r4, \r6
vtrn.32 \r5, \r7
vtrn.16 \r0, \r1
vtrn.16 \r2, \r3
vtrn.16 \r4, \r5
vtrn.16 \r6, \r7
.endm

View File

@@ -0,0 +1,79 @@
/*
* check NEON registers for clobbers
* Copyright (c) 2013 Martin Storsjo
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/avcodec.h"
#include "libavutil/arm/neontest.h"
wrap(avcodec_open2(AVCodecContext *avctx,
AVCodec *codec,
AVDictionary **options))
{
testneonclobbers(avcodec_open2, avctx, codec, options);
}
wrap(avcodec_decode_audio4(AVCodecContext *avctx,
AVFrame *frame,
int *got_frame_ptr,
AVPacket *avpkt))
{
testneonclobbers(avcodec_decode_audio4, avctx, frame,
got_frame_ptr, avpkt);
}
wrap(avcodec_decode_video2(AVCodecContext *avctx,
AVFrame *picture,
int *got_picture_ptr,
AVPacket *avpkt))
{
testneonclobbers(avcodec_decode_video2, avctx, picture,
got_picture_ptr, avpkt);
}
wrap(avcodec_decode_subtitle2(AVCodecContext *avctx,
AVSubtitle *sub,
int *got_sub_ptr,
AVPacket *avpkt))
{
testneonclobbers(avcodec_decode_subtitle2, avctx, sub,
got_sub_ptr, avpkt);
}
wrap(avcodec_encode_audio2(AVCodecContext *avctx,
AVPacket *avpkt,
const AVFrame *frame,
int *got_packet_ptr))
{
testneonclobbers(avcodec_encode_audio2, avctx, avpkt, frame,
got_packet_ptr);
}
wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
uint8_t *buf, int buf_size,
const AVSubtitle *sub))
{
testneonclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub);
}
wrap(avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt,
const AVFrame *frame, int *got_packet_ptr))
{
testneonclobbers(avcodec_encode_video2, avctx, avpkt, frame, got_packet_ptr);
}

View File

@@ -0,0 +1,76 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_get_pixels_armv6, export=1
pld [r1, r2]
push {r4-r8, lr}
mov lr, #8
1:
ldrd_post r4, r5, r1, r2
subs lr, lr, #1
uxtb16 r6, r4
uxtb16 r4, r4, ror #8
uxtb16 r12, r5
uxtb16 r8, r5, ror #8
pld [r1, r2]
pkhbt r5, r6, r4, lsl #16
pkhtb r6, r4, r6, asr #16
pkhbt r7, r12, r8, lsl #16
pkhtb r12, r8, r12, asr #16
stm r0!, {r5,r6,r7,r12}
bgt 1b
pop {r4-r8, pc}
endfunc
function ff_diff_pixels_armv6, export=1
pld [r1, r3]
pld [r2, r3]
push {r4-r9, lr}
mov lr, #8
1:
ldrd_post r4, r5, r1, r3
ldrd_post r6, r7, r2, r3
uxtb16 r8, r4
uxtb16 r4, r4, ror #8
uxtb16 r9, r6
uxtb16 r6, r6, ror #8
pld [r1, r3]
ssub16 r9, r8, r9
ssub16 r6, r4, r6
uxtb16 r8, r5
uxtb16 r5, r5, ror #8
pld [r2, r3]
pkhbt r4, r9, r6, lsl #16
pkhtb r6, r6, r9, asr #16
uxtb16 r9, r7
uxtb16 r7, r7, ror #8
ssub16 r9, r8, r9
ssub16 r5, r5, r7
subs lr, lr, #1
pkhbt r8, r9, r5, lsl #16
pkhtb r9, r5, r9, asr #16
stm r0!, {r4,r6,r8,r9}
bgt 1b
pop {r4-r9, pc}
endfunc

View File

@@ -0,0 +1,42 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/pixblockdsp.h"
void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, ptrdiff_t stride);
void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
const uint8_t *s2, int stride);
av_cold void ff_pixblockdsp_init_arm(PixblockDSPContext *c,
AVCodecContext *avctx,
unsigned high_bit_depth)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv6(cpu_flags)) {
if (!high_bit_depth)
c->get_pixels = ff_get_pixels_armv6;
c->diff_pixels = ff_diff_pixels_armv6;
}
}

View File

@@ -0,0 +1,150 @@
/*
* ARM NEON optimised RDFT
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_rdft_calc_neon, export=1
push {r4-r8,lr}
ldr r6, [r0, #4] @ inverse
mov r4, r0
mov r5, r1
lsls r6, r6, #31
bne 1f
add r0, r4, #20
bl X(ff_fft_permute_neon)
add r0, r4, #20
mov r1, r5
bl X(ff_fft_calc_neon)
1:
ldr r12, [r4, #0] @ nbits
mov r2, #1
lsl r12, r2, r12
add r0, r5, #8
add r1, r5, r12, lsl #2
lsr r12, r12, #2
ldr r2, [r4, #12] @ tcos
sub r12, r12, #2
ldr r3, [r4, #16] @ tsin
mov r7, r0
sub r1, r1, #8
mov lr, r1
mov r8, #-8
vld1.32 {d0}, [r0,:64]! @ d1[0,1]
vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
vld1.32 {d4}, [r2,:64]! @ tcos[i]
vld1.32 {d5}, [r3,:64]! @ tsin[i]
vmov.f32 d18, #0.5 @ k1
vdup.32 d19, r6
pld [r0, #32]
veor d19, d18, d19 @ k2
vmov.i32 d16, #0
vmov.i32 d17, #1<<31
pld [r1, #-32]
vtrn.32 d16, d17
pld [r2, #32]
vrev64.32 d16, d16 @ d16=1,0 d17=0,1
pld [r3, #32]
2:
veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
vld1.32 {d24}, [r0,:64]! @ d1[0,1]
vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
vld1.32 {d25}, [r1,:64], r8 @ d2[0,1]
vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1]
pld [r0, #32]
vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
pld [r1, #-32]
vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1]
vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1]
vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re
veor d7, d21, d16 @ -od.im, od.re
vrev64.32 d3, d21 @ od.re, od.im
veor d6, d20, d17 @ ev.re,-ev.im
veor d2, d3, d16 @ -od.re, od.im
vmla.f32 d20, d3, d4[1]
vmla.f32 d20, d7, d5[1]
vmla.f32 d6, d2, d4[1]
vmla.f32 d6, d21, d5[1]
vld1.32 {d4}, [r2,:64]! @ tcos[i]
veor d7, d23, d16 @ -od.im, od.re
vld1.32 {d5}, [r3,:64]! @ tsin[i]
veor d24, d22, d17 @ ev.re,-ev.im
vrev64.32 d3, d23 @ od.re, od.im
pld [r2, #32]
veor d2, d3, d16 @ -od.re, od.im
pld [r3, #32]
vmla.f32 d22, d3, d4[0]
vmla.f32 d22, d7, d5[0]
vmla.f32 d24, d2, d4[0]
vmla.f32 d24, d23, d5[0]
vld1.32 {d0}, [r0,:64]! @ d1[0,1]
vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
vst1.32 {d20}, [r7,:64]!
vst1.32 {d6}, [lr,:64], r8
vst1.32 {d22}, [r7,:64]!
vst1.32 {d24}, [lr,:64], r8
subs r12, r12, #2
bgt 2b
veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
ldr r2, [r4, #8] @ sign_convention
vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
add r0, r0, #4
bfc r2, #0, #31
vld1.32 {d0[0]}, [r0,:32]
veor d7, d21, d16 @ -od.im, od.re
vrev64.32 d3, d21 @ od.re, od.im
veor d6, d20, d17 @ ev.re,-ev.im
vld1.32 {d22}, [r5,:64]
vdup.32 d1, r2
vmov d23, d22
veor d2, d3, d16 @ -od.re, od.im
vtrn.32 d22, d23
veor d0, d0, d1
veor d23, d23, d17
vmla.f32 d20, d3, d4[1]
vmla.f32 d20, d7, d5[1]
vmla.f32 d6, d2, d4[1]
vmla.f32 d6, d21, d5[1]
vadd.f32 d22, d22, d23
vst1.32 {d20}, [r7,:64]
vst1.32 {d6}, [lr,:64]
vst1.32 {d0[0]}, [r0,:32]
vst1.32 {d22}, [r5,:64]
cmp r6, #0
it eq
popeq {r4-r8,pc}
vmul.f32 d22, d22, d18
vst1.32 {d22}, [r5,:64]
add r0, r4, #20
mov r1, r5
bl X(ff_fft_permute_neon)
add r0, r4, #20
mov r1, r5
pop {r4-r8,lr}
b X(ff_fft_calc_neon)
endfunc

View File

@@ -0,0 +1,46 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/rv34dsp.h"
#include "libavutil/arm/cpu.h"
void ff_rv34_inv_transform_noround_neon(int16_t *block);
void ff_rv34_inv_transform_noround_dc_neon(int16_t *block);
void ff_rv34_idct_add_neon(uint8_t *dst, ptrdiff_t stride, int16_t *block);
void ff_rv34_idct_dc_add_neon(uint8_t *dst, ptrdiff_t stride, int dc);
av_cold void ff_rv34dsp_init_arm(RV34DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
c->rv34_inv_transform = ff_rv34_inv_transform_noround_neon;
c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
c->rv34_idct_add = ff_rv34_idct_add_neon;
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon;
}
}

View File

@@ -0,0 +1,156 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
.macro rv34_inv_transform r0
vld1.16 {q14-q15}, [\r0,:128]
vmov.s16 d0, #13
vshll.s16 q12, d29, #3
vshll.s16 q13, d29, #4
vshll.s16 q9, d31, #3
vshll.s16 q1, d31, #4
vmull.s16 q10, d28, d0
vmlal.s16 q10, d30, d0
vmull.s16 q11, d28, d0
vmlsl.s16 q11, d30, d0
vsubw.s16 q12, q12, d29 @ z2 = block[i+4*1]*7
vaddw.s16 q13, q13, d29 @ z3 = block[i+4*1]*17
vsubw.s16 q9, q9, d31
vaddw.s16 q1, q1, d31
vadd.s32 q13, q13, q9 @ z3 = 17*block[i+4*1] + 7*block[i+4*3]
vsub.s32 q12, q12, q1 @ z2 = 7*block[i+4*1] - 17*block[i+4*3]
vadd.s32 q1, q10, q13 @ z0 + z3
vadd.s32 q2, q11, q12 @ z1 + z2
vsub.s32 q8, q10, q13 @ z0 - z3
vsub.s32 q3, q11, q12 @ z1 - z2
vtrn.32 q1, q2
vtrn.32 q3, q8
vswp d3, d6
vswp d5, d16
vmov.s32 d0, #13
vadd.s32 q10, q1, q3
vsub.s32 q11, q1, q3
vshl.s32 q12, q2, #3
vshl.s32 q9, q2, #4
vmul.s32 q13, q11, d0[0]
vshl.s32 q11, q8, #4
vadd.s32 q9, q9, q2
vshl.s32 q15, q8, #3
vsub.s32 q12, q12, q2
vadd.s32 q11, q11, q8
vmul.s32 q14, q10, d0[0]
vsub.s32 q8, q15, q8
vsub.s32 q12, q12, q11
vadd.s32 q9, q9, q8
vadd.s32 q2, q13, q12 @ z1 + z2
vadd.s32 q1, q14, q9 @ z0 + z3
vsub.s32 q3, q13, q12 @ z1 - z2
vsub.s32 q15, q14, q9 @ z0 - z3
.endm
/* void rv34_idct_add_c(uint8_t *dst, int stride, int16_t *block) */
function ff_rv34_idct_add_neon, export=1
mov r3, r0
rv34_inv_transform r2
vmov.i16 q12, #0
vrshrn.s32 d16, q1, #10 @ (z0 + z3) >> 10
vrshrn.s32 d17, q2, #10 @ (z1 + z2) >> 10
vrshrn.s32 d18, q3, #10 @ (z1 - z2) >> 10
vrshrn.s32 d19, q15, #10 @ (z0 - z3) >> 10
vld1.32 {d28[]}, [r0,:32], r1
vld1.32 {d29[]}, [r0,:32], r1
vtrn.32 q8, q9
vld1.32 {d28[1]}, [r0,:32], r1
vld1.32 {d29[1]}, [r0,:32], r1
vst1.16 {q12}, [r2,:128]! @ memset(block, 0, 16)
vst1.16 {q12}, [r2,:128] @ memset(block+16, 0, 16)
vtrn.16 d16, d17
vtrn.32 d28, d29
vtrn.16 d18, d19
vaddw.u8 q0, q8, d28
vaddw.u8 q1, q9, d29
vqmovun.s16 d28, q0
vqmovun.s16 d29, q1
vst1.32 {d28[0]}, [r3,:32], r1
vst1.32 {d28[1]}, [r3,:32], r1
vst1.32 {d29[0]}, [r3,:32], r1
vst1.32 {d29[1]}, [r3,:32], r1
bx lr
endfunc
/* void rv34_inv_transform_noround_neon(int16_t *block); */
function ff_rv34_inv_transform_noround_neon, export=1
rv34_inv_transform r0
vshl.s32 q11, q2, #1
vshl.s32 q10, q1, #1
vshl.s32 q12, q3, #1
vshl.s32 q13, q15, #1
vadd.s32 q11, q11, q2
vadd.s32 q10, q10, q1
vadd.s32 q12, q12, q3
vadd.s32 q13, q13, q15
vshrn.s32 d0, q10, #11 @ (z0 + z3)*3 >> 11
vshrn.s32 d1, q11, #11 @ (z1 + z2)*3 >> 11
vshrn.s32 d2, q12, #11 @ (z1 - z2)*3 >> 11
vshrn.s32 d3, q13, #11 @ (z0 - z3)*3 >> 11
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0,:64]!
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0,:64]!
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0,:64]!
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0,:64]!
bx lr
endfunc
/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */
function ff_rv34_idct_dc_add_neon, export=1
mov r3, r0
vld1.32 {d28[]}, [r0,:32], r1
vld1.32 {d29[]}, [r0,:32], r1
vdup.16 d0, r2
vmov.s16 d1, #169
vld1.32 {d28[1]}, [r0,:32], r1
vmull.s16 q1, d0, d1 @ dc * 13 * 13
vld1.32 {d29[1]}, [r0,:32], r1
vrshrn.s32 d0, q1, #10 @ (dc * 13 * 13 + 0x200) >> 10
vmov d1, d0
vaddw.u8 q2, q0, d28
vaddw.u8 q3, q0, d29
vqmovun.s16 d28, q2
vqmovun.s16 d29, q3
vst1.32 {d28[0]}, [r3,:32], r1
vst1.32 {d29[0]}, [r3,:32], r1
vst1.32 {d28[1]}, [r3,:32], r1
vst1.32 {d29[1]}, [r3,:32], r1
bx lr
endfunc
/* void rv34_inv_transform_dc_noround_c(int16_t *block) */
function ff_rv34_inv_transform_noround_dc_neon, export=1
vld1.16 {d28[]}, [r0,:16] @ block[0]
vmov.i16 d4, #251
vorr.s16 d4, #256 @ 13^2 * 3
vmull.s16 q3, d28, d4
vshrn.s32 d0, q3, #11
vmov.i16 d1, d0
vst1.64 {q0}, [r0,:128]!
vst1.64 {q0}, [r0,:128]!
bx lr
endfunc

View File

@@ -0,0 +1,150 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/rv34dsp.h"
#include "libavutil/arm/cpu.h"
#define DECL_QPEL3(type, w, pos) \
void ff_ ## type ## _rv40_qpel ## w ## _mc ## pos ## _neon(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride)
#define DECL_QPEL2(w, pos) \
DECL_QPEL3(put, w, pos); \
DECL_QPEL3(avg, w, pos)
#define DECL_QPEL_XY(x, y) \
DECL_QPEL2(16, x ## y); \
DECL_QPEL2(8, x ## y)
#define DECL_QPEL_Y(y) \
DECL_QPEL_XY(0, y); \
DECL_QPEL_XY(1, y); \
DECL_QPEL_XY(2, y); \
DECL_QPEL_XY(3, y); \
DECL_QPEL_Y(0);
DECL_QPEL_Y(1);
DECL_QPEL_Y(2);
DECL_QPEL_Y(3);
void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t);
void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t);
int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride,
int beta, int beta2, int edge,
int *p1, int *q1);
int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride,
int beta, int beta2, int edge,
int *p1, int *q1);
void ff_rv40_h_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1,
int filter_q1, int alpha, int beta,
int lim_p0q0, int lim_q1, int lim_p1);
void ff_rv40_v_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1,
int filter_q1, int alpha, int beta,
int lim_p0q0, int lim_q1, int lim_p1);
static av_cold void rv40dsp_init_neon(RV34DSPContext *c)
{
c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon;
c->put_pixels_tab[0][ 3] = ff_put_rv40_qpel16_mc30_neon;
c->put_pixels_tab[0][ 4] = ff_put_rv40_qpel16_mc01_neon;
c->put_pixels_tab[0][ 5] = ff_put_rv40_qpel16_mc11_neon;
c->put_pixels_tab[0][ 6] = ff_put_rv40_qpel16_mc21_neon;
c->put_pixels_tab[0][ 7] = ff_put_rv40_qpel16_mc31_neon;
c->put_pixels_tab[0][ 9] = ff_put_rv40_qpel16_mc12_neon;
c->put_pixels_tab[0][10] = ff_put_rv40_qpel16_mc22_neon;
c->put_pixels_tab[0][11] = ff_put_rv40_qpel16_mc32_neon;
c->put_pixels_tab[0][12] = ff_put_rv40_qpel16_mc03_neon;
c->put_pixels_tab[0][13] = ff_put_rv40_qpel16_mc13_neon;
c->put_pixels_tab[0][14] = ff_put_rv40_qpel16_mc23_neon;
c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_neon;
c->avg_pixels_tab[0][ 1] = ff_avg_rv40_qpel16_mc10_neon;
c->avg_pixels_tab[0][ 3] = ff_avg_rv40_qpel16_mc30_neon;
c->avg_pixels_tab[0][ 4] = ff_avg_rv40_qpel16_mc01_neon;
c->avg_pixels_tab[0][ 5] = ff_avg_rv40_qpel16_mc11_neon;
c->avg_pixels_tab[0][ 6] = ff_avg_rv40_qpel16_mc21_neon;
c->avg_pixels_tab[0][ 7] = ff_avg_rv40_qpel16_mc31_neon;
c->avg_pixels_tab[0][ 9] = ff_avg_rv40_qpel16_mc12_neon;
c->avg_pixels_tab[0][10] = ff_avg_rv40_qpel16_mc22_neon;
c->avg_pixels_tab[0][11] = ff_avg_rv40_qpel16_mc32_neon;
c->avg_pixels_tab[0][12] = ff_avg_rv40_qpel16_mc03_neon;
c->avg_pixels_tab[0][13] = ff_avg_rv40_qpel16_mc13_neon;
c->avg_pixels_tab[0][14] = ff_avg_rv40_qpel16_mc23_neon;
c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_neon;
c->put_pixels_tab[1][ 1] = ff_put_rv40_qpel8_mc10_neon;
c->put_pixels_tab[1][ 3] = ff_put_rv40_qpel8_mc30_neon;
c->put_pixels_tab[1][ 4] = ff_put_rv40_qpel8_mc01_neon;
c->put_pixels_tab[1][ 5] = ff_put_rv40_qpel8_mc11_neon;
c->put_pixels_tab[1][ 6] = ff_put_rv40_qpel8_mc21_neon;
c->put_pixels_tab[1][ 7] = ff_put_rv40_qpel8_mc31_neon;
c->put_pixels_tab[1][ 9] = ff_put_rv40_qpel8_mc12_neon;
c->put_pixels_tab[1][10] = ff_put_rv40_qpel8_mc22_neon;
c->put_pixels_tab[1][11] = ff_put_rv40_qpel8_mc32_neon;
c->put_pixels_tab[1][12] = ff_put_rv40_qpel8_mc03_neon;
c->put_pixels_tab[1][13] = ff_put_rv40_qpel8_mc13_neon;
c->put_pixels_tab[1][14] = ff_put_rv40_qpel8_mc23_neon;
c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_neon;
c->avg_pixels_tab[1][ 1] = ff_avg_rv40_qpel8_mc10_neon;
c->avg_pixels_tab[1][ 3] = ff_avg_rv40_qpel8_mc30_neon;
c->avg_pixels_tab[1][ 4] = ff_avg_rv40_qpel8_mc01_neon;
c->avg_pixels_tab[1][ 5] = ff_avg_rv40_qpel8_mc11_neon;
c->avg_pixels_tab[1][ 6] = ff_avg_rv40_qpel8_mc21_neon;
c->avg_pixels_tab[1][ 7] = ff_avg_rv40_qpel8_mc31_neon;
c->avg_pixels_tab[1][ 9] = ff_avg_rv40_qpel8_mc12_neon;
c->avg_pixels_tab[1][10] = ff_avg_rv40_qpel8_mc22_neon;
c->avg_pixels_tab[1][11] = ff_avg_rv40_qpel8_mc32_neon;
c->avg_pixels_tab[1][12] = ff_avg_rv40_qpel8_mc03_neon;
c->avg_pixels_tab[1][13] = ff_avg_rv40_qpel8_mc13_neon;
c->avg_pixels_tab[1][14] = ff_avg_rv40_qpel8_mc23_neon;
c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_neon;
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon;
c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
c->rv40_weak_loop_filter[0] = ff_rv40_h_weak_loop_filter_neon;
c->rv40_weak_loop_filter[1] = ff_rv40_v_weak_loop_filter_neon;
}
av_cold void ff_rv40dsp_init_arm(RV34DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
rv40dsp_init_neon(c);
}

View File

@@ -0,0 +1,920 @@
/*
* Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
.macro qpel_lowpass r0, r1, rc1, rc2, shift
vext.8 d25, \r0, \r1, #1 @ src[-1]
vext.8 d26, \r0, \r1, #4 @ src[ 2]
vext.8 d24, \r0, \r1, #5 @ src[ 3]
vaddl.u8 q9, d25, d26
vaddl.u8 q8, \r0, d24
vext.8 d27, \r0, \r1, #2 @ src[ 0]
vshl.s16 q12, q9, #2
vsub.s16 q8, q8, q9
vext.8 d28, \r0, \r1, #3 @ src[ 1]
vsub.s16 q8, q8, q12
vmlal.u8 q8, d27, \rc1
vmlal.u8 q8, d28, \rc2
vqrshrun.s16 \r0, q8, #\shift
.endm
.macro qpel_lowpass_x2 r0, r1, r2, r3, rc1, rc2, shift
vext.8 d25, \r0, \r1, #1 @ src[-1]
vext.8 d26, \r0, \r1, #4 @ src[ 2]
vext.8 d24, \r0, \r1, #5 @ src[ 3]
vaddl.u8 q9, d25, d26
vaddl.u8 q8, \r0, d24
vext.8 d29, \r0, \r1, #2 @ src[ 0]
vext.8 d28, \r0, \r1, #3 @ src[ 1]
vshl.s16 q10, q9, #2
vext.8 \r1, \r2, \r3, #1 @ src[-1]
vsub.s16 q8, q8, q9
vext.8 d22, \r2, \r3, #4 @ src[ 2]
vext.8 \r0, \r2, \r3, #5 @ src[ 3]
vaddl.u8 q13, \r1, d22
vaddl.u8 q12, \r2, \r0
vsub.s16 q8, q8, q10
vshl.s16 q9, q13, #2
vsub.s16 q12, q12, q13
vmlal.u8 q8, d29, \rc1
vmlal.u8 q8, d28, \rc2
vsub.s16 q12, q12, q9
vext.8 d26, \r2, \r3, #2 @ src[ 0]
vext.8 d27, \r2, \r3, #3 @ src[ 1]
vmlal.u8 q12, d26, \rc1
vmlal.u8 q12, d27, \rc2
vqrshrun.s16 \r0, q8, #\shift
vqrshrun.s16 \r2, q12, #\shift
.endm
.macro rv40_qpel8_h shift
function put_rv40_qpel8_h_lp_packed_s\shift\()_neon
1:
vld1.8 {q2}, [r1], r2
vld1.8 {q3}, [r1], r2
qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, \shift
vst1.8 {d4}, [r12,:64]!
vst1.8 {d6}, [r12,:64]!
subs r3, r3, #2
bgt 1b
vld1.8 {q2}, [r1]
qpel_lowpass d4, d5, d0, d1, \shift
vst1.8 {d4}, [r12,:64]!
bx lr
endfunc
.endm
.macro rv40_qpel8_v shift, type
function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon
vld1.64 {d2}, [r1,:64]!
vld1.64 {d3}, [r1,:64]!
vld1.64 {d4}, [r1,:64]!
vld1.64 {d5}, [r1,:64]!
vld1.64 {d6}, [r1,:64]!
vld1.64 {d7}, [r1,:64]!
vld1.64 {d8}, [r1,:64]!
vld1.64 {d9}, [r1,:64]!
vld1.64 {d10}, [r1,:64]!
vld1.64 {d11}, [r1,:64]!
vld1.64 {d12}, [r1,:64]!
vld1.64 {d13}, [r1,:64]!
vld1.64 {d14}, [r1,:64]!
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, \shift
qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, \shift
qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, \shift
qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, \shift
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
.ifc \type,avg
vld1.64 d12, [r0,:64], r2
vld1.64 d13, [r0,:64], r2
vld1.64 d14, [r0,:64], r2
vld1.64 d15, [r0,:64], r2
vld1.64 d16, [r0,:64], r2
vld1.64 d17, [r0,:64], r2
vld1.64 d18, [r0,:64], r2
vld1.64 d19, [r0,:64], r2
sub r0, r0, r2, lsl #3
vrhadd.u8 q1, q1, q6
vrhadd.u8 q2, q2, q7
vrhadd.u8 q3, q3, q8
vrhadd.u8 q4, q4, q9
.endif
vst1.64 d2, [r0,:64], r2
vst1.64 d3, [r0,:64], r2
vst1.64 d4, [r0,:64], r2
vst1.64 d5, [r0,:64], r2
vst1.64 d6, [r0,:64], r2
vst1.64 d7, [r0,:64], r2
vst1.64 d8, [r0,:64], r2
vst1.64 d9, [r0,:64], r2
bx lr
endfunc
.endm
rv40_qpel8_h 5
rv40_qpel8_h 6
.macro rv40_qpel type
function \type\()_rv40_qpel8_h_lowpass_neon
.ifc \type,avg
mov r12, r0
.endif
1:
vld1.8 {q2}, [r1], r2
vld1.8 {q3}, [r1], r2
qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, 6
.ifc \type,avg
vld1.8 {d3}, [r12,:64], r2
vld1.8 {d16}, [r12,:64], r2
vrhadd.u8 d4, d4, d3
vrhadd.u8 d6, d6, d16
.endif
vst1.8 {d4}, [r0,:64], r2
vst1.8 {d6}, [r0,:64], r2
subs r3, r3, #2
bgt 1b
bx lr
endfunc
function \type\()_rv40_qpel8_v_lowpass_neon
vld1.64 {d2}, [r1], r2
vld1.64 {d3}, [r1], r2
vld1.64 {d4}, [r1], r2
vld1.64 {d5}, [r1], r2
vld1.64 {d6}, [r1], r2
vld1.64 {d7}, [r1], r2
vld1.64 {d8}, [r1], r2
vld1.64 {d9}, [r1], r2
vld1.64 {d10}, [r1], r2
vld1.64 {d11}, [r1], r2
vld1.64 {d12}, [r1], r2
vld1.64 {d13}, [r1], r2
vld1.64 {d14}, [r1]
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, 6
qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, 6
qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, 6
qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, 6
transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
.ifc \type,avg
vld1.64 d12, [r0,:64], r2
vld1.64 d13, [r0,:64], r2
vld1.64 d14, [r0,:64], r2
vld1.64 d15, [r0,:64], r2
vld1.64 d16, [r0,:64], r2
vld1.64 d17, [r0,:64], r2
vld1.64 d18, [r0,:64], r2
vld1.64 d19, [r0,:64], r2
sub r0, r0, r2, lsl #3
vrhadd.u8 q1, q1, q6
vrhadd.u8 q2, q2, q7
vrhadd.u8 q3, q3, q8
vrhadd.u8 q4, q4, q9
.endif
vst1.64 d2, [r0,:64], r2
vst1.64 d3, [r0,:64], r2
vst1.64 d4, [r0,:64], r2
vst1.64 d5, [r0,:64], r2
vst1.64 d6, [r0,:64], r2
vst1.64 d7, [r0,:64], r2
vst1.64 d8, [r0,:64], r2
vst1.64 d9, [r0,:64], r2
bx lr
endfunc
rv40_qpel8_v 5, \type
rv40_qpel8_v 6, \type
function ff_\type\()_rv40_qpel8_mc10_neon, export=1
sub r1, r1, #2
mov r3, #8
vmov.i8 d0, #52
vmov.i8 d1, #20
b \type\()_rv40_qpel8_h_lowpass_neon
endfunc
function ff_\type\()_rv40_qpel8_mc30_neon, export=1
sub r1, r1, #2
mov r3, #8
vmov.i8 d0, #20
vmov.i8 d1, #52
b \type\()_rv40_qpel8_h_lowpass_neon
endfunc
function ff_\type\()_rv40_qpel8_mc01_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub r1, r1, r2, lsl #1
vmov.i8 d0, #52
vmov.i8 d1, #20
bl \type\()_rv40_qpel8_v_lowpass_neon
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc11_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
add r1, sp, #7
bic r1, r1, #7
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc21_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
add r1, sp, #7
bic r1, r1, #7
vmov.i8 d0, #52
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc31_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #20
vmov.i8 d1, #52
bl put_rv40_qpel8_h_lp_packed_s6_neon
add r1, sp, #7
bic r1, r1, #7
vswp d0, d1
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc12_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
add r1, sp, #7
bic r1, r1, #7
vmov.i8 d0, #20
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc22_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
add r1, sp, #7
bic r1, r1, #7
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc32_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #20
vmov.i8 d1, #52
bl put_rv40_qpel8_h_lp_packed_s6_neon
add r1, sp, #7
bic r1, r1, #7
vmov.i8 d1, #20
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc03_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub r1, r1, r2, lsl #1
vmov.i8 d0, #20
vmov.i8 d1, #52
bl \type\()_rv40_qpel8_v_lowpass_neon
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc33_neon, export=1
mov r3, #8
b X(ff_\type\()_pixels8_xy2_neon)
endfunc
function ff_\type\()_rv40_qpel8_mc13_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
add r1, sp, #7
bic r1, r1, #7
vswp d0, d1
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel8_mc23_neon, export=1
push {r4, lr}
vpush {d8-d15}
sub sp, sp, #14*8
add r12, sp, #7
bic r12, r12, #7
sub r1, r1, r2, lsl #1
sub r1, r1, #2
mov r3, #12
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
add r1, sp, #7
bic r1, r1, #7
vmov.i8 d1, #52
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #14*8
vpop {d8-d15}
pop {r4, pc}
endfunc
function ff_\type\()_rv40_qpel16_mc10_neon, export=1
vmov.i8 d0, #52
vmov.i8 d1, #20
.L\type\()_rv40_qpel16_h:
push {r1, lr}
sub r1, r1, #2
mov r3, #16
bl \type\()_rv40_qpel8_h_lowpass_neon
pop {r1, lr}
sub r0, r0, r2, lsl #4
add r0, r0, #8
add r1, r1, #6
mov r3, #16
b \type\()_rv40_qpel8_h_lowpass_neon
endfunc
function ff_\type\()_rv40_qpel16_mc30_neon, export=1
vmov.i8 d0, #20
vmov.i8 d1, #52
b .L\type\()_rv40_qpel16_h
endfunc
function ff_\type\()_rv40_qpel16_mc01_neon, export=1
vmov.i8 d0, #52
vmov.i8 d1, #20
.L\type\()_rv40_qpel16_v:
sub r1, r1, r2, lsl #1
push {r1, lr}
vpush {d8-d15}
bl \type\()_rv40_qpel8_v_lowpass_neon
sub r1, r1, r2, lsl #2
bl \type\()_rv40_qpel8_v_lowpass_neon
ldr r1, [sp, #64]
sub r0, r0, r2, lsl #4
add r0, r0, #8
add r1, r1, #8
bl \type\()_rv40_qpel8_v_lowpass_neon
sub r1, r1, r2, lsl #2
bl \type\()_rv40_qpel8_v_lowpass_neon
vpop {d8-d15}
pop {r1, pc}
endfunc
function ff_\type\()_rv40_qpel16_mc11_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
.L\type\()_rv40_qpel16_v_s6:
add r1, sp, #7
bic r1, r1, #7
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
sub r1, r1, #40
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
sub r0, r0, r2, lsl #4
add r0, r0, #8
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
sub r1, r1, #40
bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
add sp, sp, #44*8
vpop {d8-d15}
pop {r1, pc}
endfunc
function ff_\type\()_rv40_qpel16_mc21_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
vmov.i8 d0, #52
b .L\type\()_rv40_qpel16_v_s6
endfunc
function ff_\type\()_rv40_qpel16_mc31_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #20
vmov.i8 d1, #52
bl put_rv40_qpel8_h_lp_packed_s6_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
vswp d0, d1
b .L\type\()_rv40_qpel16_v_s6
endfunc
function ff_\type\()_rv40_qpel16_mc12_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
vmov.i8 d0, #20
.L\type\()_rv40_qpel16_v_s5:
add r1, sp, #7
bic r1, r1, #7
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
sub r1, r1, #40
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
sub r0, r0, r2, lsl #4
add r0, r0, #8
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
sub r1, r1, #40
bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
add sp, sp, #44*8
vpop {d8-d15}
pop {r1, pc}
endfunc
function ff_\type\()_rv40_qpel16_mc22_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
b .L\type\()_rv40_qpel16_v_s5
endfunc
function ff_\type\()_rv40_qpel16_mc32_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #20
vmov.i8 d1, #52
bl put_rv40_qpel8_h_lp_packed_s6_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
vmov.i8 d1, #20
b .L\type\()_rv40_qpel16_v_s5
endfunc
function ff_\type\()_rv40_qpel16_mc03_neon, export=1
vmov.i8 d0, #20
vmov.i8 d1, #52
b .L\type\()_rv40_qpel16_v
endfunc
function ff_\type\()_rv40_qpel16_mc13_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #52
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s6_neon
vswp d0, d1
b .L\type\()_rv40_qpel16_v_s6
endfunc
function ff_\type\()_rv40_qpel16_mc23_neon, export=1
sub r1, r1, r2, lsl #1
sub r1, r1, #2
push {r1, lr}
vpush {d8-d15}
sub sp, sp, #44*8
add r12, sp, #7
bic r12, r12, #7
mov r3, #20
vmov.i8 d0, #20
vmov.i8 d1, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
ldr r1, [sp, #416]
add r1, r1, #8
mov r3, #20
bl put_rv40_qpel8_h_lp_packed_s5_neon
vmov.i8 d1, #52
b .L\type\()_rv40_qpel16_v_s6
endfunc
function ff_\type\()_rv40_qpel16_mc33_neon, export=1
mov r3, #16
b X(ff_\type\()_pixels16_xy2_neon)
endfunc
.endm
rv40_qpel put
rv40_qpel avg
.macro rv40_weight
vmovl.u8 q8, d2
vmovl.u8 q9, d3
vmovl.u8 q10, d4
vmovl.u8 q11, d5
vmull.u16 q2, d16, d0[2]
vmull.u16 q3, d17, d0[2]
vmull.u16 q8, d18, d0[2]
vmull.u16 q9, d19, d0[2]
vmull.u16 q12, d20, d0[0]
vmull.u16 q13, d21, d0[0]
vmull.u16 q14, d22, d0[0]
vmull.u16 q15, d23, d0[0]
vshrn.i32 d4, q2, #9
vshrn.i32 d5, q3, #9
vshrn.i32 d6, q8, #9
vshrn.i32 d7, q9, #9
vshrn.i32 d16, q12, #9
vshrn.i32 d17, q13, #9
vshrn.i32 d18, q14, #9
vshrn.i32 d19, q15, #9
vadd.u16 q2, q2, q8
vadd.u16 q3, q3, q9
vrshrn.i16 d2, q2, #5
vrshrn.i16 d3, q3, #5
.endm
/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int w1, int w2, int stride) */
function ff_rv40_weight_func_16_neon, export=1
ldr r12, [sp]
vmov d0, r3, r12
ldr r12, [sp, #4]
mov r3, #16
1:
vld1.8 {q1}, [r1,:128], r12
vld1.8 {q2}, [r2,:128], r12
rv40_weight
vst1.8 {q1}, [r0,:128], r12
subs r3, r3, #1
bne 1b
bx lr
endfunc
/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int w1, int w2, int stride) */
function ff_rv40_weight_func_8_neon, export=1
ldr r12, [sp]
vmov d0, r3, r12
ldr r12, [sp, #4]
mov r3, #8
1:
vld1.8 {d2}, [r1,:64], r12
vld1.8 {d3}, [r1,:64], r12
vld1.8 {d4}, [r2,:64], r12
vld1.8 {d5}, [r2,:64], r12
rv40_weight
vst1.8 {d2}, [r0,:64], r12
vst1.8 {d3}, [r0,:64], r12
subs r3, r3, #2
bne 1b
bx lr
endfunc
function ff_rv40_h_loop_filter_strength_neon, export=1
pkhbt r2, r3, r2, lsl #18
ldr r3, [r0]
ldr_dpre r12, r0, r1
teq r3, r12
beq 1f
sub r0, r0, r1, lsl #1
vld1.32 {d4[]}, [r0,:32], r1 @ -3
vld1.32 {d0[]}, [r0,:32], r1 @ -2
vld1.32 {d4[1]}, [r0,:32], r1 @ -1
vld1.32 {d5[]}, [r0,:32], r1 @ 0
vld1.32 {d1[]}, [r0,:32], r1 @ 1
vld1.32 {d5[0]}, [r0,:32], r1 @ 2
vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1
vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0
vdup.32 d30, r2 @ beta2, beta << 2
vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1
vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0
vabd.u16 d16, d18, d16
vclt.u16 d16, d16, d30
ldrd r2, r3, [sp, #4]
vmovl.u16 q12, d16
vtrn.16 d16, d17
vshr.u32 q12, q12, #15
ldr r0, [sp]
vst1.32 {d24[1]}, [r2,:32]
vst1.32 {d25[1]}, [r3,:32]
cmp r0, #0
it eq
bxeq lr
vand d18, d16, d17
vtrn.32 d18, d19
vand d18, d18, d19
vmov.u16 r0, d18[0]
bx lr
1:
ldrd r2, r3, [sp, #4]
mov r0, #0
str r0, [r2]
str r0, [r3]
bx lr
endfunc
function ff_rv40_v_loop_filter_strength_neon, export=1
sub r0, r0, #3
pkhbt r2, r3, r2, lsl #18
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r0], r1
vld1.8 {d2}, [r0], r1
vld1.8 {d3}, [r0], r1
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vdup.32 q15, r2
vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2
vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2
vabd.u16 q0, q1, q0
vclt.u16 q0, q0, q15
ldrd r2, r3, [sp, #4]
vmovl.u16 q1, d0
vext.16 d1, d0, d1, #3
vshr.u32 q1, q1, #15
ldr r0, [sp]
vst1.32 {d2[1]}, [r2,:32]
vst1.32 {d3[1]}, [r3,:32]
cmp r0, #0
it eq
bxeq lr
vand d0, d0, d1
vtrn.16 d0, d1
vand d0, d0, d1
vmov.u16 r0, d0[0]
bx lr
endfunc
.macro rv40_weak_loop_filter
vdup.16 d30, r2 @ filter_p1
vdup.16 d31, r3 @ filter_q1
ldrd r2, r3, [sp]
vdup.16 d28, r2 @ alpha
vdup.16 d29, r3 @ beta
ldr r12, [sp, #8]
vdup.16 d25, r12 @ lim_p0q0
ldrd r2, r3, [sp, #12]
vsubl.u8 q9, d5, d4 @ x, t
vabdl.u8 q8, d5, d4 @ x, abs(t)
vneg.s16 q15, q15
vceq.i16 d16, d19, #0 @ !t
vshl.s16 d19, d19, #2 @ t << 2
vmul.u16 d18, d17, d28 @ alpha * abs(t)
vand d24, d30, d31 @ filter_p1 & filter_q1
vsubl.u8 q1, d0, d4 @ p1p2, p1p0
vsubl.u8 q3, d1, d5 @ q1q2, q1q0
vmov.i16 d22, #3
vshr.u16 d18, d18, #7
vadd.i16 d22, d22, d24 @ 3 - (filter_p1 & filter_q1)
vsubl.u8 q10, d0, d1 @ src[-2] - src[1]
vcle.u16 d18, d18, d22
vand d20, d20, d24
vneg.s16 d23, d25 @ -lim_p0q0
vadd.s16 d19, d19, d20
vbic d16, d18, d16 @ t && u <= 3 - (fp1 & fq1)
vtrn.32 d4, d5 @ -3, 2, -1, 0
vrshr.s16 d19, d19, #3
vmov d28, d29 @ beta
vswp d3, d6 @ q1q2, p1p0
vmin.s16 d19, d19, d25
vand d30, d30, d16
vand d31, d31, d16
vadd.s16 q10, q1, q3 @ p1p2 + p1p0, q1q2 + q1q0
vmax.s16 d19, d19, d23 @ diff
vabs.s16 q1, q1 @ abs(p1p2), abs(q1q2)
vand d18, d19, d16 @ diff
vcle.u16 q1, q1, q14
vneg.s16 d19, d18 @ -diff
vdup.16 d26, r3 @ lim_p1
vaddw.u8 q2, q9, d5 @ src[-1]+diff, src[0]-diff
vhsub.s16 q11, q10, q9
vand q1, q1, q15
vqmovun.s16 d4, q2 @ -1, 0
vand q9, q11, q1
vdup.16 d27, r2 @ lim_q1
vneg.s16 q9, q9
vneg.s16 q14, q13
vmin.s16 q9, q9, q13
vtrn.32 d0, d1 @ -2, 1, -2, 1
vmax.s16 q9, q9, q14
vaddw.u8 q3, q9, d0
vqmovun.s16 d5, q3 @ -2, 1
.endm
function ff_rv40_h_weak_loop_filter_neon, export=1
sub r0, r0, r1, lsl #1
sub r0, r0, r1
vld1.32 {d4[]}, [r0,:32], r1
vld1.32 {d0[]}, [r0,:32], r1
vld1.32 {d4[1]}, [r0,:32], r1
vld1.32 {d5[]}, [r0,:32], r1
vld1.32 {d1[]}, [r0,:32], r1
vld1.32 {d5[0]}, [r0,:32]
sub r0, r0, r1, lsl #2
rv40_weak_loop_filter
vst1.32 {d5[0]}, [r0,:32], r1
vst1.32 {d4[0]}, [r0,:32], r1
vst1.32 {d4[1]}, [r0,:32], r1
vst1.32 {d5[1]}, [r0,:32], r1
bx lr
endfunc
function ff_rv40_v_weak_loop_filter_neon, export=1
sub r12, r0, #3
sub r0, r0, #2
vld1.8 {d4}, [r12], r1
vld1.8 {d5}, [r12], r1
vld1.8 {d2}, [r12], r1
vld1.8 {d3}, [r12], r1
vtrn.16 q2, q1
vtrn.8 d4, d5
vtrn.8 d2, d3
vrev64.32 d5, d5
vtrn.32 q2, q1
vdup.32 d0, d3[0]
vdup.32 d1, d2[0]
rv40_weak_loop_filter
vtrn.32 q2, q3
vswp d4, d5
vst4.8 {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
vst4.8 {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
vst4.8 {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
vst4.8 {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
bx lr
endfunc

View File

@@ -0,0 +1,73 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavcodec/sbrdsp.h"
void ff_sbr_sum64x5_neon(float *z);
float ff_sbr_sum_square_neon(float (*x)[2], int n);
void ff_sbr_neg_odd_64_neon(float *x);
void ff_sbr_qmf_pre_shuffle_neon(float *z);
void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
const float *g_filt, int m_max, intptr_t ixh);
void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
const float alpha0[2], const float alpha1[2],
float bw, int start, int end);
void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
av_cold void ff_sbrdsp_init_arm(SBRDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
s->sum64x5 = ff_sbr_sum64x5_neon;
s->sum_square = ff_sbr_sum_square_neon;
s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
s->hf_g_filt = ff_sbr_hf_g_filt_neon;
s->hf_gen = ff_sbr_hf_gen_neon;
s->autocorrelate = ff_sbr_autocorrelate_neon;
s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
}
}

View File

@@ -0,0 +1,411 @@
/*
* Copyright (c) 2012 Mans Rullgard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_sbr_sum64x5_neon, export=1
push {lr}
add r1, r0, # 64*4
add r2, r0, #128*4
add r3, r0, #192*4
add lr, r0, #256*4
mov r12, #64
1:
vld1.32 {q0}, [r0,:128]
vld1.32 {q1}, [r1,:128]!
vadd.f32 q0, q0, q1
vld1.32 {q2}, [r2,:128]!
vadd.f32 q0, q0, q2
vld1.32 {q3}, [r3,:128]!
vadd.f32 q0, q0, q3
vld1.32 {q8}, [lr,:128]!
vadd.f32 q0, q0, q8
vst1.32 {q0}, [r0,:128]!
subs r12, #4
bgt 1b
pop {pc}
endfunc
function ff_sbr_sum_square_neon, export=1
vmov.f32 q0, #0.0
1:
vld1.32 {q1}, [r0,:128]!
vmla.f32 q0, q1, q1
subs r1, r1, #2
bgt 1b
vadd.f32 d0, d0, d1
vpadd.f32 d0, d0, d0
NOVFP vmov.32 r0, d0[0]
bx lr
endfunc
function ff_sbr_neg_odd_64_neon, export=1
mov r1, r0
vmov.i32 q8, #1<<31
vld2.32 {q0,q1}, [r0,:128]!
veor q1, q1, q8
vld2.32 {q2,q3}, [r0,:128]!
.rept 3
vst2.32 {q0,q1}, [r1,:128]!
veor q3, q3, q8
vld2.32 {q0,q1}, [r0,:128]!
vst2.32 {q2,q3}, [r1,:128]!
veor q1, q1, q8
vld2.32 {q2,q3}, [r0,:128]!
.endr
veor q3, q3, q8
vst2.32 {q0,q1}, [r1,:128]!
vst2.32 {q2,q3}, [r1,:128]!
bx lr
endfunc
function ff_sbr_qmf_pre_shuffle_neon, export=1
add r1, r0, #60*4
add r2, r0, #64*4
vld1.32 {d0}, [r0,:64]!
vst1.32 {d0}, [r2,:64]!
mov r3, #-16
mov r12, #24
vmov.i32 q8, #1<<31
vld1.32 {q0}, [r1,:128], r3
vld1.32 {d2}, [r0,:64]!
1:
vld1.32 {d3,d4}, [r0,:128]!
vrev64.32 q0, q0
vld1.32 {q9}, [r1,:128], r3
veor q0, q0, q8
vld1.32 {d5,d6}, [r0,:128]!
vswp d0, d1
vrev64.32 q9, q9
vst2.32 {q0,q1}, [r2,:64]!
vmov q10, q2
veor q9, q9, q8
vmov d2, d6
vswp d18, d19
vld1.32 {q0}, [r1,:128], r3
vst2.32 {q9,q10}, [r2,:64]!
subs r12, r12, #8
bgt 1b
vld1.32 {d3,d4}, [r0,:128]!
vrev64.32 q0, q0
vld1.32 {q9}, [r1,:128], r3
veor q0, q0, q8
vld1.32 {d5}, [r0,:64]!
vswp d0, d1
vrev64.32 q9, q9
vst2.32 {q0,q1}, [r2,:64]!
vswp d4, d5
veor q1, q9, q8
vst2.32 {d3,d5}, [r2,:64]!
vst2.32 {d2[0],d4[0]}, [r2,:64]!
bx lr
endfunc
function ff_sbr_qmf_post_shuffle_neon, export=1
add r2, r1, #60*4
mov r3, #-16
mov r12, #32
vmov.i32 q8, #1<<31
vld1.32 {q0}, [r2,:128], r3
vld1.32 {q1}, [r1,:128]!
1:
pld [r2, #-32]
vrev64.32 q0, q0
vswp d2, d3
veor q0, q0, q8
vld1.32 {q2}, [r2,:128], r3
vld1.32 {q3}, [r1,:128]!
vst2.32 {d1,d3}, [r0,:128]!
vst2.32 {d0,d2}, [r0,:128]!
pld [r2, #-32]
vrev64.32 q2, q2
vswp d6, d7
veor q2, q2, q8
vld1.32 {q0}, [r2,:128], r3
vld1.32 {q1}, [r1,:128]!
vst2.32 {d5,d7}, [r0,:128]!
vst2.32 {d4,d6}, [r0,:128]!
subs r12, r12, #8
bgt 1b
bx lr
endfunc
function ff_sbr_qmf_deint_neg_neon, export=1
add r1, r1, #60*4
add r2, r0, #62*4
mov r3, #-16
mov r12, #32
vmov.i32 d2, #1<<31
1:
vld2.32 {d0,d1}, [r1,:128], r3
veor d0, d0, d2
vrev64.32 d1, d1
vst1.32 {d0}, [r2,:64]
vst1.32 {d1}, [r0,:64]!
sub r2, r2, #8
subs r12, r12, #2
bgt 1b
bx lr
endfunc
function ff_sbr_qmf_deint_bfly_neon, export=1
push {lr}
add r2, r2, #60*4
add r3, r0, #124*4
mov r12, #64
mov lr, #-16
1:
vld1.32 {q0}, [r1,:128]!
vld1.32 {q1}, [r2,:128], lr
vrev64.32 q2, q0
vrev64.32 q3, q1
vadd.f32 d3, d4, d3
vadd.f32 d2, d5, d2
vsub.f32 d0, d0, d7
vsub.f32 d1, d1, d6
vst1.32 {q1}, [r3,:128], lr
vst1.32 {q0}, [r0,:128]!
subs r12, r12, #4
bgt 1b
pop {pc}
endfunc
function ff_sbr_hf_g_filt_neon, export=1
ldr r12, [sp]
add r1, r1, r12, lsl #3
mov r12, #40*2*4
sub r3, r3, #1
vld2.32 {d2[],d3[]},[r2,:64]!
vld1.32 {d0}, [r1,:64], r12
1:
vld1.32 {d1}, [r1,:64], r12
vmul.f32 q3, q0, q1
vld2.32 {d2[],d3[]},[r2,:64]!
vld1.32 {d0}, [r1,:64], r12
vst1.32 {q3}, [r0,:64]!
subs r3, r3, #2
bgt 1b
it lt
bxlt lr
vmul.f32 d0, d0, d2
vst1.32 {d0}, [r0,:64]!
bx lr
endfunc
function ff_sbr_hf_gen_neon, export=1
NOVFP vld1.32 {d1[]}, [sp,:32]
VFP vdup.32 d1, d0[0]
vmul.f32 d0, d1, d1
vld1.32 {d3}, [r2,:64]
vld1.32 {d2}, [r3,:64]
vmul.f32 q0, q0, q1
ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS]
vtrn.32 d0, d1
vneg.f32 d18, d1
vtrn.32 d18, d1
add r0, r0, r2, lsl #3
add r1, r1, r2, lsl #3
sub r1, r1, #2*8
sub r3, r3, r2
vld1.32 {q1}, [r1,:128]!
1:
vld1.32 {q3}, [r1,:128]!
vrev64.32 q2, q1
vmov q8, q3
vrev64.32 d20, d3
vrev64.32 d21, d6
vmla.f32 q3, q1, d0[0]
vmla.f32 d6, d4, d18
vmla.f32 d7, d20, d18
vmla.f32 d6, d3, d0[1]
vmla.f32 d7, d16, d0[1]
vmla.f32 d6, d5, d1
vmla.f32 d7, d21, d1
vmov q1, q8
vst1.32 {q3}, [r0,:128]!
subs r3, r3, #2
bgt 1b
bx lr
endfunc
function ff_sbr_autocorrelate_neon, export=1
vld1.32 {q0}, [r0,:128]!
vmov.f32 q1, #0.0
vmov.f32 q3, #0.0
vmov.f32 d20, #0.0
vmul.f32 d21, d1, d1
vmov q8, q0
vmov q11, q0
mov r12, #36
1:
vld1.32 {q2}, [r0,:128]!
vrev64.32 q12, q2
vmla.f32 q10, q2, q2
vmla.f32 d2, d1, d4
vmla.f32 d3, d1, d24
vmla.f32 d6, d0, d4
vmla.f32 d7, d0, d24
vmla.f32 d2, d4, d5
vmla.f32 d3, d4, d25
vmla.f32 d6, d1, d5
vmla.f32 d7, d1, d25
vmov q0, q2
subs r12, r12, #2
bgt 1b
vld1.32 {q2}, [r0,:128]!
vrev64.32 q12, q2
vmla.f32 d2, d1, d4
vmla.f32 d3, d1, d24
vmla.f32 d6, d0, d4
vmla.f32 d7, d0, d24
vadd.f32 d20, d20, d21
vrev64.32 d18, d17
vmla.f32 d6, d1, d5
vmla.f32 d7, d1, d25
vmov q0, q1
vmla.f32 d0, d16, d17
vmla.f32 d1, d16, d18
vmla.f32 d2, d4, d5
vmla.f32 d3, d4, d25
vneg.f32 s15, s15
vmov d21, d20
vpadd.f32 d0, d0, d2
vpadd.f32 d7, d6, d7
vtrn.32 d1, d3
vsub.f32 d6, d1, d3
vmla.f32 d20, d22, d22
vmla.f32 d21, d4, d4
vtrn.32 d0, d6
vpadd.f32 d20, d20, d21
vst1.32 {q3}, [r1,:128]!
vst1.32 {d20[1]}, [r1,:32]
add r1, r1, #2*4
vst1.32 {d0}, [r1,:64]
add r1, r1, #4*4
vst1.32 {d20[0]}, [r1,:32]
bx lr
endfunc
function ff_sbr_hf_apply_noise_0_neon, export=1
vmov.i32 d3, #0
.Lhf_apply_noise_0:
push {r4,lr}
movrelx r4, X(ff_sbr_noise_table)
ldr r12, [sp, #12]
add r3, r3, #1
bfc r3, #9, #23
sub r12, r12, #1
1:
add lr, r4, r3, lsl #3
vld2.32 {q0}, [r0,:64]
vld2.32 {q3}, [lr,:64]
vld1.32 {d2}, [r1,:64]!
vld1.32 {d18}, [r2,:64]!
vceq.f32 d16, d2, #0
veor d2, d2, d3
vmov q2, q0
vmla.f32 d0, d6, d18
vmla.f32 d1, d7, d18
vadd.f32 d4, d4, d2
add r3, r3, #2
bfc r3, #9, #23
vbif d0, d4, d16
vbif d1, d5, d16
vst2.32 {q0}, [r0,:64]!
subs r12, r12, #2
bgt 1b
blt 2f
add lr, r4, r3, lsl #3
vld1.32 {d0}, [r0,:64]
vld1.32 {d6}, [lr,:64]
vld1.32 {d2[]}, [r1,:32]!
vld1.32 {d3[]}, [r2,:32]!
vceq.f32 d4, d2, #0
veor d2, d2, d3
vmov d1, d0
vmla.f32 d0, d6, d3
vadd.f32 s2, s2, s4
vbif d0, d1, d4
vst1.32 {d0}, [r0,:64]!
2:
pop {r4,pc}
endfunc
function ff_sbr_hf_apply_noise_1_neon, export=1
ldr r12, [sp]
push {r4,lr}
lsl r12, r12, #31
eor lr, r12, #1<<31
vmov d3, r12, lr
.Lhf_apply_noise_1:
movrelx r4, X(ff_sbr_noise_table)
ldr r12, [sp, #12]
add r3, r3, #1
bfc r3, #9, #23
sub r12, r12, #1
1:
add lr, r4, r3, lsl #3
vld2.32 {q0}, [r0,:64]
vld2.32 {q3}, [lr,:64]
vld1.32 {d2}, [r1,:64]!
vld1.32 {d18}, [r2,:64]!
vceq.f32 d16, d2, #0
veor d2, d2, d3
vmov q2, q0
vmla.f32 d0, d6, d18
vmla.f32 d1, d7, d18
vadd.f32 d5, d5, d2
add r3, r3, #2
bfc r3, #9, #23
vbif d0, d4, d16
vbif d1, d5, d16
vst2.32 {q0}, [r0,:64]!
subs r12, r12, #2
bgt 1b
blt 2f
add lr, r4, r3, lsl #3
vld1.32 {d0}, [r0,:64]
vld1.32 {d6}, [lr,:64]
vld1.32 {d2[]}, [r1,:32]!
vld1.32 {d18[]}, [r2,:32]!
vceq.f32 d4, d2, #0
veor d2, d2, d3
vmov d1, d0
vmla.f32 d0, d6, d18
vadd.f32 s3, s3, s5
vbif d0, d1, d4
vst1.32 {d0}, [r0,:64]!
2:
pop {r4,pc}
endfunc
function ff_sbr_hf_apply_noise_2_neon, export=1
vmov.i32 d3, #1<<31
b .Lhf_apply_noise_0
endfunc
function ff_sbr_hf_apply_noise_3_neon, export=1
ldr r12, [sp]
push {r4,lr}
lsl r12, r12, #31
eor lr, r12, #1<<31
vmov d3, lr, r12
b .Lhf_apply_noise_1
endfunc

View File

@@ -0,0 +1,480 @@
/*
* Copyright (C) 2002 Frederic 'dilb' Boulay
*
* Author: Frederic Boulay <dilb@handhelds.org>
*
* The function defined in this file is derived from the simple_idct function
* from the libavcodec library part of the FFmpeg project.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
/* useful constants for the algorithm */
#define W1 22725
#define W2 21407
#define W3 19266
#define W4 16383
#define W5 12873
#define W6 8867
#define W7 4520
#define MASK_MSHW 0xFFFF0000
#define ROW_SHIFT 11
#define ROW_SHIFT2MSHW (16-11)
#define COL_SHIFT 20
#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
function ff_simple_idct_arm, export=1
@@ void simple_idct_arm(int16_t *block)
@@ save stack for reg needed (take all of them),
@@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
@@ so it must not be overwritten, if it is not saved!!
@@ R12 is another scratch register, so it should not be saved too
@@ save all registers
stmfd sp!, {r4-r11, r14} @ R14 is also called LR
@@ at this point, R0=block, other registers are free.
add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
@@ add 2 temporary variables in the stack: R0 and R14
sub sp, sp, #8 @ allow 2 local variables
str r0, [sp, #0] @ save block in sp[0]
@@ stack status
@@ sp+4 free
@@ sp+0 R0 (block)
@@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
__row_loop:
@@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
ldr r3, [r14, #8] @ R3=ROWr32[2]
ldr r4, [r14, #12] @ R4=ROWr32[3]
@@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
@@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
@@ else follow the complete algorithm.
@@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
@@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
orr r5, r4, r3 @ R5=R4 | R3
orr r5, r5, r2 @ R5=R4 | R3 | R2
orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null)
beq __end_row_loop
mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
ldrsh r6, [r14, #0] @ R6=ROWr16[0]
orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7
beq __almost_empty_row
@@ __b_evaluation:
@@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
@@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
@@ R12=__const_ptr_, R14=&block[n]
@@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
@@ MUL16(b0, W1, row[1]);
@@ MUL16(b1, W3, row[1]);
@@ MUL16(b2, W5, row[1]);
@@ MUL16(b3, W7, row[1]);
@@ MAC16(b0, W3, row[3]);
@@ MAC16(b1, -W7, row[3]);
@@ MAC16(b2, -W1, row[3]);
@@ MAC16(b3, -W5, row[3]);
ldr r8, =W1 @ R8=W1
mov r2, r2, asr #16 @ R2=ROWr16[3]
mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r9, =W3 @ R9=W3
ldr r10, =W5 @ R10=W5
mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r11, =W7 @ R11=W7
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if null avoid muls
itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
@@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
@@ R12=__const_ptr_, R14=&block[n]
@@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
@@ if (temp != 0) {}
orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3]
beq __end_b_evaluation
@@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
@@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
@@ R12=__const_ptr_, R14=&block[n]
@@ MAC16(b0, W5, row[5]);
@@ MAC16(b2, W7, row[5]);
@@ MAC16(b3, W3, row[5]);
@@ MAC16(b1, -W1, row[5]);
@@ MAC16(b0, W7, row[7]);
@@ MAC16(b2, W3, row[7]);
@@ MAC16(b3, -W1, row[7]);
@@ MAC16(b1, -W5, row[7]);
mov r3, r3, asr #16 @ R3=ROWr16[5]
teq r3, #0 @ if null avoid muls
it ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
mov r4, r4, asr #16 @ R4=ROWr16[7]
itttt ne
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5]
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
@@ R3 is free now
teq r4, #0 @ if null avoid muls
itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
@@ R4 is free now
__end_b_evaluation:
@@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
@@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ __a_evaluation:
@@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
@@ a1 = a0 + W6 * row[2];
@@ a2 = a0 - W6 * row[2];
@@ a3 = a0 - W2 * row[2];
@@ a0 = a0 + W2 * row[2];
ldr r9, =W4 @ R9=W4
mul r6, r9, r6 @ R6=W4*ROWr16[0]
ldr r10, =W6 @ R10=W6
ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
mul r11, r10, r4 @ R11=W6*ROWr16[2]
ldr r8, =W2 @ R8=W2
sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
@@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
@@ if (temp != 0) {}
teq r2, #0
beq __end_bef_a_evaluation
add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
mul r11, r8, r4 @ R11=W2*ROWr16[2]
sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ a0 += W4*row[4]
@@ a1 -= W4*row[4]
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11, [r14, #8] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls
it ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
ldrsh r9, [r14, #12] @ R9=ROWr16[6]
itttt ne
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls
itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@@ a0 += W6*row[6];
@@ a3 -= W6*row[6];
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
__end_a_evaluation:
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ row[0] = (a0 + b0) >> ROW_SHIFT;
@@ row[1] = (a1 + b1) >> ROW_SHIFT;
@@ row[2] = (a2 + b2) >> ROW_SHIFT;
@@ row[3] = (a3 + b3) >> ROW_SHIFT;
@@ row[4] = (a3 - b3) >> ROW_SHIFT;
@@ row[5] = (a2 - b2) >> ROW_SHIFT;
@@ row[6] = (a1 - b1) >> ROW_SHIFT;
@@ row[7] = (a0 - b0) >> ROW_SHIFT;
add r8, r6, r0 @ R8=a0+b0
add r9, r2, r1 @ R9=a1+b1
@@ put 2 16 bits half-words in a 32bits word
@@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
ldr r10, =MASK_MSHW @ R10=0xFFFF0000
and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
orr r8, r8, r9
str r8, [r14, #0]
add r8, r3, r5 @ R8=a2+b2
add r9, r4, r7 @ R9=a3+b3
and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
orr r8, r8, r9
str r8, [r14, #4]
sub r8, r4, r7 @ R8=a3-b3
sub r9, r3, r5 @ R9=a2-b2
and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
orr r8, r8, r9
str r8, [r14, #8]
sub r8, r2, r1 @ R8=a1-b1
sub r9, r6, r0 @ R9=a0-b0
and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
orr r8, r8, r9
str r8, [r14, #12]
bal __end_row_loop
__almost_empty_row:
@@ the row was empty, except ROWr16[0], now, management of this special case
@@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
@@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
@@ R8=0xFFFF (temp), R9-R11 free
mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
sub r8, r8, #1 @ R8 is now ready.
and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16)
str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
__end_row_loop:
@@ at this point, R0-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
ldr r0, [sp, #0] @ R0=block
teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
sub r14, r14, #16
bne __row_loop
@@ at this point, R0=block, R1-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
__col_loop:
@@ __b_evaluation2:
@@ at this point, R0=block (temp), R1-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
@@ proceed with b0-b3 first, followed by a0-a3
@@ MUL16(b0, W1, col[8x1]);
@@ MUL16(b1, W3, col[8x1]);
@@ MUL16(b2, W5, col[8x1]);
@@ MUL16(b3, W7, col[8x1]);
@@ MAC16(b0, W3, col[8x3]);
@@ MAC16(b1, -W7, col[8x3]);
@@ MAC16(b2, -W1, col[8x3]);
@@ MAC16(b3, -W5, col[8x3]);
ldr r8, =W1 @ R8=W1
ldrsh r7, [r14, #16]
mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r9, =W3 @ R9=W3
ldr r10, =W5 @ R10=W5
mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldr r11, =W7 @ R11=W7
mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
ldrsh r2, [r14, #48]
mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
teq r2, #0 @ if 0, then avoid muls
itttt ne
mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
rsbne r2, r2, #0 @ R2=-ROWr16[3]
mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
it ne
mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
@@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
@@ R12=__const_ptr_, R14=&block[n]
@@ MAC16(b0, W5, col[5x8]);
@@ MAC16(b2, W7, col[5x8]);
@@ MAC16(b3, W3, col[5x8]);
@@ MAC16(b1, -W1, col[5x8]);
@@ MAC16(b0, W7, col[7x8]);
@@ MAC16(b2, W3, col[7x8]);
@@ MAC16(b3, -W1, col[7x8]);
@@ MAC16(b1, -W5, col[7x8]);
ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
teq r3, #0 @ if 0 then avoid muls
itttt ne
mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
it ne
mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
@@ R3 is free now
teq r4, #0 @ if 0 then avoid muls
itttt ne
mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
it ne
mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
@@ R4 is free now
@@ __end_b_evaluation2:
@@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
@@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ __a_evaluation2:
@@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
@@ a1 = a0 + W6 * row[2];
@@ a2 = a0 - W6 * row[2];
@@ a3 = a0 - W2 * row[2];
@@ a0 = a0 + W2 * row[2];
ldrsh r6, [r14, #0]
ldr r9, =W4 @ R9=W4
mul r6, r9, r6 @ R6=W4*ROWr16[0]
ldr r10, =W6 @ R10=W6
ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
mul r11, r10, r4 @ R11=W6*ROWr16[2]
ldr r8, =W2 @ R8=W2
add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
mul r11, r8, r4 @ R11=W2*ROWr16[2]
sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ a0 += W4*row[4]
@@ a1 -= W4*row[4]
@@ a2 -= W4*row[4]
@@ a3 += W4*row[4]
ldrsh r11, [r14, #64] @ R11=ROWr16[4]
teq r11, #0 @ if null avoid muls
itttt ne
mulne r11, r9, r11 @ R11=W4*ROWr16[4]
@@ R9 is free now
addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
ldrsh r9, [r14, #96] @ R9=ROWr16[6]
it ne
addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
@@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
teq r9, #0 @ if null avoid muls
itttt ne
mulne r11, r10, r9 @ R11=W6*ROWr16[6]
addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
mulne r10, r8, r9 @ R10=W2*ROWr16[6]
@@ a0 += W6*row[6];
@@ a3 -= W6*row[6];
@@ a1 -= W2*row[6];
@@ a2 += W2*row[6];
subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
itt ne
subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
@@ __end_a_evaluation2:
@@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
@@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
@@ R12=__const_ptr_, R14=&block[n]
@@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
@@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
@@ col[16] = ((a2 + b2) >> COL_SHIFT);
@@ col[24] = ((a3 + b3) >> COL_SHIFT);
@@ col[32] = ((a3 - b3) >> COL_SHIFT);
@@ col[40] = ((a2 - b2) >> COL_SHIFT);
@@ col[48] = ((a1 - b1) >> COL_SHIFT);
@@ col[56] = ((a0 - b0) >> COL_SHIFT);
@@@@@ no optimization here @@@@@
add r8, r6, r0 @ R8=a0+b0
add r9, r2, r1 @ R9=a1+b1
mov r8, r8, asr #COL_SHIFT
mov r9, r9, asr #COL_SHIFT
strh r8, [r14, #0]
strh r9, [r14, #16]
add r8, r3, r5 @ R8=a2+b2
add r9, r4, r7 @ R9=a3+b3
mov r8, r8, asr #COL_SHIFT
mov r9, r9, asr #COL_SHIFT
strh r8, [r14, #32]
strh r9, [r14, #48]
sub r8, r4, r7 @ R8=a3-b3
sub r9, r3, r5 @ R9=a2-b2
mov r8, r8, asr #COL_SHIFT
mov r9, r9, asr #COL_SHIFT
strh r8, [r14, #64]
strh r9, [r14, #80]
sub r8, r2, r1 @ R8=a1-b1
sub r9, r6, r0 @ R9=a0-b0
mov r8, r8, asr #COL_SHIFT
mov r9, r9, asr #COL_SHIFT
strh r8, [r14, #96]
strh r9, [r14, #112]
@@ __end_col_loop:
@@ at this point, R0-R11 (free)
@@ R12=__const_ptr_, R14=&block[n]
ldr r0, [sp, #0] @ R0=block
teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
sub r14, r14, #2
bne __col_loop
@@ __end_simple_idct_arm:
@@ restore registers to previous status!
add sp, sp, #8 @@ the local variables!
ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
@@ kind of sub-function, here not to overload the common case.
__end_bef_a_evaluation:
add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
mul r11, r8, r4 @ R11=W2*ROWr16[2]
sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
bal __end_a_evaluation
endfunc

Some files were not shown because too many files have changed in this diff Show More