Sergey Semyonov (Serge) a316fa7c9d libmpg123 1.9.0
git-svn-id: svn://kolibrios.org@1905 a494cfbc-eb01-0410-851d-a64ba20cac60
2011-03-11 10:57:03 +00:00

247 lines
5.2 KiB
C

/*
decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic)
copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1
see COPYING and AUTHORS files in distribution or http://mpg123.org
initially written by the mysterious higway for MMX (apparently)
then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
Both have agreed to distribution under LGPL 2.1 .
Transformed back into standalone asm, with help of
gcc -S -DHAVE_CONFIG_H -I. -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c}
The difference between SSE and 3DNowExt is the dct64 function and the synth function name.
This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S...
That's not memory efficient since there's doubled code, but it's easier than giving another function pointer.
Maybe I'll change it in future, but now I need something that works.
Original comment from MPlayer source follows:
*/
/*
* this code comes under GPL
* This code was taken from http://www.mpg123.org
* See ChangeLog of mpg123-0.59s-pre.1 for detail
* Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
*
* Local ChangeLog:
* - Partial loops unrolling and removing MOVW insn from loops
*/
#include "mangle.h"
.data
ALIGN8
one_null:
.long -65536
.long -65536
ALIGN8
null_one:
.long 65535
.long 65535
.text
ALIGN16,,15
/* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */
.globl SYNTH_NAME
SYNTH_NAME:
pushl %ebp
/* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */
movl %esp, %ebp
/* Now the old stack addresses are preserved via %epb. */
subl $4,%esp /* What has been called temp before. */
pushl %edi
pushl %esi
pushl %ebx
#define TEMP 12(%esp)
/* APP */
movl 12(%ebp),%ecx
movl 16(%ebp),%edi
movl $15,%ebx
movl 24(%ebp),%edx
leal (%edi,%ecx,2),%edi
decl %ecx
movl 20(%ebp),%esi
movl (%edx),%eax
jecxz .L01
decl %eax
andl %ebx,%eax
leal 1088(%esi),%esi
movl %eax,(%edx)
.L01:
leal (%esi,%eax,2),%edx
movl %eax,TEMP
incl %eax
andl %ebx,%eax
leal 544(%esi,%eax,2),%ecx
incl %ebx
testl $1, %eax
jnz .L02
xchgl %edx,%ecx
incl TEMP
leal 544(%esi),%esi
.L02:
pushl 8(%ebp)
pushl %edx
pushl %ecx
call MPL_DCT64
addl $12, %esp
leal 1(%ebx), %ecx
subl TEMP,%ebx
pushl %ecx
/* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */
movl 28(%ebp),%ecx
leal (%ecx,%ebx,2), %edx
movl (%esp),%ecx /* restore, but leave value on stack */
shrl $1, %ecx
ALIGN16
.L03:
movq (%edx),%mm0
movq 64(%edx),%mm4
pmaddwd (%esi),%mm0
pmaddwd 32(%esi),%mm4
movq 8(%edx),%mm1
movq 72(%edx),%mm5
pmaddwd 8(%esi),%mm1
pmaddwd 40(%esi),%mm5
movq 16(%edx),%mm2
movq 80(%edx),%mm6
pmaddwd 16(%esi),%mm2
pmaddwd 48(%esi),%mm6
movq 24(%edx),%mm3
movq 88(%edx),%mm7
pmaddwd 24(%esi),%mm3
pmaddwd 56(%esi),%mm7
paddd %mm1,%mm0
paddd %mm5,%mm4
paddd %mm2,%mm0
paddd %mm6,%mm4
paddd %mm3,%mm0
paddd %mm7,%mm4
movq %mm0,%mm1
movq %mm4,%mm5
psrlq $32,%mm1
psrlq $32,%mm5
paddd %mm1,%mm0
paddd %mm5,%mm4
psrad $13,%mm0
psrad $13,%mm4
packssdw %mm0,%mm0
packssdw %mm4,%mm4
movq (%edi), %mm1
punpckldq %mm4, %mm0
pand one_null, %mm1
pand null_one, %mm0
por %mm0, %mm1
movq %mm1,(%edi)
leal 64(%esi),%esi
leal 128(%edx),%edx
leal 8(%edi),%edi
decl %ecx
jnz .L03
popl %ecx
andl $1, %ecx
jecxz .next_loop
movq (%edx),%mm0
pmaddwd (%esi),%mm0
movq 8(%edx),%mm1
pmaddwd 8(%esi),%mm1
movq 16(%edx),%mm2
pmaddwd 16(%esi),%mm2
movq 24(%edx),%mm3
pmaddwd 24(%esi),%mm3
paddd %mm1,%mm0
paddd %mm2,%mm0
paddd %mm3,%mm0
movq %mm0,%mm1
psrlq $32,%mm1
paddd %mm1,%mm0
psrad $13,%mm0
packssdw %mm0,%mm0
movd %mm0,%eax
movw %ax, (%edi)
leal 32(%esi),%esi
leal 64(%edx),%edx
leal 4(%edi),%edi
.next_loop:
subl $64,%esi
movl $7,%ecx
ALIGN16
.L04:
movq (%edx),%mm0
movq 64(%edx),%mm4
pmaddwd (%esi),%mm0
pmaddwd -32(%esi),%mm4
movq 8(%edx),%mm1
movq 72(%edx),%mm5
pmaddwd 8(%esi),%mm1
pmaddwd -24(%esi),%mm5
movq 16(%edx),%mm2
movq 80(%edx),%mm6
pmaddwd 16(%esi),%mm2
pmaddwd -16(%esi),%mm6
movq 24(%edx),%mm3
movq 88(%edx),%mm7
pmaddwd 24(%esi),%mm3
pmaddwd -8(%esi),%mm7
paddd %mm1,%mm0
paddd %mm5,%mm4
paddd %mm2,%mm0
paddd %mm6,%mm4
paddd %mm3,%mm0
paddd %mm7,%mm4
movq %mm0,%mm1
movq %mm4,%mm5
psrlq $32,%mm1
psrlq $32,%mm5
paddd %mm0,%mm1
paddd %mm4,%mm5
psrad $13,%mm1
psrad $13,%mm5
packssdw %mm1,%mm1
packssdw %mm5,%mm5
psubd %mm0,%mm0
psubd %mm4,%mm4
psubsw %mm1,%mm0
psubsw %mm5,%mm4
movq (%edi), %mm1
punpckldq %mm4, %mm0
pand one_null, %mm1
pand null_one, %mm0
por %mm0, %mm1
movq %mm1,(%edi)
subl $64,%esi
addl $128,%edx
leal 8(%edi),%edi
decl %ecx
jnz .L04
movq (%edx),%mm0
pmaddwd (%esi),%mm0
movq 8(%edx),%mm1
pmaddwd 8(%esi),%mm1
movq 16(%edx),%mm2
pmaddwd 16(%esi),%mm2
movq 24(%edx),%mm3
pmaddwd 24(%esi),%mm3
paddd %mm1,%mm0
paddd %mm2,%mm0
paddd %mm3,%mm0
movq %mm0,%mm1
psrlq $32,%mm1
paddd %mm0,%mm1
psrad $13,%mm1
packssdw %mm1,%mm1
psubd %mm0,%mm0
psubsw %mm1,%mm0
movd %mm0,%eax
movw %ax,(%edi)
emms
/* NO_APP */
popl %ebx
popl %esi
popl %edi
addl $4,%esp
popl %ebp
ret