forked from KolibriOS/kolibrios
a316fa7c9d
git-svn-id: svn://kolibrios.org@1905 a494cfbc-eb01-0410-851d-a64ba20cac60
247 lines
5.2 KiB
C
247 lines
5.2 KiB
C
/*
|
|
decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic)
|
|
|
|
copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1
|
|
see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
initially written by the mysterious higway for MMX (apparently)
|
|
then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
|
|
Both have agreed to distribution under LGPL 2.1 .
|
|
|
|
Transformed back into standalone asm, with help of
|
|
gcc -S -DHAVE_CONFIG_H -I. -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c}
|
|
|
|
The difference between SSE and 3DNowExt is the dct64 function and the synth function name.
|
|
This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S...
|
|
That's not memory efficient since there's doubled code, but it's easier than giving another function pointer.
|
|
Maybe I'll change it in future, but now I need something that works.
|
|
|
|
Original comment from MPlayer source follows:
|
|
*/
|
|
|
|
/*
|
|
* this code comes under GPL
|
|
* This code was taken from http://www.mpg123.org
|
|
* See ChangeLog of mpg123-0.59s-pre.1 for detail
|
|
* Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
|
|
*
|
|
* Local ChangeLog:
|
|
* - Partial loops unrolling and removing MOVW insn from loops
|
|
*/
|
|
|
|
#include "mangle.h"
|
|
|
|
.data
|
|
ALIGN8
|
|
one_null:
|
|
.long -65536
|
|
.long -65536
|
|
ALIGN8
|
|
null_one:
|
|
.long 65535
|
|
.long 65535
|
|
|
|
.text
|
|
ALIGN16,,15
|
|
/* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */
|
|
.globl SYNTH_NAME
|
|
SYNTH_NAME:
|
|
pushl %ebp
|
|
/* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */
|
|
movl %esp, %ebp
|
|
/* Now the old stack addresses are preserved via %epb. */
|
|
subl $4,%esp /* What has been called temp before. */
|
|
pushl %edi
|
|
pushl %esi
|
|
pushl %ebx
|
|
#define TEMP 12(%esp)
|
|
/* APP */
|
|
movl 12(%ebp),%ecx
|
|
movl 16(%ebp),%edi
|
|
movl $15,%ebx
|
|
movl 24(%ebp),%edx
|
|
leal (%edi,%ecx,2),%edi
|
|
decl %ecx
|
|
movl 20(%ebp),%esi
|
|
movl (%edx),%eax
|
|
jecxz .L01
|
|
decl %eax
|
|
andl %ebx,%eax
|
|
leal 1088(%esi),%esi
|
|
movl %eax,(%edx)
|
|
.L01:
|
|
leal (%esi,%eax,2),%edx
|
|
movl %eax,TEMP
|
|
incl %eax
|
|
andl %ebx,%eax
|
|
leal 544(%esi,%eax,2),%ecx
|
|
incl %ebx
|
|
testl $1, %eax
|
|
jnz .L02
|
|
xchgl %edx,%ecx
|
|
incl TEMP
|
|
leal 544(%esi),%esi
|
|
.L02:
|
|
pushl 8(%ebp)
|
|
pushl %edx
|
|
pushl %ecx
|
|
call MPL_DCT64
|
|
addl $12, %esp
|
|
leal 1(%ebx), %ecx
|
|
subl TEMP,%ebx
|
|
pushl %ecx
|
|
/* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */
|
|
movl 28(%ebp),%ecx
|
|
leal (%ecx,%ebx,2), %edx
|
|
movl (%esp),%ecx /* restore, but leave value on stack */
|
|
shrl $1, %ecx
|
|
ALIGN16
|
|
.L03:
|
|
movq (%edx),%mm0
|
|
movq 64(%edx),%mm4
|
|
pmaddwd (%esi),%mm0
|
|
pmaddwd 32(%esi),%mm4
|
|
movq 8(%edx),%mm1
|
|
movq 72(%edx),%mm5
|
|
pmaddwd 8(%esi),%mm1
|
|
pmaddwd 40(%esi),%mm5
|
|
movq 16(%edx),%mm2
|
|
movq 80(%edx),%mm6
|
|
pmaddwd 16(%esi),%mm2
|
|
pmaddwd 48(%esi),%mm6
|
|
movq 24(%edx),%mm3
|
|
movq 88(%edx),%mm7
|
|
pmaddwd 24(%esi),%mm3
|
|
pmaddwd 56(%esi),%mm7
|
|
paddd %mm1,%mm0
|
|
paddd %mm5,%mm4
|
|
paddd %mm2,%mm0
|
|
paddd %mm6,%mm4
|
|
paddd %mm3,%mm0
|
|
paddd %mm7,%mm4
|
|
movq %mm0,%mm1
|
|
movq %mm4,%mm5
|
|
psrlq $32,%mm1
|
|
psrlq $32,%mm5
|
|
paddd %mm1,%mm0
|
|
paddd %mm5,%mm4
|
|
psrad $13,%mm0
|
|
psrad $13,%mm4
|
|
packssdw %mm0,%mm0
|
|
packssdw %mm4,%mm4
|
|
movq (%edi), %mm1
|
|
punpckldq %mm4, %mm0
|
|
pand one_null, %mm1
|
|
pand null_one, %mm0
|
|
por %mm0, %mm1
|
|
movq %mm1,(%edi)
|
|
leal 64(%esi),%esi
|
|
leal 128(%edx),%edx
|
|
leal 8(%edi),%edi
|
|
decl %ecx
|
|
jnz .L03
|
|
popl %ecx
|
|
andl $1, %ecx
|
|
jecxz .next_loop
|
|
movq (%edx),%mm0
|
|
pmaddwd (%esi),%mm0
|
|
movq 8(%edx),%mm1
|
|
pmaddwd 8(%esi),%mm1
|
|
movq 16(%edx),%mm2
|
|
pmaddwd 16(%esi),%mm2
|
|
movq 24(%edx),%mm3
|
|
pmaddwd 24(%esi),%mm3
|
|
paddd %mm1,%mm0
|
|
paddd %mm2,%mm0
|
|
paddd %mm3,%mm0
|
|
movq %mm0,%mm1
|
|
psrlq $32,%mm1
|
|
paddd %mm1,%mm0
|
|
psrad $13,%mm0
|
|
packssdw %mm0,%mm0
|
|
movd %mm0,%eax
|
|
movw %ax, (%edi)
|
|
leal 32(%esi),%esi
|
|
leal 64(%edx),%edx
|
|
leal 4(%edi),%edi
|
|
.next_loop:
|
|
subl $64,%esi
|
|
movl $7,%ecx
|
|
ALIGN16
|
|
.L04:
|
|
movq (%edx),%mm0
|
|
movq 64(%edx),%mm4
|
|
pmaddwd (%esi),%mm0
|
|
pmaddwd -32(%esi),%mm4
|
|
movq 8(%edx),%mm1
|
|
movq 72(%edx),%mm5
|
|
pmaddwd 8(%esi),%mm1
|
|
pmaddwd -24(%esi),%mm5
|
|
movq 16(%edx),%mm2
|
|
movq 80(%edx),%mm6
|
|
pmaddwd 16(%esi),%mm2
|
|
pmaddwd -16(%esi),%mm6
|
|
movq 24(%edx),%mm3
|
|
movq 88(%edx),%mm7
|
|
pmaddwd 24(%esi),%mm3
|
|
pmaddwd -8(%esi),%mm7
|
|
paddd %mm1,%mm0
|
|
paddd %mm5,%mm4
|
|
paddd %mm2,%mm0
|
|
paddd %mm6,%mm4
|
|
paddd %mm3,%mm0
|
|
paddd %mm7,%mm4
|
|
movq %mm0,%mm1
|
|
movq %mm4,%mm5
|
|
psrlq $32,%mm1
|
|
psrlq $32,%mm5
|
|
paddd %mm0,%mm1
|
|
paddd %mm4,%mm5
|
|
psrad $13,%mm1
|
|
psrad $13,%mm5
|
|
packssdw %mm1,%mm1
|
|
packssdw %mm5,%mm5
|
|
psubd %mm0,%mm0
|
|
psubd %mm4,%mm4
|
|
psubsw %mm1,%mm0
|
|
psubsw %mm5,%mm4
|
|
movq (%edi), %mm1
|
|
punpckldq %mm4, %mm0
|
|
pand one_null, %mm1
|
|
pand null_one, %mm0
|
|
por %mm0, %mm1
|
|
movq %mm1,(%edi)
|
|
subl $64,%esi
|
|
addl $128,%edx
|
|
leal 8(%edi),%edi
|
|
decl %ecx
|
|
jnz .L04
|
|
movq (%edx),%mm0
|
|
pmaddwd (%esi),%mm0
|
|
movq 8(%edx),%mm1
|
|
pmaddwd 8(%esi),%mm1
|
|
movq 16(%edx),%mm2
|
|
pmaddwd 16(%esi),%mm2
|
|
movq 24(%edx),%mm3
|
|
pmaddwd 24(%esi),%mm3
|
|
paddd %mm1,%mm0
|
|
paddd %mm2,%mm0
|
|
paddd %mm3,%mm0
|
|
movq %mm0,%mm1
|
|
psrlq $32,%mm1
|
|
paddd %mm0,%mm1
|
|
psrad $13,%mm1
|
|
packssdw %mm1,%mm1
|
|
psubd %mm0,%mm0
|
|
psubsw %mm1,%mm0
|
|
movd %mm0,%eax
|
|
movw %ax,(%edi)
|
|
emms
|
|
|
|
/* NO_APP */
|
|
popl %ebx
|
|
popl %esi
|
|
popl %edi
|
|
addl $4,%esp
|
|
popl %ebp
|
|
ret
|