fast Hartley transform presented

git-svn-id: svn://kolibrios.org@2210 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Artem Jerdev (art_zh) 2011-09-18 21:08:53 +00:00
parent 6bbd71b6f8
commit 6641043d17
6 changed files with 1210 additions and 138 deletions

View File

@ -79,7 +79,7 @@ end if
;{
include "..\..\..\macros.inc"
; purge mov
include "debug.inc"
include "..\..\debug.inc"
include 'dump.inc'
;}
end if

View File

@ -67,7 +67,7 @@ debug_outstr:
mov cl,[edx]
test cl,cl
jz @f
int 40h
mcall
inc edx
jmp @b
@@:

View File

@ -0,0 +1,28 @@
This is a draft version of my Fast Hartley Transform (FHT) routine for KolibriOS
Hartley transform is a real-basis version of well-known Fourier transform:
1) basis function: cas(x) = cos(x) + sin(x);
2) forward transform: H(f) = sum(k=0..N-1) [X(k)*cas(kf/(2*pi*N))]
3) reverse transform: X(k) = 1/N * sum(f=0..N-1) [H(f)*cas(kf/(2*pi*N))]
FHT is known to be faster than most conventional fast Fourier transform (FHT) methods.
It also uses half-length arrays due to no need of imaginary data storage.
FHT can be easily converted to FFT (and back) with no loss of information.
Most of general tasks FFT used for (correlation, convolution, energy spectra, noise
filtration, differential math, phase detection ect.) may be done directly with FHT.
====================================================================================
Copyright (C) A. Jerdev 1999, 2003 and 2010.
The code can be used, changed and redistributed in any KolibriOS application
with only two limitations:
1) the author's name and copyright information cannot be deleted or changed;
2) the code is not allowed to be ported to or distributed with other operation systems.
18/09/2010
Artem Jerdev <kolibri@jerdev.co.uk>

View File

@ -0,0 +1,203 @@
;========================================================================
;= =
;= Fast Hartley Transform demo for KolibriOS =
;= =
;= Copyright (C) 2010, Artem Jerdev <kolibri@jerdev.co.uk> =
;= =
;= refer to wiki.kolibtios.org for all details =
;= =
;========================================================================
use32
org 0x0
db 'MENUET01' ; 8 byte id
dd 0x01 ; header version
dd START ; start of code
dd I_END ; size of image
dd 0x100000 ; memory for app
dd 0xbfffc ; esp
dd 0x0 , 0x0 ; I_Param , I_Icon
include '../../macros.inc'
include '../debug.inc'
include 'FHT4i.inc'
START: ; start of execution
call main
mov eax,-1 ; close this program
int 0x40
;=============================================================
;Func: calculates a simple function
; ff = (512*2^(-t) * cos (2.5*t))
; uses: eax, ebx
;------------
Func:
; 9 : {
; 10 : double x,t;
; 11 : int f;
; 12 :
; 13 : x = (i < N2) ? i : i - NUM_POINTS;
mov eax, [ii]
cmp eax, 512
jge .index_negative
jmp .index_correct
.index_negative:
sub eax, 1024
.index_correct:
mov [temp], eax
; fild [temp]
; 14 : t = x / 16.0;
; f2xm1 argument (abs) must be less than 1, so
mov [t_mod], eax
and [t_mod], 0x0F ; x % 16
shr eax, 4 ; x / 16
mov [t_div], eax
fild [temp]
; 15 : if (t<0) t = -t;
fabs
exp_ok:
; 16 : f = (512*2^(-t) * cos (2.5*t));
fchs
f2xm1
fmul [f500]
fstp [tv93]
fld [f2_5]
fmul [tt]
fcos
fmul [tv93]
fstp [tt]
mov bx, word[tt+6]
shr bx,4
and bx,0x07FF
add ax,bx
shl ax,4
and word[tt+6], 0x800F
or word[tt+6], ax
fld [tt]
fstp [ff]
; 17 : return f;
; 18 : }
ret
;---------------------------------------------------------
; test data filler
;
; uses eax, ebx, ecx
FillData:
; 29 : for (i=0; i<NUM_POINTS; i++)
; here : ecx = i
xor ecx, ecx
.funcloop:
; 30 : {
; 31 : ia[i] = Func(i);
mov [ii], ecx
call Func
fld [ff]
fstp qword [edx+ecx*8]
; 32 : }
inc ecx
cmp ecx, [_in] ; ecx == N ?
jne .funcloop
ret
;====================================================================
; main
;====================================================================
_ia dd 0
_ii dd 0
_ip dd 0
_in dd 0
_it dd 0
;-----------------
main:
mov eax, 68
mov ebx, 11
int 0x40
fninit
mov cl, 2 ; power of 4
mov byte[_ip], cl
mov eax, 1
shl eax, cl
shl eax, cl
mov [_in], eax
mov dl, cl
call CreateSinCosTable
mov [_it], edx
mov ecx, [_in]
shl ecx, 3
mov ebx, 12
mov eax, 68
int 0x40
mov [_ia], eax
mov edx, eax
call FillData
cpuid
rdtsc
mov [t_0], eax
push [_it]
push [_ia]
push [_ip]
push [_in]
call FHT_4
add esp, 16
cpuid
rdtsc
mov [t_1], eax
sub eax, [t_0]
debug_print_hex eax
print '<- fht time'
mov edx, [_it]
call DestroySinCosTable
mov ecx, [_ia]
mov ebx, 13
mov eax, 68
int 0x40
ret
; ========================================================
; static data
;----------------
align 8
;f18 dq 0x4032000000000000
f256 dq 256.01f
f14_2 dq 14.2f
f500 dq 0x407f400000000000
f2_5 dq 0x4004000000000000
tt dq ?
tv93 dq ?
t_div dd ?
t_mod dd ?
temp dd ?
ff dq ? ; return value (int)
ii dd ? ; argument (int) = array index
t_1 dd ?
t_0 dd ?
fcontrol dw 0x0037f
title db ' Fast Hartley Transform Test - A.Jerdev 2010'
I_END:

View File

@ -0,0 +1,841 @@
; Fast Hartley Transform routine
; Copyright (C) 1999, 2004, 2010
; Artem Jerdev artem@jerdev.co.uk
;
; free KolibriOS version - not to be ported to other OSes
; ==========================================================
; global constants
align 8
_r dq 1.41421356237309504880169 ; = sqrt(2)
_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2
_c1 dq 0.92387953251128675612818 ; = cos(pi/8)
_s1 dq 0.38268343236508977172846 ; = sin(pi/8)
;=================================================================
; parameter1:
; -- reg dl (bits[3:0]) = Power_of_4
; returns:
; -- reg edx = _CosTable address (4k-aligned)
; assumes: _SinTable = _CosTable + (N/2)*8
; user heap has to be initialized
; destroys:
; -- eax, ebx, ecx
;; ==========================
align 4
CreateSinCosTable:
xor eax, eax
inc eax
mov cl, dl
and cl, 15
shl eax, cl
shl eax, cl
mov ecx, eax ; now ecx = N
shl ecx, 3
mov ebx, 12
mov eax, 68
int 0x40 ; getmem(N*sizeof(double))
mov edx, eax ; edx = _CosTable
shr ecx, 1
mov ebx, eax
add ebx, ecx ; ebx = _SinTable
shr ecx, 3
push ecx ; [esp] = ecx = N/2
xor eax, eax
fldpi
fidiv dword[esp] ; st : dx = 2*pi/N
pop ecx
fldz ; st : 0, dx
.loop:
fld st0 ; st : x, x, dx
FSINCOS ; st : cos, sin, x, dx
fstp qword [edx+eax*8] ; st : sin, x, dx
fstp qword [ebx+eax*8] ; st : x, dx
fadd st0, st1 ; st : x+dx, dx
inc eax
cmp eax, ecx
jne .loop
fstp st0 ; st : dx
fstp st0 ; st : <empty>
ret
;=================================================================
; parameter1:
; -- reg edx = _CosTable address
; destroys:
; -- eax, ebx, ecx
;; ==========================
align 4
DestroySinCosTable:
mov ecx, edx
mov ebx, 13
mov eax, 68
int 0x40 ; free(SinCosTable)
ret
;=================================================================
; parameter1:
; -- reg dl (bits[3:0]) = Power_of_4
; -- reg edx && (-16) = 4k-aligned data array address
; returns:
; -- edx = Power_of_4
; -- ecx = N
; destroys:
; -- eax, ebx, ecx, edx, esi
;; ==========================
align 4
BitInvert:
mov esi, edx
and esi, 0xFFFFFFF0
and edx, 0x0F
push edx
mov cl, dl
xor eax, eax
inc eax
shl eax, cl
shl eax, cl
push eax
xor ecx, ecx ; index term
.newterm:
inc ecx
cmp ecx, [esp] ; N
jge .done
xor eax, eax
mov edx, ecx
xor bl, bl
.do_invert:
inc bl
cmp bl, byte[esp+4] ; Power_of_4
jg .switch
mov bh, dl
and bh, 3
shl eax, 2
or al, bh
shr edx, 2
jmp .do_invert
.switch:
cmp eax, ecx
jle .newterm
fld qword [esi+eax*8]
fld qword [esi+ecx*8]
fstp qword [esi+eax*8]
fstp qword [esi+ecx*8]
jmp .newterm
.done:
pop ecx
pop edx
ret
;=================================================================
;=================================================================
; stdcall parameters:
; -- [esp+4] = N
; -- [esp+8] = 4k-aligned data array address
; returns:
; -- nothing
; destroys:
; -- ebx, esi
;; ==========================
align 4
step1:
mov ebx, [esp+8]
mov esi, [esp+4]
shl esi, 3
add esi, ebx
.loop:
fld qword[ebx]
fld qword[ebx+8]
fld st1
fsub st0, st1 ; st : t2, f[i+1], f[i]
fxch st1 ; st : f[i+1], t2, f[i]
faddp st2, st0 ; st : t2, t1
fld qword[ebx+16]
fld qword[ebx+24]
fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1
fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1
fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1
fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1
fstp st1 ; st : t4, t3, t2, t1
fld st2 ; st : t2, t4, t3, t2, t1
fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1
fstp qword[ebx+16] ; st : t4, t3, t2, t1
fsubp st2, st0 ; st : t3, t2-t4, t1
fld st2 ; st : t1, t3, t2-t4, t1
fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1
fstp qword[ebx] ; st : t3, t2-t4, t1
fsubp st2, st0 ; st : t2-t4, t1-t3
fstp qword[ebx+24] ; st : t1-t3
fstp qword[ebx+8] ; st : <empty>
add ebx, 32
cmp ebx, esi
jnz .loop
ret
; local stack definitions
;===========================================================================
_t0 equ dword [esp]
_t1 equ dword[esp+4]
_t2 equ dword[esp+8]
_t3 equ dword[esp+12]
_t4 equ dword[esp+16]
_t5 equ dword[esp+20]
_t6 equ dword[esp+24]
_t7 equ dword[esp+28]
_t8 equ dword[esp+32]
_t9 equ dword[esp+36]
_l1 equ dword[esp+40]
_l2 equ dword[esp+44]
_l3 equ dword[esp+48]
_l4 equ dword[esp+52]
_l5 equ dword[esp+56]
_l6 equ dword[esp+60]
_l7 equ dword[esp+64]
_l8 equ dword[esp+68]
_l9 equ dword[esp+72]
_l0 equ dword[esp+76]
_d1 equ dword[esp+80]
_d2 equ dword[esp+84]
_d3 equ dword[esp+88]
_d4 equ dword[esp+92]
_d5 equ dword[esp+96]
_d6 equ dword[esp+100]
_j5 equ dword[esp+104]
_jj equ dword[esp+108]
_end_of_array equ dword[esp+112]
_step equ word [esp+116]
;=================================================================
; cdecl parameters:
; -- [ebp+8] = N
; -- [ebp+12] = 4k-aligned data array address
; returns:
; -- nothing
; destroys:
; -- eax, ebx
; locals:
; -- 10 stack-located dwords (_t0 ... _t9)
;; ==========================
align 4
step2:
push ebp
mov ebp, esp
sub esp, 40
mov ebx, [ebp+12]
mov eax, [ebp+ 8]
shl eax, 3
add eax, ebx
.loop_i:
; -- quad subelements +0, +4, +8 and +12 (simpliest operations)
fld qword[ebx]
fld qword[ebx+8*4]
fld st0
fadd st0, st2 ; st : t1, f_4, f_0
fxch st1
fsubp st2, st0 ; st : t1, t2
fld qword[ebx+8*8]
fld qword[ebx+8*12]
fld st0
fadd st0, st2 ; st : t3, f_12, t1, t2
fxch st1
fsubp st2, st0 ; st : t3, t4, t1, t2
; ------
fld st2 ; st : t1, t3, t4, t1, t2
fadd st0, st1
fstp qword[ebx] ; st : t3, t4, t1, t2
fsub st0, st2 ; st : t3-t1, t4, t1, t2
fchs ; st : t1-t3, t4, t1, t2
fstp qword[ebx+8*4] ; st : t4, t1, t2
fst st1 ; st : t4, t4, t2
fadd st0, st2 ; st : t2+t4, t4, t2
fstp qword[ebx+8*8] ; st : t4, t2
fsubp st1, st0 ; st : t2-t4
fstp qword[ebx+8*12] ; st : <empty>
; -- even subelements +2, +6, +10 and +14 (2 multiplications needed)
fld qword[ebx+8*2]
fld qword[ebx+8*6]
fld [_r]
fmul st1, st0 ; st : r, t2, t1
fld qword[ebx+8*10]
fxch st1 ; st : r, t3, t2, t1
fmul qword[ebx+8*14] ; st : t4, t3, t2, t1
; ------
fld st3 ; st : t1, t4, t3, t2, t1
fadd st0, st3 ;
fadd st0, st2 ;
fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3
fsub st0, st3 ;
fsub st0, st3 ;
fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
fld st3 ; st : t1, t4, t3, t2, t1
fsub st0, st2 ;
fsub st0, st1 ;
fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
fadd st0, st1 ;
faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1
fstp qword[ebx+8*6] ; store f[i+6]
fstp st0 ; st : t2, t1
fstp st0 ; st : t1
fstp st0 ; st : <empty>
; -- odd subelements
fld qword[ebx+8*9]
fld qword[ebx+8*11]
fld st1
fsub st0, st1
fxch st1
faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7])
fld [_r2]
fmul st2, st0
fmulp st1, st0 ; st : t9, t6
fld qword[ebx+8*3]
fld st0
fadd st0, st2 ; st : t1, f[l5], t9, t6
fstp _t1
fsub st0, st1
fstp _t2
fstp _t9 ; (t9 never used)
fstp _t6 ; st : <empty>
fld [_c1]
fld [_s1]
fld qword[ebx+8*5]
fld qword[ebx+8*7]
fld st3 ; st: c1, f[l6], f[l2], s1, c1
fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c
fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c
fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c
faddp st1, st0 ; st: t5, f_6, f_2, s, c
fstp _t5 ; st: f_6, f_2, s, c
fld st3 ; st: c, f_6, f_2, s, c
fmul st0, st1
fld st3
fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c
fsubp st1, st0 ; st: t8, f_6, f_2, s, c
fstp _t8 ; st: f_6, f_2, s, c
fstp st0 ; st: f_2, s, c
fstp st0 ; st: s, c
fld qword[ebx+8*13]
fld qword[ebx+8*15]
fld st3 ; st: c1, f[l8], f[l4], s1, c1
fmul st0, st1
fld st3
fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c
faddp st1, st0 ; st: t7, f_8, f_4, s, c
fld _t5 ; st: t5, t7, f_8, f_4, s, c
fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c
fstp _t4
fstp _t7 ; st: f_8, f_4, s, c
fld st3 ; st: c, f_8, f_4, s, c
fmul st0, st2
fld st3
fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c
fsubp st1, st0 ; st:-t0, f_8, f_4, s, c
fchs
fld _t8
fchs ; st:-t8, t0, f_8, f_4, s, c
fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c
fstp _t3
fstp _t0 ; st: f_8, f_4, s, c
fstp st0 ; st: f_4, s, c
fstp st0 ; st: s, c
fstp st0 ; st: c
fstp st0 ; st: <empty>
fld _t1
fld _t4
fld st1
fsub st0, st1
fstp qword[ebx+8*11] ; f[l7] = t1-t4
faddp st1, st0
fstp qword[ebx+8*3] ; f[l5] = t1+t4
fld _t2
fld _t3
fld st1
fsub st0, st1
fstp qword[ebx+8*15] ; f[l8]
faddp st1, st0
fstp qword[ebx+8*7] ; f[l6]
fld _t6
fld qword[ebx+8]
fld st1
fsub st0, st1
fxch st1
faddp st2, st0 ; st : t2, t1
fld _t8
fsub _t0
fld _t5
fadd _t7 ; st : t4, t3, t2, t1
fld st3
fsub st0, st1
fstp qword[ebx+8*9] ; f[l3] = t1-t4
fadd st0, st3
fstp qword[ebx+8] ; f[l1] = t1+t4
fld st1 ; st : t2, t3, t2, t1
fsub st0, st1 ; f[l4] = t2-t3
fstp qword[ebx+8*13] ; st : t3, t2, t1
faddp st1, st0 ; st : t2+t3, t1
fstp qword[ebx+8*5] ; f[l2] = t2+t3
fstp st0 ; st : <empty>
add ebx, 16*8
cmp ebx, eax
jb .loop_i
mov esp, ebp
pop ebp
ret
;=================================================================
; cdecl parameters:
; -- [ebp+8] = N
; -- [ebp+12] = p
; -- [ebp+16] = 4k-aligned data array address
; -- [ebp+20] = 4k-aligned SinCosTable address
; returns:
; -- nothing
; destroys:
; -- all GPRegs
; locals:
; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
;; ==========================
align 4
step3:
push ebp
mov ebp, esp
sub esp, 120
; 283 : {
; 293 : for (l=3; l<=p; l++)
mov cx, 0x0200
.newstep:
inc ch
cmp ch, byte[ebp+12]
jg .done
mov _step, cx
; 294 : {
; 295 : d1 = 1 << (l + l - 3);
mov cl, ch
add cl, cl
sub cl, 3
mov edx, 1
shl edx, cl
mov _d1, edx
; 296 : d2 = d1 << 1;
shl edx, 1
mov _d2, edx
mov eax, edx
; 297 : d3 = d2 << 1;
shl edx, 1
mov _d3, edx
; 298 : d4 = d2 + d3;
add eax, edx
mov _d4, eax
; 299 : d5 = d3 << 1;
shl edx, 1
mov _d5, edx
shl edx, 3
mov _d6, edx ; d6 = d5*8 to simplify index operations
; 339 : j5 = N / d5; ; moved out of internal loop
mov cl, [ebp+12]
sub cl, ch
add cl, cl
mov edx, 1
shl edx, cl
mov _j5, edx
; 300 :
; 301 : for (j=0; j<N; j+=d5)
mov ebx, [ebp+16]
mov esi, [ebp+8]
shl esi, 3
add esi, ebx
mov _end_of_array, esi
.next_j:
; {
; t1 = f[j] + f[j+d2];
mov eax, _d2
fld qword[ebx]
fld qword[ebx+eax*8]
fld st1
fadd st0, st1
fstp _t1
; t2 = f[j] - f[j+d2];
fsubp st1, st0
fstp _t2
; t3 = f[j+d3] + f[j+d4];
mov edi, _d3
fld qword[ebx+edi*8]
mov edx, _d4
fld qword[ebx+edx*8]
fld st1
fsub st0, st1 ; st : t4, f4, f3
fxch st1 ; st : f4, t4, f3
; t4 = f[j+d3] - f[j+d4];
faddp st2, st0 ; st : t4, t3
; f[j+d4] = t2 - t4;
; f[j+d3] = t2 + t4;
fld _t2
fld st0
fsub st0, st2 ; st : f4, t2, t4, t3
fstp qword[ebx+edx*8] ; st : t2, t4, t3
fadd st0, st1 ; st : f3, t4, t3
fstp qword[ebx+edi*8] ; st : t4, t3
; f[j+d2] = t1 - t3;
; f[j] = t1 + t3;
fld _t1
fst st1
fsub st0, st2 ; st : f2, t1, t3
fstp qword[ebx+eax*8] ; st : t1, t3
fadd st0, st1 ; st : f0, t3
fstp qword[ebx] ; st : t3
fstp st0
; jj = j + d1; / ??
mov edi, _d1
shl edi, 3 ; = d1*8
mov edx, edi
mov eax, edi
add eax, eax ; eax = d2*8
shl edx, 2 ; = d3*8
add edi, ebx ; now [edi] points to f[jj]
add edx, edi ; and [edx] points to f[jj+d3]
; t1 = f[jj];
fld qword [edi] ; st : t1
; t3 = f[jj+d3];
fld qword [edx] ; st : t3, t1
; t2 = f[jj+d2] * r;
fld qword [edi+eax]
fld [_r]
fmul st1, st0 ; st : r, t2, t3, t1
; t4 = f[jj+d4] * r
fmul qword [edx+eax] ; st : t4, t2, t3, t1
; f[jj] = t1 + t2 + t3;
fld st3 ; st : t1, t4, t2, t3, t1
fadd st0, st3
fadd st0, st2
fstp qword [edi]
; f[jj+d2] = t1 - t3 + t4;
fld st3
fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1
fld st0
fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1
fstp qword [edi+eax]
; f[jj+d4] = t1 - t3 - t4;
fsub st0, st1 ; st : f4, t4, t2, t3, t1
fstp qword [edx+eax]
; f[jj+d3] = t1 - t2 + t3;
fstp st0 ; st : t2, t3, t1
fsubp st1, st0 ; st : (t3-t2), t1
faddp st1, st0 ; st : f3
fstp qword [edx]
; for (k=1; k<d1; k++)
xor ecx, ecx ; ecx = k
mov _jj, ecx
.next_k:
inc ecx
cmp ecx, _d1
jge .done_k
; {
mov eax, _d2 ; the sector increment
; l1 = j + k;
mov edx, ecx
mov _l1, edx ; [ebx+edx*8] --> f[j+k]
; l2 = l1 + d2;
add edx, eax
mov _l2, edx
; l3 = l1 + d3;
add edx, eax
mov _l3, edx
; l4 = l1 + d4;
add edx, eax
mov _l4, edx
; l5 = j + d2 - k;
mov edx, eax
sub edx, ecx
mov _l5, edx
; l6 = l5 + d2;
add edx, eax
mov _l6, edx
; l7 = l5 + d3;
add edx, eax
mov _l7, edx
; l8 = l5 + d4;
add edx, eax
mov _l8, edx
; 340 : j5 *= k; // add-substituted multiplication
mov eax, _jj
add eax, _j5
mov _jj, eax
; c1 = C[jj];
; s1 = S[jj];
mov edi, [ebp+20]
fld qword[edi+eax*8]
mov esi, [ebp+8]
shl esi, 2
add esi, edi
fld qword[esi+eax*8] ; st : s1, c1
; t5 = f[l2] * c1 + f[l6] * s1;
; t8 = f[l6] * c1 - f[l2] * s1;
mov edx, _l6
fld qword[ebx+edx*8]
mov edx, _l2
fld st0
fmul st0, st2
fxch st1
fmul st0, st3
fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c
fmul st4, st0
fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c
fstp _t8
faddp st2, st0 ; st : f[l2]*s, t5
fstp st0 ; st : t5
fstp _t5 ; st : <empty>
; c2 = C[2*jj];
; s2 = S[2*jj];
shl eax, 1
fld qword[edi+eax*8]
fld qword[esi+eax*8] ; st : s2, c2
; t6 = f[l3] * c2 + f[l7] * s2;
; t9 = f[l7] * c2 - f[l3] * s2;
mov edx, _l7
fld qword[ebx+edx*8]
mov edx, _l3
fld st0
fmul st0, st2
fxch st1
fmul st0, st3
fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c
fmul st4, st0
fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c
fstp _t9
faddp st2, st0 ; st : f[l2]*s, t6
fstp st0 ; st : t6
fstp _t6 ; st : <empty>
; c3 = C[3*jj];
; s3 = S[3*jj];
add eax, _jj
fld qword[edi+eax*8]
fld qword[esi+eax*8] ; st : s3, c3
; t7 = f[l4] * c3 + f[l8] * s3;
; t0 = f[l8] * c3 - f[l4] * s3;
mov edx, _l8
fld qword[ebx+edx*8]
mov edx, _l4
fld st0
fmul st0, st2
fxch st1
fmul st0, st3
fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c
fmul st4, st0
fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c
fstp _t0
faddp st2, st0 ; st : f[l2]*s, t7
fstp st0 ; st : t7
fstp _t7 ; st : <empty>
; t1 = f[l5] - t9;
; t2 = f[l5] + t9;
mov eax, _l5
fld qword [ebx+eax*8]
fld _t9
fld st0
fadd st0, st2
fstp _t2
fsubp st1, st0
fstp _t1
; t3 = - t8 - t0;
fld _t8
fadd _t0
fchs
fstp _t3
; t4 = t5 - t7;
fld _t5
fsub _t7
fstp _t4
; f[l5] = t1 + t4;
fld _t1
fld _t4
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l7] = t1 - t4;
mov eax, _l7
fsubp st1, st0
fstp qword [ebx+eax*8]
; f[l6] = t2 + t3;
mov eax, _l6
fld _t2
fld _t3
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l8] = t2 - t3;
mov eax, _l8
fsubp st1, st0
fstp qword [ebx+eax*8]
; t1 = f[l1] + t6;
mov eax, _l1
fld qword [ebx+eax*8]
fld _t6
fld st0
fadd st0, st2
fstp _t1
; t2 = f[l1] - t6;
fsubp st1, st0
fstp _t2
; t3 = t8 - t0;
fld _t8
fsub _t0
fstp _t3
; t4 = t5 + t7;
fld _t5
fadd _t7
fstp _t4
; f[l1] = t1 + t4;
mov eax, _l1
fld _t1
fld _t4
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l3] = t1 - t4;
mov eax, _l3
fsubp st1, st0
fstp qword [ebx+eax*8]
; f[l2] = t2 + t3;
mov eax, _l2
fld _t2
fld _t3
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l4] = t2 - t3;
mov eax, _l4
fsubp st1, st0
fstp qword [ebx+eax*8]
; 374 : }
jmp .next_k
.done_k:
; 375 : }
add ebx, _d6 ; d6 = d5*8
cmp ebx, _end_of_array
jb .next_j
; 376 : }
mov cx, _step
jmp .newstep
.done:
mov esp, ebp
pop ebp
; 377 : }
ret
;=========== Step3 ends here ===========
; =================================================================
;=================================================================
; parameters:
; -- [ebp+8] = N
; -- [ebp+12] = p
; -- [ebp+16] = 4k-aligned data array address
; -- [ebp+20] = 4k-aligned SinCosTable address
; returns:
; -- nothing
; destroys:
; -- all GPRegs
;; ==========================
align 4
FHT_4:
push ebp
mov ebp, esp
mov edx, [ebp+16]
add edx, [ebp+12]
call BitInvert
push dword[ebp+16]
push dword[ebp+8]
call step1
call step2
pop edx ; N
pop ecx ; a
push dword[ebp+20] ; t
push ecx
push dword[ebp+12] ; p
push edx ; N
call step3
mov esp, ebp
pop ebp
ret

View File

@ -69,7 +69,7 @@ include '../../../develop/libraries/box_lib/load_lib.mac'
@use_library
;include 'debug.inc'
;include '../../debug.inc'
if ~ RENDER eq PIX
TOP=TOP+4