From 6641043d1785be0ff77ed88f04b8c85d54e9b99c Mon Sep 17 00:00:00 2001 From: "Artem Jerdev (art_zh)" Date: Sun, 18 Sep 2011 21:08:53 +0000 Subject: [PATCH] fast Hartley transform presented git-svn-id: svn://kolibrios.org@2210 a494cfbc-eb01-0410-851d-a64ba20cac60 --- programs/other/archer/trunk/@RCHER.ASM | 2 +- programs/other/{archer/trunk => }/debug.inc | 272 +++---- programs/other/fft/FHT.txt | 28 + programs/other/fft/FHT4A.asm | 203 +++++ programs/other/fft/FHT4i.inc | 841 ++++++++++++++++++++ programs/other/rtfread/trunk/rtfread.asm | 2 +- 6 files changed, 1210 insertions(+), 138 deletions(-) rename programs/other/{archer/trunk => }/debug.inc (91%) create mode 100644 programs/other/fft/FHT.txt create mode 100644 programs/other/fft/FHT4A.asm create mode 100644 programs/other/fft/FHT4i.inc diff --git a/programs/other/archer/trunk/@RCHER.ASM b/programs/other/archer/trunk/@RCHER.ASM index 2d9fbab0d3..ea3bc7ba28 100644 --- a/programs/other/archer/trunk/@RCHER.ASM +++ b/programs/other/archer/trunk/@RCHER.ASM @@ -79,7 +79,7 @@ end if ;{ include "..\..\..\macros.inc" ; purge mov - include "debug.inc" + include "..\..\debug.inc" include 'dump.inc' ;} end if diff --git a/programs/other/archer/trunk/debug.inc b/programs/other/debug.inc similarity index 91% rename from programs/other/archer/trunk/debug.inc rename to programs/other/debug.inc index 0efb67985c..243a49bee6 100644 --- a/programs/other/archer/trunk/debug.inc +++ b/programs/other/debug.inc @@ -1,137 +1,137 @@ -macro debug_print str -{ - local ..string, ..label - - jmp ..label - ..string db str,0 - ..label: - - pushf - pushad - mov edx,..string - call debug_outstr - popad - popf -} - -dps fix debug_print - -macro debug_print_dec arg -{ - pushf - pushad - if ~arg eq eax - mov eax,arg - end if - call debug_outdec - popad - popf -} - -dpd fix debug_print_dec - -;--------------------------------- -debug_outdec: ;(eax - num, edi-str) - push 10 ;2 - pop ecx ;1 - push -'0' ;2 - .l0: - xor edx,edx ;2 - div ecx ;2 - push edx ;1 - test eax,eax ;2 - jnz .l0 ;2 - .l1: - pop eax ;1 - add al,'0' ;2 - call debug_outchar ; stosb - jnz .l1 ;2 - ret ;1 -;--------------------------------- - -debug_outchar: ; al - char - pushf - pushad - mov cl,al - mov eax,63 - mov ebx,1 - mcall - popad - popf -ret - -debug_outstr: - mov eax,63 - mov ebx,1 - @@: - mov cl,[edx] - test cl,cl - jz @f - int 40h - inc edx - jmp @b - @@: - ret - -_debug_crlf db 13, 10, 0 - -macro newline -{ - pushf - pushad - mov edx, _debug_crlf - call debug_outstr - popad - popf -} - -macro print message -{ - dps message - newline -} - -macro pregs -{ - dps "EAX: " - dpd eax - dps " EBX: " - dpd ebx - newline - dps "ECX: " - dpd ecx - dps " EDX: " - dpd edx - newline -} - -macro debug_print_hex arg -{ - pushf - pushad - if ~arg eq eax - mov eax, arg - end if - call debug_outhex - popad - popf -} -dph fix debug_print_hex - -debug_outhex: - ; eax - number - mov edx, 8 - .new_char: - rol eax, 4 - movzx ecx, al - and cl, 0x0f - mov cl, [__hexdigits + ecx] - pushad - mcall 63, 1 - popad - dec edx - jnz .new_char -ret - -__hexdigits: +macro debug_print str +{ + local ..string, ..label + + jmp ..label + ..string db str,0 + ..label: + + pushf + pushad + mov edx,..string + call debug_outstr + popad + popf +} + +dps fix debug_print + +macro debug_print_dec arg +{ + pushf + pushad + if ~arg eq eax + mov eax,arg + end if + call debug_outdec + popad + popf +} + +dpd fix debug_print_dec + +;--------------------------------- +debug_outdec: ;(eax - num, edi-str) + push 10 ;2 + pop ecx ;1 + push -'0' ;2 + .l0: + xor edx,edx ;2 + div ecx ;2 + push edx ;1 + test eax,eax ;2 + jnz .l0 ;2 + .l1: + pop eax ;1 + add al,'0' ;2 + call debug_outchar ; stosb + jnz .l1 ;2 + ret ;1 +;--------------------------------- + +debug_outchar: ; al - char + pushf + pushad + mov cl,al + mov eax,63 + mov ebx,1 + mcall + popad + popf +ret + +debug_outstr: + mov eax,63 + mov ebx,1 + @@: + mov cl,[edx] + test cl,cl + jz @f + mcall + inc edx + jmp @b + @@: + ret + +_debug_crlf db 13, 10, 0 + +macro newline +{ + pushf + pushad + mov edx, _debug_crlf + call debug_outstr + popad + popf +} + +macro print message +{ + dps message + newline +} + +macro pregs +{ + dps "EAX: " + dpd eax + dps " EBX: " + dpd ebx + newline + dps "ECX: " + dpd ecx + dps " EDX: " + dpd edx + newline +} + +macro debug_print_hex arg +{ + pushf + pushad + if ~arg eq eax + mov eax, arg + end if + call debug_outhex + popad + popf +} +dph fix debug_print_hex + +debug_outhex: + ; eax - number + mov edx, 8 + .new_char: + rol eax, 4 + movzx ecx, al + and cl, 0x0f + mov cl, [__hexdigits + ecx] + pushad + mcall 63, 1 + popad + dec edx + jnz .new_char +ret + +__hexdigits: db '0123456789ABCDEF' \ No newline at end of file diff --git a/programs/other/fft/FHT.txt b/programs/other/fft/FHT.txt new file mode 100644 index 0000000000..4529ec2fb7 --- /dev/null +++ b/programs/other/fft/FHT.txt @@ -0,0 +1,28 @@ +This is a draft version of my Fast Hartley Transform (FHT) routine for KolibriOS + +Hartley transform is a real-basis version of well-known Fourier transform: + +1) basis function: cas(x) = cos(x) + sin(x); +2) forward transform: H(f) = sum(k=0..N-1) [X(k)*cas(kf/(2*pi*N))] +3) reverse transform: X(k) = 1/N * sum(f=0..N-1) [H(f)*cas(kf/(2*pi*N))] + +FHT is known to be faster than most conventional fast Fourier transform (FHT) methods. +It also uses half-length arrays due to no need of imaginary data storage. + +FHT can be easily converted to FFT (and back) with no loss of information. +Most of general tasks FFT used for (correlation, convolution, energy spectra, noise +filtration, differential math, phase detection ect.) may be done directly with FHT. + +==================================================================================== + +Copyright (C) A. Jerdev 1999, 2003 and 2010. + +The code can be used, changed and redistributed in any KolibriOS application +with only two limitations: + +1) the author's name and copyright information cannot be deleted or changed; + +2) the code is not allowed to be ported to or distributed with other operation systems. + +18/09/2010 +Artem Jerdev diff --git a/programs/other/fft/FHT4A.asm b/programs/other/fft/FHT4A.asm new file mode 100644 index 0000000000..43962aa2ce --- /dev/null +++ b/programs/other/fft/FHT4A.asm @@ -0,0 +1,203 @@ +;======================================================================== +;= = +;= Fast Hartley Transform demo for KolibriOS = +;= = +;= Copyright (C) 2010, Artem Jerdev = +;= = +;= refer to wiki.kolibtios.org for all details = +;= = +;======================================================================== + + + + +use32 + + org 0x0 + + db 'MENUET01' ; 8 byte id + dd 0x01 ; header version + dd START ; start of code + dd I_END ; size of image + dd 0x100000 ; memory for app + dd 0xbfffc ; esp + dd 0x0 , 0x0 ; I_Param , I_Icon + +include '../../macros.inc' +include '../debug.inc' +include 'FHT4i.inc' + + +START: ; start of execution + + call main + + + mov eax,-1 ; close this program + int 0x40 + + +;============================================================= +;Func: calculates a simple function +; ff = (512*2^(-t) * cos (2.5*t)) +; uses: eax, ebx +;------------ +Func: + +; 9 : { + +; 10 : double x,t; +; 11 : int f; +; 12 : +; 13 : x = (i < N2) ? i : i - NUM_POINTS; + mov eax, [ii] + cmp eax, 512 + jge .index_negative + jmp .index_correct +.index_negative: + sub eax, 1024 +.index_correct: + mov [temp], eax +; fild [temp] + +; 14 : t = x / 16.0; +; f2xm1 argument (abs) must be less than 1, so + mov [t_mod], eax + and [t_mod], 0x0F ; x % 16 + shr eax, 4 ; x / 16 + mov [t_div], eax + fild [temp] + +; 15 : if (t<0) t = -t; + fabs +exp_ok: +; 16 : f = (512*2^(-t) * cos (2.5*t)); + fchs + f2xm1 + fmul [f500] + fstp [tv93] + fld [f2_5] + fmul [tt] + fcos + fmul [tv93] + fstp [tt] + mov bx, word[tt+6] + shr bx,4 + and bx,0x07FF + add ax,bx + shl ax,4 + and word[tt+6], 0x800F + or word[tt+6], ax + fld [tt] + fstp [ff] + +; 17 : return f; +; 18 : } + ret +;--------------------------------------------------------- +; test data filler +; +; uses eax, ebx, ecx +FillData: +; 29 : for (i=0; i +ret + +;================================================================= +; parameter1: +; -- reg edx = _CosTable address +; destroys: +; -- eax, ebx, ecx +;; ========================== +align 4 +DestroySinCosTable: + mov ecx, edx + mov ebx, 13 + mov eax, 68 + int 0x40 ; free(SinCosTable) +ret + +;================================================================= +; parameter1: +; -- reg dl (bits[3:0]) = Power_of_4 +; -- reg edx && (-16) = 4k-aligned data array address +; returns: +; -- edx = Power_of_4 +; -- ecx = N +; destroys: +; -- eax, ebx, ecx, edx, esi +;; ========================== +align 4 +BitInvert: + mov esi, edx + and esi, 0xFFFFFFF0 + and edx, 0x0F + push edx + mov cl, dl + xor eax, eax + inc eax + shl eax, cl + shl eax, cl + push eax + xor ecx, ecx ; index term +.newterm: + inc ecx + cmp ecx, [esp] ; N + jge .done + + xor eax, eax + mov edx, ecx + xor bl, bl + +.do_invert: + inc bl + cmp bl, byte[esp+4] ; Power_of_4 + jg .switch + + mov bh, dl + and bh, 3 + shl eax, 2 + or al, bh + shr edx, 2 + jmp .do_invert + +.switch: + cmp eax, ecx + jle .newterm + + fld qword [esi+eax*8] + fld qword [esi+ecx*8] + fstp qword [esi+eax*8] + fstp qword [esi+ecx*8] + jmp .newterm + +.done: + pop ecx + pop edx + ret + +;================================================================= + + +;================================================================= +; stdcall parameters: +; -- [esp+4] = N +; -- [esp+8] = 4k-aligned data array address +; returns: +; -- nothing +; destroys: +; -- ebx, esi +;; ========================== +align 4 +step1: + mov ebx, [esp+8] + mov esi, [esp+4] + shl esi, 3 + add esi, ebx + +.loop: + fld qword[ebx] + fld qword[ebx+8] + fld st1 + fsub st0, st1 ; st : t2, f[i+1], f[i] + fxch st1 ; st : f[i+1], t2, f[i] + faddp st2, st0 ; st : t2, t1 + fld qword[ebx+16] + fld qword[ebx+24] + fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 + fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 + fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 + fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 + fstp st1 ; st : t4, t3, t2, t1 + fld st2 ; st : t2, t4, t3, t2, t1 + fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 + fstp qword[ebx+16] ; st : t4, t3, t2, t1 + fsubp st2, st0 ; st : t3, t2-t4, t1 + fld st2 ; st : t1, t3, t2-t4, t1 + fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 + fstp qword[ebx] ; st : t3, t2-t4, t1 + fsubp st2, st0 ; st : t2-t4, t1-t3 + fstp qword[ebx+24] ; st : t1-t3 + fstp qword[ebx+8] ; st : + + add ebx, 32 + cmp ebx, esi + jnz .loop +ret + +; local stack definitions +;=========================================================================== +_t0 equ dword [esp] +_t1 equ dword[esp+4] +_t2 equ dword[esp+8] +_t3 equ dword[esp+12] +_t4 equ dword[esp+16] +_t5 equ dword[esp+20] +_t6 equ dword[esp+24] +_t7 equ dword[esp+28] +_t8 equ dword[esp+32] +_t9 equ dword[esp+36] + +_l1 equ dword[esp+40] +_l2 equ dword[esp+44] +_l3 equ dword[esp+48] +_l4 equ dword[esp+52] +_l5 equ dword[esp+56] +_l6 equ dword[esp+60] +_l7 equ dword[esp+64] +_l8 equ dword[esp+68] +_l9 equ dword[esp+72] +_l0 equ dword[esp+76] +_d1 equ dword[esp+80] +_d2 equ dword[esp+84] +_d3 equ dword[esp+88] +_d4 equ dword[esp+92] +_d5 equ dword[esp+96] +_d6 equ dword[esp+100] +_j5 equ dword[esp+104] +_jj equ dword[esp+108] +_end_of_array equ dword[esp+112] +_step equ word [esp+116] + + +;================================================================= +; cdecl parameters: +; -- [ebp+8] = N +; -- [ebp+12] = 4k-aligned data array address +; returns: +; -- nothing +; destroys: +; -- eax, ebx +; locals: +; -- 10 stack-located dwords (_t0 ... _t9) +;; ========================== +align 4 +step2: + push ebp + mov ebp, esp + sub esp, 40 + mov ebx, [ebp+12] + mov eax, [ebp+ 8] + shl eax, 3 + add eax, ebx + +.loop_i: + +; -- quad subelements +0, +4, +8 and +12 (simpliest operations) + fld qword[ebx] + fld qword[ebx+8*4] + fld st0 + fadd st0, st2 ; st : t1, f_4, f_0 + fxch st1 + fsubp st2, st0 ; st : t1, t2 + fld qword[ebx+8*8] + fld qword[ebx+8*12] + fld st0 + fadd st0, st2 ; st : t3, f_12, t1, t2 + fxch st1 + fsubp st2, st0 ; st : t3, t4, t1, t2 + ; ------ + fld st2 ; st : t1, t3, t4, t1, t2 + fadd st0, st1 + fstp qword[ebx] ; st : t3, t4, t1, t2 + fsub st0, st2 ; st : t3-t1, t4, t1, t2 + fchs ; st : t1-t3, t4, t1, t2 + fstp qword[ebx+8*4] ; st : t4, t1, t2 + fst st1 ; st : t4, t4, t2 + fadd st0, st2 ; st : t2+t4, t4, t2 + fstp qword[ebx+8*8] ; st : t4, t2 + fsubp st1, st0 ; st : t2-t4 + fstp qword[ebx+8*12] ; st : + +; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) + fld qword[ebx+8*2] + fld qword[ebx+8*6] + fld [_r] + fmul st1, st0 ; st : r, t2, t1 + fld qword[ebx+8*10] + fxch st1 ; st : r, t3, t2, t1 + fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 + ; ------ + fld st3 ; st : t1, t4, t3, t2, t1 + fadd st0, st3 ; + fadd st0, st2 ; + fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 + fsub st0, st3 ; + fsub st0, st3 ; + fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 + fld st3 ; st : t1, t4, t3, t2, t1 + fsub st0, st2 ; + fsub st0, st1 ; + fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 + fadd st0, st1 ; + faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 + fstp qword[ebx+8*6] ; store f[i+6] + fstp st0 ; st : t2, t1 + fstp st0 ; st : t1 + fstp st0 ; st : + +; -- odd subelements + fld qword[ebx+8*9] + fld qword[ebx+8*11] + fld st1 + fsub st0, st1 + fxch st1 + faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) + fld [_r2] + fmul st2, st0 + fmulp st1, st0 ; st : t9, t6 + fld qword[ebx+8*3] + fld st0 + fadd st0, st2 ; st : t1, f[l5], t9, t6 + fstp _t1 + fsub st0, st1 + fstp _t2 + fstp _t9 ; (t9 never used) + fstp _t6 ; st : + + fld [_c1] + fld [_s1] + fld qword[ebx+8*5] + fld qword[ebx+8*7] + fld st3 ; st: c1, f[l6], f[l2], s1, c1 + fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c + fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c + fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c + faddp st1, st0 ; st: t5, f_6, f_2, s, c + fstp _t5 ; st: f_6, f_2, s, c + fld st3 ; st: c, f_6, f_2, s, c + fmul st0, st1 + fld st3 + fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c + fsubp st1, st0 ; st: t8, f_6, f_2, s, c + fstp _t8 ; st: f_6, f_2, s, c + fstp st0 ; st: f_2, s, c + fstp st0 ; st: s, c + + fld qword[ebx+8*13] + fld qword[ebx+8*15] + fld st3 ; st: c1, f[l8], f[l4], s1, c1 + fmul st0, st1 + fld st3 + fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c + faddp st1, st0 ; st: t7, f_8, f_4, s, c + fld _t5 ; st: t5, t7, f_8, f_4, s, c + fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c + fstp _t4 + fstp _t7 ; st: f_8, f_4, s, c + fld st3 ; st: c, f_8, f_4, s, c + fmul st0, st2 + fld st3 + fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c + fsubp st1, st0 ; st:-t0, f_8, f_4, s, c + fchs + fld _t8 + fchs ; st:-t8, t0, f_8, f_4, s, c + fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c + fstp _t3 + fstp _t0 ; st: f_8, f_4, s, c + fstp st0 ; st: f_4, s, c + fstp st0 ; st: s, c + fstp st0 ; st: c + fstp st0 ; st: + + fld _t1 + fld _t4 + fld st1 + fsub st0, st1 + fstp qword[ebx+8*11] ; f[l7] = t1-t4 + faddp st1, st0 + fstp qword[ebx+8*3] ; f[l5] = t1+t4 + fld _t2 + fld _t3 + fld st1 + fsub st0, st1 + fstp qword[ebx+8*15] ; f[l8] + faddp st1, st0 + fstp qword[ebx+8*7] ; f[l6] + + fld _t6 + fld qword[ebx+8] + fld st1 + fsub st0, st1 + fxch st1 + faddp st2, st0 ; st : t2, t1 + fld _t8 + fsub _t0 + fld _t5 + fadd _t7 ; st : t4, t3, t2, t1 + + fld st3 + fsub st0, st1 + fstp qword[ebx+8*9] ; f[l3] = t1-t4 + fadd st0, st3 + fstp qword[ebx+8] ; f[l1] = t1+t4 + fld st1 ; st : t2, t3, t2, t1 + fsub st0, st1 ; f[l4] = t2-t3 + fstp qword[ebx+8*13] ; st : t3, t2, t1 + faddp st1, st0 ; st : t2+t3, t1 + fstp qword[ebx+8*5] ; f[l2] = t2+t3 + fstp st0 ; st : + + add ebx, 16*8 + cmp ebx, eax + jb .loop_i + + mov esp, ebp + pop ebp +ret + + + + +;================================================================= +; cdecl parameters: +; -- [ebp+8] = N +; -- [ebp+12] = p +; -- [ebp+16] = 4k-aligned data array address +; -- [ebp+20] = 4k-aligned SinCosTable address +; returns: +; -- nothing +; destroys: +; -- all GPRegs +; locals: +; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step) +;; ========================== +align 4 +step3: + push ebp + mov ebp, esp + sub esp, 120 +; 283 : { + + +; 293 : for (l=3; l<=p; l++) + mov cx, 0x0200 +.newstep: + inc ch + cmp ch, byte[ebp+12] + jg .done + mov _step, cx + +; 294 : { +; 295 : d1 = 1 << (l + l - 3); + + mov cl, ch + add cl, cl + sub cl, 3 + mov edx, 1 + shl edx, cl + mov _d1, edx + +; 296 : d2 = d1 << 1; + shl edx, 1 + mov _d2, edx + mov eax, edx + +; 297 : d3 = d2 << 1; + shl edx, 1 + mov _d3, edx + +; 298 : d4 = d2 + d3; + add eax, edx + mov _d4, eax + +; 299 : d5 = d3 << 1; + shl edx, 1 + mov _d5, edx + shl edx, 3 + mov _d6, edx ; d6 = d5*8 to simplify index operations + +; 339 : j5 = N / d5; ; moved out of internal loop + mov cl, [ebp+12] + sub cl, ch + add cl, cl + mov edx, 1 + shl edx, cl + mov _j5, edx + +; 300 : +; 301 : for (j=0; j f[j+k] +; l2 = l1 + d2; + add edx, eax + mov _l2, edx +; l3 = l1 + d3; + add edx, eax + mov _l3, edx +; l4 = l1 + d4; + add edx, eax + mov _l4, edx + +; l5 = j + d2 - k; + mov edx, eax + sub edx, ecx + mov _l5, edx +; l6 = l5 + d2; + add edx, eax + mov _l6, edx +; l7 = l5 + d3; + add edx, eax + mov _l7, edx +; l8 = l5 + d4; + add edx, eax + mov _l8, edx + + +; 340 : j5 *= k; // add-substituted multiplication + mov eax, _jj + add eax, _j5 + mov _jj, eax + +; c1 = C[jj]; +; s1 = S[jj]; + mov edi, [ebp+20] + fld qword[edi+eax*8] + mov esi, [ebp+8] + shl esi, 2 + add esi, edi + fld qword[esi+eax*8] ; st : s1, c1 + +; t5 = f[l2] * c1 + f[l6] * s1; +; t8 = f[l6] * c1 - f[l2] * s1; + mov edx, _l6 + fld qword[ebx+edx*8] + mov edx, _l2 + fld st0 + fmul st0, st2 + fxch st1 + fmul st0, st3 + fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c + fmul st4, st0 + fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c + fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c + fstp _t8 + faddp st2, st0 ; st : f[l2]*s, t5 + fstp st0 ; st : t5 + fstp _t5 ; st : + +; c2 = C[2*jj]; +; s2 = S[2*jj]; + shl eax, 1 + fld qword[edi+eax*8] + fld qword[esi+eax*8] ; st : s2, c2 + +; t6 = f[l3] * c2 + f[l7] * s2; +; t9 = f[l7] * c2 - f[l3] * s2; + mov edx, _l7 + fld qword[ebx+edx*8] + mov edx, _l3 + fld st0 + fmul st0, st2 + fxch st1 + fmul st0, st3 + fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c + fmul st4, st0 + fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c + fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c + fstp _t9 + faddp st2, st0 ; st : f[l2]*s, t6 + fstp st0 ; st : t6 + fstp _t6 ; st : + +; c3 = C[3*jj]; +; s3 = S[3*jj]; + add eax, _jj + fld qword[edi+eax*8] + fld qword[esi+eax*8] ; st : s3, c3 + +; t7 = f[l4] * c3 + f[l8] * s3; +; t0 = f[l8] * c3 - f[l4] * s3; + mov edx, _l8 + fld qword[ebx+edx*8] + mov edx, _l4 + fld st0 + fmul st0, st2 + fxch st1 + fmul st0, st3 + fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c + fmul st4, st0 + fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c + fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c + fstp _t0 + faddp st2, st0 ; st : f[l2]*s, t7 + fstp st0 ; st : t7 + fstp _t7 ; st : + +; t1 = f[l5] - t9; +; t2 = f[l5] + t9; + mov eax, _l5 + fld qword [ebx+eax*8] + fld _t9 + fld st0 + fadd st0, st2 + fstp _t2 + fsubp st1, st0 + fstp _t1 + +; t3 = - t8 - t0; + fld _t8 + fadd _t0 + fchs + fstp _t3 +; t4 = t5 - t7; + fld _t5 + fsub _t7 + fstp _t4 + +; f[l5] = t1 + t4; + fld _t1 + fld _t4 + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l7] = t1 - t4; + mov eax, _l7 + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; f[l6] = t2 + t3; + mov eax, _l6 + fld _t2 + fld _t3 + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l8] = t2 - t3; + mov eax, _l8 + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; t1 = f[l1] + t6; + mov eax, _l1 + fld qword [ebx+eax*8] + fld _t6 + fld st0 + fadd st0, st2 + fstp _t1 +; t2 = f[l1] - t6; + fsubp st1, st0 + fstp _t2 + +; t3 = t8 - t0; + fld _t8 + fsub _t0 + fstp _t3 +; t4 = t5 + t7; + fld _t5 + fadd _t7 + fstp _t4 + +; f[l1] = t1 + t4; + mov eax, _l1 + fld _t1 + fld _t4 + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l3] = t1 - t4; + mov eax, _l3 + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; f[l2] = t2 + t3; + mov eax, _l2 + fld _t2 + fld _t3 + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l4] = t2 - t3; + mov eax, _l4 + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; 374 : } + jmp .next_k + +.done_k: +; 375 : } + add ebx, _d6 ; d6 = d5*8 + cmp ebx, _end_of_array + jb .next_j + +; 376 : } + mov cx, _step + jmp .newstep +.done: + mov esp, ebp + pop ebp +; 377 : } + ret + + + ;=========== Step3 ends here =========== + + +; ================================================================= + +;================================================================= +; parameters: +; -- [ebp+8] = N +; -- [ebp+12] = p +; -- [ebp+16] = 4k-aligned data array address +; -- [ebp+20] = 4k-aligned SinCosTable address +; returns: +; -- nothing +; destroys: +; -- all GPRegs +;; ========================== + +align 4 + +FHT_4: + + push ebp + mov ebp, esp + mov edx, [ebp+16] + add edx, [ebp+12] + call BitInvert + push dword[ebp+16] + push dword[ebp+8] + call step1 + call step2 + pop edx ; N + pop ecx ; a + push dword[ebp+20] ; t + push ecx + push dword[ebp+12] ; p + push edx ; N + call step3 + mov esp, ebp + pop ebp + +ret diff --git a/programs/other/rtfread/trunk/rtfread.asm b/programs/other/rtfread/trunk/rtfread.asm index 9290858dbb..98a7bed706 100644 --- a/programs/other/rtfread/trunk/rtfread.asm +++ b/programs/other/rtfread/trunk/rtfread.asm @@ -69,7 +69,7 @@ include '../../../develop/libraries/box_lib/load_lib.mac' @use_library -;include 'debug.inc' +;include '../../debug.inc' if ~ RENDER eq PIX TOP=TOP+4