From 3f2315327d64cc067196e6fb1e1e9bdab34b795e Mon Sep 17 00:00:00 2001 From: "Artem Jerdev (art_zh)" Date: Tue, 16 Apr 2013 23:52:35 +0000 Subject: [PATCH] fht4code: a very old bug fixed in Step2 git-svn-id: svn://kolibrios.org@3474 a494cfbc-eb01-0410-851d-a64ba20cac60 --- programs/other/fft/fht4code.asm | 1683 ++++++++++++++++--------------- 1 file changed, 842 insertions(+), 841 deletions(-) diff --git a/programs/other/fft/fht4code.asm b/programs/other/fft/fht4code.asm index 746a8285ba..f66f5abdd5 100644 --- a/programs/other/fft/fht4code.asm +++ b/programs/other/fft/fht4code.asm @@ -1,190 +1,190 @@ -; Fast Hartley Transform routine -; Copyright (C) 1999, 2004, 2010 -; Artem Jerdev artem@jerdev.co.uk -; -; free KolibriOS version - not to be ported to other OSes -; ========================================================== - - -; global constants -align 8 -_r dq 1.41421356237309504880169 ; = sqrt(2) -_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2 -_c1 dq 0.92387953251128675612818 ; = cos(pi/8) -_s1 dq 0.38268343236508977172846 ; = sin(pi/8) - -;================================================================= -; parameter1: -; -- reg dl (bits[3:0]) = Power_of_4 -; returns: -; -- reg edx = _CosTable address (4k-aligned) -; assumes: _SinTable = _CosTable + (N/2)*8 -; user heap has to be initialized -; destroys: -; -- eax, ebx, ecx -;; ========================== -align 4 -CreateSinCosTable: - xor eax, eax - inc eax - mov cl, dl - and cl, 15 - shl eax, cl - shl eax, cl - mov ecx, eax ; now ecx = N - shl ecx, 3 - mov ebx, 12 - mov eax, 68 - int 0x40 ; getmem(N*sizeof(double)) - - mov edx, eax ; edx = _CosTable - shr ecx, 1 - mov ebx, eax - add ebx, ecx ; ebx = _SinTable - shr ecx, 3 - push ecx ; [esp] = ecx = N/2 - - xor eax, eax - fldpi - fidiv dword[esp] ; st : dx = 2*pi/N - pop ecx - fldz ; st : 0, dx -.loop: - fld st0 ; st : x, x, dx - FSINCOS ; st : cos, sin, x, dx - fstp qword [edx+eax*8] ; st : sin, x, dx - fstp qword [ebx+eax*8] ; st : x, dx - fadd st0, st1 ; st : x+dx, dx - - inc eax - cmp eax, ecx - jne .loop - fstp st0 ; st : dx - fstp st0 ; st : -ret - -;================================================================= -; parameter1: -; -- reg edx = _CosTable address -; destroys: -; -- eax, ebx, ecx -;; ========================== -align 4 -DestroySinCosTable: - mov ecx, edx - mov ebx, 13 - mov eax, 68 - int 0x40 ; free(SinCosTable) -ret - -;================================================================= -; parameter1: -; -- reg dl (bits[3:0]) = Power_of_4 -; -- reg edx && (-16) = 4k-aligned data array address -; returns: -; -- edx = Power_of_4 -; -- ecx = N -; destroys: -; -- eax, ebx, ecx, edx, esi -;; ========================== -align 4 -BitInvert: - mov esi, edx - and esi, 0xFFFFFFF0 - and edx, 0x0F - push edx - mov cl, dl - xor eax, eax - inc eax - shl eax, cl - shl eax, cl - push eax - xor ecx, ecx ; index term -.newterm: - inc ecx - cmp ecx, [esp] ; N - jge .done - - xor eax, eax - mov edx, ecx - xor bl, bl - -.do_invert: - inc bl - cmp bl, byte[esp+4] ; Power_of_4 - jg .switch - - mov bh, dl - and bh, 3 - shl eax, 2 - or al, bh - shr edx, 2 - jmp .do_invert - -.switch: - cmp eax, ecx - jle .newterm - - fld qword [esi+eax*8] - fld qword [esi+ecx*8] - fstp qword [esi+eax*8] - fstp qword [esi+ecx*8] - jmp .newterm - -.done: - pop ecx - pop edx - ret - -;================================================================= - - -;================================================================= -; stdcall parameters: -; -- [esp+4] = N -; -- [esp+8] = 4k-aligned data array address -; returns: -; -- nothing -; destroys: -; -- ebx, esi -;; ========================== -align 4 -step1: - mov ebx, [esp+8] - mov esi, [esp+4] - shl esi, 3 - add esi, ebx - -.loop: - fld qword[ebx] - fld qword[ebx+8] - fld st1 - fsub st0, st1 ; st : t2, f[i+1], f[i] - fxch st1 ; st : f[i+1], t2, f[i] - faddp st2, st0 ; st : t2, t1 - fld qword[ebx+16] - fld qword[ebx+24] - fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 - fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 - fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 - fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 - fstp st1 ; st : t4, t3, t2, t1 - fld st2 ; st : t2, t4, t3, t2, t1 - fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 - fstp qword[ebx+16] ; st : t4, t3, t2, t1 - fsubp st2, st0 ; st : t3, t2-t4, t1 - fld st2 ; st : t1, t3, t2-t4, t1 - fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 - fstp qword[ebx] ; st : t3, t2-t4, t1 - fsubp st2, st0 ; st : t2-t4, t1-t3 - fstp qword[ebx+24] ; st : t1-t3 - fstp qword[ebx+8] ; st : - - add ebx, 32 - cmp ebx, esi - jnz .loop +; Fast Hartley Transform routine +; Copyright (C) 1999, 2004, 2010 +; Artem Jerdev artem@jerdev.co.uk +; +; free KolibriOS version - not to be ported to other OSes +; ========================================================== + + +; global constants +align 8 +_r dq 1.41421356237309504880169 ; = sqrt(2) +_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2 +_c1 dq 0.92387953251128675612818 ; = cos(pi/8) +_s1 dq 0.38268343236508977172846 ; = sin(pi/8) + +;================================================================= +; parameter1: +; -- reg dl (bits[3:0]) = Power_of_4 +; returns: +; -- reg edx = _CosTable address (4k-aligned) +; assumes: _SinTable = _CosTable + (N/2)*8 +; user heap has to be initialized +; destroys: +; -- eax, ebx, ecx +;; ========================== +align 4 +CreateSinCosTable: + xor eax, eax + inc eax + mov cl, dl + and cl, 15 + shl eax, cl + shl eax, cl + mov ecx, eax ; now ecx = N + shl ecx, 3 + mov ebx, 12 + mov eax, 68 + int 0x40 ; getmem(N*sizeof(double)) + + mov edx, eax ; edx = _CosTable + shr ecx, 1 + mov ebx, eax + add ebx, ecx ; ebx = _SinTable + shr ecx, 3 + push ecx ; [esp] = ecx = N/2 + + xor eax, eax + fldpi + fidiv dword[esp] ; st : dx = 2*pi/N + pop ecx + fldz ; st : 0, dx +.loop: + fld st0 ; st : x, x, dx + FSINCOS ; st : cos, sin, x, dx + fstp qword [edx+eax*8] ; st : sin, x, dx + fstp qword [ebx+eax*8] ; st : x, dx + fadd st0, st1 ; st : x+dx, dx + + inc eax + cmp eax, ecx + jne .loop + fstp st0 ; st : dx + fstp st0 ; st : ret - + +;================================================================= +; parameter1: +; -- reg edx = _CosTable address +; destroys: +; -- eax, ebx, ecx +;; ========================== +align 4 +DestroySinCosTable: + mov ecx, edx + mov ebx, 13 + mov eax, 68 + int 0x40 ; free(SinCosTable) +ret + +;================================================================= +; parameter1: +; -- reg dl (bits[3:0]) = Power_of_4 +; -- reg edx && (-16) = 4k-aligned data array address +; returns: +; -- edx = Power_of_4 +; -- ecx = N +; destroys: +; -- eax, ebx, ecx, edx, esi +;; ========================== +align 4 +BitInvert: + mov esi, edx + and esi, 0xFFFFFFF0 + and edx, 0x0F + push edx + mov cl, dl + xor eax, eax + inc eax + shl eax, cl + shl eax, cl + push eax + xor ecx, ecx ; index term +.newterm: + inc ecx + cmp ecx, [esp] ; N + jge .done + + xor eax, eax + mov edx, ecx + xor bl, bl + +.do_invert: + inc bl + cmp bl, byte[esp+4] ; Power_of_4 + jg .switch + + mov bh, dl + and bh, 3 + shl eax, 2 + or al, bh + shr edx, 2 + jmp .do_invert + +.switch: + cmp eax, ecx + jle .newterm + + fld qword [esi+eax*8] + fld qword [esi+ecx*8] + fstp qword [esi+eax*8] + fstp qword [esi+ecx*8] + jmp .newterm + +.done: + pop ecx + pop edx + ret + +;================================================================= + + +;================================================================= +; stdcall parameters: +; -- [esp+4] = N +; -- [esp+8] = 4k-aligned data array address +; returns: +; -- nothing +; destroys: +; -- ebx, esi +;; ========================== +align 4 +step1: + mov ebx, [esp+8] + mov esi, [esp+4] + shl esi, 3 + add esi, ebx + +.loop: + fld qword[ebx] + fld qword[ebx+8] + fld st1 + fsub st0, st1 ; st : t2, f[i+1], f[i] + fxch st1 ; st : f[i+1], t2, f[i] + faddp st2, st0 ; st : t2, t1 + fld qword[ebx+16] + fld qword[ebx+24] + fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 + fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 + fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 + fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 + fstp st1 ; st : t4, t3, t2, t1 + fld st2 ; st : t2, t4, t3, t2, t1 + fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 + fstp qword[ebx+16] ; st : t4, t3, t2, t1 + fsubp st2, st0 ; st : t3, t2-t4, t1 + fld st2 ; st : t1, t3, t2-t4, t1 + fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 + fstp qword[ebx] ; st : t3, t2-t4, t1 + fsubp st2, st0 ; st : t2-t4, t1-t3 + fstp qword[ebx+24] ; st : t1-t3 + fstp qword[ebx+8] ; st : + + add ebx, 32 + cmp ebx, esi + jnz .loop +ret + ;================================================================= ; SSE3 version: Step1 ; @@ -223,658 +223,659 @@ step1_sse: cmp ebx, esi jnz .loop ret - -; local stack definitions -;=========================================================================== -_t0 equ dword [esp] -_t1 equ dword[esp+4] -_t2 equ dword[esp+8] -_t3 equ dword[esp+12] -_t4 equ dword[esp+16] -_t5 equ dword[esp+20] -_t6 equ dword[esp+24] -_t7 equ dword[esp+28] -_t8 equ dword[esp+32] -_t9 equ dword[esp+36] - -_l1 equ dword[esp+40] -_l2 equ dword[esp+44] -_l3 equ dword[esp+48] -_l4 equ dword[esp+52] -_l5 equ dword[esp+56] -_l6 equ dword[esp+60] -_l7 equ dword[esp+64] -_l8 equ dword[esp+68] -_l9 equ dword[esp+72] -_l0 equ dword[esp+76] -_d1 equ dword[esp+80] -_d2 equ dword[esp+84] -_d3 equ dword[esp+88] -_d4 equ dword[esp+92] -_d5 equ dword[esp+96] -_d6 equ dword[esp+100] -_j5 equ dword[esp+104] -_jj equ dword[esp+108] -_end_of_array equ dword[esp+112] -_step equ word [esp+116] - - -;================================================================= -; cdecl parameters: -; -- [ebp+8] = N -; -- [ebp+12] = 4k-aligned data array address -; returns: -; -- nothing -; destroys: -; -- eax, ebx -; locals: -; -- 10 stack-located dwords (_t0 ... _t9) -;; ========================== -align 4 -step2: - push ebp - mov ebp, esp - sub esp, 40 - mov ebx, [ebp+12] - mov eax, [ebp+ 8] - shl eax, 3 - add eax, ebx - -.loop_i: - -; -- quad subelements +0, +4, +8 and +12 (simpliest operations) - fld qword[ebx] - fld qword[ebx+8*4] - fld st0 - fadd st0, st2 ; st : t1, f_4, f_0 - fxch st1 - fsubp st2, st0 ; st : t1, t2 - fld qword[ebx+8*8] - fld qword[ebx+8*12] - fld st0 - fadd st0, st2 ; st : t3, f_12, t1, t2 - fxch st1 - fsubp st2, st0 ; st : t3, t4, t1, t2 - ; ------ - fld st2 ; st : t1, t3, t4, t1, t2 - fadd st0, st1 - fstp qword[ebx] ; st : t3, t4, t1, t2 - fsub st0, st2 ; st : t3-t1, t4, t1, t2 - fchs ; st : t1-t3, t4, t1, t2 - fstp qword[ebx+8*4] ; st : t4, t1, t2 - fst st1 ; st : t4, t4, t2 - fadd st0, st2 ; st : t2+t4, t4, t2 - fstp qword[ebx+8*8] ; st : t4, t2 - fsubp st1, st0 ; st : t2-t4 - fstp qword[ebx+8*12] ; st : - -; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) - fld qword[ebx+8*2] - fld qword[ebx+8*6] - fld [_r] - fmul st1, st0 ; st : r, t2, t1 - fld qword[ebx+8*10] - fxch st1 ; st : r, t3, t2, t1 - fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 - ; ------ - fld st3 ; st : t1, t4, t3, t2, t1 - fadd st0, st3 ; - fadd st0, st2 ; - fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 - fsub st0, st3 ; - fsub st0, st3 ; - fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 - fld st3 ; st : t1, t4, t3, t2, t1 - fsub st0, st2 ; - fsub st0, st1 ; - fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 - fadd st0, st1 ; - faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 - fstp qword[ebx+8*6] ; store f[i+6] - fstp st0 ; st : t2, t1 - fstp st0 ; st : t1 - fstp st0 ; st : - -; -- odd subelements - fld qword[ebx+8*9] - fld qword[ebx+8*11] - fld st1 - fsub st0, st1 - fxch st1 - faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) - fld [_r2] - fmul st2, st0 - fmulp st1, st0 ; st : t9, t6 - fld qword[ebx+8*3] - fld st0 - fadd st0, st2 ; st : t1, f[l5], t9, t6 - fstp _t1 - fsub st0, st1 - fstp _t2 - fstp _t9 ; (t9 never used) - fstp _t6 ; st : - - fld [_c1] - fld [_s1] - fld qword[ebx+8*5] - fld qword[ebx+8*7] - fld st3 ; st: c1, f[l6], f[l2], s1, c1 - fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c - fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c - fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c - faddp st1, st0 ; st: t5, f_6, f_2, s, c - fstp _t5 ; st: f_6, f_2, s, c - fld st3 ; st: c, f_6, f_2, s, c - fmul st0, st1 - fld st3 - fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c - fsubp st1, st0 ; st: t8, f_6, f_2, s, c - fstp _t8 ; st: f_6, f_2, s, c - fstp st0 ; st: f_2, s, c - fstp st0 ; st: s, c - - fld qword[ebx+8*13] - fld qword[ebx+8*15] - fld st3 ; st: c1, f[l8], f[l4], s1, c1 - fmul st0, st1 - fld st3 - fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c - faddp st1, st0 ; st: t7, f_8, f_4, s, c - fld _t5 ; st: t5, t7, f_8, f_4, s, c - fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c - fstp _t4 - fstp _t7 ; st: f_8, f_4, s, c - fld st3 ; st: c, f_8, f_4, s, c - fmul st0, st2 - fld st3 - fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c - fsubp st1, st0 ; st:-t0, f_8, f_4, s, c - fchs - fld _t8 - fchs ; st:-t8, t0, f_8, f_4, s, c - fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c - fstp _t3 - fstp _t0 ; st: f_8, f_4, s, c - fstp st0 ; st: f_4, s, c - fstp st0 ; st: s, c - fstp st0 ; st: c - fstp st0 ; st: - - fld _t1 - fld _t4 - fld st1 - fsub st0, st1 - fstp qword[ebx+8*11] ; f[l7] = t1-t4 - faddp st1, st0 - fstp qword[ebx+8*3] ; f[l5] = t1+t4 - fld _t2 - fld _t3 - fld st1 - fsub st0, st1 - fstp qword[ebx+8*15] ; f[l8] - faddp st1, st0 - fstp qword[ebx+8*7] ; f[l6] - - fld _t6 - fld qword[ebx+8] - fld st1 - fsub st0, st1 - fxch st1 - faddp st2, st0 ; st : t2, t1 - fld _t8 - fsub _t0 - fld _t5 - fadd _t7 ; st : t4, t3, t2, t1 - - fld st3 - fsub st0, st1 - fstp qword[ebx+8*9] ; f[l3] = t1-t4 - fadd st0, st3 - fstp qword[ebx+8] ; f[l1] = t1+t4 - fld st1 ; st : t2, t3, t2, t1 - fsub st0, st1 ; f[l4] = t2-t3 - fstp qword[ebx+8*13] ; st : t3, t2, t1 - faddp st1, st0 ; st : t2+t3, t1 - fstp qword[ebx+8*5] ; f[l2] = t2+t3 - fstp st0 ; st : - - add ebx, 16*8 - cmp ebx, eax - jb .loop_i - - mov esp, ebp - pop ebp -ret - - - - -;================================================================= -; cdecl parameters: -; -- [ebp+8] = N -; -- [ebp+12] = p -; -- [ebp+16] = 4k-aligned data array address -; -- [ebp+20] = 4k-aligned SinCosTable address -; returns: -; -- nothing -; destroys: -; -- all GPRegs -; locals: -; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step) -;; ========================== -align 4 -step3: - push ebp - mov ebp, esp - sub esp, 120 -; 283 : { - - -; 293 : for (l=3; l<=p; l++) - mov cx, 0x0200 -.newstep: - inc ch - cmp ch, byte[ebp+12] - jg .done - mov _step, cx - -; 294 : { -; 295 : d1 = 1 << (l + l - 3); - - mov cl, ch - add cl, cl - sub cl, 3 - mov edx, 1 - shl edx, cl - mov _d1, edx - -; 296 : d2 = d1 << 1; - shl edx, 1 - mov _d2, edx - mov eax, edx - -; 297 : d3 = d2 << 1; - shl edx, 1 - mov _d3, edx - -; 298 : d4 = d2 + d3; - add eax, edx - mov _d4, eax - -; 299 : d5 = d3 << 1; - shl edx, 1 - mov _d5, edx - shl edx, 3 - mov _d6, edx ; d6 = d5*8 to simplify index operations - -; 339 : j5 = N / d5; ; moved out of internal loop - mov cl, [ebp+12] - sub cl, ch - add cl, cl - mov edx, 1 - shl edx, cl - mov _j5, edx - -; 300 : -; 301 : for (j=0; j f[j+k] -; l2 = l1 + d2; - add edx, eax - mov _l2, edx -; l3 = l1 + d3; - add edx, eax - mov _l3, edx -; l4 = l1 + d4; - add edx, eax - mov _l4, edx - -; l5 = j + d2 - k; - mov edx, eax - sub edx, ecx - mov _l5, edx -; l6 = l5 + d2; - add edx, eax - mov _l6, edx -; l7 = l5 + d3; - add edx, eax - mov _l7, edx -; l8 = l5 + d4; - add edx, eax - mov _l8, edx - - -; 340 : j5 *= k; // add-substituted multiplication - mov eax, _jj - add eax, _j5 - mov _jj, eax - -; c1 = C[jj]; -; s1 = S[jj]; - mov edi, [ebp+20] - fld qword[edi+eax*8] - mov esi, [ebp+8] - shl esi, 2 - add esi, edi - fld qword[esi+eax*8] ; st : s1, c1 - -; t5 = f[l2] * c1 + f[l6] * s1; -; t8 = f[l6] * c1 - f[l2] * s1; - mov edx, _l6 - fld qword[ebx+edx*8] - mov edx, _l2 - fld st0 - fmul st0, st2 - fxch st1 - fmul st0, st3 - fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c - fmul st4, st0 - fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c - fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c - fstp _t8 - faddp st2, st0 ; st : f[l2]*s, t5 - fstp st0 ; st : t5 - fstp _t5 ; st : - -; c2 = C[2*jj]; -; s2 = S[2*jj]; - shl eax, 1 - fld qword[edi+eax*8] - fld qword[esi+eax*8] ; st : s2, c2 - -; t6 = f[l3] * c2 + f[l7] * s2; -; t9 = f[l7] * c2 - f[l3] * s2; - mov edx, _l7 - fld qword[ebx+edx*8] - mov edx, _l3 - fld st0 - fmul st0, st2 - fxch st1 - fmul st0, st3 - fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c - fmul st4, st0 - fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c - fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c - fstp _t9 - faddp st2, st0 ; st : f[l2]*s, t6 - fstp st0 ; st : t6 - fstp _t6 ; st : - -; c3 = C[3*jj]; -; s3 = S[3*jj]; - add eax, _jj - fld qword[edi+eax*8] - fld qword[esi+eax*8] ; st : s3, c3 - -; t7 = f[l4] * c3 + f[l8] * s3; -; t0 = f[l8] * c3 - f[l4] * s3; - mov edx, _l8 - fld qword[ebx+edx*8] - mov edx, _l4 - fld st0 - fmul st0, st2 - fxch st1 - fmul st0, st3 - fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c - fmul st4, st0 - fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c - fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c - fstp _t0 - faddp st2, st0 ; st : f[l2]*s, t7 - fstp st0 ; st : t7 - fstp _t7 ; st : - -; t1 = f[l5] - t9; -; t2 = f[l5] + t9; - mov eax, _l5 - fld qword [ebx+eax*8] - fld _t9 - fld st0 - fadd st0, st2 - fstp _t2 - fsubp st1, st0 - fstp _t1 - -; t3 = - t8 - t0; - fld _t8 - fadd _t0 - fchs - fstp _t3 -; t4 = t5 - t7; - fld _t5 - fsub _t7 - fstp _t4 - -; f[l5] = t1 + t4; - fld _t1 - fld _t4 - fld st0 - fadd st0, st2 - fstp qword [ebx+eax*8] -; f[l7] = t1 - t4; - mov eax, _l7 - fsubp st1, st0 - fstp qword [ebx+eax*8] - -; f[l6] = t2 + t3; - mov eax, _l6 - fld _t2 - fld _t3 - fld st0 - fadd st0, st2 - fstp qword [ebx+eax*8] -; f[l8] = t2 - t3; - mov eax, _l8 - fsubp st1, st0 - fstp qword [ebx+eax*8] - -; t1 = f[l1] + t6; - mov eax, _l1 - fld qword [ebx+eax*8] - fld _t6 - fld st0 - fadd st0, st2 - fstp _t1 -; t2 = f[l1] - t6; - fsubp st1, st0 - fstp _t2 - -; t3 = t8 - t0; - fld _t8 - fsub _t0 - fstp _t3 -; t4 = t5 + t7; - fld _t5 - fadd _t7 - fstp _t4 - -; f[l1] = t1 + t4; - mov eax, _l1 - fld _t1 - fld _t4 - fld st0 - fadd st0, st2 - fstp qword [ebx+eax*8] -; f[l3] = t1 - t4; - mov eax, _l3 - fsubp st1, st0 - fstp qword [ebx+eax*8] - -; f[l2] = t2 + t3; - mov eax, _l2 - fld _t2 - fld _t3 - fld st0 - fadd st0, st2 - fstp qword [ebx+eax*8] -; f[l4] = t2 - t3; - mov eax, _l4 - fsubp st1, st0 - fstp qword [ebx+eax*8] - -; 374 : } - jmp .next_k - -.done_k: -; 375 : } - add ebx, _d6 ; d6 = d5*8 - cmp ebx, _end_of_array - jb .next_j - -; 376 : } - mov cx, _step - jmp .newstep -.done: - mov esp, ebp - pop ebp -; 377 : } - ret - - - ;=========== Step3 ends here =========== - - -; ================================================================= - -;================================================================= -; parameters: -; -- [ebp+8] = N -; -- [ebp+12] = p -; -- [ebp+16] = 4k-aligned data array address -; -- [ebp+20] = 4k-aligned SinCosTable address -; returns: -; -- nothing -; destroys: -; -- all GPRegs -;; ========================== - -align 4 - -FHT_4: - - push ebp - mov ebp, esp - mov edx, [ebp+16] - add edx, [ebp+12] - call BitInvert - push dword[ebp+16] - push dword[ebp+8] - call step1 - call step2 - pop edx ; N - pop ecx ; a - push dword[ebp+20] ; t - push ecx - push dword[ebp+12] ; p - push edx ; N - call step3 - mov esp, ebp - pop ebp - -ret + +; local stack definitions +;=========================================================================== +_t0 equ dword [esp] +_t1 equ dword[esp+4] +_t2 equ dword[esp+8] +_t3 equ dword[esp+12] +_t4 equ dword[esp+16] +_t5 equ dword[esp+20] +_t6 equ dword[esp+24] +_t7 equ dword[esp+28] +_t8 equ dword[esp+32] +_t9 equ dword[esp+36] + +_l1 equ dword[esp+40] +_l2 equ dword[esp+44] +_l3 equ dword[esp+48] +_l4 equ dword[esp+52] +_l5 equ dword[esp+56] +_l6 equ dword[esp+60] +_l7 equ dword[esp+64] +_l8 equ dword[esp+68] +_l9 equ dword[esp+72] +_l0 equ dword[esp+76] +_d1 equ dword[esp+80] +_d2 equ dword[esp+84] +_d3 equ dword[esp+88] +_d4 equ dword[esp+92] +_d5 equ dword[esp+96] +_d6 equ dword[esp+100] +_j5 equ dword[esp+104] +_jj equ dword[esp+108] +_end_of_array equ dword[esp+112] +_step equ word [esp+116] + + +;================================================================= +; cdecl parameters: +; -- [ebp+8] = N +; -- [ebp+12] = 4k-aligned data array address +; returns: +; -- nothing +; destroys: +; -- eax, ebx +; locals: +; -- 10 stack-located dwords (_t0 ... _t9) +;; ========================== +align 4 +step2: + push ebp + mov ebp, esp + sub esp, 40 + mov ebx, [ebp+12] + mov eax, [ebp+ 8] + shl eax, 3 + add eax, ebx + +.loop_i: + +; -- quad subelements +0, +4, +8 and +12 (simpliest operations) + fld qword[ebx] + fld qword[ebx+8*4] + fld st0 + fadd st0, st2 ; st : t1, f_4, f_0 + fxch st1 + fsubp st2, st0 ; st : t1, t2 + fld qword[ebx+8*8] + fld qword[ebx+8*12] + fld st0 + fadd st0, st2 ; st : t3, f_12, t1, t2 + fxch st1 + fsubp st2, st0 ; st : t3, t4, t1, t2 + ; ------ + fld st2 ; st : t1, t3, t4, t1, t2 + fadd st0, st1 + fstp qword[ebx] ; st : t3, t4, t1, t2 + fsub st0, st2 ; st : t3-t1, t4, t1, t2 + fchs ; st : t1-t3, t4, t1, t2 + fstp qword[ebx+8*4] ; st : t4, t1, t2 + fst st1 ; st : t4, t4, t2 + fadd st0, st2 ; st : t2+t4, t4, t2 + fstp qword[ebx+8*8] ; st : t4, t2 + fsubp st1, st0 ; st : t2-t4 + fstp qword[ebx+8*12] ; st : + +; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) + fld qword[ebx+8*2] + fld qword[ebx+8*6] + fld [_r] + fmul st1, st0 ; st : r, t2, t1 + fld qword[ebx+8*10] + fxch st1 ; st : r, t3, t2, t1 + fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 + ; ------ + fld st3 ; st : t1, t4, t3, t2, t1 + fadd st0, st3 ; + fadd st0, st2 ; + fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 + fsub st0, st3 ; + fsub st0, st3 ; + fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 + fld st3 ; st : t1, t4, t3, t2, t1 + fsub st0, st2 ; + fsub st0, st1 ; + fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 + fadd st0, st1 ; + faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 + fstp qword[ebx+8*6] ; store f[i+6] + fstp st0 ; st : t2, t1 + fstp st0 ; st : t1 + fstp st0 ; st : + +; -- odd subelements + fld qword[ebx+8*9] + fld qword[ebx+8*11] + fld st1 + fsub st0, st1 + fxch st1 + faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) + fld [_r2] + fmul st2, st0 + fmulp st1, st0 ; st : t9, t6 + fld qword[ebx+8*3] + fld st0 + fadd st0, st2 ; st : t1, f[l5], t9, t6 + fstp _t1 + fsub st0, st1 + fstp _t2 + fstp _t9 ; (t9 never used) + fstp _t6 ; st : + + fld [_c1] + fld [_s1] + fld qword[ebx+8*5] + fld qword[ebx+8*7] + fld st3 ; st: c1, f[l6], f[l2], s1, c1 + fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c + fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c + fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c + faddp st1, st0 ; st: t5, f_6, f_2, s, c + fstp _t5 ; st: f_6, f_2, s, c + fld st3 ; st: c, f_6, f_2, s, c + fmul st0, st1 + fld st3 + fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c + fsubp st1, st0 ; st: t8, f_6, f_2, s, c + fstp _t8 ; st: f_6, f_2, s, c + fstp st0 ; st: f_2, s, c + fstp st0 ; st: s, c + + fld qword[ebx+8*13] + fld qword[ebx+8*15] + fld st3 ; st: c1, f[l8], f[l4], s1, c1 + fmul st0, st1 + fld st3 + fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c + faddp st1, st0 ; st: t7, f_8, f_4, s, c + fld _t5 ; st: t5, t7, f_8, f_4, s, c + fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c + fstp _t4 + fstp _t7 ; st: f_8, f_4, s, c + fld st3 ; st: c, f_8, f_4, s, c + fmul st0, st2 + fld st3 + fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c + fsubp st1, st0 ; st:-t0, f_8, f_4, s, c + fchs + fld _t8 + fchs ; st:-t8, t0, f_8, f_4, s, c + fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c + fstp _t3 + fstp _t0 ; st: f_8, f_4, s, c + fstp st0 ; st: f_4, s, c + fstp st0 ; st: s, c + fstp st0 ; st: c + fstp st0 ; st: + + fld _t1 + fld _t4 + fld st1 + fsub st0, st1 + fstp qword[ebx+8*11] ; f[l7] = t1-t4 + faddp st1, st0 + fstp qword[ebx+8*3] ; f[l5] = t1+t4 + fld _t2 + fld _t3 + fld st1 + fsub st0, st1 + fstp qword[ebx+8*15] ; f[l8] + faddp st1, st0 + fstp qword[ebx+8*7] ; f[l6] + + fld _t6 + fld qword[ebx+8] + fld st1 + fld st1 + faddp st3, st0 + fsubp st1, st0 ; st : t2, t1 + + fld _t8 + fsub _t0 + fld _t5 + fadd _t7 ; st : t4, t3, t2, t1 + + fld st3 + fsub st0, st1 + fstp qword[ebx+8*9] ; f[l3] = t1-t4 + fadd st0, st3 + fstp qword[ebx+8] ; f[l1] = t1+t4 + fld st1 ; st : t2, t3, t2, t1 + fsub st0, st1 ; f[l4] = t2-t3 + fstp qword[ebx+8*13] ; st : t3, t2, t1 + faddp st1, st0 ; st : t2+t3, t1 + fstp qword[ebx+8*5] ; f[l2] = t2+t3 + fstp st0 ; st : + + add ebx, 16*8 + cmp ebx, eax + jb .loop_i + + mov esp, ebp + pop ebp +ret + + + + +;================================================================= +; cdecl parameters: +; -- [ebp+8] = N +; -- [ebp+12] = p +; -- [ebp+16] = 4k-aligned data array address +; -- [ebp+20] = 4k-aligned SinCosTable address +; returns: +; -- nothing +; destroys: +; -- all GPRegs +; locals: +; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step) +;; ========================== +align 4 +step3: + push ebp + mov ebp, esp + sub esp, 120 +; 283 : { + + +; 293 : for (l=3; l<=p; l++) + mov cx, 0x0200 +.newstep: + inc ch + cmp ch, byte[ebp+12] + jg .done + mov _step, cx + +; 294 : { +; 295 : d1 = 1 << (l + l - 3); + + mov cl, ch + add cl, cl + sub cl, 3 + mov edx, 1 + shl edx, cl + mov _d1, edx + +; 296 : d2 = d1 << 1; + shl edx, 1 + mov _d2, edx + mov eax, edx + +; 297 : d3 = d2 << 1; + shl edx, 1 + mov _d3, edx + +; 298 : d4 = d2 + d3; + add eax, edx + mov _d4, eax + +; 299 : d5 = d3 << 1; + shl edx, 1 + mov _d5, edx + shl edx, 3 + mov _d6, edx ; d6 = d5*8 to simplify index operations + +; 339 : j5 = N / d5; ; moved out of internal loop + mov cl, [ebp+12] + sub cl, ch + add cl, cl + mov edx, 1 + shl edx, cl + mov _j5, edx + +; 300 : +; 301 : for (j=0; j f[j+k] +; l2 = l1 + d2; + add edx, eax + mov _l2, edx +; l3 = l1 + d3; + add edx, eax + mov _l3, edx +; l4 = l1 + d4; + add edx, eax + mov _l4, edx + +; l5 = j + d2 - k; + mov edx, eax + sub edx, ecx + mov _l5, edx +; l6 = l5 + d2; + add edx, eax + mov _l6, edx +; l7 = l5 + d3; + add edx, eax + mov _l7, edx +; l8 = l5 + d4; + add edx, eax + mov _l8, edx + + +; 340 : j5 *= k; // add-substituted multiplication + mov eax, _jj + add eax, _j5 + mov _jj, eax + +; c1 = C[jj]; +; s1 = S[jj]; + mov edi, [ebp+20] + fld qword[edi+eax*8] + mov esi, [ebp+8] + shl esi, 2 + add esi, edi + fld qword[esi+eax*8] ; st : s1, c1 + +; t5 = f[l2] * c1 + f[l6] * s1; +; t8 = f[l6] * c1 - f[l2] * s1; + mov edx, _l6 + fld qword[ebx+edx*8] + mov edx, _l2 + fld st0 + fmul st0, st2 + fxch st1 + fmul st0, st3 + fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c + fmul st4, st0 + fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c + fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c + fstp _t8 + faddp st2, st0 ; st : f[l2]*s, t5 + fstp st0 ; st : t5 + fstp _t5 ; st : + +; c2 = C[2*jj]; +; s2 = S[2*jj]; + shl eax, 1 + fld qword[edi+eax*8] + fld qword[esi+eax*8] ; st : s2, c2 + +; t6 = f[l3] * c2 + f[l7] * s2; +; t9 = f[l7] * c2 - f[l3] * s2; + mov edx, _l7 + fld qword[ebx+edx*8] + mov edx, _l3 + fld st0 + fmul st0, st2 + fxch st1 + fmul st0, st3 + fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c + fmul st4, st0 + fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c + fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c + fstp _t9 + faddp st2, st0 ; st : f[l2]*s, t6 + fstp st0 ; st : t6 + fstp _t6 ; st : + +; c3 = C[3*jj]; +; s3 = S[3*jj]; + add eax, _jj + fld qword[edi+eax*8] + fld qword[esi+eax*8] ; st : s3, c3 + +; t7 = f[l4] * c3 + f[l8] * s3; +; t0 = f[l8] * c3 - f[l4] * s3; + mov edx, _l8 + fld qword[ebx+edx*8] + mov edx, _l4 + fld st0 + fmul st0, st2 + fxch st1 + fmul st0, st3 + fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c + fmul st4, st0 + fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c + fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c + fstp _t0 + faddp st2, st0 ; st : f[l2]*s, t7 + fstp st0 ; st : t7 + fstp _t7 ; st : + +; t1 = f[l5] - t9; +; t2 = f[l5] + t9; + mov eax, _l5 + fld qword [ebx+eax*8] + fld _t9 + fld st0 + fadd st0, st2 + fstp _t2 + fsubp st1, st0 + fstp _t1 + +; t3 = - t8 - t0; + fld _t8 + fadd _t0 + fchs + fstp _t3 +; t4 = t5 - t7; + fld _t5 + fsub _t7 + fstp _t4 + +; f[l5] = t1 + t4; + fld _t1 + fld _t4 + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l7] = t1 - t4; + mov eax, _l7 + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; f[l6] = t2 + t3; + mov eax, _l6 + fld _t2 + fld _t3 + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l8] = t2 - t3; + mov eax, _l8 + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; t1 = f[l1] + t6; + mov eax, _l1 + fld qword [ebx+eax*8] + fld _t6 + fld st0 + fadd st0, st2 + fstp _t1 +; t2 = f[l1] - t6; + fsubp st1, st0 + fstp _t2 + +; t3 = t8 - t0; + fld _t8 + fsub _t0 + fstp _t3 +; t4 = t5 + t7; + fld _t5 + fadd _t7 + fstp _t4 + +; f[l1] = t1 + t4; + mov eax, _l1 + fld _t1 + fld _t4 + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l3] = t1 - t4; + mov eax, _l3 + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; f[l2] = t2 + t3; + mov eax, _l2 + fld _t2 + fld _t3 + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l4] = t2 - t3; + mov eax, _l4 + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; 374 : } + jmp .next_k + +.done_k: +; 375 : } + add ebx, _d6 ; d6 = d5*8 + cmp ebx, _end_of_array + jb .next_j + +; 376 : } + mov cx, _step + jmp .newstep +.done: + mov esp, ebp + pop ebp +; 377 : } + ret + + + ;=========== Step3 ends here =========== + + +; ================================================================= + +;================================================================= +; parameters: +; -- [ebp+8] = N +; -- [ebp+12] = p +; -- [ebp+16] = 4k-aligned data array address +; -- [ebp+20] = 4k-aligned SinCosTable address +; returns: +; -- nothing +; destroys: +; -- all GPRegs +;; ========================== + +align 4 + +FHT_4: + + push ebp + mov ebp, esp + mov edx, [ebp+16] + add edx, [ebp+12] + call BitInvert + push dword[ebp+16] + push dword[ebp+8] + call step1 + call step2 + pop edx ; N + pop ecx ; a + push dword[ebp+20] ; t + push ecx + push dword[ebp+12] ; p + push edx ; N + call step3 + mov esp, ebp + pop ebp + +ret