; Fast Hartley Transform routine ; Copyright (C) 1999, 2004, 2010 ; Artem Jerdev artem@jerdev.co.uk ; ; free KolibriOS version - not to be ported to other OSes ; ========================================================== Power_of_4 equ 5 NumPoints equ 1024 N_2 equ NumPoints / 2 N_4 equ NumPoints / 4 ;================================================================= ; global constants align 8 _root dq 1.41421356237309504880169 ; = sqrt(2) _root2 dq 0.70710678118654752440084 ; = sqrt(2)/2 _c1 dq 0.92387953251128675612818 ; = cos(pi/8) _s1 dq 0.38268343236508977172846 ; = sin(pi/8) _dx dq 0.00613592315154296875 ; pi/512 ;[_CosTable] dd 0 ; N_2 elements ;[_SinTable] dd 0 ; N_2 elements ;; ========================== align 4 MakeSinCosTable: mov ebx, [_Sines] mov ecx, [_Cosins] xor eax, eax fld [_dx] ; st : dx fldz ; st : 0, dx .loop: fld st0 ; st : x, x, dx FSINCOS ; st : cos, sin, x, dx fstp qword [ecx+eax*8] ; st : sin, x, dx fstp qword [ebx+eax*8] ; st : x, dx fadd st0, st1 ; st : x+dx, dx inc eax cmp eax, N_2 jne .loop fstp st0 ; st : dx fstp st0 ; st : ret ; ================================================================ align 4 BitInvert: mov esi, [x] ; array of qwords xor ecx, ecx ; index term .newterm: inc ecx cmp ecx, NumPoints jge .done xor eax, eax mov edx, ecx xor bl, bl .do_invert: inc bl cmp bl, Power_of_4 jg .switch mov bh, dl and bh, 3 shl eax, 2 or al, bh shr edx, 2 jmp .do_invert .switch: cmp eax, ecx jle .newterm fld qword [esi+eax*8] fld qword [esi+ecx*8] fstp qword [esi+eax*8] fstp qword [esi+ecx*8] jmp .newterm .done: ret ;================================================================= align 4 step1: mov esi, [x] mov ebx, esi add esi, NumPoints*8 .loop: fld qword[ebx] fld qword[ebx+8] fld st1 fsub st0, st1 ; st : t2, f[i+1], f[i] fxch st1 ; st : f[i+1], t2, f[i] faddp st2, st0 ; st : t2, t1 fld qword[ebx+16] fld qword[ebx+24] fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 fstp st1 ; st : t4, t3, t2, t1 fld st2 ; st : t2, t4, t3, t2, t1 fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 fstp qword[ebx+16] ; st : t4, t3, t2, t1 fsubp st2, st0 ; st : t3, t2-t4, t1 fld st2 ; st : t1, t3, t2-t4, t1 fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 fstp qword[ebx] ; st : t3, t2-t4, t1 fsubp st2, st0 ; st : t2-t4, t1-t3 fstp qword[ebx+24] ; st : t1-t3 fstp qword[ebx+8] ; st : add ebx, 32 cmp ebx, esi jnz .loop ret ; ;=========================================================================== step2: ; Step2 ;=========================================================================== mov eax, [_f] mov ebx, eax add eax, NumPoints*8 .loop_i: ; -- quad subelements +0, +4, +8 and +12 (simpliest operations) fld qword[ebx] fld qword[ebx+8*4] fld st0 fadd st0, st2 ; st : t1, f_4, f_0 fxch st1 fsubp st2, st0 ; st : t1, t2 fld qword[ebx+8*8] fld qword[ebx+8*12] fld st0 fadd st0, st2 ; st : t3, f_12, t1, t2 fxch st1 fsubp st2, st0 ; st : t3, t4, t1, t2 ; ------ fld st2 ; st : t1, t3, t4, t1, t2 fadd st0, st1 fstp qword[ebx] ; st : t3, t4, t1, t2 fsub st0, st2 ; st : t3-t1, t4, t1, t2 fchs ; st : t1-t3, t4, t1, t2 fstp qword[ebx+8*4] ; st : t4, t1, t2 fst st1 ; st : t4, t4, t2 fadd st0, st2 ; st : t2+t4, t4, t2 fstp qword[ebx+8*8] ; st : t4, t2 fsubp st1, st0 ; st : t2-t4 fstp qword[ebx+8*12] ; st : ; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) fld qword[ebx+8*2] fld qword[ebx+8*6] fld [_root] fmul st1, st0 ; st : r, t2, t1 fld qword[ebx+8*10] fxch st1 ; st : r, t3, t2, t1 fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 ; ------ fld st3 ; st : t1, t4, t3, t2, t1 fadd st0, st3 ; fadd st0, st2 ; fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 fsub st0, st3 ; fsub st0, st3 ; fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 fld st3 ; st : t1, t4, t3, t2, t1 fsub st0, st2 ; fsub st0, st1 ; fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 fadd st0, st1 ; faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 fstp qword[ebx+8*6] ; store f[i+6] fstp st0 ; st : t2, t1 fstp st0 ; st : t1 fstp st0 ; st : ; -- odd subelements fld qword[ebx+8*9] fld qword[ebx+8*11] fld st1 fsub st0, st1 fxch st1 faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) fld [_root2] fmul st2, st0 fmulp st1, st0 ; st : t9, t6 fld qword[ebx+8*3] fld st0 fadd st0, st2 ; st : t1, f[l5], t9, t6 fstp [_t1] fsub st0, st1 fstp [_t2] fstp [_t9] ; (t9 never used) fstp [_t6] ; st : fld [_c1] fld [_s1] fld qword[ebx+8*5] fld qword[ebx+8*7] fld st3 ; st: c1, f[l6], f[l2], s1, c1 fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c faddp st1, st0 ; st: t5, f_6, f_2, s, c fstp [_t5] ; st: f_6, f_2, s, c fld st3 ; st: c, f_6, f_2, s, c fmul st0, st1 fld st3 fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c fsubp st1, st0 ; st: t8, f_6, f_2, s, c fstp [_t8] ; st: f_6, f_2, s, c fstp st0 ; st: f_2, s, c fstp st0 ; st: s, c fld qword[ebx+8*13] fld qword[ebx+8*15] fld st3 ; st: c1, f[l8], f[l4], s1, c1 fmul st0, st1 fld st3 fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c faddp st1, st0 ; st: t7, f_8, f_4, s, c fld [_t5] ; st: t5, t7, f_8, f_4, s, c fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c fstp [_t4] fstp [_t7] ; st: f_8, f_4, s, c fld st3 ; st: c, f_8, f_4, s, c fmul st0, st2 fld st3 fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c fsubp st1, st0 ; st:-t0, f_8, f_4, s, c fchs fld [_t8] fchs ; st:-t8, t0, f_8, f_4, s, c fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c fstp [_t3] fstp [_t0] ; st: f_8, f_4, s, c fstp st0 ; st: f_4, s, c fstp st0 ; st: s, c fstp st0 ; st: c fstp st0 ; st: fld [_t1] fld [_t4] fld st1 fsub st0, st1 fstp qword[ebx+8*11] ; f[l7] = t1-t4 faddp st1, st0 fstp qword[ebx+8*3] ; f[l5] = t1+t4 fld [_t2] fld [_t3] fld st1 fsub st0, st1 fstp qword[ebx+8*15] ; f[l8] faddp st1, st0 fstp qword[ebx+8*7] ; f[l6] fld [_t6] fld qword[ebx+8] fld st1 fsub st0, st1 fxch st1 faddp st2, st0 ; st : t2, t1 fld [_t8] fsub [_t0] fld [_t5] fadd [_t7] ; st : t4, t3, t2, t1 fld st3 fsub st0, st1 fstp qword[ebx+8*9] ; f[l3] = t1-t4 fadd st0, st3 fstp qword[ebx+8] ; f[l1] = t1+t4 fld st1 ; st : t2, t3, t2, t1 fsub st0, st1 ; f[l4] = t2-t3 fstp qword[ebx+8*13] ; st : t3, t2, t1 faddp st1, st0 ; st : t2+t3, t1 fstp qword[ebx+8*5] ; f[l2] = t2+t3 fstp st0 ; st : add ebx, 16*8 cmp ebx, eax jb .loop_i ret align 8 ; shared local vars _t0 dq 0 _t1 dq 0 _t2 dq 0 _t3 dq 0 _t4 dq 0 _t5 dq 0 _t6 dq 0 _t7 dq 0 _t8 dq 0 _t9 dq 0 ;=================================================================== step3: ;=================================================================== ; 283 : { ; 293 : for (l=3; l<=p; l++) mov cx, 0x0200 .newstep: inc ch cmp ch, Power_of_4 jg .done mov [.step], cx ; 294 : { ; 295 : d1 = 1 << (l + l - 3); mov cl, ch add cl, cl sub cl, 3 mov edx, 1 shl edx, cl mov [.d1], edx ; 296 : d2 = d1 << 1; shl edx, 1 mov [.d2], edx mov eax, edx ; 297 : d3 = d2 << 1; shl edx, 1 mov [.d3], edx ; 298 : d4 = d2 + d3; add eax, edx mov [.d4], eax ; 299 : d5 = d3 << 1; shl edx, 1 mov [.d5], edx shl edx, 3 mov [.d6], edx ; d6 = d5*8 to simplify index operations ; 339 : j5 = N / d5; ; moved out of internal loop mov cl, Power_of_4 sub cl, ch add cl, cl mov edx, 1 shl edx, cl mov [.j5], edx ; 300 : ; 301 : for (j=0; j f[j+k] ; l2 = l1 + d2; add edx, eax mov [.l2], edx ; l3 = l1 + d3; add edx, eax mov [.l3], edx ; l4 = l1 + d4; add edx, eax mov [.l4], edx ; l5 = j + d2 - k; mov edx, eax sub edx, ecx mov [.l5], edx ; l6 = l5 + d2; add edx, eax mov [.l6], edx ; l7 = l5 + d3; add edx, eax mov [.l7], edx ; l8 = l5 + d4; add edx, eax mov [.l8], edx ; 340 : j5 *= k; // add-substituted multiplication mov eax, [.jj] add eax, [.j5] mov [.jj], eax ; c1 = C[jj]; ; s1 = S[jj]; mov edi, [_Cosins] fld qword[edi+eax*8] mov esi, [_Sines] fld qword[esi+eax*8] ; st : s1, c1 ; t5 = f[l2] * c1 + f[l6] * s1; ; t8 = f[l6] * c1 - f[l2] * s1; mov edx, [.l6] fld qword[ebx+edx*8] mov edx, [.l2] fld st0 fmul st0, st2 fxch st1 fmul st0, st3 fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c fmul st4, st0 fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c fstp [_t8] faddp st2, st0 ; st : f[l2]*s, t5 fstp st0 ; st : t5 fstp [_t5] ; st : ; c2 = C[2*jj]; ; s2 = S[2*jj]; shl eax, 1 fld qword[edi+eax*8] fld qword[esi+eax*8] ; st : s2, c2 ; t6 = f[l3] * c2 + f[l7] * s2; ; t9 = f[l7] * c2 - f[l3] * s2; mov edx, [.l7] fld qword[ebx+edx*8] mov edx, [.l3] fld st0 fmul st0, st2 fxch st1 fmul st0, st3 fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c fmul st4, st0 fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c fstp [_t9] faddp st2, st0 ; st : f[l2]*s, t6 fstp st0 ; st : t6 fstp [_t6] ; st : ; c3 = C[3*jj]; ; s3 = S[3*jj]; add eax, [.jj] fld qword[edi+eax*8] fld qword[esi+eax*8] ; st : s3, c3 ; t7 = f[l4] * c3 + f[l8] * s3; ; t0 = f[l8] * c3 - f[l4] * s3; mov edx, [.l8] fld qword[ebx+edx*8] mov edx, [.l4] fld st0 fmul st0, st2 fxch st1 fmul st0, st3 fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c fmul st4, st0 fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c fstp [_t0] faddp st2, st0 ; st : f[l2]*s, t7 fstp st0 ; st : t7 fstp [_t7] ; st : ; t1 = f[l5] - t9; ; t2 = f[l5] + t9; mov eax, [.l5] fld qword [ebx+eax*8] fld [_t9] fld st0 fadd st0, st2 fstp [_t2] fsubp st1, st0 fstp [_t1] ; t3 = - t8 - t0; fld [_t8] fadd [_t0] fchs fstp [_t3] ; t4 = t5 - t7; fld [_t5] fsub [_t7] fstp [_t4] ; f[l5] = t1 + t4; fld [_t1] fld [_t4] fld st0 fadd st0, st2 fstp qword [ebx+eax*8] ; f[l7] = t1 - t4; mov eax, [.l7] fsubp st1, st0 fstp qword [ebx+eax*8] ; f[l6] = t2 + t3; mov eax, [.l6] fld [_t2] fld [_t3] fld st0 fadd st0, st2 fstp qword [ebx+eax*8] ; f[l8] = t2 - t3; mov eax, [.l8] fsubp st1, st0 fstp qword [ebx+eax*8] ; t1 = f[l1] + t6; mov eax, [.l1] fld qword [ebx+eax*8] fld [_t6] fld st0 fadd st0, st2 fstp [_t1] ; t2 = f[l1] - t6; fsubp st1, st0 fstp [_t2] ; t3 = t8 - t0; fld [_t8] fsub [_t0] fstp [_t3] ; t4 = t5 + t7; fld [_t5] fadd [_t7] fstp [_t4] ; f[l1] = t1 + t4; mov eax, [.l1] fld [_t1] fld [_t4] fld st0 fadd st0, st2 fstp qword [ebx+eax*8] ; f[l3] = t1 - t4; mov eax, [.l3] fsubp st1, st0 fstp qword [ebx+eax*8] ; f[l2] = t2 + t3; mov eax, [.l2] fld [_t2] fld [_t3] fld st0 fadd st0, st2 fstp qword [ebx+eax*8] ; f[l4] = t2 - t3; mov eax, [.l4] fsubp st1, st0 fstp qword [ebx+eax*8] ; 374 : } jmp .next_k .done_k: ; 375 : } add ebx, [.d6] ; d6 = d5*8 cmp ebx, [.end_of_array] jb .next_j ; 376 : } mov cx, [.step] jmp .newstep .done: ; 377 : } ret align 4 .l1 dd 0 .l2 dd 0 .l3 dd 0 .l4 dd 0 .l5 dd 0 .l6 dd 0 .l7 dd 0 .l8 dd 0 .l9 dd 0 .l0 dd 0 .d1 dd 0 .d2 dd 0 .d3 dd 0 .d4 dd 0 .d5 dd 0 .d6 dd 0 .j5 dd 0 .jj dd 0 .end_of_array dd 0 .step dw 0 align 8 ;=========== Step3 ends here =========== ; ================================================================= ; syscall entry ; _f dd ? _N dd 1024 ; number of points _a dd ? ; initial data array x dd 0 ; tranformed (float) data array _Cosins dd 0 _Sines dd 0 FFT4: or al, al jnz .trans mov cl, Power_of_4 mov eax, 1 shl eax, cl shl eax, cl mov [_N], eax shl eax, 2 ; size of Sine table in bytes add eax, ebx mov [_Sines], ebx mov [_Cosins], eax cpuid rdtsc mov [.time], eax call MakeSinCosTable cpuid rdtsc sub eax, [.time] ret .trans: mov [x], ebx mov [_f], ebx cli ;----- cpuid rdtsc mov [.time], eax call BitInvert call step1 call step2 call step3 cpuid rdtsc sti ;---- sub eax, [.time] ret .time dd 0