Artem Jerdev (art_zh) 8c44a1652a a test version with FFT library inside kernel
git-svn-id: svn://kolibrios.org@1626 a494cfbc-eb01-0410-851d-a64ba20cac60
2010-09-26 16:16:10 +00:00

761 lines
15 KiB
PHP

; Fast Hartley Transform routine
; Copyright (C) 1999, 2004, 2010
; Artem Jerdev artem@jerdev.co.uk
;
; free KolibriOS version - not to be ported to other OSes
; ==========================================================
Power_of_4 equ 5
NumPoints equ 1024
N_2 equ NumPoints / 2
N_4 equ NumPoints / 4
;=================================================================
; global constants
align 8
_root dq 1.41421356237309504880169 ; = sqrt(2)
_root2 dq 0.70710678118654752440084 ; = sqrt(2)/2
_c1 dq 0.92387953251128675612818 ; = cos(pi/8)
_s1 dq 0.38268343236508977172846 ; = sin(pi/8)
_dx dq 0.00613592315154296875 ; pi/512
;[_CosTable] dd 0 ; N_2 elements
;[_SinTable] dd 0 ; N_2 elements
;; ==========================
align 4
MakeSinCosTable:
mov ebx, [_Sines]
mov ecx, [_Cosins]
xor eax, eax
fld [_dx] ; st : dx
fldz ; st : 0, dx
.loop:
fld st0 ; st : x, x, dx
FSINCOS ; st : cos, sin, x, dx
fstp qword [ecx+eax*8] ; st : sin, x, dx
fstp qword [ebx+eax*8] ; st : x, dx
fadd st0, st1 ; st : x+dx, dx
inc eax
cmp eax, N_2
jne .loop
fstp st0 ; st : dx
fstp st0 ; st : <empty>
ret
; ================================================================
align 4
BitInvert:
mov esi, [x] ; array of qwords
xor ecx, ecx ; index term
.newterm:
inc ecx
cmp ecx, NumPoints
jge .done
xor eax, eax
mov edx, ecx
xor bl, bl
.do_invert:
inc bl
cmp bl, Power_of_4
jg .switch
mov bh, dl
and bh, 3
shl eax, 2
or al, bh
shr edx, 2
jmp .do_invert
.switch:
cmp eax, ecx
jle .newterm
fld qword [esi+eax*8]
fld qword [esi+ecx*8]
fstp qword [esi+eax*8]
fstp qword [esi+ecx*8]
jmp .newterm
.done:
ret
;=================================================================
align 4
step1:
mov esi, [x]
mov ebx, esi
add esi, NumPoints*8
.loop:
fld qword[ebx]
fld qword[ebx+8]
fld st1
fsub st0, st1 ; st : t2, f[i+1], f[i]
fxch st1 ; st : f[i+1], t2, f[i]
faddp st2, st0 ; st : t2, t1
fld qword[ebx+16]
fld qword[ebx+24]
fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1
fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1
fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1
fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1
fstp st1 ; st : t4, t3, t2, t1
fld st2 ; st : t2, t4, t3, t2, t1
fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1
fstp qword[ebx+16] ; st : t4, t3, t2, t1
fsubp st2, st0 ; st : t3, t2-t4, t1
fld st2 ; st : t1, t3, t2-t4, t1
fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1
fstp qword[ebx] ; st : t3, t2-t4, t1
fsubp st2, st0 ; st : t2-t4, t1-t3
fstp qword[ebx+24] ; st : t1-t3
fstp qword[ebx+8] ; st : <empty>
add ebx, 32
cmp ebx, esi
jnz .loop
ret
;
;===========================================================================
step2: ; Step2
;===========================================================================
mov eax, [_f]
mov ebx, eax
add eax, NumPoints*8
.loop_i:
; -- quad subelements +0, +4, +8 and +12 (simpliest operations)
fld qword[ebx]
fld qword[ebx+8*4]
fld st0
fadd st0, st2 ; st : t1, f_4, f_0
fxch st1
fsubp st2, st0 ; st : t1, t2
fld qword[ebx+8*8]
fld qword[ebx+8*12]
fld st0
fadd st0, st2 ; st : t3, f_12, t1, t2
fxch st1
fsubp st2, st0 ; st : t3, t4, t1, t2
; ------
fld st2 ; st : t1, t3, t4, t1, t2
fadd st0, st1
fstp qword[ebx] ; st : t3, t4, t1, t2
fsub st0, st2 ; st : t3-t1, t4, t1, t2
fchs ; st : t1-t3, t4, t1, t2
fstp qword[ebx+8*4] ; st : t4, t1, t2
fst st1 ; st : t4, t4, t2
fadd st0, st2 ; st : t2+t4, t4, t2
fstp qword[ebx+8*8] ; st : t4, t2
fsubp st1, st0 ; st : t2-t4
fstp qword[ebx+8*12] ; st : <empty>
; -- even subelements +2, +6, +10 and +14 (2 multiplications needed)
fld qword[ebx+8*2]
fld qword[ebx+8*6]
fld [_root]
fmul st1, st0 ; st : r, t2, t1
fld qword[ebx+8*10]
fxch st1 ; st : r, t3, t2, t1
fmul qword[ebx+8*14] ; st : t4, t3, t2, t1
; ------
fld st3 ; st : t1, t4, t3, t2, t1
fadd st0, st3 ;
fadd st0, st2 ;
fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3
fsub st0, st3 ;
fsub st0, st3 ;
fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
fld st3 ; st : t1, t4, t3, t2, t1
fsub st0, st2 ;
fsub st0, st1 ;
fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
fadd st0, st1 ;
faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1
fstp qword[ebx+8*6] ; store f[i+6]
fstp st0 ; st : t2, t1
fstp st0 ; st : t1
fstp st0 ; st : <empty>
; -- odd subelements
fld qword[ebx+8*9]
fld qword[ebx+8*11]
fld st1
fsub st0, st1
fxch st1
faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7])
fld [_root2]
fmul st2, st0
fmulp st1, st0 ; st : t9, t6
fld qword[ebx+8*3]
fld st0
fadd st0, st2 ; st : t1, f[l5], t9, t6
fstp [_t1]
fsub st0, st1
fstp [_t2]
fstp [_t9] ; (t9 never used)
fstp [_t6] ; st : <empty>
fld [_c1]
fld [_s1]
fld qword[ebx+8*5]
fld qword[ebx+8*7]
fld st3 ; st: c1, f[l6], f[l2], s1, c1
fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c
fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c
fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c
faddp st1, st0 ; st: t5, f_6, f_2, s, c
fstp [_t5] ; st: f_6, f_2, s, c
fld st3 ; st: c, f_6, f_2, s, c
fmul st0, st1
fld st3
fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c
fsubp st1, st0 ; st: t8, f_6, f_2, s, c
fstp [_t8] ; st: f_6, f_2, s, c
fstp st0 ; st: f_2, s, c
fstp st0 ; st: s, c
fld qword[ebx+8*13]
fld qword[ebx+8*15]
fld st3 ; st: c1, f[l8], f[l4], s1, c1
fmul st0, st1
fld st3
fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c
faddp st1, st0 ; st: t7, f_8, f_4, s, c
fld [_t5] ; st: t5, t7, f_8, f_4, s, c
fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c
fstp [_t4]
fstp [_t7] ; st: f_8, f_4, s, c
fld st3 ; st: c, f_8, f_4, s, c
fmul st0, st2
fld st3
fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c
fsubp st1, st0 ; st:-t0, f_8, f_4, s, c
fchs
fld [_t8]
fchs ; st:-t8, t0, f_8, f_4, s, c
fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c
fstp [_t3]
fstp [_t0] ; st: f_8, f_4, s, c
fstp st0 ; st: f_4, s, c
fstp st0 ; st: s, c
fstp st0 ; st: c
fstp st0 ; st: <empty>
fld [_t1]
fld [_t4]
fld st1
fsub st0, st1
fstp qword[ebx+8*11] ; f[l7] = t1-t4
faddp st1, st0
fstp qword[ebx+8*3] ; f[l5] = t1+t4
fld [_t2]
fld [_t3]
fld st1
fsub st0, st1
fstp qword[ebx+8*15] ; f[l8]
faddp st1, st0
fstp qword[ebx+8*7] ; f[l6]
fld [_t6]
fld qword[ebx+8]
fld st1
fsub st0, st1
fxch st1
faddp st2, st0 ; st : t2, t1
fld [_t8]
fsub [_t0]
fld [_t5]
fadd [_t7] ; st : t4, t3, t2, t1
fld st3
fsub st0, st1
fstp qword[ebx+8*9] ; f[l3] = t1-t4
fadd st0, st3
fstp qword[ebx+8] ; f[l1] = t1+t4
fld st1 ; st : t2, t3, t2, t1
fsub st0, st1 ; f[l4] = t2-t3
fstp qword[ebx+8*13] ; st : t3, t2, t1
faddp st1, st0 ; st : t2+t3, t1
fstp qword[ebx+8*5] ; f[l2] = t2+t3
fstp st0 ; st : <empty>
add ebx, 16*8
cmp ebx, eax
jb .loop_i
ret
align 8 ; shared local vars
_t0 dq 0
_t1 dq 0
_t2 dq 0
_t3 dq 0
_t4 dq 0
_t5 dq 0
_t6 dq 0
_t7 dq 0
_t8 dq 0
_t9 dq 0
;===================================================================
step3:
;===================================================================
; 283 : {
; 293 : for (l=3; l<=p; l++)
mov cx, 0x0200
.newstep:
inc ch
cmp ch, Power_of_4
jg .done
mov [.step], cx
; 294 : {
; 295 : d1 = 1 << (l + l - 3);
mov cl, ch
add cl, cl
sub cl, 3
mov edx, 1
shl edx, cl
mov [.d1], edx
; 296 : d2 = d1 << 1;
shl edx, 1
mov [.d2], edx
mov eax, edx
; 297 : d3 = d2 << 1;
shl edx, 1
mov [.d3], edx
; 298 : d4 = d2 + d3;
add eax, edx
mov [.d4], eax
; 299 : d5 = d3 << 1;
shl edx, 1
mov [.d5], edx
shl edx, 3
mov [.d6], edx ; d6 = d5*8 to simplify index operations
; 339 : j5 = N / d5; ; moved out of internal loop
mov cl, Power_of_4
sub cl, ch
add cl, cl
mov edx, 1
shl edx, cl
mov [.j5], edx
; 300 :
; 301 : for (j=0; j<N; j+=d5)
mov esi, [_f]
mov ebx, esi
add esi, NumPoints*8
mov [.end_of_array], esi
.next_j:
; {
; t1 = f[j] + f[j+d2];
mov eax, [.d2]
fld qword[ebx]
fld qword[ebx+eax*8]
fld st1
fadd st0, st1
fstp [_t1]
; t2 = f[j] - f[j+d2];
fsubp st1, st0
fstp [_t2]
; t3 = f[j+d3] + f[j+d4];
mov edi, [.d3]
fld qword[ebx+edi*8]
mov edx, [.d4]
fld qword[ebx+edx*8]
fld st1
fsub st0, st1 ; st : t4, f4, f3
fxch st1 ; st : f4, t4, f3
; t4 = f[j+d3] - f[j+d4];
faddp st2, st0 ; st : t4, t3
; f[j+d4] = t2 - t4;
; f[j+d3] = t2 + t4;
fld [_t2]
fld st0
fsub st0, st2 ; st : f4, t2, t4, t3
fstp qword[ebx+edx*8] ; st : t2, t4, t3
fadd st0, st1 ; st : f3, t4, t3
fstp qword[ebx+edi*8] ; st : t4, t3
; f[j+d2] = t1 - t3;
; f[j] = t1 + t3;
fld [_t1]
fst st1
fsub st0, st2 ; st : f2, t1, t3
fstp qword[ebx+eax*8] ; st : t1, t3
fadd st0, st1 ; st : f0, t3
fstp qword[ebx] ; st : t3
fstp st0
; jj = j + d1; / ??
mov edi, [.d1]
shl edi, 3 ; = d1*8
mov edx, edi
mov eax, edi
add eax, eax ; eax = d2*8
shl edx, 2 ; = d3*8
add edi, ebx ; now [edi] points to f[jj]
add edx, edi ; and [edx] points to f[jj+d3]
; t1 = f[jj];
fld qword [edi] ; st : t1
; t3 = f[jj+d3];
fld qword [edx] ; st : t3, t1
; t2 = f[jj+d2] * r;
fld qword [edi+eax]
fld [_root]
fmul st1, st0 ; st : r, t2, t3, t1
; t4 = f[jj+d4] * r
fmul qword [edx+eax] ; st : t4, t2, t3, t1
; f[jj] = t1 + t2 + t3;
fld st3 ; st : t1, t4, t2, t3, t1
fadd st0, st3
fadd st0, st2
fstp qword [edi]
; f[jj+d2] = t1 - t3 + t4;
fld st3
fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1
fld st0
fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1
fstp qword [edi+eax]
; f[jj+d4] = t1 - t3 - t4;
fsub st0, st1 ; st : f4, t4, t2, t3, t1
fstp qword [edx+eax]
; f[jj+d3] = t1 - t2 + t3;
fstp st0 ; st : t2, t3, t1
fsubp st1, st0 ; st : (t3-t2), t1
faddp st1, st0 ; st : f3
fstp qword [edx]
; for (k=1; k<d1; k++)
xor ecx, ecx ; ecx = k
mov [.jj], ecx
.next_k:
inc ecx
cmp ecx, [.d1]
jge .done_k
; {
mov eax, [.d2] ; the sector increment
; l1 = j + k;
mov edx, ecx
mov [.l1], edx ; [ebx+edx*8] --> f[j+k]
; l2 = l1 + d2;
add edx, eax
mov [.l2], edx
; l3 = l1 + d3;
add edx, eax
mov [.l3], edx
; l4 = l1 + d4;
add edx, eax
mov [.l4], edx
; l5 = j + d2 - k;
mov edx, eax
sub edx, ecx
mov [.l5], edx
; l6 = l5 + d2;
add edx, eax
mov [.l6], edx
; l7 = l5 + d3;
add edx, eax
mov [.l7], edx
; l8 = l5 + d4;
add edx, eax
mov [.l8], edx
; 340 : j5 *= k; // add-substituted multiplication
mov eax, [.jj]
add eax, [.j5]
mov [.jj], eax
; c1 = C[jj];
; s1 = S[jj];
mov edi, [_Cosins]
fld qword[edi+eax*8]
mov esi, [_Sines]
fld qword[esi+eax*8] ; st : s1, c1
; t5 = f[l2] * c1 + f[l6] * s1;
; t8 = f[l6] * c1 - f[l2] * s1;
mov edx, [.l6]
fld qword[ebx+edx*8]
mov edx, [.l2]
fld st0
fmul st0, st2
fxch st1
fmul st0, st3
fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c
fmul st4, st0
fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c
fstp [_t8]
faddp st2, st0 ; st : f[l2]*s, t5
fstp st0 ; st : t5
fstp [_t5] ; st : <empty>
; c2 = C[2*jj];
; s2 = S[2*jj];
shl eax, 1
fld qword[edi+eax*8]
fld qword[esi+eax*8] ; st : s2, c2
; t6 = f[l3] * c2 + f[l7] * s2;
; t9 = f[l7] * c2 - f[l3] * s2;
mov edx, [.l7]
fld qword[ebx+edx*8]
mov edx, [.l3]
fld st0
fmul st0, st2
fxch st1
fmul st0, st3
fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c
fmul st4, st0
fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c
fstp [_t9]
faddp st2, st0 ; st : f[l2]*s, t6
fstp st0 ; st : t6
fstp [_t6] ; st : <empty>
; c3 = C[3*jj];
; s3 = S[3*jj];
add eax, [.jj]
fld qword[edi+eax*8]
fld qword[esi+eax*8] ; st : s3, c3
; t7 = f[l4] * c3 + f[l8] * s3;
; t0 = f[l8] * c3 - f[l4] * s3;
mov edx, [.l8]
fld qword[ebx+edx*8]
mov edx, [.l4]
fld st0
fmul st0, st2
fxch st1
fmul st0, st3
fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c
fmul st4, st0
fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c
fstp [_t0]
faddp st2, st0 ; st : f[l2]*s, t7
fstp st0 ; st : t7
fstp [_t7] ; st : <empty>
; t1 = f[l5] - t9;
; t2 = f[l5] + t9;
mov eax, [.l5]
fld qword [ebx+eax*8]
fld [_t9]
fld st0
fadd st0, st2
fstp [_t2]
fsubp st1, st0
fstp [_t1]
; t3 = - t8 - t0;
fld [_t8]
fadd [_t0]
fchs
fstp [_t3]
; t4 = t5 - t7;
fld [_t5]
fsub [_t7]
fstp [_t4]
; f[l5] = t1 + t4;
fld [_t1]
fld [_t4]
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l7] = t1 - t4;
mov eax, [.l7]
fsubp st1, st0
fstp qword [ebx+eax*8]
; f[l6] = t2 + t3;
mov eax, [.l6]
fld [_t2]
fld [_t3]
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l8] = t2 - t3;
mov eax, [.l8]
fsubp st1, st0
fstp qword [ebx+eax*8]
; t1 = f[l1] + t6;
mov eax, [.l1]
fld qword [ebx+eax*8]
fld [_t6]
fld st0
fadd st0, st2
fstp [_t1]
; t2 = f[l1] - t6;
fsubp st1, st0
fstp [_t2]
; t3 = t8 - t0;
fld [_t8]
fsub [_t0]
fstp [_t3]
; t4 = t5 + t7;
fld [_t5]
fadd [_t7]
fstp [_t4]
; f[l1] = t1 + t4;
mov eax, [.l1]
fld [_t1]
fld [_t4]
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l3] = t1 - t4;
mov eax, [.l3]
fsubp st1, st0
fstp qword [ebx+eax*8]
; f[l2] = t2 + t3;
mov eax, [.l2]
fld [_t2]
fld [_t3]
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l4] = t2 - t3;
mov eax, [.l4]
fsubp st1, st0
fstp qword [ebx+eax*8]
; 374 : }
jmp .next_k
.done_k:
; 375 : }
add ebx, [.d6] ; d6 = d5*8
cmp ebx, [.end_of_array]
jb .next_j
; 376 : }
mov cx, [.step]
jmp .newstep
.done:
; 377 : }
ret
align 4
.l1 dd 0
.l2 dd 0
.l3 dd 0
.l4 dd 0
.l5 dd 0
.l6 dd 0
.l7 dd 0
.l8 dd 0
.l9 dd 0
.l0 dd 0
.d1 dd 0
.d2 dd 0
.d3 dd 0
.d4 dd 0
.d5 dd 0
.d6 dd 0
.j5 dd 0
.jj dd 0
.end_of_array dd 0
.step dw 0
align 8
;=========== Step3 ends here ===========
; =================================================================
; syscall entry
;
_f dd ?
_N dd 1024 ; number of points
_a dd ? ; initial data array
x dd 0 ; tranformed (float) data array
_Cosins dd 0
_Sines dd 0
FFT4:
or al, al
jnz .trans
mov cl, Power_of_4
mov eax, 1
shl eax, cl
shl eax, cl
mov [_N], eax
shl eax, 2 ; size of Sine table in bytes
add eax, ebx
mov [_Sines], ebx
mov [_Cosins], eax
cpuid
rdtsc
mov [.time], eax
call MakeSinCosTable
cpuid
rdtsc
sub eax, [.time]
ret
.trans:
mov [x], ebx
mov [_f], ebx
cli ;-----
cpuid
rdtsc
mov [.time], eax
call BitInvert
call step1
call step2
call step3
cpuid
rdtsc
sti ;----
sub eax, [.time]
ret
.time dd 0