From 8c44a1652a5c70edf75991158fb56d19e6fd1864 Mon Sep 17 00:00:00 2001 From: "Artem Jerdev (art_zh)" Date: Sun, 26 Sep 2010 16:16:10 +0000 Subject: [PATCH] a test version with FFT library inside kernel git-svn-id: svn://kolibrios.org@1626 a494cfbc-eb01-0410-851d-a64ba20cac60 --- .../branches/Kolibri-A/trunk/core/syscall.inc | 8 +- kernel/branches/Kolibri-A/trunk/kernel32.inc | 3 +- kernel/branches/Kolibri-A/trunk/sound/FFT.inc | 760 ++++++++++++++++++ 3 files changed, 766 insertions(+), 5 deletions(-) create mode 100644 kernel/branches/Kolibri-A/trunk/sound/FFT.inc diff --git a/kernel/branches/Kolibri-A/trunk/core/syscall.inc b/kernel/branches/Kolibri-A/trunk/core/syscall.inc index 4bafe7ebee..caad5d8a44 100644 --- a/kernel/branches/Kolibri-A/trunk/core/syscall.inc +++ b/kernel/branches/Kolibri-A/trunk/core/syscall.inc @@ -76,10 +76,10 @@ align 32 syscall_entry: ; push ecx sti - xor eax, 3 + and eax, 3 call dword [servetable3 + eax * 4] -; pop ecx + ; pop ecx sysret iglobal @@ -187,8 +187,8 @@ iglobal align 4 servetable3: - dd paleholder ; 0 - dd paleholder ; 1 + dd FFT4 ; 0 + dd FFT4 ; 1 dd paleholder ; 2 dd sys_end ; last diff --git a/kernel/branches/Kolibri-A/trunk/kernel32.inc b/kernel/branches/Kolibri-A/trunk/kernel32.inc index b5c9dc13cf..9bdd43c726 100644 --- a/kernel/branches/Kolibri-A/trunk/kernel32.inc +++ b/kernel/branches/Kolibri-A/trunk/kernel32.inc @@ -243,7 +243,8 @@ include "fs/ext2.inc" ; read / write for ext2 filesystem ; sound -include "sound/playnote.inc" ; player Note for Speaker PC +include "sound/playnote.inc" ; player Note for Speaker PC +include "sound/FFT.inc" ; fast Fourier transform routines ; display diff --git a/kernel/branches/Kolibri-A/trunk/sound/FFT.inc b/kernel/branches/Kolibri-A/trunk/sound/FFT.inc new file mode 100644 index 0000000000..a3fd33373c --- /dev/null +++ b/kernel/branches/Kolibri-A/trunk/sound/FFT.inc @@ -0,0 +1,760 @@ +; Fast Hartley Transform routine +; Copyright (C) 1999, 2004, 2010 +; Artem Jerdev artem@jerdev.co.uk +; +; free KolibriOS version - not to be ported to other OSes +; ========================================================== + +Power_of_4 equ 5 +NumPoints equ 1024 +N_2 equ NumPoints / 2 +N_4 equ NumPoints / 4 + +;================================================================= +; global constants +align 8 +_root dq 1.41421356237309504880169 ; = sqrt(2) +_root2 dq 0.70710678118654752440084 ; = sqrt(2)/2 +_c1 dq 0.92387953251128675612818 ; = cos(pi/8) +_s1 dq 0.38268343236508977172846 ; = sin(pi/8) +_dx dq 0.00613592315154296875 ; pi/512 + +;[_CosTable] dd 0 ; N_2 elements +;[_SinTable] dd 0 ; N_2 elements + +;; ========================== +align 4 +MakeSinCosTable: + mov ebx, [_Sines] + mov ecx, [_Cosins] + xor eax, eax + fld [_dx] ; st : dx + fldz ; st : 0, dx +.loop: + fld st0 ; st : x, x, dx + FSINCOS ; st : cos, sin, x, dx + fstp qword [ecx+eax*8] ; st : sin, x, dx + fstp qword [ebx+eax*8] ; st : x, dx + fadd st0, st1 ; st : x+dx, dx + + inc eax + cmp eax, N_2 + jne .loop + fstp st0 ; st : dx + fstp st0 ; st : + ret + +; ================================================================ +align 4 +BitInvert: + mov esi, [x] ; array of qwords + xor ecx, ecx ; index term +.newterm: + inc ecx + cmp ecx, NumPoints + jge .done + + xor eax, eax + mov edx, ecx + xor bl, bl + +.do_invert: + inc bl + cmp bl, Power_of_4 + jg .switch + + mov bh, dl + and bh, 3 + shl eax, 2 + or al, bh + shr edx, 2 + jmp .do_invert + +.switch: + cmp eax, ecx + jle .newterm + + fld qword [esi+eax*8] + fld qword [esi+ecx*8] + fstp qword [esi+eax*8] + fstp qword [esi+ecx*8] + jmp .newterm + +.done: + ret + +;================================================================= +align 4 + +step1: + mov esi, [x] + mov ebx, esi + add esi, NumPoints*8 + +.loop: + fld qword[ebx] + fld qword[ebx+8] + fld st1 + fsub st0, st1 ; st : t2, f[i+1], f[i] + fxch st1 ; st : f[i+1], t2, f[i] + faddp st2, st0 ; st : t2, t1 + fld qword[ebx+16] + fld qword[ebx+24] + fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1 + fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1 + fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1 + fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1 + fstp st1 ; st : t4, t3, t2, t1 + fld st2 ; st : t2, t4, t3, t2, t1 + fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1 + fstp qword[ebx+16] ; st : t4, t3, t2, t1 + fsubp st2, st0 ; st : t3, t2-t4, t1 + fld st2 ; st : t1, t3, t2-t4, t1 + fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1 + fstp qword[ebx] ; st : t3, t2-t4, t1 + fsubp st2, st0 ; st : t2-t4, t1-t3 + fstp qword[ebx+24] ; st : t1-t3 + fstp qword[ebx+8] ; st : + + add ebx, 32 + cmp ebx, esi + jnz .loop + + ret + + +; +;=========================================================================== +step2: ; Step2 +;=========================================================================== + + mov eax, [_f] + mov ebx, eax + add eax, NumPoints*8 + +.loop_i: + +; -- quad subelements +0, +4, +8 and +12 (simpliest operations) + fld qword[ebx] + fld qword[ebx+8*4] + fld st0 + fadd st0, st2 ; st : t1, f_4, f_0 + fxch st1 + fsubp st2, st0 ; st : t1, t2 + fld qword[ebx+8*8] + fld qword[ebx+8*12] + fld st0 + fadd st0, st2 ; st : t3, f_12, t1, t2 + fxch st1 + fsubp st2, st0 ; st : t3, t4, t1, t2 + ; ------ + fld st2 ; st : t1, t3, t4, t1, t2 + fadd st0, st1 + fstp qword[ebx] ; st : t3, t4, t1, t2 + fsub st0, st2 ; st : t3-t1, t4, t1, t2 + fchs ; st : t1-t3, t4, t1, t2 + fstp qword[ebx+8*4] ; st : t4, t1, t2 + fst st1 ; st : t4, t4, t2 + fadd st0, st2 ; st : t2+t4, t4, t2 + fstp qword[ebx+8*8] ; st : t4, t2 + fsubp st1, st0 ; st : t2-t4 + fstp qword[ebx+8*12] ; st : + +; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) + fld qword[ebx+8*2] + fld qword[ebx+8*6] + fld [_root] + fmul st1, st0 ; st : r, t2, t1 + fld qword[ebx+8*10] + fxch st1 ; st : r, t3, t2, t1 + fmul qword[ebx+8*14] ; st : t4, t3, t2, t1 + ; ------ + fld st3 ; st : t1, t4, t3, t2, t1 + fadd st0, st3 ; + fadd st0, st2 ; + fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3 + fsub st0, st3 ; + fsub st0, st3 ; + fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3 + fld st3 ; st : t1, t4, t3, t2, t1 + fsub st0, st2 ; + fsub st0, st1 ; + fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4 + fadd st0, st1 ; + faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1 + fstp qword[ebx+8*6] ; store f[i+6] + fstp st0 ; st : t2, t1 + fstp st0 ; st : t1 + fstp st0 ; st : + +; -- odd subelements + fld qword[ebx+8*9] + fld qword[ebx+8*11] + fld st1 + fsub st0, st1 + fxch st1 + faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) + fld [_root2] + fmul st2, st0 + fmulp st1, st0 ; st : t9, t6 + fld qword[ebx+8*3] + fld st0 + fadd st0, st2 ; st : t1, f[l5], t9, t6 + fstp [_t1] + fsub st0, st1 + fstp [_t2] + fstp [_t9] ; (t9 never used) + fstp [_t6] ; st : + + fld [_c1] + fld [_s1] + fld qword[ebx+8*5] + fld qword[ebx+8*7] + fld st3 ; st: c1, f[l6], f[l2], s1, c1 + fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c + fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c + fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c + faddp st1, st0 ; st: t5, f_6, f_2, s, c + fstp [_t5] ; st: f_6, f_2, s, c + fld st3 ; st: c, f_6, f_2, s, c + fmul st0, st1 + fld st3 + fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c + fsubp st1, st0 ; st: t8, f_6, f_2, s, c + fstp [_t8] ; st: f_6, f_2, s, c + fstp st0 ; st: f_2, s, c + fstp st0 ; st: s, c + + fld qword[ebx+8*13] + fld qword[ebx+8*15] + fld st3 ; st: c1, f[l8], f[l4], s1, c1 + fmul st0, st1 + fld st3 + fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c + faddp st1, st0 ; st: t7, f_8, f_4, s, c + fld [_t5] ; st: t5, t7, f_8, f_4, s, c + fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c + fstp [_t4] + fstp [_t7] ; st: f_8, f_4, s, c + fld st3 ; st: c, f_8, f_4, s, c + fmul st0, st2 + fld st3 + fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c + fsubp st1, st0 ; st:-t0, f_8, f_4, s, c + fchs + fld [_t8] + fchs ; st:-t8, t0, f_8, f_4, s, c + fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c + fstp [_t3] + fstp [_t0] ; st: f_8, f_4, s, c + fstp st0 ; st: f_4, s, c + fstp st0 ; st: s, c + fstp st0 ; st: c + fstp st0 ; st: + + fld [_t1] + fld [_t4] + fld st1 + fsub st0, st1 + fstp qword[ebx+8*11] ; f[l7] = t1-t4 + faddp st1, st0 + fstp qword[ebx+8*3] ; f[l5] = t1+t4 + fld [_t2] + fld [_t3] + fld st1 + fsub st0, st1 + fstp qword[ebx+8*15] ; f[l8] + faddp st1, st0 + fstp qword[ebx+8*7] ; f[l6] + + fld [_t6] + fld qword[ebx+8] + fld st1 + fsub st0, st1 + fxch st1 + faddp st2, st0 ; st : t2, t1 + fld [_t8] + fsub [_t0] + fld [_t5] + fadd [_t7] ; st : t4, t3, t2, t1 + + fld st3 + fsub st0, st1 + fstp qword[ebx+8*9] ; f[l3] = t1-t4 + fadd st0, st3 + fstp qword[ebx+8] ; f[l1] = t1+t4 + fld st1 ; st : t2, t3, t2, t1 + fsub st0, st1 ; f[l4] = t2-t3 + fstp qword[ebx+8*13] ; st : t3, t2, t1 + faddp st1, st0 ; st : t2+t3, t1 + fstp qword[ebx+8*5] ; f[l2] = t2+t3 + fstp st0 ; st : + + add ebx, 16*8 + cmp ebx, eax + jb .loop_i + + ret + +align 8 ; shared local vars +_t0 dq 0 +_t1 dq 0 +_t2 dq 0 +_t3 dq 0 +_t4 dq 0 +_t5 dq 0 +_t6 dq 0 +_t7 dq 0 +_t8 dq 0 +_t9 dq 0 + + + +;=================================================================== +step3: +;=================================================================== + +; 283 : { + + +; 293 : for (l=3; l<=p; l++) + mov cx, 0x0200 +.newstep: + inc ch + cmp ch, Power_of_4 + jg .done + mov [.step], cx + +; 294 : { +; 295 : d1 = 1 << (l + l - 3); + + mov cl, ch + add cl, cl + sub cl, 3 + mov edx, 1 + shl edx, cl + mov [.d1], edx + +; 296 : d2 = d1 << 1; + shl edx, 1 + mov [.d2], edx + mov eax, edx + +; 297 : d3 = d2 << 1; + shl edx, 1 + mov [.d3], edx + +; 298 : d4 = d2 + d3; + add eax, edx + mov [.d4], eax + +; 299 : d5 = d3 << 1; + shl edx, 1 + mov [.d5], edx + shl edx, 3 + mov [.d6], edx ; d6 = d5*8 to simplify index operations + +; 339 : j5 = N / d5; ; moved out of internal loop + mov cl, Power_of_4 + sub cl, ch + add cl, cl + mov edx, 1 + shl edx, cl + mov [.j5], edx + +; 300 : +; 301 : for (j=0; j f[j+k] +; l2 = l1 + d2; + add edx, eax + mov [.l2], edx +; l3 = l1 + d3; + add edx, eax + mov [.l3], edx +; l4 = l1 + d4; + add edx, eax + mov [.l4], edx + +; l5 = j + d2 - k; + mov edx, eax + sub edx, ecx + mov [.l5], edx +; l6 = l5 + d2; + add edx, eax + mov [.l6], edx +; l7 = l5 + d3; + add edx, eax + mov [.l7], edx +; l8 = l5 + d4; + add edx, eax + mov [.l8], edx + + +; 340 : j5 *= k; // add-substituted multiplication + mov eax, [.jj] + add eax, [.j5] + mov [.jj], eax + +; c1 = C[jj]; +; s1 = S[jj]; + mov edi, [_Cosins] + fld qword[edi+eax*8] + mov esi, [_Sines] + fld qword[esi+eax*8] ; st : s1, c1 + +; t5 = f[l2] * c1 + f[l6] * s1; +; t8 = f[l6] * c1 - f[l2] * s1; + mov edx, [.l6] + fld qword[ebx+edx*8] + mov edx, [.l2] + fld st0 + fmul st0, st2 + fxch st1 + fmul st0, st3 + fld qword[ebx+edx*8] ; st : f[l2], f[l6]*c, f[l6]*s, s, c + fmul st4, st0 + fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c + fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c + fstp [_t8] + faddp st2, st0 ; st : f[l2]*s, t5 + fstp st0 ; st : t5 + fstp [_t5] ; st : + +; c2 = C[2*jj]; +; s2 = S[2*jj]; + shl eax, 1 + fld qword[edi+eax*8] + fld qword[esi+eax*8] ; st : s2, c2 + +; t6 = f[l3] * c2 + f[l7] * s2; +; t9 = f[l7] * c2 - f[l3] * s2; + mov edx, [.l7] + fld qword[ebx+edx*8] + mov edx, [.l3] + fld st0 + fmul st0, st2 + fxch st1 + fmul st0, st3 + fld qword[ebx+edx*8] ; st : f[l3], f[l7]*c, f[l7]*s, s, c + fmul st4, st0 + fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c + fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c + fstp [_t9] + faddp st2, st0 ; st : f[l2]*s, t6 + fstp st0 ; st : t6 + fstp [_t6] ; st : + +; c3 = C[3*jj]; +; s3 = S[3*jj]; + add eax, [.jj] + fld qword[edi+eax*8] + fld qword[esi+eax*8] ; st : s3, c3 + +; t7 = f[l4] * c3 + f[l8] * s3; +; t0 = f[l8] * c3 - f[l4] * s3; + mov edx, [.l8] + fld qword[ebx+edx*8] + mov edx, [.l4] + fld st0 + fmul st0, st2 + fxch st1 + fmul st0, st3 + fld qword[ebx+edx*8] ; st : f[l4], f[l8]*c, f[l8]*s, s, c + fmul st4, st0 + fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c + fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c + fstp [_t0] + faddp st2, st0 ; st : f[l2]*s, t7 + fstp st0 ; st : t7 + fstp [_t7] ; st : + +; t1 = f[l5] - t9; +; t2 = f[l5] + t9; + mov eax, [.l5] + fld qword [ebx+eax*8] + fld [_t9] + fld st0 + fadd st0, st2 + fstp [_t2] + fsubp st1, st0 + fstp [_t1] + +; t3 = - t8 - t0; + fld [_t8] + fadd [_t0] + fchs + fstp [_t3] +; t4 = t5 - t7; + fld [_t5] + fsub [_t7] + fstp [_t4] + +; f[l5] = t1 + t4; + fld [_t1] + fld [_t4] + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l7] = t1 - t4; + mov eax, [.l7] + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; f[l6] = t2 + t3; + mov eax, [.l6] + fld [_t2] + fld [_t3] + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l8] = t2 - t3; + mov eax, [.l8] + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; t1 = f[l1] + t6; + mov eax, [.l1] + fld qword [ebx+eax*8] + fld [_t6] + fld st0 + fadd st0, st2 + fstp [_t1] +; t2 = f[l1] - t6; + fsubp st1, st0 + fstp [_t2] + +; t3 = t8 - t0; + fld [_t8] + fsub [_t0] + fstp [_t3] +; t4 = t5 + t7; + fld [_t5] + fadd [_t7] + fstp [_t4] + +; f[l1] = t1 + t4; + mov eax, [.l1] + fld [_t1] + fld [_t4] + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l3] = t1 - t4; + mov eax, [.l3] + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; f[l2] = t2 + t3; + mov eax, [.l2] + fld [_t2] + fld [_t3] + fld st0 + fadd st0, st2 + fstp qword [ebx+eax*8] +; f[l4] = t2 - t3; + mov eax, [.l4] + fsubp st1, st0 + fstp qword [ebx+eax*8] + +; 374 : } + jmp .next_k + +.done_k: +; 375 : } + add ebx, [.d6] ; d6 = d5*8 + cmp ebx, [.end_of_array] + jb .next_j + +; 376 : } + mov cx, [.step] + jmp .newstep +.done: + +; 377 : } + ret + +align 4 +.l1 dd 0 +.l2 dd 0 +.l3 dd 0 +.l4 dd 0 +.l5 dd 0 +.l6 dd 0 +.l7 dd 0 +.l8 dd 0 +.l9 dd 0 +.l0 dd 0 +.d1 dd 0 +.d2 dd 0 +.d3 dd 0 +.d4 dd 0 +.d5 dd 0 +.d6 dd 0 +.j5 dd 0 +.jj dd 0 +.end_of_array dd 0 +.step dw 0 + +align 8 + + ;=========== Step3 ends here =========== + + + + + +; ================================================================= +; syscall entry +; +_f dd ? +_N dd 1024 ; number of points + +_a dd ? ; initial data array +x dd 0 ; tranformed (float) data array +_Cosins dd 0 +_Sines dd 0 + +FFT4: + or al, al + jnz .trans + mov cl, Power_of_4 + mov eax, 1 + shl eax, cl + shl eax, cl + mov [_N], eax + shl eax, 2 ; size of Sine table in bytes + add eax, ebx + mov [_Sines], ebx + mov [_Cosins], eax + cpuid + rdtsc + mov [.time], eax + call MakeSinCosTable + cpuid + rdtsc + sub eax, [.time] + ret +.trans: + mov [x], ebx + mov [_f], ebx + cli ;----- + cpuid + rdtsc + mov [.time], eax + call BitInvert + call step1 + call step2 + call step3 + cpuid + rdtsc + sti ;---- + sub eax, [.time] + ret + +.time dd 0