a test version with FFT library inside kernel

git-svn-id: svn://kolibrios.org@1626 a494cfbc-eb01-0410-851d-a64ba20cac60
2010-09-26 16:16:10 +00:00 · 2010-09-26 16:16:10 +00:00 · 8c44a1652a
commit 8c44a1652a
parent 2e0a186ba0
3 changed files with 766 additions and 5 deletions
--- a/kernel/branches/Kolibri-A/trunk/core/syscall.inc
+++ b/kernel/branches/Kolibri-A/trunk/core/syscall.inc
@ -76,10 +76,10 @@ align 32
 syscall_entry:
 ;       push    ecx
 	  sti
-        xor   eax, 3
+        and   eax, 3
        call    dword [servetable3 + eax * 4]
-;       pop     ecx
+ ;      pop     ecx
        sysret
 iglobal
@ -187,8 +187,8 @@ iglobal
 align 4
 servetable3:
-	dd 	paleholder	; 0
+	dd 	FFT4	; 0
-	dd 	paleholder	; 1
+	dd 	FFT4	; 1
 	dd 	paleholder	; 2
 	dd 	sys_end	; last
--- a/kernel/branches/Kolibri-A/trunk/kernel32.inc
+++ b/kernel/branches/Kolibri-A/trunk/kernel32.inc
@ -243,7 +243,8 @@ include "fs/ext2.inc"     ; read / write for ext2 filesystem
 ; sound
-include "sound/playnote.inc" ; player Note for Speaker PC
+include "sound/playnote.inc" 	; player Note for Speaker PC
 include "sound/FFT.inc" 	; fast Fourier transform routines
 ; display
--- a/kernel/branches/Kolibri-A/trunk/sound/FFT.inc
+++ b/kernel/branches/Kolibri-A/trunk/sound/FFT.inc
@ -0,0 +1,760 @@
 ;           Fast Hartley Transform routine
 ;           Copyright (C) 1999, 2004, 2010
 ;          Artem Jerdev  artem@jerdev.co.uk
 ;
 ; free KolibriOS version - not to be ported to other OSes
 ; ==========================================================
 Power_of_4	equ	5
 NumPoints	equ	1024
 N_2		equ	NumPoints / 2
 N_4		equ	NumPoints / 4
 ;=================================================================
 ; global constants
 align 8
 _root		dq	1.41421356237309504880169	; = sqrt(2)
 _root2	dq	0.70710678118654752440084	; = sqrt(2)/2
 _c1		dq	0.92387953251128675612818	; = cos(pi/8)
 _s1		dq	0.38268343236508977172846	; = sin(pi/8)
 _dx		dq	0.00613592315154296875		; pi/512
 ;[_CosTable]	dd	0 ; N_2 elements
 ;[_SinTable]	dd	0 ; N_2 elements
 ;; ==========================
 align 4
 MakeSinCosTable:
 	mov	ebx, [_Sines]
 	mov	ecx, [_Cosins]
 	xor	eax,  eax
 	fld	[_dx]			; st : dx
 	fldz				; st : 0, dx
 .loop:
 	fld	st0			; st : x, x, dx
 	FSINCOS 			; st : cos, sin, x, dx
 	fstp	qword [ecx+eax*8]	; st : sin, x, dx
 	fstp	qword [ebx+eax*8]	; st : x, dx
 	fadd	st0, st1		; st : x+dx, dx
 	inc	eax
 	cmp	eax, N_2
 	jne	.loop
 	fstp	st0			; st : dx
 	fstp	st0			; st : <empty>
 	ret
 ; ================================================================
 align 4
 BitInvert:
 	mov	esi, [x]	; array of qwords
 	xor	ecx, ecx	; index term
 .newterm:
 	inc	ecx
 	cmp	ecx, NumPoints
 	jge	.done
 	xor	eax, eax
 	mov	edx, ecx
 	xor	bl, bl
 .do_invert:
 	inc	bl
 	cmp	bl, Power_of_4
 	jg	.switch
 	mov	bh, dl
 	and	bh,  3
 	shl	eax, 2
 	or	al, bh
 	shr	edx, 2
 	jmp	.do_invert
 .switch:
 	cmp	eax, ecx
 	jle	.newterm
 	fld	qword [esi+eax*8]
 	fld	qword [esi+ecx*8]
 	fstp	qword [esi+eax*8]
 	fstp	qword [esi+ecx*8]
 	jmp	.newterm
 .done:
 	ret
 ;=================================================================
 align 4
 step1:
 	mov	esi, [x]
 	mov	ebx, esi
 	add	esi, NumPoints*8
 .loop:
 	fld	qword[ebx]
 	fld	qword[ebx+8]
 	fld	st1
 	fsub	st0, st1	; st : t2, f[i+1], f[i]
 	fxch	st1		; st : f[i+1], t2, f[i]
 	faddp	st2, st0	; st : t2, t1
 	fld	qword[ebx+16]
 	fld	qword[ebx+24]
 	fld	st1		; st : f[i+2], f[i+3], f[i+2], t2, t1
 	fadd	st0, st1	; st : t3, f[i+3], f[i+2], t2, t1
 	fxch	st2		; st : f[i+2], f[i+3], t3, t2, t1
 	fsub	st0, st1	; st : t4, f[i+3], t3, t2, t1
 	fstp	st1		; st : t4, t3, t2, t1
 	fld	st2		; st : t2, t4, t3, t2, t1
 	fadd	st0, st1	; st : t2+t4, t4, t3, t2, t1
 	fstp	qword[ebx+16]	; st : t4, t3, t2, t1
 	fsubp	st2, st0	; st : t3, t2-t4, t1
 	fld	st2		; st : t1, t3, t2-t4, t1
 	fadd	st0, st1	; st : t1+t3, t3, t2-t4, t1
 	fstp	qword[ebx]	; st : t3, t2-t4, t1
 	fsubp	st2, st0	; st : t2-t4, t1-t3
 	fstp	qword[ebx+24]	; st : t1-t3
 	fstp	qword[ebx+8]	; st : <empty>
 	add	ebx, 32
 	cmp	ebx, esi
 	jnz	.loop
 	ret
 ;
 ;===========================================================================
 step2:				; Step2
 ;===========================================================================
 	mov	eax, [_f]
 	mov	ebx, eax
 	add	eax, NumPoints*8
 .loop_i:
 ; -- quad subelements  +0, +4, +8 and +12 (simpliest operations)
 	fld	qword[ebx]
 	fld	qword[ebx+8*4]
 	fld	st0
 	fadd	st0, st2	; st : t1, f_4, f_0
 	fxch	st1
 	fsubp	st2, st0	; st : t1, t2
 	fld	qword[ebx+8*8]
 	fld	qword[ebx+8*12]
 	fld	st0
 	fadd	st0, st2	; st : t3, f_12, t1, t2
 	fxch	st1
 	fsubp	st2, st0	; st : t3, t4, t1, t2
 	; ------
 	fld	st2		; st : t1, t3, t4, t1, t2
 	fadd	st0, st1
 	fstp	qword[ebx]	; st : t3, t4, t1, t2
 	fsub	st0, st2	; st : t3-t1, t4, t1, t2
 	fchs			; st : t1-t3, t4, t1, t2
 	fstp	qword[ebx+8*4]	; st : t4, t1, t2
 	fst	st1		; st : t4, t4, t2
 	fadd	st0, st2	; st : t2+t4, t4, t2
 	fstp	qword[ebx+8*8]	; st : t4, t2
 	fsubp	st1, st0	; st : t2-t4
 	fstp	qword[ebx+8*12] ; st : <empty>
 ; -- even subelements  +2, +6, +10 and +14 (2 multiplications needed)
 	fld	qword[ebx+8*2]
 	fld	qword[ebx+8*6]
 	fld	[_root]
 	fmul	st1, st0	; st : r, t2, t1
 	fld	qword[ebx+8*10]
 	fxch	st1		; st : r, t3, t2, t1
 	fmul	qword[ebx+8*14] ; st : t4, t3, t2, t1
 	; ------
 	fld	st3		; st : t1, t4, t3, t2, t1
 	fadd	st0, st3	;
 	fadd	st0, st2	;
 	fst	qword[ebx+8*2]	; store f[i+8] = t1+t2+t3
 	fsub	st0, st3	;
 	fsub	st0, st3	;
 	fstp	qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
 	fld	st3		; st : t1, t4, t3, t2, t1
 	fsub	st0, st2	;
 	fsub	st0, st1	;
 	fst	qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
 	fadd	st0, st1	;
 	faddp	st1, st0	; st : t1-t3+t4, t3, t2, t1
 	fstp	qword[ebx+8*6]	; store f[i+6]
 	fstp	st0		; st : t2, t1
 	fstp	st0		; st : t1
 	fstp	st0		; st : <empty>
 ; -- odd subelements
 	fld	qword[ebx+8*9]
 	fld	qword[ebx+8*11]
 	fld	st1
 	fsub	st0, st1
 	fxch	st1
 	faddp	st2, st0	; st : (f[l3]-f[l7]), (f[l3]+f[l7])
 	fld	[_root2]
 	fmul	st2, st0
 	fmulp	st1, st0	; st : t9, t6
 	fld	qword[ebx+8*3]
 	fld	st0
 	fadd	st0, st2	; st : t1, f[l5], t9, t6
 	fstp	[_t1]
 	fsub	st0, st1
 	fstp	[_t2]
 	fstp	[_t9]	; (t9 never used)
 	fstp	[_t6]		; st : <empty>
 	fld	[_c1]
 	fld	[_s1]
 	fld	qword[ebx+8*5]
 	fld	qword[ebx+8*7]
 	fld	st3		; st: c1, f[l6], f[l2], s1, c1
 	fmul	st0, st2	; st: f_2*c, f_6, f_2, s, c
 	fld	st1		; st: f_6, f_2*c, f_6, f_2, s, c
 	fmul	st0, st4	; st: f_6*s, f_2*c, f_6, f_2, s, c
 	faddp	st1, st0	; st: t5, f_6, f_2, s, c
 	fstp	[_t5]		; st: f_6, f_2, s, c
 	fld	st3		; st: c, f_6, f_2, s, c
 	fmul	st0, st1
 	fld	st3
 	fmul	st0, st3	; st: f_2*s, f_6*c, f_6, f_2, s, c
 	fsubp	st1, st0	; st: t8, f_6, f_2, s, c
 	fstp	[_t8]		; st: f_6, f_2, s, c
 	fstp	st0		; st: f_2, s, c
 	fstp	st0		; st: s, c
 	fld	qword[ebx+8*13]
 	fld	qword[ebx+8*15]
 	fld	st3		; st: c1, f[l8], f[l4], s1, c1
 	fmul	st0, st1
 	fld	st3
 	fmul	st0, st3	; st: f_4*s, f_8*c, f_8, f_4, s, c
 	faddp	st1, st0	; st: t7, f_8, f_4, s, c
 	fld	[_t5]		; st: t5, t7, f_8, f_4, s, c
 	fsub	st0, st1	; st: t4, t7, f_8, f_4, s, c
 	fstp	[_t4]
 	fstp	[_t7]		; st: f_8, f_4, s, c
 	fld	st3		; st: c, f_8, f_4, s, c
 	fmul	st0, st2
 	fld	st3
 	fmul	st0, st2	; st: f_8*s, f_4*c, f_8, f_4, s, c
 	fsubp	st1, st0	; st:-t0, f_8, f_4, s, c
 	fchs
 	fld	[_t8]
 	fchs			; st:-t8, t0, f_8, f_4, s, c
 	fsub	st0, st1	; st: t3, t0, f_8, f_4, s, c
 	fstp	[_t3]
 	fstp	[_t0]		; st: f_8, f_4, s, c
 	fstp	st0		; st: f_4, s, c
 	fstp	st0		; st: s, c
 	fstp	st0		; st: c
 	fstp	st0		; st: <empty>
 	fld	[_t1]
 	fld	[_t4]
 	fld	st1
 	fsub	st0, st1
 	fstp	qword[ebx+8*11] ; f[l7] = t1-t4
 	faddp	st1, st0
 	fstp	qword[ebx+8*3]	; f[l5] = t1+t4
 	fld	[_t2]
 	fld	[_t3]
 	fld	st1
 	fsub	st0, st1
 	fstp	qword[ebx+8*15] ; f[l8]
 	faddp	st1, st0
 	fstp	qword[ebx+8*7]	; f[l6]
 	fld	[_t6]
 	fld	qword[ebx+8]
 	fld	st1
 	fsub	st0, st1
 	fxch	st1
 	faddp	st2, st0	; st : t2, t1
 	fld	[_t8]
 	fsub	[_t0]
 	fld	[_t5]
 	fadd	[_t7]		; st : t4, t3, t2, t1
 	fld	st3
 	fsub	st0, st1
 	fstp	qword[ebx+8*9]	; f[l3] = t1-t4
 	fadd	st0, st3
 	fstp	qword[ebx+8]	; f[l1] = t1+t4
 	fld	st1		; st : t2, t3, t2, t1
 	fsub	st0, st1	; f[l4] = t2-t3
 	fstp	qword[ebx+8*13] ; st : t3, t2, t1
 	faddp	st1, st0	; st : t2+t3, t1
 	fstp	qword[ebx+8*5]	; f[l2] = t2+t3
 	fstp	st0		; st : <empty>
 	add	ebx, 16*8
 	cmp	ebx, eax
 	jb	.loop_i
 	ret
 align 8 	; shared local vars
 _t0	dq	0
 _t1	dq	0
 _t2	dq	0
 _t3	dq	0
 _t4	dq	0
 _t5	dq	0
 _t6	dq	0
 _t7	dq	0
 _t8	dq	0
 _t9	dq	0
 ;===================================================================
 step3:
 ;===================================================================
 ; 283  : {
 ; 293  :   for (l=3; l<=p; l++)
 	mov	cx, 0x0200
 .newstep:
 	inc	ch
 	cmp	ch, Power_of_4
 	jg	.done
 	mov	[.step], cx
 ; 294  :   {
 ; 295  :     d1 = 1 << (l + l - 3);
 	mov	cl, ch
 	add	cl, cl
 	sub	cl, 3
 	mov	edx, 1
 	shl	edx, cl
 	mov	[.d1], edx
 ; 296  :     d2 = d1 << 1;
 	shl	edx, 1
 	mov	[.d2], edx
 	mov	eax, edx
 ; 297  :     d3 = d2 << 1;
 	shl	edx, 1
 	mov	[.d3], edx
 ; 298  :     d4 = d2 + d3;
 	add	eax, edx
 	mov	[.d4], eax
 ; 299  :     d5 = d3 << 1;
 	shl	edx, 1
 	mov	[.d5], edx
 	shl	edx, 3
 	mov	[.d6], edx	; d6 = d5*8 to simplify index operations
 ; 339  :         j5 = N / d5;   ; moved out of internal loop
 	mov	cl, Power_of_4
 	sub	cl, ch
 	add	cl, cl
 	mov	edx, 1
 	shl	edx, cl
 	mov	[.j5], edx
 ; 300  :
 ; 301  :     for (j=0; j<N; j+=d5)
 	mov	esi, [_f]
 	mov	ebx, esi
 	add	esi, NumPoints*8
 	mov	[.end_of_array], esi
 .next_j:
 ; {
 ; t1 = f[j] + f[j+d2];
 	mov	eax, [.d2]
 	fld	qword[ebx]
 	fld	qword[ebx+eax*8]
 	fld	st1
 	fadd	st0, st1
 	fstp	[_t1]
 ; t2 = f[j] - f[j+d2];
 	fsubp	st1, st0
 	fstp	[_t2]
 ; t3 = f[j+d3] + f[j+d4];
 	mov	edi, [.d3]
 	fld	qword[ebx+edi*8]
 	mov	edx, [.d4]
 	fld	qword[ebx+edx*8]
 	fld	st1
 	fsub	st0, st1		; st : t4, f4, f3
 	fxch	st1			; st : f4, t4, f3
 ; t4 = f[j+d3] - f[j+d4];
 	faddp	st2, st0		; st : t4, t3
 ; f[j+d4] = t2 - t4;
 ; f[j+d3] = t2 + t4;
 	fld	[_t2]
 	fld	st0
 	fsub	st0, st2		; st : f4, t2, t4, t3
 	fstp	qword[ebx+edx*8]	; st : t2, t4, t3
 	fadd	st0, st1		; st : f3, t4, t3
 	fstp	qword[ebx+edi*8]	; st : t4, t3
 ; f[j+d2] = t1 - t3;
 ; f[j]    = t1 + t3;
 	fld	[_t1]
 	fst	st1
 	fsub	st0, st2		; st : f2, t1, t3
 	fstp	qword[ebx+eax*8]	; st : t1, t3
 	fadd	st0, st1		; st : f0, t3
 	fstp	qword[ebx]		; st : t3
 	fstp	st0
 ; jj = j + d1;     / ??
 	mov	edi, [.d1]
 	shl	edi, 3		; = d1*8
 	mov	edx, edi
 	mov	eax, edi
 	add	eax, eax	; eax = d2*8
 	shl	edx, 2		; = d3*8
 	add	edi, ebx	; now [edi] points to f[jj]
 	add	edx, edi	; and [edx] points to f[jj+d3]
 ; t1 = f[jj];
 	fld	qword [edi]	; st : t1
 ; t3 = f[jj+d3];
 	fld	qword [edx]	; st : t3, t1
 ; t2 = f[jj+d2] * r;
 	fld	qword [edi+eax]
 	fld	[_root]
 	fmul	st1, st0	; st : r,  t2, t3, t1
 ; t4 = f[jj+d4] * r
 	fmul	qword [edx+eax] ; st : t4, t2, t3, t1
 ; f[jj]    = t1 + t2 + t3;
 	fld	st3		; st : t1, t4, t2, t3, t1
 	fadd	st0, st3
 	fadd	st0, st2
 	fstp	qword [edi]
 ; f[jj+d2] = t1 - t3 + t4;
 	fld	st3
 	fsub	st0, st3	; st : (t1-t3), t4, t2, t3, t1
 	fld	st0
 	fadd	st0, st2	; st : f2, (t1-t3), t4, t2, t3, t1
 	fstp	qword [edi+eax]
 ; f[jj+d4] = t1 - t3 - t4;
 	fsub	st0, st1	; st : f4, t4, t2, t3, t1
 	fstp	qword [edx+eax]
 ; f[jj+d3] = t1 - t2 + t3;
 	fstp	st0		; st : t2, t3,  t1
 	fsubp	st1, st0	; st : (t3-t2), t1
 	faddp	st1, st0	; st : f3
 	fstp	qword [edx]
 ; for (k=1; k<d1; k++)
 	xor	ecx, ecx	; ecx = k
 	mov	[.jj], ecx
 .next_k:
 	inc	ecx
 	cmp	ecx, [.d1]
 	jge	.done_k
 ; {
 	mov	eax, [.d2]	; the sector increment
 ; l1 = j  + k;
 	mov	edx, ecx
 	mov	[.l1], edx	; [ebx+edx*8] --> f[j+k]
 ; l2 = l1 + d2;
 	add	edx, eax
 	mov	[.l2], edx
 ; l3 = l1 + d3;
 	add	edx, eax
 	mov	[.l3], edx
 ; l4 = l1 + d4;
 	add	edx, eax
 	mov	[.l4], edx
 ; l5 = j  + d2 - k;
 	mov	edx, eax
 	sub	edx, ecx
 	mov	[.l5], edx
 ; l6 = l5 + d2;
 	add	edx, eax
 	mov	[.l6], edx
 ; l7 = l5 + d3;
 	add	edx, eax
 	mov	[.l7], edx
 ; l8 = l5 + d4;
 	add	edx, eax
 	mov	[.l8], edx
 ; 340  :         j5 *= k;       // add-substituted multiplication
 	mov	eax, [.jj]
 	add	eax, [.j5]
 	mov	[.jj], eax
 ; c1 = C[jj];
 ; s1 = S[jj];
 	mov	edi, [_Cosins]
 	fld	qword[edi+eax*8]
 	mov	esi, [_Sines]
 	fld	qword[esi+eax*8]	; st : s1, c1
 ; t5 = f[l2] * c1 + f[l6] * s1;
 ; t8 = f[l6] * c1 - f[l2] * s1;
 	mov	edx, [.l6]
 	fld	qword[ebx+edx*8]
 	mov	edx, [.l2]
 	fld	st0
 	fmul	st0, st2
 	fxch	st1
 	fmul	st0, st3
 	fld	qword[ebx+edx*8]	; st : f[l2], f[l6]*c, f[l6]*s, s, c
 	fmul	st4, st0
 	fmulp	st3, st0		; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
 	fsub	st0, st2		; st :   t8,    f[l6]*s, f[l2]*s, f[l2]*c
 	fstp	[_t8]
 	faddp	st2, st0		; st :  f[l2]*s, t5
 	fstp	st0			; st :  t5
 	fstp	[_t5]			; st :  <empty>
 ; c2 = C[2*jj];
 ; s2 = S[2*jj];
 	shl	eax, 1
 	fld	qword[edi+eax*8]
 	fld	qword[esi+eax*8]	; st : s2, c2
 ; t6 = f[l3] * c2 + f[l7] * s2;
 ; t9 = f[l7] * c2 - f[l3] * s2;
 	mov	edx, [.l7]
 	fld	qword[ebx+edx*8]
 	mov	edx, [.l3]
 	fld	st0
 	fmul	st0, st2
 	fxch	st1
 	fmul	st0, st3
 	fld	qword[ebx+edx*8]	; st : f[l3], f[l7]*c, f[l7]*s, s, c
 	fmul	st4, st0
 	fmulp	st3, st0		; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
 	fsub	st0, st2		; st :   t9,    f[l7]*s, f[l3]*s, f[l3]*c
 	fstp	[_t9]
 	faddp	st2, st0		; st :  f[l2]*s, t6
 	fstp	st0			; st :  t6
 	fstp	[_t6]			; st :  <empty>
 ; c3 = C[3*jj];
 ; s3 = S[3*jj];
 	add	eax, [.jj]
 	fld	qword[edi+eax*8]
 	fld	qword[esi+eax*8]	; st : s3, c3
 ; t7 = f[l4] * c3 + f[l8] * s3;
 ; t0 = f[l8] * c3 - f[l4] * s3;
 	mov	edx, [.l8]
 	fld	qword[ebx+edx*8]
 	mov	edx, [.l4]
 	fld	st0
 	fmul	st0, st2
 	fxch	st1
 	fmul	st0, st3
 	fld	qword[ebx+edx*8]	; st : f[l4], f[l8]*c, f[l8]*s, s, c
 	fmul	st4, st0
 	fmulp	st3, st0		; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
 	fsub	st0, st2		; st :   t9,    f[l8]*s, f[l4]*s, f[l4]*c
 	fstp	[_t0]
 	faddp	st2, st0		; st : f[l2]*s, t7
 	fstp	st0			; st :  t7
 	fstp	[_t7]			; st :  <empty>
 ; t1 = f[l5] - t9;
 ; t2 = f[l5] + t9;
 	mov	eax, [.l5]
 	fld	qword [ebx+eax*8]
 	fld	[_t9]
 	fld	st0
 	fadd	st0, st2
 	fstp	[_t2]
 	fsubp	st1, st0
 	fstp	[_t1]
 ; t3 = - t8  - t0;
 	fld	[_t8]
 	fadd	[_t0]
 	fchs
 	fstp	[_t3]
 ; t4 =   t5  - t7;
 	fld	[_t5]
 	fsub	[_t7]
 	fstp	[_t4]
 ; f[l5] = t1 + t4;
 	fld	[_t1]
 	fld	[_t4]
 	fld	st0
 	fadd	st0, st2
 	fstp	qword [ebx+eax*8]
 ; f[l7] = t1 - t4;
 	mov	eax, [.l7]
 	fsubp	st1, st0
 	fstp	qword [ebx+eax*8]
 ; f[l6] = t2 + t3;
 	mov	eax, [.l6]
 	fld	[_t2]
 	fld	[_t3]
 	fld	st0
 	fadd	st0, st2
 	fstp	qword [ebx+eax*8]
 ; f[l8] = t2 - t3;
 	mov	eax, [.l8]
 	fsubp	st1, st0
 	fstp	qword [ebx+eax*8]
 ; t1 = f[l1] + t6;
 	mov	eax, [.l1]
 	fld	qword [ebx+eax*8]
 	fld	[_t6]
 	fld	st0
 	fadd	st0, st2
 	fstp	[_t1]
 ; t2 = f[l1] - t6;
 	fsubp	st1, st0
 	fstp	[_t2]
 ; t3 =    t8 - t0;
 	fld	[_t8]
 	fsub	[_t0]
 	fstp	[_t3]
 ; t4 =    t5 + t7;
 	fld	[_t5]
 	fadd	[_t7]
 	fstp	[_t4]
 ; f[l1] = t1 + t4;
 	mov	eax, [.l1]
 	fld	[_t1]
 	fld	[_t4]
      fld     st0
 	fadd	st0, st2
 	fstp	qword [ebx+eax*8]
 ; f[l3] = t1 - t4;
 	mov	eax, [.l3]
 	fsubp	st1, st0
 	fstp	qword [ebx+eax*8]
 ; f[l2] = t2 + t3;
 	mov	eax, [.l2]
 	fld	[_t2]
 	fld	[_t3]
 	fld	st0
 	fadd	st0, st2
 	fstp	qword [ebx+eax*8]
 ; f[l4] = t2 - t3;
 	mov	eax, [.l4]
 	fsubp	st1, st0
 	fstp	qword [ebx+eax*8]
 ; 374  :       }
 	jmp	.next_k
 .done_k:
 ; 375  :     }
 	add	ebx, [.d6]	; d6 = d5*8
 	cmp	ebx, [.end_of_array]
 	jb	.next_j
 ; 376  :   }
 	mov	cx, [.step]
 	jmp	.newstep
 .done:
 ; 377  : }
 	ret
 align 4
 .l1   dd	0
 .l2   dd	0
 .l3   dd	0
 .l4   dd	0
 .l5   dd	0
 .l6   dd	0
 .l7   dd	0
 .l8   dd	0
 .l9   dd	0
 .l0   dd	0
 .d1   dd	0
 .d2   dd	0
 .d3   dd	0
 .d4   dd	0
 .d5   dd	0
 .d6   dd	0
 .j5   dd	0
 .jj   dd	0
 .end_of_array	dd	  0
 .step dw	0
 align 8
 		;=========== Step3 ends here ===========
 ; =================================================================
 ;	syscall entry
 ;
 _f  dd ?
 _N  dd 1024	; number of points
 _a		dd ?		; initial   data array
 x      	dd 0    	; tranformed (float) data array
 _Cosins 	dd 0
 _Sines	dd 0
 FFT4:
 	or	al, al
 	jnz	.trans
 	mov	cl, Power_of_4
 	mov	eax, 1
 	shl	eax, cl
 	shl	eax, cl
 	mov	[_N], eax
 	shl	eax, 2	; size of Sine table in bytes
 	add	eax, ebx
 	mov 	[_Sines], ebx
 	mov	[_Cosins], eax
 	cpuid
 	rdtsc
 	mov	[.time], eax
 	call	MakeSinCosTable
 	cpuid
 	rdtsc
 	sub	eax, [.time]
 	ret
 .trans:
 	mov	[x], 	ebx
 	mov	[_f], ebx
 	cli	;-----
 	cpuid
 	rdtsc
 	mov	[.time], eax
 	call BitInvert
 	call	step1
 	call	step2
 	call	step3
 	cpuid
 	rdtsc
 	sti	;----
 	sub	eax, [.time]
 	ret
 .time	dd	0