an experimental kernel with a mad syscall and FHT inside

git-svn-id: svn://kolibrios.org@1641 a494cfbc-eb01-0410-851d-a64ba20cac60
2010-10-03 23:11:42 +00:00 · 2010-10-03 23:11:42 +00:00 · 112a3665cb
commit 112a3665cb
parent abbc09c677
5 changed files with 661 additions and 479 deletions
--- a/kernel/branches/Kolibri-A/trunk/bus/pci/PCIe.inc
+++ b/kernel/branches/Kolibri-A/trunk/bus/pci/PCIe.inc
@ -76,9 +76,6 @@ pci_ext_config:
 	shl	eax, 8
 	test	eax, 0x000F0000 	; MMIO Base must be bus0-aligned
 	jnz	.no_pcie_cfg
-
-;       -- it looks like a true PCIe config space;
-
 	ret	; <<<<<<<<<<< OK >>>>>>>>>>>

 .no_pcie_cfg:
@ -92,6 +89,7 @@ pci_ext_config:
 .pcie_failed:
 	mov	esi, boot_pcie_fail
 	call	boot_log
+	xor	eax, eax
 	ret	; <<<<<<<<< FAILURE >>>>>>>>>


--- a/kernel/branches/Kolibri-A/trunk/core/syscall.inc
+++ b/kernel/branches/Kolibri-A/trunk/core/syscall.inc
@ -1,4 +1,4 @@
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;                                                              ;;
 ;; Copyright (C) KolibriOS team 2004-2007. All rights reserved. ;;
 ;; Distributed under terms of the GNU General Public License    ;;
@ -21,37 +21,6 @@ cross_order:
 	call	dword [servetable+edi*4]
 	ret

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;                                                            ;;
-;;                     SYSENTER ENTRY                         ;;
-;;                                  (not used on AMD systems) ;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;align 32
-;sysenter_entry:
-;        ; Íàñòðàèâàåì ñòåê
-;        mov     esp, [ss:tss._esp0]
-;        sti
-;        push    ebp                     ; save app esp + 4
-;        mov     ebp, [ebp]              ; ebp - original ebp
-;        ;------------------
-;        pushad
-;        cld
-;
-;        movzx   eax, al
-;        call    dword [servetable2 + eax * 4]
-
-;       popad
-;       ;------------------
-;       xchg    ecx, [ss:esp]           ; â âåðøèí ñòåêà - app ecx, ecx - app esp + 4
-;        sub     ecx, 4
-;        xchg    edx, [ecx]              ; edx - return point, & save original edx
-;        push    edx
-;        mov     edx, [ss:esp + 4]
-;        mov     [ecx + 4], edx          ; save original ecx
-;        pop     edx
-;        sysexit
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;                                                            ;;
 ;;                   SYSTEM CALL ENTRY                        ;;
@ -69,17 +38,17 @@ i40:

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;                                                            ;;
-;;                     SYSCALL ENTRY                          ;;
+;;               SYSCALL ENTRY   --    NEW  !!!               ;;
 ;;                                                            ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 align 32
 syscall_entry:
-;       push    ecx
-	  sti
+;        sti
+	push ecx
 	and   eax, 3
 	call	dword [servetable3 + eax * 4]
-
- ;      pop     ecx
+	pop	ecx
 	sysret

 iglobal
@ -89,7 +58,6 @@ iglobal

  align 4
  servetable:
-
      dd socket 		 ; 53-Socket interface
      dd 0
      dd 0
@ -187,8 +155,8 @@ iglobal
 align 4
 servetable3:

-	dd 	FFT4	; 0
-	dd 	FFT4	; 1
+	dd	FHT_4	 ; 0
+	dd	FHT_4	 ; 1
 	dd	paleholder	; 2
 	dd	sys_end ; last

--- a/kernel/branches/Kolibri-A/trunk/kernel32.inc
+++ b/kernel/branches/Kolibri-A/trunk/kernel32.inc
@ -16,30 +16,6 @@
 $Revision$


-;struc db [a] { common . db a
-;  if ~used .
-;    display 'not used db: ',`.,13,10
-;  end if }
-;struc dw [a] { common . dw a
-;  if ~used .
-;    display 'not used dw: ',`.,13,10
-;  end if }
-;struc dd [a] { common . dd a
-;  if ~used .
-;    display 'not used dd: ',`.,13,10
-;  end if }
-;struc dp [a] { common . dp a
-;  if ~used .
-;    display 'not used dp: ',`.,13,10
-;  end if }
-;struc dq [a] { common . dq a
-;  if ~used .
-;    display 'not used dq: ',`.,13,10
-;  end if }
-;struc dt [a] { common . dt a
-;  if ~used .
-;    display 'not used dt: ',`.,13,10
-;  end if }

 struc POINT {
  .x dd ?
@ -244,7 +220,7 @@ include "fs/ext2.inc"     ; read / write for ext2 filesystem
 ; sound

 include "sound/playnote.inc"	; player Note for Speaker PC
-include "sound/FFT.inc" 	; fast Fourier transform routines
+include "sound/FHT.inc" 	; fast Fourier transform routines

 ; display

@ -311,3 +287,4 @@ include "core/ext_lib.inc"

 ; list of external functions
 include "imports.inc"
+
--- a/kernel/branches/Kolibri-A/trunk/sound/FHT.INC
+++ b/kernel/branches/Kolibri-A/trunk/sound/FHT.INC
@ -5,62 +5,51 @@
 ; free KolibriOS version - not to be ported to other OSes
 ; ==========================================================

-Power_of_4	equ	5
-NumPoints	equ	1024
-N_2		equ	NumPoints / 2
-N_4		equ	NumPoints / 4

-;=================================================================
 ; global constants
 align 8
-_root		dq	1.41421356237309504880169	; = sqrt(2)
-_root2	dq	0.70710678118654752440084	; = sqrt(2)/2
-_c1		dq	0.92387953251128675612818	; = cos(pi/8)
-_s1		dq	0.38268343236508977172846	; = sin(pi/8)
-_dx		dq	0.00613592315154296875		; pi/512
+fht_r	   dq	   1.41421356237309504880169	   ; = sqrt(2)
+fht_r2	   dq	   0.70710678118654752440084	   ; = sqrt(2)/2
+fht_c1	   dq	   0.92387953251128675612818	   ; = cos(pi/8)
+fht_s1	   dq	   0.38268343236508977172846	   ; = sin(pi/8)

-;[_CosTable]	dd	0 ; N_2 elements
-;[_SinTable]	dd	0 ; N_2 elements

+;=================================================================
+; parameter1:
+; -- reg  dl (bits[3:0])   = Power_of_4
+; -- reg edx && (-16) = 4k-aligned data array address
+; returns:
+; -- edx = Power_of_4
+; -- ecx = N
+; destroys:
+; -- eax, ebx, ecx, edx, esi
 ;; ==========================
 align 4
-MakeSinCosTable:
-	mov	ebx, [_Sines]
-	mov	ecx, [_Cosins]
-	xor	eax,  eax
-	fld	[_dx]			; st : dx
-	fldz				; st : 0, dx
-.loop:
-	fld	st0			; st : x, x, dx
-	FSINCOS 			; st : cos, sin, x, dx
-	fstp	qword [ecx+eax*8]	; st : sin, x, dx
-	fstp	qword [ebx+eax*8]	; st : x, dx
-	fadd	st0, st1		; st : x+dx, dx
-
-	inc	eax
-	cmp	eax, N_2
-	jne	.loop
-	fstp	st0			; st : dx
-	fstp	st0			; st : <empty>
-	ret
-
-; ================================================================
-align 4
 BitInvert:
-	mov	esi, [x]	; array of qwords
+	mov	esi, edx
+	and	esi, 0xFFFFFFF0
+	and	edx, 0x0F
+	push	edx
+	mov	cl, dl
+	xor	eax, eax
+	inc	eax
+	shl	eax, cl
+	shl	eax, cl
+	push	eax
 	xor	ecx, ecx		; index term
+align 4
 .newterm:
 	inc	ecx
-	cmp	ecx, NumPoints
+	cmp	ecx, [esp]		; N
 	jge	.done

 	xor	eax, eax
 	mov	edx, ecx
 	xor	bl, bl
-
+align 4
 .do_invert:
 	inc	bl
-	cmp	bl, Power_of_4
+	cmp	bl, byte[esp+4] ; Power_of_4
 	jg	.switch

 	mov	bh, dl
@ -69,6 +58,7 @@ BitInvert:
 	or	al, bh
 	shr	edx, 2
 	jmp	.do_invert
+align 8

 .switch:
 	cmp	eax, ecx
@ -80,17 +70,32 @@ BitInvert:
 	fstp	qword [esi+ecx*8]
 	jmp	.newterm

+align 4
 .done:
+	pop	ecx
+	pop	edx
 	ret

 ;=================================================================
+
+
+;=================================================================
+; stdcall parameters:
+; -- [esp+4]  = N
+; -- [esp+8]  = 4k-aligned data array  address
+; returns:
+; -- nothing
+; destroys:
+; -- ebx, esi
+;; ==========================
 align 4
-
 step1:
-	mov	esi, [x]
-	mov	ebx, esi
-	add	esi, NumPoints*8
+	mov	ebx, [esp+8]
+	mov	esi, [esp+4]
+	shl	esi, 3
+	add	esi, ebx

+align 4
 .loop:
 	fld	qword[ebx]
 	fld	qword[ebx+8]
@ -119,19 +124,65 @@ step1:
 	add	ebx, 32
 	cmp	ebx, esi
 	jnz	.loop
+ret

-	ret
-
-
-;
-;===========================================================================
-step2:				; Step2
+;       local stack definitions
 ;===========================================================================
+_t0	equ	dword [esp]
+_t1	equ	dword[esp+4]
+_t2	equ	dword[esp+8]
+_t3	equ	dword[esp+12]
+_t4	equ	dword[esp+16]
+_t5	equ	dword[esp+20]
+_t6	equ	dword[esp+24]
+_t7	equ	dword[esp+28]
+_t8	equ	dword[esp+32]
+_t9	equ	dword[esp+36]

-	mov	eax, [_f]
-	mov	ebx, eax
-	add	eax, NumPoints*8
+_l1   equ	dword[esp+40]
+_l2   equ	dword[esp+44]
+_l3   equ	dword[esp+48]
+_l4   equ	dword[esp+52]
+_l5   equ	dword[esp+56]
+_l6   equ	dword[esp+60]
+_l7   equ	dword[esp+64]
+_l8   equ	dword[esp+68]
+_l9   equ	dword[esp+72]
+_l0   equ	dword[esp+76]
+_d1   equ	dword[esp+80]
+_d2   equ	dword[esp+84]
+_d3   equ	dword[esp+88]
+_d4   equ	dword[esp+92]
+_d5   equ	dword[esp+96]
+_d6   equ	dword[esp+100]
+_j5   equ	dword[esp+104]
+_jj   equ	dword[esp+108]
+_end_of_array	equ	dword[esp+112]
+_step		equ	word [esp+116]

+
+;=================================================================
+; cdecl parameters:
+; -- [ebp+8]   = N
+; -- [ebp+12]  = 4k-aligned data array  address
+; returns:
+; -- nothing
+; destroys:
+; -- eax, ebx
+; locals:
+; -- 10 stack-located dwords (_t0 ... _t9)
+;; ==========================
+align 4
+step2:
+	push	ebp
+	mov	ebp, esp
+	sub	esp, 40
+	mov	ebx, [ebp+12]
+	mov	eax, [ebp+ 8]
+	shl	eax, 3
+	add	eax, ebx
+
+align 4
 .loop_i:

 ; -- quad subelements  +0, +4, +8 and +12 (simpliest operations)
@ -163,7 +214,7 @@ step2:				; Step2
 ; -- even subelements  +2, +6, +10 and +14 (2 multiplications needed)
 	fld	qword[ebx+8*2]
 	fld	qword[ebx+8*6]
-	fld	[_root]
+	fld	[fht_r]
 	fmul	st1, st0	; st : r, t2, t1
 	fld	qword[ebx+8*10]
 	fxch	st1		; st : r, t3, t2, t1
@ -194,20 +245,20 @@ step2:				; Step2
 	fsub	st0, st1
 	fxch	st1
 	faddp	st2, st0	; st : (f[l3]-f[l7]), (f[l3]+f[l7])
-	fld	[_root2]
+	fld	[fht_r2]
 	fmul	st2, st0
 	fmulp	st1, st0	; st : t9, t6
 	fld	qword[ebx+8*3]
 	fld	st0
 	fadd	st0, st2	; st : t1, f[l5], t9, t6
-	fstp	[_t1]
+	fstp	_t1
 	fsub	st0, st1
-	fstp	[_t2]
-	fstp	[_t9]	; (t9 never used)
-	fstp	[_t6]		; st : <empty>
+	fstp	_t2
+	fstp	_t9	; (t9 never used)
+	fstp	_t6		; st : <empty>

-	fld	[_c1]
-	fld	[_s1]
+	fld	[fht_c1]
+	fld	[fht_s1]
 	fld	qword[ebx+8*5]
 	fld	qword[ebx+8*7]
 	fld	st3		; st: c1, f[l6], f[l2], s1, c1
@ -215,13 +266,13 @@ step2:				; Step2
 	fld	st1		; st: f_6, f_2*c, f_6, f_2, s, c
 	fmul	st0, st4	; st: f_6*s, f_2*c, f_6, f_2, s, c
 	faddp	st1, st0	; st: t5, f_6, f_2, s, c
-	fstp	[_t5]		; st: f_6, f_2, s, c
+	fstp	_t5		; st: f_6, f_2, s, c
 	fld	st3		; st: c, f_6, f_2, s, c
 	fmul	st0, st1
 	fld	st3
 	fmul	st0, st3	; st: f_2*s, f_6*c, f_6, f_2, s, c
 	fsubp	st1, st0	; st: t8, f_6, f_2, s, c
-	fstp	[_t8]		; st: f_6, f_2, s, c
+	fstp	_t8		; st: f_6, f_2, s, c
 	fstp	st0		; st: f_2, s, c
 	fstp	st0		; st: s, c

@ -232,51 +283,51 @@ step2:				; Step2
 	fld	st3
 	fmul	st0, st3	; st: f_4*s, f_8*c, f_8, f_4, s, c
 	faddp	st1, st0	; st: t7, f_8, f_4, s, c
-	fld	[_t5]		; st: t5, t7, f_8, f_4, s, c
+	fld	_t5		; st: t5, t7, f_8, f_4, s, c
 	fsub	st0, st1	; st: t4, t7, f_8, f_4, s, c
-	fstp	[_t4]
-	fstp	[_t7]		; st: f_8, f_4, s, c
+	fstp	_t4
+	fstp	_t7		; st: f_8, f_4, s, c
 	fld	st3		; st: c, f_8, f_4, s, c
 	fmul	st0, st2
 	fld	st3
 	fmul	st0, st2	; st: f_8*s, f_4*c, f_8, f_4, s, c
 	fsubp	st1, st0	; st:-t0, f_8, f_4, s, c
 	fchs
-	fld	[_t8]
+	fld	_t8
 	fchs			; st:-t8, t0, f_8, f_4, s, c
 	fsub	st0, st1	; st: t3, t0, f_8, f_4, s, c
-	fstp	[_t3]
-	fstp	[_t0]		; st: f_8, f_4, s, c
+	fstp	_t3
+	fstp	_t0		; st: f_8, f_4, s, c
 	fstp	st0		; st: f_4, s, c
 	fstp	st0		; st: s, c
 	fstp	st0		; st: c
 	fstp	st0		; st: <empty>

-	fld	[_t1]
-	fld	[_t4]
+	fld	_t1
+	fld	_t4
 	fld	st1
 	fsub	st0, st1
 	fstp	qword[ebx+8*11] ; f[l7] = t1-t4
 	faddp	st1, st0
 	fstp	qword[ebx+8*3]	; f[l5] = t1+t4
-	fld	[_t2]
-	fld	[_t3]
+	fld	_t2
+	fld	_t3
 	fld	st1
 	fsub	st0, st1
 	fstp	qword[ebx+8*15] ; f[l8]
 	faddp	st1, st0
 	fstp	qword[ebx+8*7]	; f[l6]

-	fld	[_t6]
+	fld	_t6
 	fld	qword[ebx+8]
 	fld	st1
 	fsub	st0, st1
 	fxch	st1
 	faddp	st2, st0	; st : t2, t1
-	fld	[_t8]
-	fsub	[_t0]
-	fld	[_t5]
-	fadd	[_t7]		; st : t4, t3, t2, t1
+	fld	_t8
+	fsub	_t0
+	fld	_t5
+	fadd	_t7		; st : t4, t3, t2, t1

 	fld	st3
 	fsub	st0, st1
@ -294,36 +345,42 @@ step2:				; Step2
 	cmp	ebx, eax
 	jb	.loop_i

-	ret
-
-align 8 	; shared local vars
-_t0	dq	0
-_t1	dq	0
-_t2	dq	0
-_t3	dq	0
-_t4	dq	0
-_t5	dq	0
-_t6	dq	0
-_t7	dq	0
-_t8	dq	0
-_t9	dq	0
+	mov	esp, ebp
+	pop	ebp
+ret



-;===================================================================
+
+;=================================================================
+; cdecl parameters:
+; -- [ebp+8]   = N
+; -- [ebp+12]  = p
+; -- [ebp+16]  = 4k-aligned data array  address
+; -- [ebp+20]  = 4k-aligned SinCosTable address
+; returns:
+; -- nothing
+; destroys:
+; -- all GPRegs
+; locals:
+; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
+;; ==========================
+align 4
 step3:
-;===================================================================
-
+	push	ebp
+	mov	ebp, esp
+	sub	esp, 120
 ; 283  : {


 ; 293  :   for (l=3; l<=p; l++)
 	mov	cx, 0x0200
+align 4
 .newstep:
 	inc	ch
-	cmp	ch, Power_of_4
+	cmp	ch, byte[ebp+12]
 	jg	.done
-	mov	[.step], cx
+	mov	_step, cx

 ; 294  :   {
 ; 295  :     d1 = 1 << (l + l - 3);
@ -333,61 +390,63 @@ step3:
 	sub	cl, 3
 	mov	edx, 1
 	shl	edx, cl
-	mov	[.d1], edx
+	mov	_d1, edx

 ; 296  :     d2 = d1 << 1;
 	shl	edx, 1
-	mov	[.d2], edx
+	mov	_d2, edx
 	mov	eax, edx

 ; 297  :     d3 = d2 << 1;
 	shl	edx, 1
-	mov	[.d3], edx
+	mov	_d3, edx

 ; 298  :     d4 = d2 + d3;
 	add	eax, edx
-	mov	[.d4], eax
+	mov	_d4, eax

 ; 299  :     d5 = d3 << 1;
 	shl	edx, 1
-	mov	[.d5], edx
+	mov	_d5, edx
 	shl	edx, 3
-	mov	[.d6], edx	; d6 = d5*8 to simplify index operations
+	mov	_d6, edx	; d6 = d5*8 to simplify index operations

 ; 339  :         j5 = N / d5;   ; moved out of internal loop
-	mov	cl, Power_of_4
+	mov	cl, [ebp+12]
 	sub	cl, ch
 	add	cl, cl
 	mov	edx, 1
 	shl	edx, cl
-	mov	[.j5], edx
+	mov	_j5, edx

 ; 300  :
 ; 301  :     for (j=0; j<N; j+=d5)
-	mov	esi, [_f]
-	mov	ebx, esi
-	add	esi, NumPoints*8
-	mov	[.end_of_array], esi
+	mov	ebx, [ebp+16]
+	mov	esi, [ebp+8]
+	shl	esi, 3
+	add	esi, ebx
+	mov	_end_of_array, esi

+align 4
 .next_j:

 ; {
 ; t1 = f[j] + f[j+d2];
-	mov	eax, [.d2]
+	mov	eax, _d2
 	fld	qword[ebx]
 	fld	qword[ebx+eax*8]
 	fld	st1
 	fadd	st0, st1
-	fstp	[_t1]
+	fstp	_t1

 ; t2 = f[j] - f[j+d2];
 	fsubp	st1, st0
-	fstp	[_t2]
+	fstp	_t2

 ; t3 = f[j+d3] + f[j+d4];
-	mov	edi, [.d3]
+	mov	edi, _d3
 	fld	qword[ebx+edi*8]
-	mov	edx, [.d4]
+	mov	edx, _d4
 	fld	qword[ebx+edx*8]
 	fld	st1
 	fsub	st0, st1		; st : t4, f4, f3
@ -398,7 +457,7 @@ step3:

 ; f[j+d4] = t2 - t4;
 ; f[j+d3] = t2 + t4;
-	fld	[_t2]
+	fld	_t2
 	fld	st0
 	fsub	st0, st2		; st : f4, t2, t4, t3
 	fstp	qword[ebx+edx*8]	; st : t2, t4, t3
@ -407,7 +466,7 @@ step3:

 ; f[j+d2] = t1 - t3;
 ; f[j]    = t1 + t3;
-	fld	[_t1]
+	fld	_t1
 	fst	st1
 	fsub	st0, st2		; st : f2, t1, t3
 	fstp	qword[ebx+eax*8]	; st : t1, t3
@ -416,7 +475,7 @@ step3:
 	fstp	st0

 ; jj = j + d1;     / ??
-	mov	edi, [.d1]
+	mov	edi, _d1
 	shl	edi, 3		; = d1*8
 	mov	edx, edi
 	mov	eax, edi
@ -432,7 +491,7 @@ step3:

 ; t2 = f[jj+d2] * r;
 	fld	qword [edi+eax]
-	fld	[_root]
+	fld	[fht_r]
 	fmul	st1, st0	; st : r,  t2, t3, t1
 ; t4 = f[jj+d4] * r
 	fmul	qword [edx+eax] ; st : t4, t2, t3, t1
@ -461,58 +520,61 @@ step3:

 ; for (k=1; k<d1; k++)
 	xor	ecx, ecx	; ecx = k
-	mov	[.jj], ecx
+	mov	_jj, ecx
+align 4
 .next_k:
 	inc	ecx
-	cmp	ecx, [.d1]
+	cmp	ecx, _d1
 	jge	.done_k
 ; {
-	mov	eax, [.d2]	; the sector increment
+	mov	eax, _d2	; the sector increment
 ; l1 = j  + k;
 	mov	edx, ecx
-	mov	[.l1], edx	; [ebx+edx*8] --> f[j+k]
+	mov	_l1, edx	; [ebx+edx*8] --> f[j+k]
 ; l2 = l1 + d2;
 	add	edx, eax
-	mov	[.l2], edx
+	mov	_l2, edx
 ; l3 = l1 + d3;
 	add	edx, eax
-	mov	[.l3], edx
+	mov	_l3, edx
 ; l4 = l1 + d4;
 	add	edx, eax
-	mov	[.l4], edx
+	mov	_l4, edx

 ; l5 = j  + d2 - k;
 	mov	edx, eax
 	sub	edx, ecx
-	mov	[.l5], edx
+	mov	_l5, edx
 ; l6 = l5 + d2;
 	add	edx, eax
-	mov	[.l6], edx
+	mov	_l6, edx
 ; l7 = l5 + d3;
 	add	edx, eax
-	mov	[.l7], edx
+	mov	_l7, edx
 ; l8 = l5 + d4;
 	add	edx, eax
-	mov	[.l8], edx
+	mov	_l8, edx


 ; 340  :         j5 *= k;       // add-substituted multiplication
-	mov	eax, [.jj]
-	add	eax, [.j5]
-	mov	[.jj], eax
+	mov	eax, _jj
+	add	eax, _j5
+	mov	_jj, eax

 ; c1 = C[jj];
 ; s1 = S[jj];
-	mov	edi, [_Cosins]
+	mov	edi, [ebp+20]
 	fld	qword[edi+eax*8]
-	mov	esi, [_Sines]
+	mov	esi, [ebp+8]
+	shl	esi, 2
+	add	esi, edi
 	fld	qword[esi+eax*8]	; st : s1, c1

 ; t5 = f[l2] * c1 + f[l6] * s1;
 ; t8 = f[l6] * c1 - f[l2] * s1;
-	mov	edx, [.l6]
+	mov	edx, _l6
 	fld	qword[ebx+edx*8]
-	mov	edx, [.l2]
+	mov	edx, _l2
 	fld	st0
 	fmul	st0, st2
 	fxch	st1
@ -521,10 +583,10 @@ step3:
 	fmul	st4, st0
 	fmulp	st3, st0		; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
 	fsub	st0, st2		; st :   t8,    f[l6]*s, f[l2]*s, f[l2]*c
-	fstp	[_t8]
+	fstp	_t8
 	faddp	st2, st0		; st :  f[l2]*s, t5
 	fstp	st0			; st :  t5
-	fstp	[_t5]			; st :  <empty>
+	fstp	_t5			; st :  <empty>

 ; c2 = C[2*jj];
 ; s2 = S[2*jj];
@ -534,9 +596,9 @@ step3:

 ; t6 = f[l3] * c2 + f[l7] * s2;
 ; t9 = f[l7] * c2 - f[l3] * s2;
-	mov	edx, [.l7]
+	mov	edx, _l7
 	fld	qword[ebx+edx*8]
-	mov	edx, [.l3]
+	mov	edx, _l3
 	fld	st0
 	fmul	st0, st2
 	fxch	st1
@ -545,22 +607,22 @@ step3:
 	fmul	st4, st0
 	fmulp	st3, st0		; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
 	fsub	st0, st2		; st :   t9,    f[l7]*s, f[l3]*s, f[l3]*c
-	fstp	[_t9]
+	fstp	_t9
 	faddp	st2, st0		; st :  f[l2]*s, t6
 	fstp	st0			; st :  t6
-	fstp	[_t6]			; st :  <empty>
+	fstp	_t6			; st :  <empty>

 ; c3 = C[3*jj];
 ; s3 = S[3*jj];
-	add	eax, [.jj]
+	add	eax, _jj
 	fld	qword[edi+eax*8]
 	fld	qword[esi+eax*8]	; st : s3, c3

 ; t7 = f[l4] * c3 + f[l8] * s3;
 ; t0 = f[l8] * c3 - f[l4] * s3;
-	mov	edx, [.l8]
+	mov	edx, _l8
 	fld	qword[ebx+edx*8]
-	mov	edx, [.l4]
+	mov	edx, _l4
 	fld	st0
 	fmul	st0, st2
 	fxch	st1
@ -569,192 +631,162 @@ step3:
 	fmul	st4, st0
 	fmulp	st3, st0		; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
 	fsub	st0, st2		; st :   t9,    f[l8]*s, f[l4]*s, f[l4]*c
-	fstp	[_t0]
+	fstp	_t0
 	faddp	st2, st0		; st : f[l2]*s, t7
 	fstp	st0			; st :  t7
-	fstp	[_t7]			; st :  <empty>
+	fstp	_t7			; st :  <empty>

 ; t1 = f[l5] - t9;
 ; t2 = f[l5] + t9;
-	mov	eax, [.l5]
+	mov	eax, _l5
 	fld	qword [ebx+eax*8]
-	fld	[_t9]
+	fld	_t9
 	fld	st0
 	fadd	st0, st2
-	fstp	[_t2]
+	fstp	_t2
 	fsubp	st1, st0
-	fstp	[_t1]
+	fstp	_t1

 ; t3 = - t8  - t0;
-	fld	[_t8]
-	fadd	[_t0]
+	fld	_t8
+	fadd	_t0
 	fchs
-	fstp	[_t3]
+	fstp	_t3
 ; t4 =   t5  - t7;
-	fld	[_t5]
-	fsub	[_t7]
-	fstp	[_t4]
+	fld	_t5
+	fsub	_t7
+	fstp	_t4

 ; f[l5] = t1 + t4;
-	fld	[_t1]
-	fld	[_t4]
+	fld	_t1
+	fld	_t4
 	fld	st0
 	fadd	st0, st2
 	fstp	qword [ebx+eax*8]
 ; f[l7] = t1 - t4;
-	mov	eax, [.l7]
+	mov	eax, _l7
 	fsubp	st1, st0
 	fstp	qword [ebx+eax*8]

 ; f[l6] = t2 + t3;
-	mov	eax, [.l6]
-	fld	[_t2]
-	fld	[_t3]
+	mov	eax, _l6
+	fld	_t2
+	fld	_t3
 	fld	st0
 	fadd	st0, st2
 	fstp	qword [ebx+eax*8]
 ; f[l8] = t2 - t3;
-	mov	eax, [.l8]
+	mov	eax, _l8
 	fsubp	st1, st0
 	fstp	qword [ebx+eax*8]

 ; t1 = f[l1] + t6;
-	mov	eax, [.l1]
+	mov	eax, _l1
 	fld	qword [ebx+eax*8]
-	fld	[_t6]
+	fld	_t6
 	fld	st0
 	fadd	st0, st2
-	fstp	[_t1]
+	fstp	_t1
 ; t2 = f[l1] - t6;
 	fsubp	st1, st0
-	fstp	[_t2]
+	fstp	_t2

 ; t3 =    t8 - t0;
-	fld	[_t8]
-	fsub	[_t0]
-	fstp	[_t3]
+	fld	_t8
+	fsub	_t0
+	fstp	_t3
 ; t4 =    t5 + t7;
-	fld	[_t5]
-	fadd	[_t7]
-	fstp	[_t4]
+	fld	_t5
+	fadd	_t7
+	fstp	_t4

 ; f[l1] = t1 + t4;
-	mov	eax, [.l1]
-	fld	[_t1]
-	fld	[_t4]
+	mov	eax, _l1
+	fld	_t1
+	fld	_t4
      fld     st0
 	fadd	st0, st2
 	fstp	qword [ebx+eax*8]
 ; f[l3] = t1 - t4;
-	mov	eax, [.l3]
+	mov	eax, _l3
 	fsubp	st1, st0
 	fstp	qword [ebx+eax*8]

 ; f[l2] = t2 + t3;
-	mov	eax, [.l2]
-	fld	[_t2]
-	fld	[_t3]
+	mov	eax, _l2
+	fld	_t2
+	fld	_t3
 	fld	st0
 	fadd	st0, st2
 	fstp	qword [ebx+eax*8]
 ; f[l4] = t2 - t3;
-	mov	eax, [.l4]
+	mov	eax, _l4
 	fsubp	st1, st0
 	fstp	qword [ebx+eax*8]

 ; 374  :       }
 	jmp	.next_k

+align 4
 .done_k:
 ; 375  :     }
-	add	ebx, [.d6]	; d6 = d5*8
-	cmp	ebx, [.end_of_array]
+	add	ebx, _d6	; d6 = d5*8
+	cmp	ebx, _end_of_array
 	jb	.next_j

 ; 376  :   }
-	mov	cx, [.step]
+	mov	cx, _step
 	jmp	.newstep
 .done:
-
+	mov	esp, ebp
+	pop	ebp
 ; 377  : }
 	ret

-align 4
-.l1   dd	0
-.l2   dd	0
-.l3   dd	0
-.l4   dd	0
-.l5   dd	0
-.l6   dd	0
-.l7   dd	0
-.l8   dd	0
-.l9   dd	0
-.l0   dd	0
-.d1   dd	0
-.d2   dd	0
-.d3   dd	0
-.d4   dd	0
-.d5   dd	0
-.d6   dd	0
-.j5   dd	0
-.jj   dd	0
-.end_of_array	dd	  0
-.step dw	0
-
-align 8

 		;=========== Step3 ends here ===========


-
-
-
 ; =================================================================
-;	syscall entry
-;
-_f  dd ?
-_N  dd 1024	; number of points

-_a		dd ?		; initial   data array
-x      	dd 0    	; tranformed (float) data array
-_Cosins 	dd 0
-_Sines	dd 0
+;=================================================================
+; parameters:
+; -- [ebp+12]   = N
+; -- [ebp+16]  = p
+; -- [ebp+20]  = 4k-aligned data array  address
+; -- [ebp+24]  = 4k-aligned SinCosTable address
+; returns:
+; -- nothing
+; destroys:
+; -- all GPRegs
+;; ==========================

-FFT4:
-	or	al, al
-	jnz	.trans
-	mov	cl, Power_of_4
-	mov	eax, 1
-	shl	eax, cl
-	shl	eax, cl
-	mov	[_N], eax
-	shl	eax, 2	; size of Sine table in bytes
-	add	eax, ebx
-	mov 	[_Sines], ebx
-	mov	[_Cosins], eax
-	cpuid
-	rdtsc
-	mov	[.time], eax
-	call	MakeSinCosTable
-	cpuid
-	rdtsc
-	sub	eax, [.time]
-	ret
-.trans:
-	mov	[x], 	ebx
-	mov	[_f], ebx
-	cli	;-----
-	cpuid
-	rdtsc
-	mov	[.time], eax
+align 4
+
+FHT_4:
+	push	ebp
+	mov	ebp, esp
+
+	mov	edx, [ebp+20]	; a
+	mov	dl, byte[ebp+16]
 	call BitInvert
-	call	step1
-	call	step2
+	push	dword[ebp+20]	; a
+	push	ecx		; N
+	call	step1		; 4-point transform
+	cmp	cl, 1
+	jz	.done
+	call	step2		; 16-point transform
+	cmp	byte[ebp+16],1	; p = 2 ?
+	jz	.done
+	pop	edx		; N
+	pop	ecx		; a
+	push	dword[ebp+24]	; t
+	push	ecx
+	push	dword[ebp+16]	; p
+	push	edx		; N
 	call	step3
-	cpuid
-	rdtsc
-	sti	;----
-	sub	eax, [.time]
-	ret
+.done:
+	mov	esp, ebp
+	pop	ebp

-.time	dd	0
+ret
--- a/kernel/branches/Kolibri-A/utilities/FFT/FHT4B.ASM
+++ b/kernel/branches/Kolibri-A/utilities/FFT/FHT4B.ASM
@ -0,0 +1,207 @@
+;========================================================================
+;=                                                                      =
+;=         Fast Hartley Transform  routine demo for KolibriOS           =
+;=                                                                      =
+;=       Copyright (C) 2010, Artem Jerdev <kolibri@jerdev.co.uk>        =
+;=                                                                      =
+;=             refer to wiki.kolibtios.org for all details              =
+;=                                                                      =
+;========================================================================
+
+
+
+
+use32
+
+	       org    0x0
+
+	       db     'MENUET01'	      ; 8 byte id
+	       dd     0x01		      ; header version
+	       dd     START		      ; start of code
+	       dd     I_END		      ; size of image
+	       dd     0x100000		      ; memory for app
+	       dd     0xbfffc		      ; esp
+	       dd     0x0 , 0x0 	      ; I_Param , I_Icon
+
+
+include 'macros.inc'
+include 'debug.inc'
+include 'FHT4i.inc'
+
+
+START:				; start of execution
+
+     call main
+
+
+    mov  eax,-1 		; close this program
+    int  0x40
+
+
+;=============================================================
+;Func: calculates a simple function
+;      ff = (int)(500*exp(-t) * cos (2.5*t))
+;               uses: eax, ebx
+;------------
+Func:
+
+; 9    : {
+
+; 10   :   double x,t;
+; 11   :   int f;
+; 12   :
+; 13   :   x = (i < N2) ? i : i - NUM_POINTS;
+	mov	eax, [ii]
+	cmp	eax, 512
+	jge	.index_negative
+	jmp	.index_correct
+.index_negative:
+	sub	eax, 1024
+.index_correct:
+	mov	[temp], eax
+;        fild    [temp]
+
+; 14   :   t = x / 16.0;
+; f2xm1 argument (abs) must be less than 1, so
+	mov	[t_mod], eax
+	and	[t_mod], 0x0F	; x % 16
+	shr	eax, 4		; x / 16
+	mov	[t_div], eax
+	fild	[temp]
+
+; 15   :   if (t<0) t = -t;
+	fabs
+exp_ok:
+; 16   :   f = (int)(512*2^(-t) * cos (2.5*t));
+	fchs
+	f2xm1
+	fmul	[f500]
+	fstp	[tv93]
+	fld	[f2_5]
+	fmul	[tt]
+	fcos
+	fmul	[tv93]
+	fstp	[tt]
+	mov	bx, word[tt+6]
+	shr	bx,4
+	and	bx,0x07FF
+	add	ax,bx
+	shl	ax,4
+	and	word[tt+6], 0x800F
+	or	word[tt+6], ax
+	fld	[tt]
+	fstp   [ff]
+
+; 17   :   return  f;
+; 18   : }
+	ret
+;---------------------------------------------------------
+;       test data filler
+;
+;       uses eax, ebx, ecx
+FillData:
+; 29   :    for (i=0; i<NUM_POINTS; i++)
+; here : ecx = i
+	xor	ecx, ecx
+.funcloop:
+; 30   :    {
+; 31   :       ia[i] = Func(i);
+	mov	[ii], ecx
+	call	Func
+	fld	[ff]
+	fstp	qword [edx+ecx*8]
+; 32   :    }
+	inc	ecx
+	cmp	ecx, [_in]	   ; ecx == N ?
+	jne	.funcloop
+	ret
+
+;====================================================================
+; main
+;====================================================================
+align 4
+
+_ia	dd 0
+_ii	dd 0
+_ip	dd 0
+_in	dd 0
+_it	dd 0
+;-----------------
+main:
+	 mov	eax, 68
+	 mov	ebx, 11
+	 int	0x40
+	fninit
+	 mov	cl,   2 	; power of 4
+	 mov	byte[_ip], cl
+	 mov	eax, 1
+	 shl	eax, cl
+	 shl	eax, cl
+	 mov	[_in], eax
+	 mov	dl, cl
+	 call	CreateSinCosTable
+	 mov	[_it], edx
+	 mov	ecx, [_in]
+	 shl	ecx, 3
+	 mov	ebx, 12
+	 mov	eax, 68
+	 int	0x40
+	 mov	[_ia], eax
+	 mov	edx, eax
+
+	call FillData
+
+	cpuid
+	rdtsc
+	mov	[t_0], eax
+
+	push	[_it]
+	push	[_ia]
+	push	[_ip]
+	push	[_in]
+;        call    FHT_4
+	xor	eax, eax
+	syscall
+
+	add	esp, 16
+	cpuid
+	rdtsc
+	mov	[t_1], eax
+
+	sub	eax, [t_0]
+	debug_print_hex eax
+	print	'<- fht time'
+
+	mov	edx, [_it]
+	call	DestroySinCosTable
+	mov	ecx, [_ia]
+	mov	ebx, 13
+	mov	eax, 68
+	int	0x40
+	ret
+
+
+; ========================================================
+; static data
+;----------------
+
+align 8
+
+;f18     dq 0x4032000000000000
+f256	dq 256.01f
+f14_2	dq 14.2f
+f500	dq 0x407f400000000000
+f2_5	dq 0x4004000000000000
+tt	dq ?
+tv93	dq ?
+t_div	dd ?
+t_mod	dd ?
+temp	dd ?
+ff	dq ?	; return value (int)
+ii	dd ?	; argument (int) = array index
+t_1	dd ?
+t_0	dd ?
+fcontrol dw  0x0037f
+title	db ' Fast Hartley Transform Test - A.Jerdev 2010'
+
+I_END: