an experimental kernel with a mad syscall and FHT inside

git-svn-id: svn://kolibrios.org@1641 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Artem Jerdev (art_zh) 2010-10-03 23:11:42 +00:00
parent abbc09c677
commit 112a3665cb
5 changed files with 661 additions and 479 deletions

View File

@ -76,9 +76,6 @@ pci_ext_config:
shl eax, 8
test eax, 0x000F0000 ; MMIO Base must be bus0-aligned
jnz .no_pcie_cfg
; -- it looks like a true PCIe config space;
ret ; <<<<<<<<<<< OK >>>>>>>>>>>
.no_pcie_cfg:
@ -92,6 +89,7 @@ pci_ext_config:
.pcie_failed:
mov esi, boot_pcie_fail
call boot_log
xor eax, eax
ret ; <<<<<<<<< FAILURE >>>>>>>>>

View File

@ -21,37 +21,6 @@ cross_order:
call dword [servetable+edi*4]
ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;;
;; SYSENTER ENTRY ;;
;; (not used on AMD systems) ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;align 32
;sysenter_entry:
; ; Íàñòðàèâàåì ñòåê
; mov esp, [ss:tss._esp0]
; sti
; push ebp ; save app esp + 4
; mov ebp, [ebp] ; ebp - original ebp
; ;------------------
; pushad
; cld
;
; movzx eax, al
; call dword [servetable2 + eax * 4]
; popad
; ;------------------
; xchg ecx, [ss:esp] ; â âåðøèí ñòåêà - app ecx, ecx - app esp + 4
; sub ecx, 4
; xchg edx, [ecx] ; edx - return point, & save original edx
; push edx
; mov edx, [ss:esp + 4]
; mov [ecx + 4], edx ; save original ecx
; pop edx
; sysexit
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;;
;; SYSTEM CALL ENTRY ;;
@ -69,17 +38,17 @@ i40:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;;
;; SYSCALL ENTRY ;;
;; SYSCALL ENTRY -- NEW !!! ;;
;; ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 32
syscall_entry:
; push ecx
sti
; sti
push ecx
and eax, 3
call dword [servetable3 + eax * 4]
; pop ecx
pop ecx
sysret
iglobal
@ -89,7 +58,6 @@ iglobal
align 4
servetable:
dd socket ; 53-Socket interface
dd 0
dd 0
@ -187,8 +155,8 @@ iglobal
align 4
servetable3:
dd FFT4 ; 0
dd FFT4 ; 1
dd FHT_4 ; 0
dd FHT_4 ; 1
dd paleholder ; 2
dd sys_end ; last

View File

@ -16,30 +16,6 @@
$Revision$
;struc db [a] { common . db a
; if ~used .
; display 'not used db: ',`.,13,10
; end if }
;struc dw [a] { common . dw a
; if ~used .
; display 'not used dw: ',`.,13,10
; end if }
;struc dd [a] { common . dd a
; if ~used .
; display 'not used dd: ',`.,13,10
; end if }
;struc dp [a] { common . dp a
; if ~used .
; display 'not used dp: ',`.,13,10
; end if }
;struc dq [a] { common . dq a
; if ~used .
; display 'not used dq: ',`.,13,10
; end if }
;struc dt [a] { common . dt a
; if ~used .
; display 'not used dt: ',`.,13,10
; end if }
struc POINT {
.x dd ?
@ -244,7 +220,7 @@ include "fs/ext2.inc" ; read / write for ext2 filesystem
; sound
include "sound/playnote.inc" ; player Note for Speaker PC
include "sound/FFT.inc" ; fast Fourier transform routines
include "sound/FHT.inc" ; fast Fourier transform routines
; display
@ -311,3 +287,4 @@ include "core/ext_lib.inc"
; list of external functions
include "imports.inc"

View File

@ -5,62 +5,51 @@
; free KolibriOS version - not to be ported to other OSes
; ==========================================================
Power_of_4 equ 5
NumPoints equ 1024
N_2 equ NumPoints / 2
N_4 equ NumPoints / 4
;=================================================================
; global constants
align 8
_root dq 1.41421356237309504880169 ; = sqrt(2)
_root2 dq 0.70710678118654752440084 ; = sqrt(2)/2
_c1 dq 0.92387953251128675612818 ; = cos(pi/8)
_s1 dq 0.38268343236508977172846 ; = sin(pi/8)
_dx dq 0.00613592315154296875 ; pi/512
fht_r dq 1.41421356237309504880169 ; = sqrt(2)
fht_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2
fht_c1 dq 0.92387953251128675612818 ; = cos(pi/8)
fht_s1 dq 0.38268343236508977172846 ; = sin(pi/8)
;[_CosTable] dd 0 ; N_2 elements
;[_SinTable] dd 0 ; N_2 elements
;=================================================================
; parameter1:
; -- reg dl (bits[3:0]) = Power_of_4
; -- reg edx && (-16) = 4k-aligned data array address
; returns:
; -- edx = Power_of_4
; -- ecx = N
; destroys:
; -- eax, ebx, ecx, edx, esi
;; ==========================
align 4
MakeSinCosTable:
mov ebx, [_Sines]
mov ecx, [_Cosins]
xor eax, eax
fld [_dx] ; st : dx
fldz ; st : 0, dx
.loop:
fld st0 ; st : x, x, dx
FSINCOS ; st : cos, sin, x, dx
fstp qword [ecx+eax*8] ; st : sin, x, dx
fstp qword [ebx+eax*8] ; st : x, dx
fadd st0, st1 ; st : x+dx, dx
inc eax
cmp eax, N_2
jne .loop
fstp st0 ; st : dx
fstp st0 ; st : <empty>
ret
; ================================================================
align 4
BitInvert:
mov esi, [x] ; array of qwords
mov esi, edx
and esi, 0xFFFFFFF0
and edx, 0x0F
push edx
mov cl, dl
xor eax, eax
inc eax
shl eax, cl
shl eax, cl
push eax
xor ecx, ecx ; index term
align 4
.newterm:
inc ecx
cmp ecx, NumPoints
cmp ecx, [esp] ; N
jge .done
xor eax, eax
mov edx, ecx
xor bl, bl
align 4
.do_invert:
inc bl
cmp bl, Power_of_4
cmp bl, byte[esp+4] ; Power_of_4
jg .switch
mov bh, dl
@ -69,6 +58,7 @@ BitInvert:
or al, bh
shr edx, 2
jmp .do_invert
align 8
.switch:
cmp eax, ecx
@ -80,17 +70,32 @@ BitInvert:
fstp qword [esi+ecx*8]
jmp .newterm
align 4
.done:
pop ecx
pop edx
ret
;=================================================================
;=================================================================
; stdcall parameters:
; -- [esp+4] = N
; -- [esp+8] = 4k-aligned data array address
; returns:
; -- nothing
; destroys:
; -- ebx, esi
;; ==========================
align 4
step1:
mov esi, [x]
mov ebx, esi
add esi, NumPoints*8
mov ebx, [esp+8]
mov esi, [esp+4]
shl esi, 3
add esi, ebx
align 4
.loop:
fld qword[ebx]
fld qword[ebx+8]
@ -119,19 +124,65 @@ step1:
add ebx, 32
cmp ebx, esi
jnz .loop
ret
;
;===========================================================================
step2: ; Step2
; local stack definitions
;===========================================================================
_t0 equ dword [esp]
_t1 equ dword[esp+4]
_t2 equ dword[esp+8]
_t3 equ dword[esp+12]
_t4 equ dword[esp+16]
_t5 equ dword[esp+20]
_t6 equ dword[esp+24]
_t7 equ dword[esp+28]
_t8 equ dword[esp+32]
_t9 equ dword[esp+36]
mov eax, [_f]
mov ebx, eax
add eax, NumPoints*8
_l1 equ dword[esp+40]
_l2 equ dword[esp+44]
_l3 equ dword[esp+48]
_l4 equ dword[esp+52]
_l5 equ dword[esp+56]
_l6 equ dword[esp+60]
_l7 equ dword[esp+64]
_l8 equ dword[esp+68]
_l9 equ dword[esp+72]
_l0 equ dword[esp+76]
_d1 equ dword[esp+80]
_d2 equ dword[esp+84]
_d3 equ dword[esp+88]
_d4 equ dword[esp+92]
_d5 equ dword[esp+96]
_d6 equ dword[esp+100]
_j5 equ dword[esp+104]
_jj equ dword[esp+108]
_end_of_array equ dword[esp+112]
_step equ word [esp+116]
;=================================================================
; cdecl parameters:
; -- [ebp+8] = N
; -- [ebp+12] = 4k-aligned data array address
; returns:
; -- nothing
; destroys:
; -- eax, ebx
; locals:
; -- 10 stack-located dwords (_t0 ... _t9)
;; ==========================
align 4
step2:
push ebp
mov ebp, esp
sub esp, 40
mov ebx, [ebp+12]
mov eax, [ebp+ 8]
shl eax, 3
add eax, ebx
align 4
.loop_i:
; -- quad subelements +0, +4, +8 and +12 (simpliest operations)
@ -163,7 +214,7 @@ step2: ; Step2
; -- even subelements +2, +6, +10 and +14 (2 multiplications needed)
fld qword[ebx+8*2]
fld qword[ebx+8*6]
fld [_root]
fld [fht_r]
fmul st1, st0 ; st : r, t2, t1
fld qword[ebx+8*10]
fxch st1 ; st : r, t3, t2, t1
@ -194,20 +245,20 @@ step2: ; Step2
fsub st0, st1
fxch st1
faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7])
fld [_root2]
fld [fht_r2]
fmul st2, st0
fmulp st1, st0 ; st : t9, t6
fld qword[ebx+8*3]
fld st0
fadd st0, st2 ; st : t1, f[l5], t9, t6
fstp [_t1]
fstp _t1
fsub st0, st1
fstp [_t2]
fstp [_t9] ; (t9 never used)
fstp [_t6] ; st : <empty>
fstp _t2
fstp _t9 ; (t9 never used)
fstp _t6 ; st : <empty>
fld [_c1]
fld [_s1]
fld [fht_c1]
fld [fht_s1]
fld qword[ebx+8*5]
fld qword[ebx+8*7]
fld st3 ; st: c1, f[l6], f[l2], s1, c1
@ -215,13 +266,13 @@ step2: ; Step2
fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c
fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c
faddp st1, st0 ; st: t5, f_6, f_2, s, c
fstp [_t5] ; st: f_6, f_2, s, c
fstp _t5 ; st: f_6, f_2, s, c
fld st3 ; st: c, f_6, f_2, s, c
fmul st0, st1
fld st3
fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c
fsubp st1, st0 ; st: t8, f_6, f_2, s, c
fstp [_t8] ; st: f_6, f_2, s, c
fstp _t8 ; st: f_6, f_2, s, c
fstp st0 ; st: f_2, s, c
fstp st0 ; st: s, c
@ -232,51 +283,51 @@ step2: ; Step2
fld st3
fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c
faddp st1, st0 ; st: t7, f_8, f_4, s, c
fld [_t5] ; st: t5, t7, f_8, f_4, s, c
fld _t5 ; st: t5, t7, f_8, f_4, s, c
fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c
fstp [_t4]
fstp [_t7] ; st: f_8, f_4, s, c
fstp _t4
fstp _t7 ; st: f_8, f_4, s, c
fld st3 ; st: c, f_8, f_4, s, c
fmul st0, st2
fld st3
fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c
fsubp st1, st0 ; st:-t0, f_8, f_4, s, c
fchs
fld [_t8]
fld _t8
fchs ; st:-t8, t0, f_8, f_4, s, c
fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c
fstp [_t3]
fstp [_t0] ; st: f_8, f_4, s, c
fstp _t3
fstp _t0 ; st: f_8, f_4, s, c
fstp st0 ; st: f_4, s, c
fstp st0 ; st: s, c
fstp st0 ; st: c
fstp st0 ; st: <empty>
fld [_t1]
fld [_t4]
fld _t1
fld _t4
fld st1
fsub st0, st1
fstp qword[ebx+8*11] ; f[l7] = t1-t4
faddp st1, st0
fstp qword[ebx+8*3] ; f[l5] = t1+t4
fld [_t2]
fld [_t3]
fld _t2
fld _t3
fld st1
fsub st0, st1
fstp qword[ebx+8*15] ; f[l8]
faddp st1, st0
fstp qword[ebx+8*7] ; f[l6]
fld [_t6]
fld _t6
fld qword[ebx+8]
fld st1
fsub st0, st1
fxch st1
faddp st2, st0 ; st : t2, t1
fld [_t8]
fsub [_t0]
fld [_t5]
fadd [_t7] ; st : t4, t3, t2, t1
fld _t8
fsub _t0
fld _t5
fadd _t7 ; st : t4, t3, t2, t1
fld st3
fsub st0, st1
@ -294,36 +345,42 @@ step2: ; Step2
cmp ebx, eax
jb .loop_i
mov esp, ebp
pop ebp
ret
align 8 ; shared local vars
_t0 dq 0
_t1 dq 0
_t2 dq 0
_t3 dq 0
_t4 dq 0
_t5 dq 0
_t6 dq 0
_t7 dq 0
_t8 dq 0
_t9 dq 0
;===================================================================
;=================================================================
; cdecl parameters:
; -- [ebp+8] = N
; -- [ebp+12] = p
; -- [ebp+16] = 4k-aligned data array address
; -- [ebp+20] = 4k-aligned SinCosTable address
; returns:
; -- nothing
; destroys:
; -- all GPRegs
; locals:
; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
;; ==========================
align 4
step3:
;===================================================================
push ebp
mov ebp, esp
sub esp, 120
; 283 : {
; 293 : for (l=3; l<=p; l++)
mov cx, 0x0200
align 4
.newstep:
inc ch
cmp ch, Power_of_4
cmp ch, byte[ebp+12]
jg .done
mov [.step], cx
mov _step, cx
; 294 : {
; 295 : d1 = 1 << (l + l - 3);
@ -333,61 +390,63 @@ step3:
sub cl, 3
mov edx, 1
shl edx, cl
mov [.d1], edx
mov _d1, edx
; 296 : d2 = d1 << 1;
shl edx, 1
mov [.d2], edx
mov _d2, edx
mov eax, edx
; 297 : d3 = d2 << 1;
shl edx, 1
mov [.d3], edx
mov _d3, edx
; 298 : d4 = d2 + d3;
add eax, edx
mov [.d4], eax
mov _d4, eax
; 299 : d5 = d3 << 1;
shl edx, 1
mov [.d5], edx
mov _d5, edx
shl edx, 3
mov [.d6], edx ; d6 = d5*8 to simplify index operations
mov _d6, edx ; d6 = d5*8 to simplify index operations
; 339 : j5 = N / d5; ; moved out of internal loop
mov cl, Power_of_4
mov cl, [ebp+12]
sub cl, ch
add cl, cl
mov edx, 1
shl edx, cl
mov [.j5], edx
mov _j5, edx
; 300 :
; 301 : for (j=0; j<N; j+=d5)
mov esi, [_f]
mov ebx, esi
add esi, NumPoints*8
mov [.end_of_array], esi
mov ebx, [ebp+16]
mov esi, [ebp+8]
shl esi, 3
add esi, ebx
mov _end_of_array, esi
align 4
.next_j:
; {
; t1 = f[j] + f[j+d2];
mov eax, [.d2]
mov eax, _d2
fld qword[ebx]
fld qword[ebx+eax*8]
fld st1
fadd st0, st1
fstp [_t1]
fstp _t1
; t2 = f[j] - f[j+d2];
fsubp st1, st0
fstp [_t2]
fstp _t2
; t3 = f[j+d3] + f[j+d4];
mov edi, [.d3]
mov edi, _d3
fld qword[ebx+edi*8]
mov edx, [.d4]
mov edx, _d4
fld qword[ebx+edx*8]
fld st1
fsub st0, st1 ; st : t4, f4, f3
@ -398,7 +457,7 @@ step3:
; f[j+d4] = t2 - t4;
; f[j+d3] = t2 + t4;
fld [_t2]
fld _t2
fld st0
fsub st0, st2 ; st : f4, t2, t4, t3
fstp qword[ebx+edx*8] ; st : t2, t4, t3
@ -407,7 +466,7 @@ step3:
; f[j+d2] = t1 - t3;
; f[j] = t1 + t3;
fld [_t1]
fld _t1
fst st1
fsub st0, st2 ; st : f2, t1, t3
fstp qword[ebx+eax*8] ; st : t1, t3
@ -416,7 +475,7 @@ step3:
fstp st0
; jj = j + d1; / ??
mov edi, [.d1]
mov edi, _d1
shl edi, 3 ; = d1*8
mov edx, edi
mov eax, edi
@ -432,7 +491,7 @@ step3:
; t2 = f[jj+d2] * r;
fld qword [edi+eax]
fld [_root]
fld [fht_r]
fmul st1, st0 ; st : r, t2, t3, t1
; t4 = f[jj+d4] * r
fmul qword [edx+eax] ; st : t4, t2, t3, t1
@ -461,58 +520,61 @@ step3:
; for (k=1; k<d1; k++)
xor ecx, ecx ; ecx = k
mov [.jj], ecx
mov _jj, ecx
align 4
.next_k:
inc ecx
cmp ecx, [.d1]
cmp ecx, _d1
jge .done_k
; {
mov eax, [.d2] ; the sector increment
mov eax, _d2 ; the sector increment
; l1 = j + k;
mov edx, ecx
mov [.l1], edx ; [ebx+edx*8] --> f[j+k]
mov _l1, edx ; [ebx+edx*8] --> f[j+k]
; l2 = l1 + d2;
add edx, eax
mov [.l2], edx
mov _l2, edx
; l3 = l1 + d3;
add edx, eax
mov [.l3], edx
mov _l3, edx
; l4 = l1 + d4;
add edx, eax
mov [.l4], edx
mov _l4, edx
; l5 = j + d2 - k;
mov edx, eax
sub edx, ecx
mov [.l5], edx
mov _l5, edx
; l6 = l5 + d2;
add edx, eax
mov [.l6], edx
mov _l6, edx
; l7 = l5 + d3;
add edx, eax
mov [.l7], edx
mov _l7, edx
; l8 = l5 + d4;
add edx, eax
mov [.l8], edx
mov _l8, edx
; 340 : j5 *= k; // add-substituted multiplication
mov eax, [.jj]
add eax, [.j5]
mov [.jj], eax
mov eax, _jj
add eax, _j5
mov _jj, eax
; c1 = C[jj];
; s1 = S[jj];
mov edi, [_Cosins]
mov edi, [ebp+20]
fld qword[edi+eax*8]
mov esi, [_Sines]
mov esi, [ebp+8]
shl esi, 2
add esi, edi
fld qword[esi+eax*8] ; st : s1, c1
; t5 = f[l2] * c1 + f[l6] * s1;
; t8 = f[l6] * c1 - f[l2] * s1;
mov edx, [.l6]
mov edx, _l6
fld qword[ebx+edx*8]
mov edx, [.l2]
mov edx, _l2
fld st0
fmul st0, st2
fxch st1
@ -521,10 +583,10 @@ step3:
fmul st4, st0
fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c
fstp [_t8]
fstp _t8
faddp st2, st0 ; st : f[l2]*s, t5
fstp st0 ; st : t5
fstp [_t5] ; st : <empty>
fstp _t5 ; st : <empty>
; c2 = C[2*jj];
; s2 = S[2*jj];
@ -534,9 +596,9 @@ step3:
; t6 = f[l3] * c2 + f[l7] * s2;
; t9 = f[l7] * c2 - f[l3] * s2;
mov edx, [.l7]
mov edx, _l7
fld qword[ebx+edx*8]
mov edx, [.l3]
mov edx, _l3
fld st0
fmul st0, st2
fxch st1
@ -545,22 +607,22 @@ step3:
fmul st4, st0
fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c
fstp [_t9]
fstp _t9
faddp st2, st0 ; st : f[l2]*s, t6
fstp st0 ; st : t6
fstp [_t6] ; st : <empty>
fstp _t6 ; st : <empty>
; c3 = C[3*jj];
; s3 = S[3*jj];
add eax, [.jj]
add eax, _jj
fld qword[edi+eax*8]
fld qword[esi+eax*8] ; st : s3, c3
; t7 = f[l4] * c3 + f[l8] * s3;
; t0 = f[l8] * c3 - f[l4] * s3;
mov edx, [.l8]
mov edx, _l8
fld qword[ebx+edx*8]
mov edx, [.l4]
mov edx, _l4
fld st0
fmul st0, st2
fxch st1
@ -569,192 +631,162 @@ step3:
fmul st4, st0
fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c
fstp [_t0]
fstp _t0
faddp st2, st0 ; st : f[l2]*s, t7
fstp st0 ; st : t7
fstp [_t7] ; st : <empty>
fstp _t7 ; st : <empty>
; t1 = f[l5] - t9;
; t2 = f[l5] + t9;
mov eax, [.l5]
mov eax, _l5
fld qword [ebx+eax*8]
fld [_t9]
fld _t9
fld st0
fadd st0, st2
fstp [_t2]
fstp _t2
fsubp st1, st0
fstp [_t1]
fstp _t1
; t3 = - t8 - t0;
fld [_t8]
fadd [_t0]
fld _t8
fadd _t0
fchs
fstp [_t3]
fstp _t3
; t4 = t5 - t7;
fld [_t5]
fsub [_t7]
fstp [_t4]
fld _t5
fsub _t7
fstp _t4
; f[l5] = t1 + t4;
fld [_t1]
fld [_t4]
fld _t1
fld _t4
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l7] = t1 - t4;
mov eax, [.l7]
mov eax, _l7
fsubp st1, st0
fstp qword [ebx+eax*8]
; f[l6] = t2 + t3;
mov eax, [.l6]
fld [_t2]
fld [_t3]
mov eax, _l6
fld _t2
fld _t3
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l8] = t2 - t3;
mov eax, [.l8]
mov eax, _l8
fsubp st1, st0
fstp qword [ebx+eax*8]
; t1 = f[l1] + t6;
mov eax, [.l1]
mov eax, _l1
fld qword [ebx+eax*8]
fld [_t6]
fld _t6
fld st0
fadd st0, st2
fstp [_t1]
fstp _t1
; t2 = f[l1] - t6;
fsubp st1, st0
fstp [_t2]
fstp _t2
; t3 = t8 - t0;
fld [_t8]
fsub [_t0]
fstp [_t3]
fld _t8
fsub _t0
fstp _t3
; t4 = t5 + t7;
fld [_t5]
fadd [_t7]
fstp [_t4]
fld _t5
fadd _t7
fstp _t4
; f[l1] = t1 + t4;
mov eax, [.l1]
fld [_t1]
fld [_t4]
mov eax, _l1
fld _t1
fld _t4
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l3] = t1 - t4;
mov eax, [.l3]
mov eax, _l3
fsubp st1, st0
fstp qword [ebx+eax*8]
; f[l2] = t2 + t3;
mov eax, [.l2]
fld [_t2]
fld [_t3]
mov eax, _l2
fld _t2
fld _t3
fld st0
fadd st0, st2
fstp qword [ebx+eax*8]
; f[l4] = t2 - t3;
mov eax, [.l4]
mov eax, _l4
fsubp st1, st0
fstp qword [ebx+eax*8]
; 374 : }
jmp .next_k
align 4
.done_k:
; 375 : }
add ebx, [.d6] ; d6 = d5*8
cmp ebx, [.end_of_array]
add ebx, _d6 ; d6 = d5*8
cmp ebx, _end_of_array
jb .next_j
; 376 : }
mov cx, [.step]
mov cx, _step
jmp .newstep
.done:
mov esp, ebp
pop ebp
; 377 : }
ret
align 4
.l1 dd 0
.l2 dd 0
.l3 dd 0
.l4 dd 0
.l5 dd 0
.l6 dd 0
.l7 dd 0
.l8 dd 0
.l9 dd 0
.l0 dd 0
.d1 dd 0
.d2 dd 0
.d3 dd 0
.d4 dd 0
.d5 dd 0
.d6 dd 0
.j5 dd 0
.jj dd 0
.end_of_array dd 0
.step dw 0
align 8
;=========== Step3 ends here ===========
; =================================================================
;=================================================================
; syscall entry
;
_f dd ?
_N dd 1024 ; number of points
; parameters:
; -- [ebp+12] = N
; -- [ebp+16] = p
; -- [ebp+20] = 4k-aligned data array address
; -- [ebp+24] = 4k-aligned SinCosTable address
; returns:
; -- nothing
; destroys:
; -- all GPRegs
;; ==========================
_a dd ? ; initial data array
x dd 0 ; tranformed (float) data array
_Cosins dd 0
_Sines dd 0
align 4
FFT4:
or al, al
jnz .trans
mov cl, Power_of_4
mov eax, 1
shl eax, cl
shl eax, cl
mov [_N], eax
shl eax, 2 ; size of Sine table in bytes
add eax, ebx
mov [_Sines], ebx
mov [_Cosins], eax
cpuid
rdtsc
mov [.time], eax
call MakeSinCosTable
cpuid
rdtsc
sub eax, [.time]
ret
.trans:
mov [x], ebx
mov [_f], ebx
cli ;-----
cpuid
rdtsc
mov [.time], eax
FHT_4:
push ebp
mov ebp, esp
mov edx, [ebp+20] ; a
mov dl, byte[ebp+16]
call BitInvert
call step1
call step2
push dword[ebp+20] ; a
push ecx ; N
call step1 ; 4-point transform
cmp cl, 1
jz .done
call step2 ; 16-point transform
cmp byte[ebp+16],1 ; p = 2 ?
jz .done
pop edx ; N
pop ecx ; a
push dword[ebp+24] ; t
push ecx
push dword[ebp+16] ; p
push edx ; N
call step3
cpuid
rdtsc
sti ;----
sub eax, [.time]
ret
.done:
mov esp, ebp
pop ebp
.time dd 0
ret

View File

@ -0,0 +1,207 @@
;========================================================================
;= =
;= Fast Hartley Transform routine demo for KolibriOS =
;= =
;= Copyright (C) 2010, Artem Jerdev <kolibri@jerdev.co.uk> =
;= =
;= refer to wiki.kolibtios.org for all details =
;= =
;========================================================================
use32
org 0x0
db 'MENUET01' ; 8 byte id
dd 0x01 ; header version
dd START ; start of code
dd I_END ; size of image
dd 0x100000 ; memory for app
dd 0xbfffc ; esp
dd 0x0 , 0x0 ; I_Param , I_Icon
include 'macros.inc'
include 'debug.inc'
include 'FHT4i.inc'
START: ; start of execution
call main
mov eax,-1 ; close this program
int 0x40
;=============================================================
;Func: calculates a simple function
; ff = (int)(500*exp(-t) * cos (2.5*t))
; uses: eax, ebx
;------------
Func:
; 9 : {
; 10 : double x,t;
; 11 : int f;
; 12 :
; 13 : x = (i < N2) ? i : i - NUM_POINTS;
mov eax, [ii]
cmp eax, 512
jge .index_negative
jmp .index_correct
.index_negative:
sub eax, 1024
.index_correct:
mov [temp], eax
; fild [temp]
; 14 : t = x / 16.0;
; f2xm1 argument (abs) must be less than 1, so
mov [t_mod], eax
and [t_mod], 0x0F ; x % 16
shr eax, 4 ; x / 16
mov [t_div], eax
fild [temp]
; 15 : if (t<0) t = -t;
fabs
exp_ok:
; 16 : f = (int)(512*2^(-t) * cos (2.5*t));
fchs
f2xm1
fmul [f500]
fstp [tv93]
fld [f2_5]
fmul [tt]
fcos
fmul [tv93]
fstp [tt]
mov bx, word[tt+6]
shr bx,4
and bx,0x07FF
add ax,bx
shl ax,4
and word[tt+6], 0x800F
or word[tt+6], ax
fld [tt]
fstp [ff]
; 17 : return f;
; 18 : }
ret
;---------------------------------------------------------
; test data filler
;
; uses eax, ebx, ecx
FillData:
; 29 : for (i=0; i<NUM_POINTS; i++)
; here : ecx = i
xor ecx, ecx
.funcloop:
; 30 : {
; 31 : ia[i] = Func(i);
mov [ii], ecx
call Func
fld [ff]
fstp qword [edx+ecx*8]
; 32 : }
inc ecx
cmp ecx, [_in] ; ecx == N ?
jne .funcloop
ret
;====================================================================
; main
;====================================================================
align 4
_ia dd 0
_ii dd 0
_ip dd 0
_in dd 0
_it dd 0
;-----------------
main:
mov eax, 68
mov ebx, 11
int 0x40
fninit
mov cl, 2 ; power of 4
mov byte[_ip], cl
mov eax, 1
shl eax, cl
shl eax, cl
mov [_in], eax
mov dl, cl
call CreateSinCosTable
mov [_it], edx
mov ecx, [_in]
shl ecx, 3
mov ebx, 12
mov eax, 68
int 0x40
mov [_ia], eax
mov edx, eax
call FillData
cpuid
rdtsc
mov [t_0], eax
push [_it]
push [_ia]
push [_ip]
push [_in]
; call FHT_4
xor eax, eax
syscall
add esp, 16
cpuid
rdtsc
mov [t_1], eax
sub eax, [t_0]
debug_print_hex eax
print '<- fht time'
mov edx, [_it]
call DestroySinCosTable
mov ecx, [_ia]
mov ebx, 13
mov eax, 68
int 0x40
ret
; ========================================================
; static data
;----------------
align 8
;f18 dq 0x4032000000000000
f256 dq 256.01f
f14_2 dq 14.2f
f500 dq 0x407f400000000000
f2_5 dq 0x4004000000000000
tt dq ?
tv93 dq ?
t_div dd ?
t_mod dd ?
temp dd ?
ff dq ? ; return value (int)
ii dd ? ; argument (int) = array index
t_1 dd ?
t_0 dd ?
fcontrol dw 0x0037f
title db ' Fast Hartley Transform Test - A.Jerdev 2010'
I_END: