an experimental kernel with a mad syscall and FHT inside

git-svn-id: svn://kolibrios.org@1641 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Artem Jerdev (art_zh) 2010-10-03 23:11:42 +00:00
parent abbc09c677
commit 112a3665cb
5 changed files with 661 additions and 479 deletions

View File

@ -76,9 +76,6 @@ pci_ext_config:
shl eax, 8 shl eax, 8
test eax, 0x000F0000 ; MMIO Base must be bus0-aligned test eax, 0x000F0000 ; MMIO Base must be bus0-aligned
jnz .no_pcie_cfg jnz .no_pcie_cfg
; -- it looks like a true PCIe config space;
ret ; <<<<<<<<<<< OK >>>>>>>>>>> ret ; <<<<<<<<<<< OK >>>>>>>>>>>
.no_pcie_cfg: .no_pcie_cfg:
@ -92,6 +89,7 @@ pci_ext_config:
.pcie_failed: .pcie_failed:
mov esi, boot_pcie_fail mov esi, boot_pcie_fail
call boot_log call boot_log
xor eax, eax
ret ; <<<<<<<<< FAILURE >>>>>>>>> ret ; <<<<<<<<< FAILURE >>>>>>>>>

View File

@ -21,37 +21,6 @@ cross_order:
call dword [servetable+edi*4] call dword [servetable+edi*4]
ret ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;;
;; SYSENTER ENTRY ;;
;; (not used on AMD systems) ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;align 32
;sysenter_entry:
; ; Íàñòðàèâàåì ñòåê
; mov esp, [ss:tss._esp0]
; sti
; push ebp ; save app esp + 4
; mov ebp, [ebp] ; ebp - original ebp
; ;------------------
; pushad
; cld
;
; movzx eax, al
; call dword [servetable2 + eax * 4]
; popad
; ;------------------
; xchg ecx, [ss:esp] ; â âåðøèí ñòåêà - app ecx, ecx - app esp + 4
; sub ecx, 4
; xchg edx, [ecx] ; edx - return point, & save original edx
; push edx
; mov edx, [ss:esp + 4]
; mov [ecx + 4], edx ; save original ecx
; pop edx
; sysexit
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;; ;; ;;
;; SYSTEM CALL ENTRY ;; ;; SYSTEM CALL ENTRY ;;
@ -69,17 +38,17 @@ i40:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;; ;; ;;
;; SYSCALL ENTRY ;; ;; SYSCALL ENTRY -- NEW !!! ;;
;; ;; ;; ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 32 align 32
syscall_entry: syscall_entry:
; push ecx ; sti
sti push ecx
and eax, 3 and eax, 3
call dword [servetable3 + eax * 4] call dword [servetable3 + eax * 4]
pop ecx
; pop ecx
sysret sysret
iglobal iglobal
@ -89,7 +58,6 @@ iglobal
align 4 align 4
servetable: servetable:
dd socket ; 53-Socket interface dd socket ; 53-Socket interface
dd 0 dd 0
dd 0 dd 0
@ -187,8 +155,8 @@ iglobal
align 4 align 4
servetable3: servetable3:
dd FFT4 ; 0 dd FHT_4 ; 0
dd FFT4 ; 1 dd FHT_4 ; 1
dd paleholder ; 2 dd paleholder ; 2
dd sys_end ; last dd sys_end ; last

View File

@ -16,30 +16,6 @@
$Revision$ $Revision$
;struc db [a] { common . db a
; if ~used .
; display 'not used db: ',`.,13,10
; end if }
;struc dw [a] { common . dw a
; if ~used .
; display 'not used dw: ',`.,13,10
; end if }
;struc dd [a] { common . dd a
; if ~used .
; display 'not used dd: ',`.,13,10
; end if }
;struc dp [a] { common . dp a
; if ~used .
; display 'not used dp: ',`.,13,10
; end if }
;struc dq [a] { common . dq a
; if ~used .
; display 'not used dq: ',`.,13,10
; end if }
;struc dt [a] { common . dt a
; if ~used .
; display 'not used dt: ',`.,13,10
; end if }
struc POINT { struc POINT {
.x dd ? .x dd ?
@ -244,7 +220,7 @@ include "fs/ext2.inc" ; read / write for ext2 filesystem
; sound ; sound
include "sound/playnote.inc" ; player Note for Speaker PC include "sound/playnote.inc" ; player Note for Speaker PC
include "sound/FFT.inc" ; fast Fourier transform routines include "sound/FHT.inc" ; fast Fourier transform routines
; display ; display
@ -311,3 +287,4 @@ include "core/ext_lib.inc"
; list of external functions ; list of external functions
include "imports.inc" include "imports.inc"

View File

@ -5,62 +5,51 @@
; free KolibriOS version - not to be ported to other OSes ; free KolibriOS version - not to be ported to other OSes
; ========================================================== ; ==========================================================
Power_of_4 equ 5
NumPoints equ 1024
N_2 equ NumPoints / 2
N_4 equ NumPoints / 4
;=================================================================
; global constants ; global constants
align 8 align 8
_root dq 1.41421356237309504880169 ; = sqrt(2) fht_r dq 1.41421356237309504880169 ; = sqrt(2)
_root2 dq 0.70710678118654752440084 ; = sqrt(2)/2 fht_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2
_c1 dq 0.92387953251128675612818 ; = cos(pi/8) fht_c1 dq 0.92387953251128675612818 ; = cos(pi/8)
_s1 dq 0.38268343236508977172846 ; = sin(pi/8) fht_s1 dq 0.38268343236508977172846 ; = sin(pi/8)
_dx dq 0.00613592315154296875 ; pi/512
;[_CosTable] dd 0 ; N_2 elements
;[_SinTable] dd 0 ; N_2 elements
;=================================================================
; parameter1:
; -- reg dl (bits[3:0]) = Power_of_4
; -- reg edx && (-16) = 4k-aligned data array address
; returns:
; -- edx = Power_of_4
; -- ecx = N
; destroys:
; -- eax, ebx, ecx, edx, esi
;; ========================== ;; ==========================
align 4 align 4
MakeSinCosTable:
mov ebx, [_Sines]
mov ecx, [_Cosins]
xor eax, eax
fld [_dx] ; st : dx
fldz ; st : 0, dx
.loop:
fld st0 ; st : x, x, dx
FSINCOS ; st : cos, sin, x, dx
fstp qword [ecx+eax*8] ; st : sin, x, dx
fstp qword [ebx+eax*8] ; st : x, dx
fadd st0, st1 ; st : x+dx, dx
inc eax
cmp eax, N_2
jne .loop
fstp st0 ; st : dx
fstp st0 ; st : <empty>
ret
; ================================================================
align 4
BitInvert: BitInvert:
mov esi, [x] ; array of qwords mov esi, edx
and esi, 0xFFFFFFF0
and edx, 0x0F
push edx
mov cl, dl
xor eax, eax
inc eax
shl eax, cl
shl eax, cl
push eax
xor ecx, ecx ; index term xor ecx, ecx ; index term
align 4
.newterm: .newterm:
inc ecx inc ecx
cmp ecx, NumPoints cmp ecx, [esp] ; N
jge .done jge .done
xor eax, eax xor eax, eax
mov edx, ecx mov edx, ecx
xor bl, bl xor bl, bl
align 4
.do_invert: .do_invert:
inc bl inc bl
cmp bl, Power_of_4 cmp bl, byte[esp+4] ; Power_of_4
jg .switch jg .switch
mov bh, dl mov bh, dl
@ -69,6 +58,7 @@ BitInvert:
or al, bh or al, bh
shr edx, 2 shr edx, 2
jmp .do_invert jmp .do_invert
align 8
.switch: .switch:
cmp eax, ecx cmp eax, ecx
@ -80,17 +70,32 @@ BitInvert:
fstp qword [esi+ecx*8] fstp qword [esi+ecx*8]
jmp .newterm jmp .newterm
align 4
.done: .done:
pop ecx
pop edx
ret ret
;================================================================= ;=================================================================
;=================================================================
; stdcall parameters:
; -- [esp+4] = N
; -- [esp+8] = 4k-aligned data array address
; returns:
; -- nothing
; destroys:
; -- ebx, esi
;; ==========================
align 4 align 4
step1: step1:
mov esi, [x] mov ebx, [esp+8]
mov ebx, esi mov esi, [esp+4]
add esi, NumPoints*8 shl esi, 3
add esi, ebx
align 4
.loop: .loop:
fld qword[ebx] fld qword[ebx]
fld qword[ebx+8] fld qword[ebx+8]
@ -119,19 +124,65 @@ step1:
add ebx, 32 add ebx, 32
cmp ebx, esi cmp ebx, esi
jnz .loop jnz .loop
ret ret
; local stack definitions
;
;===========================================================================
step2: ; Step2
;=========================================================================== ;===========================================================================
_t0 equ dword [esp]
_t1 equ dword[esp+4]
_t2 equ dword[esp+8]
_t3 equ dword[esp+12]
_t4 equ dword[esp+16]
_t5 equ dword[esp+20]
_t6 equ dword[esp+24]
_t7 equ dword[esp+28]
_t8 equ dword[esp+32]
_t9 equ dword[esp+36]
mov eax, [_f] _l1 equ dword[esp+40]
mov ebx, eax _l2 equ dword[esp+44]
add eax, NumPoints*8 _l3 equ dword[esp+48]
_l4 equ dword[esp+52]
_l5 equ dword[esp+56]
_l6 equ dword[esp+60]
_l7 equ dword[esp+64]
_l8 equ dword[esp+68]
_l9 equ dword[esp+72]
_l0 equ dword[esp+76]
_d1 equ dword[esp+80]
_d2 equ dword[esp+84]
_d3 equ dword[esp+88]
_d4 equ dword[esp+92]
_d5 equ dword[esp+96]
_d6 equ dword[esp+100]
_j5 equ dword[esp+104]
_jj equ dword[esp+108]
_end_of_array equ dword[esp+112]
_step equ word [esp+116]
;=================================================================
; cdecl parameters:
; -- [ebp+8] = N
; -- [ebp+12] = 4k-aligned data array address
; returns:
; -- nothing
; destroys:
; -- eax, ebx
; locals:
; -- 10 stack-located dwords (_t0 ... _t9)
;; ==========================
align 4
step2:
push ebp
mov ebp, esp
sub esp, 40
mov ebx, [ebp+12]
mov eax, [ebp+ 8]
shl eax, 3
add eax, ebx
align 4
.loop_i: .loop_i:
; -- quad subelements +0, +4, +8 and +12 (simpliest operations) ; -- quad subelements +0, +4, +8 and +12 (simpliest operations)
@ -163,7 +214,7 @@ step2: ; Step2
; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) ; -- even subelements +2, +6, +10 and +14 (2 multiplications needed)
fld qword[ebx+8*2] fld qword[ebx+8*2]
fld qword[ebx+8*6] fld qword[ebx+8*6]
fld [_root] fld [fht_r]
fmul st1, st0 ; st : r, t2, t1 fmul st1, st0 ; st : r, t2, t1
fld qword[ebx+8*10] fld qword[ebx+8*10]
fxch st1 ; st : r, t3, t2, t1 fxch st1 ; st : r, t3, t2, t1
@ -194,20 +245,20 @@ step2: ; Step2
fsub st0, st1 fsub st0, st1
fxch st1 fxch st1
faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7])
fld [_root2] fld [fht_r2]
fmul st2, st0 fmul st2, st0
fmulp st1, st0 ; st : t9, t6 fmulp st1, st0 ; st : t9, t6
fld qword[ebx+8*3] fld qword[ebx+8*3]
fld st0 fld st0
fadd st0, st2 ; st : t1, f[l5], t9, t6 fadd st0, st2 ; st : t1, f[l5], t9, t6
fstp [_t1] fstp _t1
fsub st0, st1 fsub st0, st1
fstp [_t2] fstp _t2
fstp [_t9] ; (t9 never used) fstp _t9 ; (t9 never used)
fstp [_t6] ; st : <empty> fstp _t6 ; st : <empty>
fld [_c1] fld [fht_c1]
fld [_s1] fld [fht_s1]
fld qword[ebx+8*5] fld qword[ebx+8*5]
fld qword[ebx+8*7] fld qword[ebx+8*7]
fld st3 ; st: c1, f[l6], f[l2], s1, c1 fld st3 ; st: c1, f[l6], f[l2], s1, c1
@ -215,13 +266,13 @@ step2: ; Step2
fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c
fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c
faddp st1, st0 ; st: t5, f_6, f_2, s, c faddp st1, st0 ; st: t5, f_6, f_2, s, c
fstp [_t5] ; st: f_6, f_2, s, c fstp _t5 ; st: f_6, f_2, s, c
fld st3 ; st: c, f_6, f_2, s, c fld st3 ; st: c, f_6, f_2, s, c
fmul st0, st1 fmul st0, st1
fld st3 fld st3
fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c
fsubp st1, st0 ; st: t8, f_6, f_2, s, c fsubp st1, st0 ; st: t8, f_6, f_2, s, c
fstp [_t8] ; st: f_6, f_2, s, c fstp _t8 ; st: f_6, f_2, s, c
fstp st0 ; st: f_2, s, c fstp st0 ; st: f_2, s, c
fstp st0 ; st: s, c fstp st0 ; st: s, c
@ -232,51 +283,51 @@ step2: ; Step2
fld st3 fld st3
fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c
faddp st1, st0 ; st: t7, f_8, f_4, s, c faddp st1, st0 ; st: t7, f_8, f_4, s, c
fld [_t5] ; st: t5, t7, f_8, f_4, s, c fld _t5 ; st: t5, t7, f_8, f_4, s, c
fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c
fstp [_t4] fstp _t4
fstp [_t7] ; st: f_8, f_4, s, c fstp _t7 ; st: f_8, f_4, s, c
fld st3 ; st: c, f_8, f_4, s, c fld st3 ; st: c, f_8, f_4, s, c
fmul st0, st2 fmul st0, st2
fld st3 fld st3
fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c
fsubp st1, st0 ; st:-t0, f_8, f_4, s, c fsubp st1, st0 ; st:-t0, f_8, f_4, s, c
fchs fchs
fld [_t8] fld _t8
fchs ; st:-t8, t0, f_8, f_4, s, c fchs ; st:-t8, t0, f_8, f_4, s, c
fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c
fstp [_t3] fstp _t3
fstp [_t0] ; st: f_8, f_4, s, c fstp _t0 ; st: f_8, f_4, s, c
fstp st0 ; st: f_4, s, c fstp st0 ; st: f_4, s, c
fstp st0 ; st: s, c fstp st0 ; st: s, c
fstp st0 ; st: c fstp st0 ; st: c
fstp st0 ; st: <empty> fstp st0 ; st: <empty>
fld [_t1] fld _t1
fld [_t4] fld _t4
fld st1 fld st1
fsub st0, st1 fsub st0, st1
fstp qword[ebx+8*11] ; f[l7] = t1-t4 fstp qword[ebx+8*11] ; f[l7] = t1-t4
faddp st1, st0 faddp st1, st0
fstp qword[ebx+8*3] ; f[l5] = t1+t4 fstp qword[ebx+8*3] ; f[l5] = t1+t4
fld [_t2] fld _t2
fld [_t3] fld _t3
fld st1 fld st1
fsub st0, st1 fsub st0, st1
fstp qword[ebx+8*15] ; f[l8] fstp qword[ebx+8*15] ; f[l8]
faddp st1, st0 faddp st1, st0
fstp qword[ebx+8*7] ; f[l6] fstp qword[ebx+8*7] ; f[l6]
fld [_t6] fld _t6
fld qword[ebx+8] fld qword[ebx+8]
fld st1 fld st1
fsub st0, st1 fsub st0, st1
fxch st1 fxch st1
faddp st2, st0 ; st : t2, t1 faddp st2, st0 ; st : t2, t1
fld [_t8] fld _t8
fsub [_t0] fsub _t0
fld [_t5] fld _t5
fadd [_t7] ; st : t4, t3, t2, t1 fadd _t7 ; st : t4, t3, t2, t1
fld st3 fld st3
fsub st0, st1 fsub st0, st1
@ -294,36 +345,42 @@ step2: ; Step2
cmp ebx, eax cmp ebx, eax
jb .loop_i jb .loop_i
mov esp, ebp
pop ebp
ret ret
align 8 ; shared local vars
_t0 dq 0
_t1 dq 0
_t2 dq 0
_t3 dq 0
_t4 dq 0
_t5 dq 0
_t6 dq 0
_t7 dq 0
_t8 dq 0
_t9 dq 0
;=================================================================== ;=================================================================
; cdecl parameters:
; -- [ebp+8] = N
; -- [ebp+12] = p
; -- [ebp+16] = 4k-aligned data array address
; -- [ebp+20] = 4k-aligned SinCosTable address
; returns:
; -- nothing
; destroys:
; -- all GPRegs
; locals:
; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
;; ==========================
align 4
step3: step3:
;=================================================================== push ebp
mov ebp, esp
sub esp, 120
; 283 : { ; 283 : {
; 293 : for (l=3; l<=p; l++) ; 293 : for (l=3; l<=p; l++)
mov cx, 0x0200 mov cx, 0x0200
align 4
.newstep: .newstep:
inc ch inc ch
cmp ch, Power_of_4 cmp ch, byte[ebp+12]
jg .done jg .done
mov [.step], cx mov _step, cx
; 294 : { ; 294 : {
; 295 : d1 = 1 << (l + l - 3); ; 295 : d1 = 1 << (l + l - 3);
@ -333,61 +390,63 @@ step3:
sub cl, 3 sub cl, 3
mov edx, 1 mov edx, 1
shl edx, cl shl edx, cl
mov [.d1], edx mov _d1, edx
; 296 : d2 = d1 << 1; ; 296 : d2 = d1 << 1;
shl edx, 1 shl edx, 1
mov [.d2], edx mov _d2, edx
mov eax, edx mov eax, edx
; 297 : d3 = d2 << 1; ; 297 : d3 = d2 << 1;
shl edx, 1 shl edx, 1
mov [.d3], edx mov _d3, edx
; 298 : d4 = d2 + d3; ; 298 : d4 = d2 + d3;
add eax, edx add eax, edx
mov [.d4], eax mov _d4, eax
; 299 : d5 = d3 << 1; ; 299 : d5 = d3 << 1;
shl edx, 1 shl edx, 1
mov [.d5], edx mov _d5, edx
shl edx, 3 shl edx, 3
mov [.d6], edx ; d6 = d5*8 to simplify index operations mov _d6, edx ; d6 = d5*8 to simplify index operations
; 339 : j5 = N / d5; ; moved out of internal loop ; 339 : j5 = N / d5; ; moved out of internal loop
mov cl, Power_of_4 mov cl, [ebp+12]
sub cl, ch sub cl, ch
add cl, cl add cl, cl
mov edx, 1 mov edx, 1
shl edx, cl shl edx, cl
mov [.j5], edx mov _j5, edx
; 300 : ; 300 :
; 301 : for (j=0; j<N; j+=d5) ; 301 : for (j=0; j<N; j+=d5)
mov esi, [_f] mov ebx, [ebp+16]
mov ebx, esi mov esi, [ebp+8]
add esi, NumPoints*8 shl esi, 3
mov [.end_of_array], esi add esi, ebx
mov _end_of_array, esi
align 4
.next_j: .next_j:
; { ; {
; t1 = f[j] + f[j+d2]; ; t1 = f[j] + f[j+d2];
mov eax, [.d2] mov eax, _d2
fld qword[ebx] fld qword[ebx]
fld qword[ebx+eax*8] fld qword[ebx+eax*8]
fld st1 fld st1
fadd st0, st1 fadd st0, st1
fstp [_t1] fstp _t1
; t2 = f[j] - f[j+d2]; ; t2 = f[j] - f[j+d2];
fsubp st1, st0 fsubp st1, st0
fstp [_t2] fstp _t2
; t3 = f[j+d3] + f[j+d4]; ; t3 = f[j+d3] + f[j+d4];
mov edi, [.d3] mov edi, _d3
fld qword[ebx+edi*8] fld qword[ebx+edi*8]
mov edx, [.d4] mov edx, _d4
fld qword[ebx+edx*8] fld qword[ebx+edx*8]
fld st1 fld st1
fsub st0, st1 ; st : t4, f4, f3 fsub st0, st1 ; st : t4, f4, f3
@ -398,7 +457,7 @@ step3:
; f[j+d4] = t2 - t4; ; f[j+d4] = t2 - t4;
; f[j+d3] = t2 + t4; ; f[j+d3] = t2 + t4;
fld [_t2] fld _t2
fld st0 fld st0
fsub st0, st2 ; st : f4, t2, t4, t3 fsub st0, st2 ; st : f4, t2, t4, t3
fstp qword[ebx+edx*8] ; st : t2, t4, t3 fstp qword[ebx+edx*8] ; st : t2, t4, t3
@ -407,7 +466,7 @@ step3:
; f[j+d2] = t1 - t3; ; f[j+d2] = t1 - t3;
; f[j] = t1 + t3; ; f[j] = t1 + t3;
fld [_t1] fld _t1
fst st1 fst st1
fsub st0, st2 ; st : f2, t1, t3 fsub st0, st2 ; st : f2, t1, t3
fstp qword[ebx+eax*8] ; st : t1, t3 fstp qword[ebx+eax*8] ; st : t1, t3
@ -416,7 +475,7 @@ step3:
fstp st0 fstp st0
; jj = j + d1; / ?? ; jj = j + d1; / ??
mov edi, [.d1] mov edi, _d1
shl edi, 3 ; = d1*8 shl edi, 3 ; = d1*8
mov edx, edi mov edx, edi
mov eax, edi mov eax, edi
@ -432,7 +491,7 @@ step3:
; t2 = f[jj+d2] * r; ; t2 = f[jj+d2] * r;
fld qword [edi+eax] fld qword [edi+eax]
fld [_root] fld [fht_r]
fmul st1, st0 ; st : r, t2, t3, t1 fmul st1, st0 ; st : r, t2, t3, t1
; t4 = f[jj+d4] * r ; t4 = f[jj+d4] * r
fmul qword [edx+eax] ; st : t4, t2, t3, t1 fmul qword [edx+eax] ; st : t4, t2, t3, t1
@ -461,58 +520,61 @@ step3:
; for (k=1; k<d1; k++) ; for (k=1; k<d1; k++)
xor ecx, ecx ; ecx = k xor ecx, ecx ; ecx = k
mov [.jj], ecx mov _jj, ecx
align 4
.next_k: .next_k:
inc ecx inc ecx
cmp ecx, [.d1] cmp ecx, _d1
jge .done_k jge .done_k
; { ; {
mov eax, [.d2] ; the sector increment mov eax, _d2 ; the sector increment
; l1 = j + k; ; l1 = j + k;
mov edx, ecx mov edx, ecx
mov [.l1], edx ; [ebx+edx*8] --> f[j+k] mov _l1, edx ; [ebx+edx*8] --> f[j+k]
; l2 = l1 + d2; ; l2 = l1 + d2;
add edx, eax add edx, eax
mov [.l2], edx mov _l2, edx
; l3 = l1 + d3; ; l3 = l1 + d3;
add edx, eax add edx, eax
mov [.l3], edx mov _l3, edx
; l4 = l1 + d4; ; l4 = l1 + d4;
add edx, eax add edx, eax
mov [.l4], edx mov _l4, edx
; l5 = j + d2 - k; ; l5 = j + d2 - k;
mov edx, eax mov edx, eax
sub edx, ecx sub edx, ecx
mov [.l5], edx mov _l5, edx
; l6 = l5 + d2; ; l6 = l5 + d2;
add edx, eax add edx, eax
mov [.l6], edx mov _l6, edx
; l7 = l5 + d3; ; l7 = l5 + d3;
add edx, eax add edx, eax
mov [.l7], edx mov _l7, edx
; l8 = l5 + d4; ; l8 = l5 + d4;
add edx, eax add edx, eax
mov [.l8], edx mov _l8, edx
; 340 : j5 *= k; // add-substituted multiplication ; 340 : j5 *= k; // add-substituted multiplication
mov eax, [.jj] mov eax, _jj
add eax, [.j5] add eax, _j5
mov [.jj], eax mov _jj, eax
; c1 = C[jj]; ; c1 = C[jj];
; s1 = S[jj]; ; s1 = S[jj];
mov edi, [_Cosins] mov edi, [ebp+20]
fld qword[edi+eax*8] fld qword[edi+eax*8]
mov esi, [_Sines] mov esi, [ebp+8]
shl esi, 2
add esi, edi
fld qword[esi+eax*8] ; st : s1, c1 fld qword[esi+eax*8] ; st : s1, c1
; t5 = f[l2] * c1 + f[l6] * s1; ; t5 = f[l2] * c1 + f[l6] * s1;
; t8 = f[l6] * c1 - f[l2] * s1; ; t8 = f[l6] * c1 - f[l2] * s1;
mov edx, [.l6] mov edx, _l6
fld qword[ebx+edx*8] fld qword[ebx+edx*8]
mov edx, [.l2] mov edx, _l2
fld st0 fld st0
fmul st0, st2 fmul st0, st2
fxch st1 fxch st1
@ -521,10 +583,10 @@ step3:
fmul st4, st0 fmul st4, st0
fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c
fstp [_t8] fstp _t8
faddp st2, st0 ; st : f[l2]*s, t5 faddp st2, st0 ; st : f[l2]*s, t5
fstp st0 ; st : t5 fstp st0 ; st : t5
fstp [_t5] ; st : <empty> fstp _t5 ; st : <empty>
; c2 = C[2*jj]; ; c2 = C[2*jj];
; s2 = S[2*jj]; ; s2 = S[2*jj];
@ -534,9 +596,9 @@ step3:
; t6 = f[l3] * c2 + f[l7] * s2; ; t6 = f[l3] * c2 + f[l7] * s2;
; t9 = f[l7] * c2 - f[l3] * s2; ; t9 = f[l7] * c2 - f[l3] * s2;
mov edx, [.l7] mov edx, _l7
fld qword[ebx+edx*8] fld qword[ebx+edx*8]
mov edx, [.l3] mov edx, _l3
fld st0 fld st0
fmul st0, st2 fmul st0, st2
fxch st1 fxch st1
@ -545,22 +607,22 @@ step3:
fmul st4, st0 fmul st4, st0
fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c
fstp [_t9] fstp _t9
faddp st2, st0 ; st : f[l2]*s, t6 faddp st2, st0 ; st : f[l2]*s, t6
fstp st0 ; st : t6 fstp st0 ; st : t6
fstp [_t6] ; st : <empty> fstp _t6 ; st : <empty>
; c3 = C[3*jj]; ; c3 = C[3*jj];
; s3 = S[3*jj]; ; s3 = S[3*jj];
add eax, [.jj] add eax, _jj
fld qword[edi+eax*8] fld qword[edi+eax*8]
fld qword[esi+eax*8] ; st : s3, c3 fld qword[esi+eax*8] ; st : s3, c3
; t7 = f[l4] * c3 + f[l8] * s3; ; t7 = f[l4] * c3 + f[l8] * s3;
; t0 = f[l8] * c3 - f[l4] * s3; ; t0 = f[l8] * c3 - f[l4] * s3;
mov edx, [.l8] mov edx, _l8
fld qword[ebx+edx*8] fld qword[ebx+edx*8]
mov edx, [.l4] mov edx, _l4
fld st0 fld st0
fmul st0, st2 fmul st0, st2
fxch st1 fxch st1
@ -569,192 +631,162 @@ step3:
fmul st4, st0 fmul st4, st0
fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c
fstp [_t0] fstp _t0
faddp st2, st0 ; st : f[l2]*s, t7 faddp st2, st0 ; st : f[l2]*s, t7
fstp st0 ; st : t7 fstp st0 ; st : t7
fstp [_t7] ; st : <empty> fstp _t7 ; st : <empty>
; t1 = f[l5] - t9; ; t1 = f[l5] - t9;
; t2 = f[l5] + t9; ; t2 = f[l5] + t9;
mov eax, [.l5] mov eax, _l5
fld qword [ebx+eax*8] fld qword [ebx+eax*8]
fld [_t9] fld _t9
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp [_t2] fstp _t2
fsubp st1, st0 fsubp st1, st0
fstp [_t1] fstp _t1
; t3 = - t8 - t0; ; t3 = - t8 - t0;
fld [_t8] fld _t8
fadd [_t0] fadd _t0
fchs fchs
fstp [_t3] fstp _t3
; t4 = t5 - t7; ; t4 = t5 - t7;
fld [_t5] fld _t5
fsub [_t7] fsub _t7
fstp [_t4] fstp _t4
; f[l5] = t1 + t4; ; f[l5] = t1 + t4;
fld [_t1] fld _t1
fld [_t4] fld _t4
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l7] = t1 - t4; ; f[l7] = t1 - t4;
mov eax, [.l7] mov eax, _l7
fsubp st1, st0 fsubp st1, st0
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l6] = t2 + t3; ; f[l6] = t2 + t3;
mov eax, [.l6] mov eax, _l6
fld [_t2] fld _t2
fld [_t3] fld _t3
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l8] = t2 - t3; ; f[l8] = t2 - t3;
mov eax, [.l8] mov eax, _l8
fsubp st1, st0 fsubp st1, st0
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; t1 = f[l1] + t6; ; t1 = f[l1] + t6;
mov eax, [.l1] mov eax, _l1
fld qword [ebx+eax*8] fld qword [ebx+eax*8]
fld [_t6] fld _t6
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp [_t1] fstp _t1
; t2 = f[l1] - t6; ; t2 = f[l1] - t6;
fsubp st1, st0 fsubp st1, st0
fstp [_t2] fstp _t2
; t3 = t8 - t0; ; t3 = t8 - t0;
fld [_t8] fld _t8
fsub [_t0] fsub _t0
fstp [_t3] fstp _t3
; t4 = t5 + t7; ; t4 = t5 + t7;
fld [_t5] fld _t5
fadd [_t7] fadd _t7
fstp [_t4] fstp _t4
; f[l1] = t1 + t4; ; f[l1] = t1 + t4;
mov eax, [.l1] mov eax, _l1
fld [_t1] fld _t1
fld [_t4] fld _t4
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l3] = t1 - t4; ; f[l3] = t1 - t4;
mov eax, [.l3] mov eax, _l3
fsubp st1, st0 fsubp st1, st0
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l2] = t2 + t3; ; f[l2] = t2 + t3;
mov eax, [.l2] mov eax, _l2
fld [_t2] fld _t2
fld [_t3] fld _t3
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l4] = t2 - t3; ; f[l4] = t2 - t3;
mov eax, [.l4] mov eax, _l4
fsubp st1, st0 fsubp st1, st0
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; 374 : } ; 374 : }
jmp .next_k jmp .next_k
align 4
.done_k: .done_k:
; 375 : } ; 375 : }
add ebx, [.d6] ; d6 = d5*8 add ebx, _d6 ; d6 = d5*8
cmp ebx, [.end_of_array] cmp ebx, _end_of_array
jb .next_j jb .next_j
; 376 : } ; 376 : }
mov cx, [.step] mov cx, _step
jmp .newstep jmp .newstep
.done: .done:
mov esp, ebp
pop ebp
; 377 : } ; 377 : }
ret ret
align 4
.l1 dd 0
.l2 dd 0
.l3 dd 0
.l4 dd 0
.l5 dd 0
.l6 dd 0
.l7 dd 0
.l8 dd 0
.l9 dd 0
.l0 dd 0
.d1 dd 0
.d2 dd 0
.d3 dd 0
.d4 dd 0
.d5 dd 0
.d6 dd 0
.j5 dd 0
.jj dd 0
.end_of_array dd 0
.step dw 0
align 8
;=========== Step3 ends here =========== ;=========== Step3 ends here ===========
; =================================================================
;================================================================= ;=================================================================
; syscall entry ; parameters:
; ; -- [ebp+12] = N
_f dd ? ; -- [ebp+16] = p
_N dd 1024 ; number of points ; -- [ebp+20] = 4k-aligned data array address
; -- [ebp+24] = 4k-aligned SinCosTable address
; returns:
; -- nothing
; destroys:
; -- all GPRegs
;; ==========================
_a dd ? ; initial data array align 4
x dd 0 ; tranformed (float) data array
_Cosins dd 0
_Sines dd 0
FFT4: FHT_4:
or al, al push ebp
jnz .trans mov ebp, esp
mov cl, Power_of_4
mov eax, 1 mov edx, [ebp+20] ; a
shl eax, cl mov dl, byte[ebp+16]
shl eax, cl
mov [_N], eax
shl eax, 2 ; size of Sine table in bytes
add eax, ebx
mov [_Sines], ebx
mov [_Cosins], eax
cpuid
rdtsc
mov [.time], eax
call MakeSinCosTable
cpuid
rdtsc
sub eax, [.time]
ret
.trans:
mov [x], ebx
mov [_f], ebx
cli ;-----
cpuid
rdtsc
mov [.time], eax
call BitInvert call BitInvert
call step1 push dword[ebp+20] ; a
call step2 push ecx ; N
call step1 ; 4-point transform
cmp cl, 1
jz .done
call step2 ; 16-point transform
cmp byte[ebp+16],1 ; p = 2 ?
jz .done
pop edx ; N
pop ecx ; a
push dword[ebp+24] ; t
push ecx
push dword[ebp+16] ; p
push edx ; N
call step3 call step3
cpuid .done:
rdtsc mov esp, ebp
sti ;---- pop ebp
sub eax, [.time]
ret
.time dd 0 ret

View File

@ -0,0 +1,207 @@
;========================================================================
;= =
;= Fast Hartley Transform routine demo for KolibriOS =
;= =
;= Copyright (C) 2010, Artem Jerdev <kolibri@jerdev.co.uk> =
;= =
;= refer to wiki.kolibtios.org for all details =
;= =
;========================================================================
use32
org 0x0
db 'MENUET01' ; 8 byte id
dd 0x01 ; header version
dd START ; start of code
dd I_END ; size of image
dd 0x100000 ; memory for app
dd 0xbfffc ; esp
dd 0x0 , 0x0 ; I_Param , I_Icon
include 'macros.inc'
include 'debug.inc'
include 'FHT4i.inc'
START: ; start of execution
call main
mov eax,-1 ; close this program
int 0x40
;=============================================================
;Func: calculates a simple function
; ff = (int)(500*exp(-t) * cos (2.5*t))
; uses: eax, ebx
;------------
Func:
; 9 : {
; 10 : double x,t;
; 11 : int f;
; 12 :
; 13 : x = (i < N2) ? i : i - NUM_POINTS;
mov eax, [ii]
cmp eax, 512
jge .index_negative
jmp .index_correct
.index_negative:
sub eax, 1024
.index_correct:
mov [temp], eax
; fild [temp]
; 14 : t = x / 16.0;
; f2xm1 argument (abs) must be less than 1, so
mov [t_mod], eax
and [t_mod], 0x0F ; x % 16
shr eax, 4 ; x / 16
mov [t_div], eax
fild [temp]
; 15 : if (t<0) t = -t;
fabs
exp_ok:
; 16 : f = (int)(512*2^(-t) * cos (2.5*t));
fchs
f2xm1
fmul [f500]
fstp [tv93]
fld [f2_5]
fmul [tt]
fcos
fmul [tv93]
fstp [tt]
mov bx, word[tt+6]
shr bx,4
and bx,0x07FF
add ax,bx
shl ax,4
and word[tt+6], 0x800F
or word[tt+6], ax
fld [tt]
fstp [ff]
; 17 : return f;
; 18 : }
ret
;---------------------------------------------------------
; test data filler
;
; uses eax, ebx, ecx
FillData:
; 29 : for (i=0; i<NUM_POINTS; i++)
; here : ecx = i
xor ecx, ecx
.funcloop:
; 30 : {
; 31 : ia[i] = Func(i);
mov [ii], ecx
call Func
fld [ff]
fstp qword [edx+ecx*8]
; 32 : }
inc ecx
cmp ecx, [_in] ; ecx == N ?
jne .funcloop
ret
;====================================================================
; main
;====================================================================
align 4
_ia dd 0
_ii dd 0
_ip dd 0
_in dd 0
_it dd 0
;-----------------
main:
mov eax, 68
mov ebx, 11
int 0x40
fninit
mov cl, 2 ; power of 4
mov byte[_ip], cl
mov eax, 1
shl eax, cl
shl eax, cl
mov [_in], eax
mov dl, cl
call CreateSinCosTable
mov [_it], edx
mov ecx, [_in]
shl ecx, 3
mov ebx, 12
mov eax, 68
int 0x40
mov [_ia], eax
mov edx, eax
call FillData
cpuid
rdtsc
mov [t_0], eax
push [_it]
push [_ia]
push [_ip]
push [_in]
; call FHT_4
xor eax, eax
syscall
add esp, 16
cpuid
rdtsc
mov [t_1], eax
sub eax, [t_0]
debug_print_hex eax
print '<- fht time'
mov edx, [_it]
call DestroySinCosTable
mov ecx, [_ia]
mov ebx, 13
mov eax, 68
int 0x40
ret
; ========================================================
; static data
;----------------
align 8
;f18 dq 0x4032000000000000
f256 dq 256.01f
f14_2 dq 14.2f
f500 dq 0x407f400000000000
f2_5 dq 0x4004000000000000
tt dq ?
tv93 dq ?
t_div dd ?
t_mod dd ?
temp dd ?
ff dq ? ; return value (int)
ii dd ? ; argument (int) = array index
t_1 dd ?
t_0 dd ?
fcontrol dw 0x0037f
title db ' Fast Hartley Transform Test - A.Jerdev 2010'
I_END: