an experimental kernel with a mad syscall and FHT inside

git-svn-id: svn://kolibrios.org@1641 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Artem Jerdev (art_zh) 2010-10-03 23:11:42 +00:00
parent abbc09c677
commit 112a3665cb
5 changed files with 661 additions and 479 deletions

View File

@ -76,9 +76,6 @@ pci_ext_config:
shl eax, 8 shl eax, 8
test eax, 0x000F0000 ; MMIO Base must be bus0-aligned test eax, 0x000F0000 ; MMIO Base must be bus0-aligned
jnz .no_pcie_cfg jnz .no_pcie_cfg
; -- it looks like a true PCIe config space;
ret ; <<<<<<<<<<< OK >>>>>>>>>>> ret ; <<<<<<<<<<< OK >>>>>>>>>>>
.no_pcie_cfg: .no_pcie_cfg:
@ -92,6 +89,7 @@ pci_ext_config:
.pcie_failed: .pcie_failed:
mov esi, boot_pcie_fail mov esi, boot_pcie_fail
call boot_log call boot_log
xor eax, eax
ret ; <<<<<<<<< FAILURE >>>>>>>>> ret ; <<<<<<<<< FAILURE >>>>>>>>>

View File

@ -1,4 +1,4 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;; ;; ;;
;; Copyright (C) KolibriOS team 2004-2007. All rights reserved. ;; ;; Copyright (C) KolibriOS team 2004-2007. All rights reserved. ;;
;; Distributed under terms of the GNU General Public License ;; ;; Distributed under terms of the GNU General Public License ;;
@ -10,47 +10,16 @@ $Revision$
; Old style system call converter ; Old style system call converter
align 16 align 16
cross_order: cross_order:
; load all registers in crossed order ; load all registers in crossed order
mov eax, ebx mov eax, ebx
mov ebx, ecx mov ebx, ecx
mov ecx, edx mov ecx, edx
mov edx, esi mov edx, esi
mov esi, edi mov esi, edi
movzx edi, byte[esp+28 + 4] movzx edi, byte[esp+28 + 4]
sub edi, 53 ; all zeroes before sub edi, 53 ; all zeroes before
call dword [servetable+edi*4] call dword [servetable+edi*4]
ret ret
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;;
;; SYSENTER ENTRY ;;
;; (not used on AMD systems) ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;align 32
;sysenter_entry:
; ; Íàñòðàèâàåì ñòåê
; mov esp, [ss:tss._esp0]
; sti
; push ebp ; save app esp + 4
; mov ebp, [ebp] ; ebp - original ebp
; ;------------------
; pushad
; cld
;
; movzx eax, al
; call dword [servetable2 + eax * 4]
; popad
; ;------------------
; xchg ecx, [ss:esp] ; â âåðøèí ñòåêà - app ecx, ecx - app esp + 4
; sub ecx, 4
; xchg edx, [ecx] ; edx - return point, & save original edx
; push edx
; mov edx, [ss:esp + 4]
; mov [ecx + 4], edx ; save original ecx
; pop edx
; sysexit
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;; ;; ;;
@ -60,27 +29,27 @@ cross_order:
align 16 align 16
i40: i40:
pushad pushad
cld cld
and eax, 0x07F and eax, 0x07F
call dword [servetable2 + eax * 4] call dword [servetable2 + eax * 4]
popad popad
iretd iretd
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ;; ;; ;;
;; SYSCALL ENTRY ;; ;; SYSCALL ENTRY -- NEW !!! ;;
;; ;; ;; ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 32 align 32
syscall_entry: syscall_entry:
; push ecx ; sti
sti push ecx
and eax, 3 and eax, 3
call dword [servetable3 + eax * 4] call dword [servetable3 + eax * 4]
pop ecx
; pop ecx sysret
sysret
iglobal iglobal
;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -89,18 +58,17 @@ iglobal
align 4 align 4
servetable: servetable:
dd socket ; 53-Socket interface
dd socket ; 53-Socket interface
dd 0 dd 0
dd 0 dd 0
dd 0 dd 0
dd 0 dd 0
dd file_system ; 58-Common file system interface dd file_system ; 58-Common file system interface
dd 0 dd 0
dd 0 dd 0
dd 0 dd 0
dd sys_pci ; 62-PCI functions dd sys_pci ; 62-PCI functions
dd sys_msg_board ; 63-System message board dd sys_msg_board ; 63-System message board
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; NEW SYSTEM FUNCTIONS TABLE ;; ;; NEW SYSTEM FUNCTIONS TABLE ;;
@ -108,88 +76,88 @@ iglobal
align 4 align 4
servetable2: servetable2:
dd syscall_draw_window ; 0-DrawWindow dd syscall_draw_window ; 0-DrawWindow
dd syscall_setpixel ; 1-SetPixel dd syscall_setpixel ; 1-SetPixel
dd sys_getkey ; 2-GetKey dd sys_getkey ; 2-GetKey
dd sys_clock ; 3-GetTime dd sys_clock ; 3-GetTime
dd syscall_writetext ; 4-WriteText dd syscall_writetext ; 4-WriteText
dd delay_hs ; 5-DelayHs dd delay_hs ; 5-DelayHs
dd syscall_openramdiskfile ; 6-OpenRamdiskFile dd syscall_openramdiskfile ; 6-OpenRamdiskFile
dd syscall_putimage ; 7-PutImage dd syscall_putimage ; 7-PutImage
dd syscall_button ; 8-DefineButton dd syscall_button ; 8-DefineButton
dd sys_cpuusage ; 9-GetProcessInfo dd sys_cpuusage ; 9-GetProcessInfo
dd sys_waitforevent ; 10-WaitForEvent dd sys_waitforevent ; 10-WaitForEvent
dd sys_getevent ; 11-CheckForEvent dd sys_getevent ; 11-CheckForEvent
dd sys_redrawstat ; 12-BeginDraw and EndDraw dd sys_redrawstat ; 12-BeginDraw and EndDraw
dd syscall_drawrect ; 13-DrawRect dd syscall_drawrect ; 13-DrawRect
dd syscall_getscreensize ; 14-GetScreenSize dd syscall_getscreensize ; 14-GetScreenSize
dd sys_background ; 15-bgr dd sys_background ; 15-bgr
dd sys_cachetodiskette ; 16-FlushFloppyCache dd sys_cachetodiskette ; 16-FlushFloppyCache
dd sys_getbutton ; 17-GetButton dd sys_getbutton ; 17-GetButton
dd sys_system ; 18-System Services dd sys_system ; 18-System Services
dd paleholder ; 19-reserved dd paleholder ; 19-reserved
dd sys_midi ; 20-ResetMidi and OutputMidi dd sys_midi ; 20-ResetMidi and OutputMidi
dd sys_setup ; 21-SetMidiBase,SetKeymap,SetShiftKeymap,. dd sys_setup ; 21-SetMidiBase,SetKeymap,SetShiftKeymap,.
dd sys_settime ; 22-setting date,time,clock and alarm-clock dd sys_settime ; 22-setting date,time,clock and alarm-clock
dd sys_wait_event_timeout ; 23-TimeOutWaitForEvent dd sys_wait_event_timeout ; 23-TimeOutWaitForEvent
dd syscall_cdaudio ; 24-PlayCdTrack,StopCd and GetCdPlaylist dd syscall_cdaudio ; 24-PlayCdTrack,StopCd and GetCdPlaylist
dd undefined_syscall ; 25-reserved dd undefined_syscall ; 25-reserved
dd sys_getsetup ; 26-GetMidiBase,GetKeymap,GetShiftKeymap,. dd sys_getsetup ; 26-GetMidiBase,GetKeymap,GetShiftKeymap,.
dd undefined_syscall ; 27-reserved dd undefined_syscall ; 27-reserved
dd undefined_syscall ; 28-reserved dd undefined_syscall ; 28-reserved
dd sys_date ; 29-GetDate dd sys_date ; 29-GetDate
dd sys_current_directory ; 30-Get/SetCurrentDirectory dd sys_current_directory ; 30-Get/SetCurrentDirectory
dd undefined_syscall ; 31-reserved dd undefined_syscall ; 31-reserved
dd undefined_syscall ; 32-reserved dd undefined_syscall ; 32-reserved
dd undefined_syscall ; 33-reserved dd undefined_syscall ; 33-reserved
dd undefined_syscall ; 34-reserved dd undefined_syscall ; 34-reserved
dd syscall_getpixel ; 35-GetPixel dd syscall_getpixel ; 35-GetPixel
dd syscall_getarea ; 36-GetArea dd syscall_getarea ; 36-GetArea
dd readmousepos ; 37-GetMousePosition_ScreenRelative,. dd readmousepos ; 37-GetMousePosition_ScreenRelative,.
dd syscall_drawline ; 38-DrawLine dd syscall_drawline ; 38-DrawLine
dd sys_getbackground ; 39-GetBackgroundSize,ReadBgrData,. dd sys_getbackground ; 39-GetBackgroundSize,ReadBgrData,.
dd set_app_param ; 40-WantEvents dd set_app_param ; 40-WantEvents
dd syscall_getirqowner ; 41-GetIrqOwner dd syscall_getirqowner ; 41-GetIrqOwner
dd get_irq_data ; 42-ReadIrqData dd get_irq_data ; 42-ReadIrqData
dd sys_outport ; 43-SendDeviceData dd sys_outport ; 43-SendDeviceData
dd sys_programirq ; 44-ProgramIrqs dd sys_programirq ; 44-ProgramIrqs
dd reserve_free_irq ; 45-ReserveIrq and FreeIrq dd reserve_free_irq ; 45-ReserveIrq and FreeIrq
dd syscall_reserveportarea ; 46-ReservePortArea and FreePortArea dd syscall_reserveportarea ; 46-ReservePortArea and FreePortArea
dd display_number ; 47-WriteNum dd display_number ; 47-WriteNum
dd syscall_display_settings ; 48-SetRedrawType and SetButtonType dd syscall_display_settings ; 48-SetRedrawType and SetButtonType
dd sys_apm ; 49-Advanced Power Management (APM) dd sys_apm ; 49-Advanced Power Management (APM)
dd syscall_set_window_shape ; 50-Window shape & scale dd syscall_set_window_shape ; 50-Window shape & scale
dd syscall_threads ; 51-Threads dd syscall_threads ; 51-Threads
dd stack_driver_stat ; 52-Stack driver status dd stack_driver_stat ; 52-Stack driver status
dd cross_order ; 53-Socket interface dd cross_order ; 53-Socket interface
dd undefined_syscall ; 54-reserved dd undefined_syscall ; 54-reserved
dd sound_interface ; 55-Sound interface dd sound_interface ; 55-Sound interface
dd undefined_syscall ; 56-reserved dd undefined_syscall ; 56-reserved
dd sys_pcibios ; 57-PCI BIOS32 dd sys_pcibios ; 57-PCI BIOS32
dd cross_order ; 58-Common file system interface dd cross_order ; 58-Common file system interface
dd undefined_syscall ; 59-reserved dd undefined_syscall ; 59-reserved
dd sys_IPC ; 60-Inter Process Communication dd sys_IPC ; 60-Inter Process Communication
dd sys_gs ; 61-Direct graphics access dd sys_gs ; 61-Direct graphics access
dd cross_order ; 62-PCI functions dd cross_order ; 62-PCI functions
dd cross_order ; 63-System message board dd cross_order ; 63-System message board
dd sys_resize_app_memory ; 64-Resize application memory usage dd sys_resize_app_memory ; 64-Resize application memory usage
dd sys_putimage_palette ; 65-PutImagePalette dd sys_putimage_palette ; 65-PutImagePalette
dd sys_process_def ; 66-Process definitions - keyboard dd sys_process_def ; 66-Process definitions - keyboard
dd syscall_move_window ; 67-Window move or resize dd syscall_move_window ; 67-Window move or resize
dd f68 ; 68-Some internal services dd f68 ; 68-Some internal services
dd sys_debug_services ; 69-Debug dd sys_debug_services ; 69-Debug
dd file_system_lfn ; 70-Common file system interface, version 2 dd file_system_lfn ; 70-Common file system interface, version 2
dd syscall_window_settings ; 71-Window settings dd syscall_window_settings ; 71-Window settings
dd sys_sendwindowmsg ; 72-Send window message dd sys_sendwindowmsg ; 72-Send window message
times 127 - ( ($-servetable2) /4 ) dd undefined_syscall times 127 - ( ($-servetable2) /4 ) dd undefined_syscall
dd sys_end ; -1-end application dd sys_end ; -1-end application
align 4 align 4
servetable3: servetable3:
dd FFT4 ; 0 dd FHT_4 ; 0
dd FFT4 ; 1 dd FHT_4 ; 1
dd paleholder ; 2 dd paleholder ; 2
dd sys_end ; last dd sys_end ; last
endg endg

View File

@ -16,30 +16,6 @@
$Revision$ $Revision$
;struc db [a] { common . db a
; if ~used .
; display 'not used db: ',`.,13,10
; end if }
;struc dw [a] { common . dw a
; if ~used .
; display 'not used dw: ',`.,13,10
; end if }
;struc dd [a] { common . dd a
; if ~used .
; display 'not used dd: ',`.,13,10
; end if }
;struc dp [a] { common . dp a
; if ~used .
; display 'not used dp: ',`.,13,10
; end if }
;struc dq [a] { common . dq a
; if ~used .
; display 'not used dq: ',`.,13,10
; end if }
;struc dt [a] { common . dt a
; if ~used .
; display 'not used dt: ',`.,13,10
; end if }
struc POINT { struc POINT {
.x dd ? .x dd ?
@ -52,7 +28,7 @@ end virtual
struc RECT { struc RECT {
.left dd ? .left dd ?
.top dd ? .top dd ?
.right dd ? .right dd ?
.bottom dd ? .bottom dd ?
.sizeof: .sizeof:
@ -63,7 +39,7 @@ end virtual
struc BOX { struc BOX {
.left dd ? .left dd ?
.top dd ? .top dd ?
.width dd ? .width dd ?
.height dd ? .height dd ?
.sizeof: .sizeof:
@ -75,17 +51,17 @@ end virtual
struc DISPMODE { struc DISPMODE {
.width rw 1 .width rw 1
.height rw 1 .height rw 1
.bpp rw 1 .bpp rw 1
.freq rw 1 .freq rw 1
} }
; constants definition ; constants definition
WSTATE_NORMAL = 00000000b WSTATE_NORMAL = 00000000b
WSTATE_MAXIMIZED = 00000001b WSTATE_MAXIMIZED = 00000001b
WSTATE_MINIMIZED = 00000010b WSTATE_MINIMIZED = 00000010b
WSTATE_ROLLEDUP = 00000100b WSTATE_ROLLEDUP = 00000100b
WSTATE_REDRAW = 00000001b WSTATE_REDRAW = 00000001b
WSTATE_WNDDRAWN = 00000010b WSTATE_WNDDRAWN = 00000010b
WSTYLE_HASCAPTION = 00010000b WSTYLE_HASCAPTION = 00010000b
@ -94,13 +70,13 @@ WSTYLE_CLIENTRELATIVE = 00100000b
struc TASKDATA struc TASKDATA
{ {
.event_mask dd ? .event_mask dd ?
.pid dd ? .pid dd ?
dw ? dw ?
.state db ? .state db ?
db ? db ?
dw ? dw ?
.wnd_number db ? .wnd_number db ?
db ? db ?
.mem_start dd ? .mem_start dd ?
.counter_sum dd ? .counter_sum dd ?
.counter_add dd ? .counter_add dd ?
@ -110,24 +86,24 @@ virtual at 0
TASKDATA TASKDATA TASKDATA TASKDATA
end virtual end virtual
TSTATE_RUNNING = 0 TSTATE_RUNNING = 0
TSTATE_RUN_SUSPENDED = 1 TSTATE_RUN_SUSPENDED = 1
TSTATE_WAIT_SUSPENDED = 2 TSTATE_WAIT_SUSPENDED = 2
TSTATE_ZOMBIE = 3 TSTATE_ZOMBIE = 3
TSTATE_TERMINATING = 4 TSTATE_TERMINATING = 4
TSTATE_WAITING = 5 TSTATE_WAITING = 5
TSTATE_FREE = 9 TSTATE_FREE = 9
; structures definition ; structures definition
struc WDATA { struc WDATA {
.box BOX .box BOX
.cl_workarea dd ? .cl_workarea dd ?
.cl_titlebar dd ? .cl_titlebar dd ?
.cl_frames dd ? .cl_frames dd ?
.reserved db ? .reserved db ?
.fl_wstate db ? .fl_wstate db ?
.fl_wdrawn db ? .fl_wdrawn db ?
.fl_redraw db ? .fl_redraw db ?
.sizeof: .sizeof:
} }
virtual at 0 virtual at 0
@ -137,47 +113,47 @@ label WDATA.fl_wstyle byte at WDATA.cl_workarea + 3
struc APPDATA struc APPDATA
{ {
.app_name db 11 dup(?) .app_name db 11 dup(?)
db 5 dup(?) db 5 dup(?)
.fpu_state dd ? ;+16 .fpu_state dd ? ;+16
.ev_count_ dd ? ;unused ;+20 .ev_count_ dd ? ;unused ;+20
.exc_handler dd ? ;+24 .exc_handler dd ? ;+24
.except_mask dd ? ;+28 .except_mask dd ? ;+28
.pl0_stack dd ? ;unused ;+32 .pl0_stack dd ? ;unused ;+32
.heap_base dd ? ;+36 .heap_base dd ? ;+36
.heap_top dd ? ;+40 .heap_top dd ? ;+40
.cursor dd ? ;+44 .cursor dd ? ;+44
.fd_ev dd ? ;+48 .fd_ev dd ? ;+48
.bk_ev dd ? ;+52 .bk_ev dd ? ;+52
.fd_obj dd ? ;+56 .fd_obj dd ? ;+56
.bk_obj dd ? ;+60 .bk_obj dd ? ;+60
.saved_esp dd ? ;+64 .saved_esp dd ? ;+64
.io_map rd 2 ;+68 .io_map rd 2 ;+68
.dbg_state dd ? ;+76 .dbg_state dd ? ;+76
.cur_dir dd ? ;+80 .cur_dir dd ? ;+80
.wait_timeout dd ? ;+84 .wait_timeout dd ? ;+84
.saved_esp0 dd ? ;+88 .saved_esp0 dd ? ;+88
.wait_begin dd ? ;+92 +++ .wait_begin dd ? ;+92 +++
.wait_test dd ? ;+96 +++ .wait_test dd ? ;+96 +++
.wait_param dd ? ;+100 +++ .wait_param dd ? ;+100 +++
.tls_base dd ? ;+104 .tls_base dd ? ;+104
.dlls_list_ptr dd ? ;+108 .dlls_list_ptr dd ? ;+108
db 16 dup(?) ;+112 db 16 dup(?) ;+112
.wnd_shape dd ? ;+128 .wnd_shape dd ? ;+128
.wnd_shape_scale dd ? ;+132 .wnd_shape_scale dd ? ;+132
dd ? ;+136 dd ? ;+136
.mem_size dd ? ;+140 .mem_size dd ? ;+140
.saved_box BOX .saved_box BOX
.ipc_start dd ? .ipc_start dd ?
.ipc_size dd ? .ipc_size dd ?
.event_mask dd ? .event_mask dd ?
.debugger_slot dd ? .debugger_slot dd ?
dd ? dd ?
.keyboard_mode db ? .keyboard_mode db ?
db 3 dup(?) db 3 dup(?)
.dir_table dd ? .dir_table dd ?
.dbg_event_mem dd ? .dbg_event_mem dd ?
.dbg_regs: .dbg_regs:
.dbg_regs.dr0 dd ? .dbg_regs.dr0 dd ?
@ -185,7 +161,7 @@ struc APPDATA
.dbg_regs.dr2 dd ? .dbg_regs.dr2 dd ?
.dbg_regs.dr3 dd ? .dbg_regs.dr3 dd ?
.dbg_regs.dr7 dd ? .dbg_regs.dr7 dd ?
.wnd_caption dd ? .wnd_caption dd ?
.wnd_clientbox BOX .wnd_clientbox BOX
} }
virtual at 0 virtual at 0
@ -211,7 +187,7 @@ include "core/sync.inc" ; macros for synhronization objects
include "core/sys32.inc" ; process management include "core/sys32.inc" ; process management
include "core/sched.inc" ; process scheduling include "core/sched.inc" ; process scheduling
include "core/syscall.inc" ; system call include "core/syscall.inc" ; system call
include "core/fpu.inc" ; all fpu/sse support include "core/fpu.inc" ; all fpu/sse support
include "core/memory.inc" include "core/memory.inc"
include "core/heap.inc" ; kernel and app heap include "core/heap.inc" ; kernel and app heap
include "core/malloc.inc" ; small kernel heap include "core/malloc.inc" ; small kernel heap
@ -220,7 +196,7 @@ include "core/dll.inc"
include "core/peload.inc" ; include "core/peload.inc" ;
include "core/exports.inc" include "core/exports.inc"
include "core/string.inc" include "core/string.inc"
include "core/v86.inc" ; virtual-8086 manager include "core/v86.inc" ; virtual-8086 manager
; GUI stuff ; GUI stuff
include "gui/window.inc" include "gui/window.inc"
@ -232,19 +208,19 @@ include "gui/button.inc"
; file system ; file system
include "fs/fs.inc" ; syscall include "fs/fs.inc" ; syscall
include "fs/fat32.inc" ; read / write for fat32 filesystem include "fs/fat32.inc" ; read / write for fat32 filesystem
include "fs/ntfs.inc" ; read / write for ntfs filesystem include "fs/ntfs.inc" ; read / write for ntfs filesystem
include "fs/fat12.inc" ; read / write for fat12 filesystem include "fs/fat12.inc" ; read / write for fat12 filesystem
include "blkdev/rd.inc" ; ramdisk read /write include "blkdev/rd.inc" ; ramdisk read /write
include "fs/fs_lfn.inc" ; syscall, version 2 include "fs/fs_lfn.inc" ; syscall, version 2
include "fs/iso9660.inc" ; read for iso9660 filesystem CD include "fs/iso9660.inc" ; read for iso9660 filesystem CD
include "fs/ext2.inc" ; read / write for ext2 filesystem include "fs/ext2.inc" ; read / write for ext2 filesystem
; sound ; sound
include "sound/playnote.inc" ; player Note for Speaker PC include "sound/playnote.inc" ; player Note for Speaker PC
include "sound/FFT.inc" ; fast Fourier transform routines include "sound/FHT.inc" ; fast Fourier transform routines
; display ; display
@ -311,3 +287,4 @@ include "core/ext_lib.inc"
; list of external functions ; list of external functions
include "imports.inc" include "imports.inc"

View File

@ -5,62 +5,51 @@
; free KolibriOS version - not to be ported to other OSes ; free KolibriOS version - not to be ported to other OSes
; ========================================================== ; ==========================================================
Power_of_4 equ 5
NumPoints equ 1024
N_2 equ NumPoints / 2
N_4 equ NumPoints / 4
;=================================================================
; global constants ; global constants
align 8 align 8
_root dq 1.41421356237309504880169 ; = sqrt(2) fht_r dq 1.41421356237309504880169 ; = sqrt(2)
_root2 dq 0.70710678118654752440084 ; = sqrt(2)/2 fht_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2
_c1 dq 0.92387953251128675612818 ; = cos(pi/8) fht_c1 dq 0.92387953251128675612818 ; = cos(pi/8)
_s1 dq 0.38268343236508977172846 ; = sin(pi/8) fht_s1 dq 0.38268343236508977172846 ; = sin(pi/8)
_dx dq 0.00613592315154296875 ; pi/512
;[_CosTable] dd 0 ; N_2 elements
;[_SinTable] dd 0 ; N_2 elements
;=================================================================
; parameter1:
; -- reg dl (bits[3:0]) = Power_of_4
; -- reg edx && (-16) = 4k-aligned data array address
; returns:
; -- edx = Power_of_4
; -- ecx = N
; destroys:
; -- eax, ebx, ecx, edx, esi
;; ========================== ;; ==========================
align 4 align 4
MakeSinCosTable:
mov ebx, [_Sines]
mov ecx, [_Cosins]
xor eax, eax
fld [_dx] ; st : dx
fldz ; st : 0, dx
.loop:
fld st0 ; st : x, x, dx
FSINCOS ; st : cos, sin, x, dx
fstp qword [ecx+eax*8] ; st : sin, x, dx
fstp qword [ebx+eax*8] ; st : x, dx
fadd st0, st1 ; st : x+dx, dx
inc eax
cmp eax, N_2
jne .loop
fstp st0 ; st : dx
fstp st0 ; st : <empty>
ret
; ================================================================
align 4
BitInvert: BitInvert:
mov esi, [x] ; array of qwords mov esi, edx
xor ecx, ecx ; index term and esi, 0xFFFFFFF0
and edx, 0x0F
push edx
mov cl, dl
xor eax, eax
inc eax
shl eax, cl
shl eax, cl
push eax
xor ecx, ecx ; index term
align 4
.newterm: .newterm:
inc ecx inc ecx
cmp ecx, NumPoints cmp ecx, [esp] ; N
jge .done jge .done
xor eax, eax xor eax, eax
mov edx, ecx mov edx, ecx
xor bl, bl xor bl, bl
align 4
.do_invert: .do_invert:
inc bl inc bl
cmp bl, Power_of_4 cmp bl, byte[esp+4] ; Power_of_4
jg .switch jg .switch
mov bh, dl mov bh, dl
@ -69,6 +58,7 @@ BitInvert:
or al, bh or al, bh
shr edx, 2 shr edx, 2
jmp .do_invert jmp .do_invert
align 8
.switch: .switch:
cmp eax, ecx cmp eax, ecx
@ -80,17 +70,32 @@ BitInvert:
fstp qword [esi+ecx*8] fstp qword [esi+ecx*8]
jmp .newterm jmp .newterm
align 4
.done: .done:
pop ecx
pop edx
ret ret
;================================================================= ;=================================================================
;=================================================================
; stdcall parameters:
; -- [esp+4] = N
; -- [esp+8] = 4k-aligned data array address
; returns:
; -- nothing
; destroys:
; -- ebx, esi
;; ==========================
align 4 align 4
step1: step1:
mov esi, [x] mov ebx, [esp+8]
mov ebx, esi mov esi, [esp+4]
add esi, NumPoints*8 shl esi, 3
add esi, ebx
align 4
.loop: .loop:
fld qword[ebx] fld qword[ebx]
fld qword[ebx+8] fld qword[ebx+8]
@ -119,19 +124,65 @@ step1:
add ebx, 32 add ebx, 32
cmp ebx, esi cmp ebx, esi
jnz .loop jnz .loop
ret
ret ; local stack definitions
;
;===========================================================================
step2: ; Step2
;=========================================================================== ;===========================================================================
_t0 equ dword [esp]
_t1 equ dword[esp+4]
_t2 equ dword[esp+8]
_t3 equ dword[esp+12]
_t4 equ dword[esp+16]
_t5 equ dword[esp+20]
_t6 equ dword[esp+24]
_t7 equ dword[esp+28]
_t8 equ dword[esp+32]
_t9 equ dword[esp+36]
mov eax, [_f] _l1 equ dword[esp+40]
mov ebx, eax _l2 equ dword[esp+44]
add eax, NumPoints*8 _l3 equ dword[esp+48]
_l4 equ dword[esp+52]
_l5 equ dword[esp+56]
_l6 equ dword[esp+60]
_l7 equ dword[esp+64]
_l8 equ dword[esp+68]
_l9 equ dword[esp+72]
_l0 equ dword[esp+76]
_d1 equ dword[esp+80]
_d2 equ dword[esp+84]
_d3 equ dword[esp+88]
_d4 equ dword[esp+92]
_d5 equ dword[esp+96]
_d6 equ dword[esp+100]
_j5 equ dword[esp+104]
_jj equ dword[esp+108]
_end_of_array equ dword[esp+112]
_step equ word [esp+116]
;=================================================================
; cdecl parameters:
; -- [ebp+8] = N
; -- [ebp+12] = 4k-aligned data array address
; returns:
; -- nothing
; destroys:
; -- eax, ebx
; locals:
; -- 10 stack-located dwords (_t0 ... _t9)
;; ==========================
align 4
step2:
push ebp
mov ebp, esp
sub esp, 40
mov ebx, [ebp+12]
mov eax, [ebp+ 8]
shl eax, 3
add eax, ebx
align 4
.loop_i: .loop_i:
; -- quad subelements +0, +4, +8 and +12 (simpliest operations) ; -- quad subelements +0, +4, +8 and +12 (simpliest operations)
@ -163,7 +214,7 @@ step2: ; Step2
; -- even subelements +2, +6, +10 and +14 (2 multiplications needed) ; -- even subelements +2, +6, +10 and +14 (2 multiplications needed)
fld qword[ebx+8*2] fld qword[ebx+8*2]
fld qword[ebx+8*6] fld qword[ebx+8*6]
fld [_root] fld [fht_r]
fmul st1, st0 ; st : r, t2, t1 fmul st1, st0 ; st : r, t2, t1
fld qword[ebx+8*10] fld qword[ebx+8*10]
fxch st1 ; st : r, t3, t2, t1 fxch st1 ; st : r, t3, t2, t1
@ -194,20 +245,20 @@ step2: ; Step2
fsub st0, st1 fsub st0, st1
fxch st1 fxch st1
faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7]) faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7])
fld [_root2] fld [fht_r2]
fmul st2, st0 fmul st2, st0
fmulp st1, st0 ; st : t9, t6 fmulp st1, st0 ; st : t9, t6
fld qword[ebx+8*3] fld qword[ebx+8*3]
fld st0 fld st0
fadd st0, st2 ; st : t1, f[l5], t9, t6 fadd st0, st2 ; st : t1, f[l5], t9, t6
fstp [_t1] fstp _t1
fsub st0, st1 fsub st0, st1
fstp [_t2] fstp _t2
fstp [_t9] ; (t9 never used) fstp _t9 ; (t9 never used)
fstp [_t6] ; st : <empty> fstp _t6 ; st : <empty>
fld [_c1] fld [fht_c1]
fld [_s1] fld [fht_s1]
fld qword[ebx+8*5] fld qword[ebx+8*5]
fld qword[ebx+8*7] fld qword[ebx+8*7]
fld st3 ; st: c1, f[l6], f[l2], s1, c1 fld st3 ; st: c1, f[l6], f[l2], s1, c1
@ -215,13 +266,13 @@ step2: ; Step2
fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c
fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c fmul st0, st4 ; st: f_6*s, f_2*c, f_6, f_2, s, c
faddp st1, st0 ; st: t5, f_6, f_2, s, c faddp st1, st0 ; st: t5, f_6, f_2, s, c
fstp [_t5] ; st: f_6, f_2, s, c fstp _t5 ; st: f_6, f_2, s, c
fld st3 ; st: c, f_6, f_2, s, c fld st3 ; st: c, f_6, f_2, s, c
fmul st0, st1 fmul st0, st1
fld st3 fld st3
fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c fmul st0, st3 ; st: f_2*s, f_6*c, f_6, f_2, s, c
fsubp st1, st0 ; st: t8, f_6, f_2, s, c fsubp st1, st0 ; st: t8, f_6, f_2, s, c
fstp [_t8] ; st: f_6, f_2, s, c fstp _t8 ; st: f_6, f_2, s, c
fstp st0 ; st: f_2, s, c fstp st0 ; st: f_2, s, c
fstp st0 ; st: s, c fstp st0 ; st: s, c
@ -232,51 +283,51 @@ step2: ; Step2
fld st3 fld st3
fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c fmul st0, st3 ; st: f_4*s, f_8*c, f_8, f_4, s, c
faddp st1, st0 ; st: t7, f_8, f_4, s, c faddp st1, st0 ; st: t7, f_8, f_4, s, c
fld [_t5] ; st: t5, t7, f_8, f_4, s, c fld _t5 ; st: t5, t7, f_8, f_4, s, c
fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c
fstp [_t4] fstp _t4
fstp [_t7] ; st: f_8, f_4, s, c fstp _t7 ; st: f_8, f_4, s, c
fld st3 ; st: c, f_8, f_4, s, c fld st3 ; st: c, f_8, f_4, s, c
fmul st0, st2 fmul st0, st2
fld st3 fld st3
fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c fmul st0, st2 ; st: f_8*s, f_4*c, f_8, f_4, s, c
fsubp st1, st0 ; st:-t0, f_8, f_4, s, c fsubp st1, st0 ; st:-t0, f_8, f_4, s, c
fchs fchs
fld [_t8] fld _t8
fchs ; st:-t8, t0, f_8, f_4, s, c fchs ; st:-t8, t0, f_8, f_4, s, c
fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c
fstp [_t3] fstp _t3
fstp [_t0] ; st: f_8, f_4, s, c fstp _t0 ; st: f_8, f_4, s, c
fstp st0 ; st: f_4, s, c fstp st0 ; st: f_4, s, c
fstp st0 ; st: s, c fstp st0 ; st: s, c
fstp st0 ; st: c fstp st0 ; st: c
fstp st0 ; st: <empty> fstp st0 ; st: <empty>
fld [_t1] fld _t1
fld [_t4] fld _t4
fld st1 fld st1
fsub st0, st1 fsub st0, st1
fstp qword[ebx+8*11] ; f[l7] = t1-t4 fstp qword[ebx+8*11] ; f[l7] = t1-t4
faddp st1, st0 faddp st1, st0
fstp qword[ebx+8*3] ; f[l5] = t1+t4 fstp qword[ebx+8*3] ; f[l5] = t1+t4
fld [_t2] fld _t2
fld [_t3] fld _t3
fld st1 fld st1
fsub st0, st1 fsub st0, st1
fstp qword[ebx+8*15] ; f[l8] fstp qword[ebx+8*15] ; f[l8]
faddp st1, st0 faddp st1, st0
fstp qword[ebx+8*7] ; f[l6] fstp qword[ebx+8*7] ; f[l6]
fld [_t6] fld _t6
fld qword[ebx+8] fld qword[ebx+8]
fld st1 fld st1
fsub st0, st1 fsub st0, st1
fxch st1 fxch st1
faddp st2, st0 ; st : t2, t1 faddp st2, st0 ; st : t2, t1
fld [_t8] fld _t8
fsub [_t0] fsub _t0
fld [_t5] fld _t5
fadd [_t7] ; st : t4, t3, t2, t1 fadd _t7 ; st : t4, t3, t2, t1
fld st3 fld st3
fsub st0, st1 fsub st0, st1
@ -294,36 +345,42 @@ step2: ; Step2
cmp ebx, eax cmp ebx, eax
jb .loop_i jb .loop_i
ret mov esp, ebp
pop ebp
align 8 ; shared local vars ret
_t0 dq 0
_t1 dq 0
_t2 dq 0
_t3 dq 0
_t4 dq 0
_t5 dq 0
_t6 dq 0
_t7 dq 0
_t8 dq 0
_t9 dq 0
;===================================================================
;=================================================================
; cdecl parameters:
; -- [ebp+8] = N
; -- [ebp+12] = p
; -- [ebp+16] = 4k-aligned data array address
; -- [ebp+20] = 4k-aligned SinCosTable address
; returns:
; -- nothing
; destroys:
; -- all GPRegs
; locals:
; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
;; ==========================
align 4
step3: step3:
;=================================================================== push ebp
mov ebp, esp
sub esp, 120
; 283 : { ; 283 : {
; 293 : for (l=3; l<=p; l++) ; 293 : for (l=3; l<=p; l++)
mov cx, 0x0200 mov cx, 0x0200
align 4
.newstep: .newstep:
inc ch inc ch
cmp ch, Power_of_4 cmp ch, byte[ebp+12]
jg .done jg .done
mov [.step], cx mov _step, cx
; 294 : { ; 294 : {
; 295 : d1 = 1 << (l + l - 3); ; 295 : d1 = 1 << (l + l - 3);
@ -333,61 +390,63 @@ step3:
sub cl, 3 sub cl, 3
mov edx, 1 mov edx, 1
shl edx, cl shl edx, cl
mov [.d1], edx mov _d1, edx
; 296 : d2 = d1 << 1; ; 296 : d2 = d1 << 1;
shl edx, 1 shl edx, 1
mov [.d2], edx mov _d2, edx
mov eax, edx mov eax, edx
; 297 : d3 = d2 << 1; ; 297 : d3 = d2 << 1;
shl edx, 1 shl edx, 1
mov [.d3], edx mov _d3, edx
; 298 : d4 = d2 + d3; ; 298 : d4 = d2 + d3;
add eax, edx add eax, edx
mov [.d4], eax mov _d4, eax
; 299 : d5 = d3 << 1; ; 299 : d5 = d3 << 1;
shl edx, 1 shl edx, 1
mov [.d5], edx mov _d5, edx
shl edx, 3 shl edx, 3
mov [.d6], edx ; d6 = d5*8 to simplify index operations mov _d6, edx ; d6 = d5*8 to simplify index operations
; 339 : j5 = N / d5; ; moved out of internal loop ; 339 : j5 = N / d5; ; moved out of internal loop
mov cl, Power_of_4 mov cl, [ebp+12]
sub cl, ch sub cl, ch
add cl, cl add cl, cl
mov edx, 1 mov edx, 1
shl edx, cl shl edx, cl
mov [.j5], edx mov _j5, edx
; 300 : ; 300 :
; 301 : for (j=0; j<N; j+=d5) ; 301 : for (j=0; j<N; j+=d5)
mov esi, [_f] mov ebx, [ebp+16]
mov ebx, esi mov esi, [ebp+8]
add esi, NumPoints*8 shl esi, 3
mov [.end_of_array], esi add esi, ebx
mov _end_of_array, esi
align 4
.next_j: .next_j:
; { ; {
; t1 = f[j] + f[j+d2]; ; t1 = f[j] + f[j+d2];
mov eax, [.d2] mov eax, _d2
fld qword[ebx] fld qword[ebx]
fld qword[ebx+eax*8] fld qword[ebx+eax*8]
fld st1 fld st1
fadd st0, st1 fadd st0, st1
fstp [_t1] fstp _t1
; t2 = f[j] - f[j+d2]; ; t2 = f[j] - f[j+d2];
fsubp st1, st0 fsubp st1, st0
fstp [_t2] fstp _t2
; t3 = f[j+d3] + f[j+d4]; ; t3 = f[j+d3] + f[j+d4];
mov edi, [.d3] mov edi, _d3
fld qword[ebx+edi*8] fld qword[ebx+edi*8]
mov edx, [.d4] mov edx, _d4
fld qword[ebx+edx*8] fld qword[ebx+edx*8]
fld st1 fld st1
fsub st0, st1 ; st : t4, f4, f3 fsub st0, st1 ; st : t4, f4, f3
@ -398,7 +457,7 @@ step3:
; f[j+d4] = t2 - t4; ; f[j+d4] = t2 - t4;
; f[j+d3] = t2 + t4; ; f[j+d3] = t2 + t4;
fld [_t2] fld _t2
fld st0 fld st0
fsub st0, st2 ; st : f4, t2, t4, t3 fsub st0, st2 ; st : f4, t2, t4, t3
fstp qword[ebx+edx*8] ; st : t2, t4, t3 fstp qword[ebx+edx*8] ; st : t2, t4, t3
@ -407,7 +466,7 @@ step3:
; f[j+d2] = t1 - t3; ; f[j+d2] = t1 - t3;
; f[j] = t1 + t3; ; f[j] = t1 + t3;
fld [_t1] fld _t1
fst st1 fst st1
fsub st0, st2 ; st : f2, t1, t3 fsub st0, st2 ; st : f2, t1, t3
fstp qword[ebx+eax*8] ; st : t1, t3 fstp qword[ebx+eax*8] ; st : t1, t3
@ -416,7 +475,7 @@ step3:
fstp st0 fstp st0
; jj = j + d1; / ?? ; jj = j + d1; / ??
mov edi, [.d1] mov edi, _d1
shl edi, 3 ; = d1*8 shl edi, 3 ; = d1*8
mov edx, edi mov edx, edi
mov eax, edi mov eax, edi
@ -432,7 +491,7 @@ step3:
; t2 = f[jj+d2] * r; ; t2 = f[jj+d2] * r;
fld qword [edi+eax] fld qword [edi+eax]
fld [_root] fld [fht_r]
fmul st1, st0 ; st : r, t2, t3, t1 fmul st1, st0 ; st : r, t2, t3, t1
; t4 = f[jj+d4] * r ; t4 = f[jj+d4] * r
fmul qword [edx+eax] ; st : t4, t2, t3, t1 fmul qword [edx+eax] ; st : t4, t2, t3, t1
@ -461,58 +520,61 @@ step3:
; for (k=1; k<d1; k++) ; for (k=1; k<d1; k++)
xor ecx, ecx ; ecx = k xor ecx, ecx ; ecx = k
mov [.jj], ecx mov _jj, ecx
align 4
.next_k: .next_k:
inc ecx inc ecx
cmp ecx, [.d1] cmp ecx, _d1
jge .done_k jge .done_k
; { ; {
mov eax, [.d2] ; the sector increment mov eax, _d2 ; the sector increment
; l1 = j + k; ; l1 = j + k;
mov edx, ecx mov edx, ecx
mov [.l1], edx ; [ebx+edx*8] --> f[j+k] mov _l1, edx ; [ebx+edx*8] --> f[j+k]
; l2 = l1 + d2; ; l2 = l1 + d2;
add edx, eax add edx, eax
mov [.l2], edx mov _l2, edx
; l3 = l1 + d3; ; l3 = l1 + d3;
add edx, eax add edx, eax
mov [.l3], edx mov _l3, edx
; l4 = l1 + d4; ; l4 = l1 + d4;
add edx, eax add edx, eax
mov [.l4], edx mov _l4, edx
; l5 = j + d2 - k; ; l5 = j + d2 - k;
mov edx, eax mov edx, eax
sub edx, ecx sub edx, ecx
mov [.l5], edx mov _l5, edx
; l6 = l5 + d2; ; l6 = l5 + d2;
add edx, eax add edx, eax
mov [.l6], edx mov _l6, edx
; l7 = l5 + d3; ; l7 = l5 + d3;
add edx, eax add edx, eax
mov [.l7], edx mov _l7, edx
; l8 = l5 + d4; ; l8 = l5 + d4;
add edx, eax add edx, eax
mov [.l8], edx mov _l8, edx
; 340 : j5 *= k; // add-substituted multiplication ; 340 : j5 *= k; // add-substituted multiplication
mov eax, [.jj] mov eax, _jj
add eax, [.j5] add eax, _j5
mov [.jj], eax mov _jj, eax
; c1 = C[jj]; ; c1 = C[jj];
; s1 = S[jj]; ; s1 = S[jj];
mov edi, [_Cosins] mov edi, [ebp+20]
fld qword[edi+eax*8] fld qword[edi+eax*8]
mov esi, [_Sines] mov esi, [ebp+8]
shl esi, 2
add esi, edi
fld qword[esi+eax*8] ; st : s1, c1 fld qword[esi+eax*8] ; st : s1, c1
; t5 = f[l2] * c1 + f[l6] * s1; ; t5 = f[l2] * c1 + f[l6] * s1;
; t8 = f[l6] * c1 - f[l2] * s1; ; t8 = f[l6] * c1 - f[l2] * s1;
mov edx, [.l6] mov edx, _l6
fld qword[ebx+edx*8] fld qword[ebx+edx*8]
mov edx, [.l2] mov edx, _l2
fld st0 fld st0
fmul st0, st2 fmul st0, st2
fxch st1 fxch st1
@ -521,10 +583,10 @@ step3:
fmul st4, st0 fmul st4, st0
fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c fmulp st3, st0 ; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c fsub st0, st2 ; st : t8, f[l6]*s, f[l2]*s, f[l2]*c
fstp [_t8] fstp _t8
faddp st2, st0 ; st : f[l2]*s, t5 faddp st2, st0 ; st : f[l2]*s, t5
fstp st0 ; st : t5 fstp st0 ; st : t5
fstp [_t5] ; st : <empty> fstp _t5 ; st : <empty>
; c2 = C[2*jj]; ; c2 = C[2*jj];
; s2 = S[2*jj]; ; s2 = S[2*jj];
@ -534,9 +596,9 @@ step3:
; t6 = f[l3] * c2 + f[l7] * s2; ; t6 = f[l3] * c2 + f[l7] * s2;
; t9 = f[l7] * c2 - f[l3] * s2; ; t9 = f[l7] * c2 - f[l3] * s2;
mov edx, [.l7] mov edx, _l7
fld qword[ebx+edx*8] fld qword[ebx+edx*8]
mov edx, [.l3] mov edx, _l3
fld st0 fld st0
fmul st0, st2 fmul st0, st2
fxch st1 fxch st1
@ -545,22 +607,22 @@ step3:
fmul st4, st0 fmul st4, st0
fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c fmulp st3, st0 ; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c fsub st0, st2 ; st : t9, f[l7]*s, f[l3]*s, f[l3]*c
fstp [_t9] fstp _t9
faddp st2, st0 ; st : f[l2]*s, t6 faddp st2, st0 ; st : f[l2]*s, t6
fstp st0 ; st : t6 fstp st0 ; st : t6
fstp [_t6] ; st : <empty> fstp _t6 ; st : <empty>
; c3 = C[3*jj]; ; c3 = C[3*jj];
; s3 = S[3*jj]; ; s3 = S[3*jj];
add eax, [.jj] add eax, _jj
fld qword[edi+eax*8] fld qword[edi+eax*8]
fld qword[esi+eax*8] ; st : s3, c3 fld qword[esi+eax*8] ; st : s3, c3
; t7 = f[l4] * c3 + f[l8] * s3; ; t7 = f[l4] * c3 + f[l8] * s3;
; t0 = f[l8] * c3 - f[l4] * s3; ; t0 = f[l8] * c3 - f[l4] * s3;
mov edx, [.l8] mov edx, _l8
fld qword[ebx+edx*8] fld qword[ebx+edx*8]
mov edx, [.l4] mov edx, _l4
fld st0 fld st0
fmul st0, st2 fmul st0, st2
fxch st1 fxch st1
@ -569,192 +631,162 @@ step3:
fmul st4, st0 fmul st4, st0
fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c fmulp st3, st0 ; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c fsub st0, st2 ; st : t9, f[l8]*s, f[l4]*s, f[l4]*c
fstp [_t0] fstp _t0
faddp st2, st0 ; st : f[l2]*s, t7 faddp st2, st0 ; st : f[l2]*s, t7
fstp st0 ; st : t7 fstp st0 ; st : t7
fstp [_t7] ; st : <empty> fstp _t7 ; st : <empty>
; t1 = f[l5] - t9; ; t1 = f[l5] - t9;
; t2 = f[l5] + t9; ; t2 = f[l5] + t9;
mov eax, [.l5] mov eax, _l5
fld qword [ebx+eax*8] fld qword [ebx+eax*8]
fld [_t9] fld _t9
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp [_t2] fstp _t2
fsubp st1, st0 fsubp st1, st0
fstp [_t1] fstp _t1
; t3 = - t8 - t0; ; t3 = - t8 - t0;
fld [_t8] fld _t8
fadd [_t0] fadd _t0
fchs fchs
fstp [_t3] fstp _t3
; t4 = t5 - t7; ; t4 = t5 - t7;
fld [_t5] fld _t5
fsub [_t7] fsub _t7
fstp [_t4] fstp _t4
; f[l5] = t1 + t4; ; f[l5] = t1 + t4;
fld [_t1] fld _t1
fld [_t4] fld _t4
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l7] = t1 - t4; ; f[l7] = t1 - t4;
mov eax, [.l7] mov eax, _l7
fsubp st1, st0 fsubp st1, st0
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l6] = t2 + t3; ; f[l6] = t2 + t3;
mov eax, [.l6] mov eax, _l6
fld [_t2] fld _t2
fld [_t3] fld _t3
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l8] = t2 - t3; ; f[l8] = t2 - t3;
mov eax, [.l8] mov eax, _l8
fsubp st1, st0 fsubp st1, st0
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; t1 = f[l1] + t6; ; t1 = f[l1] + t6;
mov eax, [.l1] mov eax, _l1
fld qword [ebx+eax*8] fld qword [ebx+eax*8]
fld [_t6] fld _t6
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp [_t1] fstp _t1
; t2 = f[l1] - t6; ; t2 = f[l1] - t6;
fsubp st1, st0 fsubp st1, st0
fstp [_t2] fstp _t2
; t3 = t8 - t0; ; t3 = t8 - t0;
fld [_t8] fld _t8
fsub [_t0] fsub _t0
fstp [_t3] fstp _t3
; t4 = t5 + t7; ; t4 = t5 + t7;
fld [_t5] fld _t5
fadd [_t7] fadd _t7
fstp [_t4] fstp _t4
; f[l1] = t1 + t4; ; f[l1] = t1 + t4;
mov eax, [.l1] mov eax, _l1
fld [_t1] fld _t1
fld [_t4] fld _t4
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l3] = t1 - t4; ; f[l3] = t1 - t4;
mov eax, [.l3] mov eax, _l3
fsubp st1, st0 fsubp st1, st0
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l2] = t2 + t3; ; f[l2] = t2 + t3;
mov eax, [.l2] mov eax, _l2
fld [_t2] fld _t2
fld [_t3] fld _t3
fld st0 fld st0
fadd st0, st2 fadd st0, st2
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; f[l4] = t2 - t3; ; f[l4] = t2 - t3;
mov eax, [.l4] mov eax, _l4
fsubp st1, st0 fsubp st1, st0
fstp qword [ebx+eax*8] fstp qword [ebx+eax*8]
; 374 : } ; 374 : }
jmp .next_k jmp .next_k
align 4
.done_k: .done_k:
; 375 : } ; 375 : }
add ebx, [.d6] ; d6 = d5*8 add ebx, _d6 ; d6 = d5*8
cmp ebx, [.end_of_array] cmp ebx, _end_of_array
jb .next_j jb .next_j
; 376 : } ; 376 : }
mov cx, [.step] mov cx, _step
jmp .newstep jmp .newstep
.done: .done:
mov esp, ebp
pop ebp
; 377 : } ; 377 : }
ret ret
align 4
.l1 dd 0
.l2 dd 0
.l3 dd 0
.l4 dd 0
.l5 dd 0
.l6 dd 0
.l7 dd 0
.l8 dd 0
.l9 dd 0
.l0 dd 0
.d1 dd 0
.d2 dd 0
.d3 dd 0
.d4 dd 0
.d5 dd 0
.d6 dd 0
.j5 dd 0
.jj dd 0
.end_of_array dd 0
.step dw 0
align 8
;=========== Step3 ends here =========== ;=========== Step3 ends here ===========
; ================================================================= ; =================================================================
; syscall entry
;
_f dd ?
_N dd 1024 ; number of points
_a dd ? ; initial data array ;=================================================================
x dd 0 ; tranformed (float) data array ; parameters:
_Cosins dd 0 ; -- [ebp+12] = N
_Sines dd 0 ; -- [ebp+16] = p
; -- [ebp+20] = 4k-aligned data array address
; -- [ebp+24] = 4k-aligned SinCosTable address
; returns:
; -- nothing
; destroys:
; -- all GPRegs
;; ==========================
FFT4: align 4
or al, al
jnz .trans FHT_4:
mov cl, Power_of_4 push ebp
mov eax, 1 mov ebp, esp
shl eax, cl
shl eax, cl mov edx, [ebp+20] ; a
mov [_N], eax mov dl, byte[ebp+16]
shl eax, 2 ; size of Sine table in bytes
add eax, ebx
mov [_Sines], ebx
mov [_Cosins], eax
cpuid
rdtsc
mov [.time], eax
call MakeSinCosTable
cpuid
rdtsc
sub eax, [.time]
ret
.trans:
mov [x], ebx
mov [_f], ebx
cli ;-----
cpuid
rdtsc
mov [.time], eax
call BitInvert call BitInvert
call step1 push dword[ebp+20] ; a
call step2 push ecx ; N
call step1 ; 4-point transform
cmp cl, 1
jz .done
call step2 ; 16-point transform
cmp byte[ebp+16],1 ; p = 2 ?
jz .done
pop edx ; N
pop ecx ; a
push dword[ebp+24] ; t
push ecx
push dword[ebp+16] ; p
push edx ; N
call step3 call step3
cpuid .done:
rdtsc mov esp, ebp
sti ;---- pop ebp
sub eax, [.time]
ret
.time dd 0 ret

View File

@ -0,0 +1,207 @@
;========================================================================
;= =
;= Fast Hartley Transform routine demo for KolibriOS =
;= =
;= Copyright (C) 2010, Artem Jerdev <kolibri@jerdev.co.uk> =
;= =
;= refer to wiki.kolibtios.org for all details =
;= =
;========================================================================
use32
org 0x0
db 'MENUET01' ; 8 byte id
dd 0x01 ; header version
dd START ; start of code
dd I_END ; size of image
dd 0x100000 ; memory for app
dd 0xbfffc ; esp
dd 0x0 , 0x0 ; I_Param , I_Icon
include 'macros.inc'
include 'debug.inc'
include 'FHT4i.inc'
START: ; start of execution
call main
mov eax,-1 ; close this program
int 0x40
;=============================================================
;Func: calculates a simple function
; ff = (int)(500*exp(-t) * cos (2.5*t))
; uses: eax, ebx
;------------
Func:
; 9 : {
; 10 : double x,t;
; 11 : int f;
; 12 :
; 13 : x = (i < N2) ? i : i - NUM_POINTS;
mov eax, [ii]
cmp eax, 512
jge .index_negative
jmp .index_correct
.index_negative:
sub eax, 1024
.index_correct:
mov [temp], eax
; fild [temp]
; 14 : t = x / 16.0;
; f2xm1 argument (abs) must be less than 1, so
mov [t_mod], eax
and [t_mod], 0x0F ; x % 16
shr eax, 4 ; x / 16
mov [t_div], eax
fild [temp]
; 15 : if (t<0) t = -t;
fabs
exp_ok:
; 16 : f = (int)(512*2^(-t) * cos (2.5*t));
fchs
f2xm1
fmul [f500]
fstp [tv93]
fld [f2_5]
fmul [tt]
fcos
fmul [tv93]
fstp [tt]
mov bx, word[tt+6]
shr bx,4
and bx,0x07FF
add ax,bx
shl ax,4
and word[tt+6], 0x800F
or word[tt+6], ax
fld [tt]
fstp [ff]
; 17 : return f;
; 18 : }
ret
;---------------------------------------------------------
; test data filler
;
; uses eax, ebx, ecx
FillData:
; 29 : for (i=0; i<NUM_POINTS; i++)
; here : ecx = i
xor ecx, ecx
.funcloop:
; 30 : {
; 31 : ia[i] = Func(i);
mov [ii], ecx
call Func
fld [ff]
fstp qword [edx+ecx*8]
; 32 : }
inc ecx
cmp ecx, [_in] ; ecx == N ?
jne .funcloop
ret
;====================================================================
; main
;====================================================================
align 4
_ia dd 0
_ii dd 0
_ip dd 0
_in dd 0
_it dd 0
;-----------------
main:
mov eax, 68
mov ebx, 11
int 0x40
fninit
mov cl, 2 ; power of 4
mov byte[_ip], cl
mov eax, 1
shl eax, cl
shl eax, cl
mov [_in], eax
mov dl, cl
call CreateSinCosTable
mov [_it], edx
mov ecx, [_in]
shl ecx, 3
mov ebx, 12
mov eax, 68
int 0x40
mov [_ia], eax
mov edx, eax
call FillData
cpuid
rdtsc
mov [t_0], eax
push [_it]
push [_ia]
push [_ip]
push [_in]
; call FHT_4
xor eax, eax
syscall
add esp, 16
cpuid
rdtsc
mov [t_1], eax
sub eax, [t_0]
debug_print_hex eax
print '<- fht time'
mov edx, [_it]
call DestroySinCosTable
mov ecx, [_ia]
mov ebx, 13
mov eax, 68
int 0x40
ret
; ========================================================
; static data
;----------------
align 8
;f18 dq 0x4032000000000000
f256 dq 256.01f
f14_2 dq 14.2f
f500 dq 0x407f400000000000
f2_5 dq 0x4004000000000000
tt dq ?
tv93 dq ?
t_div dd ?
t_mod dd ?
temp dd ?
ff dq ? ; return value (int)
ii dd ? ; argument (int) = array index
t_1 dd ?
t_0 dd ?
fcontrol dw 0x0037f
title db ' Fast Hartley Transform Test - A.Jerdev 2010'
I_END: