fast Hartley transform presented

git-svn-id: svn://kolibrios.org@2210 a494cfbc-eb01-0410-851d-a64ba20cac60
2011-09-18 21:08:53 +00:00 · 2011-09-18 21:08:53 +00:00 · 6641043d17
commit 6641043d17
parent 6bbd71b6f8
6 changed files with 1210 additions and 138 deletions
--- a/programs/other/archer/trunk/@RCHER.ASM
+++ b/programs/other/archer/trunk/@RCHER.ASM
@ -79,7 +79,7 @@ end if
 ;{
    include "..\..\..\macros.inc"
 ;    purge mov
-    include "debug.inc"
+    include "..\..\debug.inc"
    include 'dump.inc'
 ;}
 end if
--- a/programs/other/archer/trunk/debug.inc
+++ b/programs/other/archer/trunk/debug.inc
@ -67,7 +67,7 @@ debug_outstr:
   mov  cl,[edx]
   test cl,cl
   jz   @f
-   int  40h
+   mcall
   inc  edx
   jmp  @b
 @@:
--- a/programs/other/fft/FHT.txt
+++ b/programs/other/fft/FHT.txt
@ -0,0 +1,28 @@
+This is a draft version of my Fast Hartley Transform (FHT) routine for KolibriOS
+
+Hartley transform is a real-basis version of well-known Fourier transform:
+
+1) basis function:    cas(x) = cos(x) + sin(x);
+2) forward transform: H(f) = sum(k=0..N-1) [X(k)*cas(kf/(2*pi*N))] 
+3) reverse transform: X(k) = 1/N * sum(f=0..N-1) [H(f)*cas(kf/(2*pi*N))] 
+
+FHT is known to be faster than most conventional fast Fourier transform (FHT) methods.
+It also uses half-length arrays due to no need of imaginary data storage.
+
+FHT can be easily converted to FFT (and back) with no loss of information.
+Most of general tasks FFT used for (correlation, convolution, energy spectra, noise
+filtration, differential math, phase detection ect.) may be done directly with FHT.
+
+====================================================================================
+
+Copyright (C) A. Jerdev 1999, 2003 and 2010. 
+
+The code can be used, changed and redistributed in any KolibriOS application
+with only two limitations:
+
+1) the author's name and copyright information cannot be deleted or changed;
+
+2) the code is not allowed to be ported to or distributed with other operation systems.
+
+18/09/2010
+Artem Jerdev <kolibri@jerdev.co.uk>
--- a/programs/other/fft/FHT4A.asm
+++ b/programs/other/fft/FHT4A.asm
@ -0,0 +1,203 @@
+;========================================================================
+;=                                                                      =
+;=         Fast Hartley Transform   demo for KolibriOS                  =
+;=                                                                      =
+;=       Copyright (C) 2010, Artem Jerdev <kolibri@jerdev.co.uk>        =
+;=                                                                      =
+;=             refer to wiki.kolibtios.org for all details              =
+;=                                                                      =
+;========================================================================
+
+
+
+
+use32
+
+	       org    0x0
+
+	       db     'MENUET01'	      ; 8 byte id
+	       dd     0x01		      ; header version
+	       dd     START		      ; start of code
+	       dd     I_END		      ; size of image
+	       dd     0x100000		      ; memory for app
+	       dd     0xbfffc		      ; esp
+	       dd     0x0 , 0x0 	      ; I_Param , I_Icon
+
+include '../../macros.inc'
+include '../debug.inc'
+include 'FHT4i.inc'
+
+
+START:				; start of execution
+
+     call main
+
+
+    mov  eax,-1 		; close this program
+    int  0x40
+
+
+;=============================================================
+;Func: calculates a simple function
+;      ff = (512*2^(-t) * cos (2.5*t))
+;               uses: eax, ebx
+;------------
+Func:
+
+; 9    : {
+
+; 10   :   double x,t;
+; 11   :   int f;
+; 12   :
+; 13   :   x = (i < N2) ? i : i - NUM_POINTS;
+	mov	eax, [ii]
+	cmp	eax, 512
+	jge	.index_negative
+	jmp	.index_correct
+.index_negative:
+	sub	eax, 1024
+.index_correct:
+	mov	[temp], eax
+;        fild    [temp]
+
+; 14   :   t = x / 16.0;
+; f2xm1 argument (abs) must be less than 1, so
+	mov	[t_mod], eax
+	and	[t_mod], 0x0F	; x % 16
+	shr	eax, 4		; x / 16
+	mov	[t_div], eax
+	fild	[temp]
+
+; 15   :   if (t<0) t = -t;
+	fabs
+exp_ok:
+; 16   :   f = (512*2^(-t) * cos (2.5*t));
+	fchs
+	f2xm1
+	fmul	[f500]
+	fstp	[tv93]
+	fld	[f2_5]
+	fmul	[tt]
+	fcos
+	fmul	[tv93]
+	fstp	[tt]
+	mov	bx, word[tt+6]
+	shr	bx,4
+	and	bx,0x07FF
+	add	ax,bx
+	shl	ax,4
+	and	word[tt+6], 0x800F
+	or	word[tt+6], ax
+	fld	[tt]
+	fstp   [ff]
+
+; 17   :   return  f;
+; 18   : }
+	ret
+;---------------------------------------------------------
+;       test data filler
+;
+;       uses eax, ebx, ecx
+FillData:
+; 29   :    for (i=0; i<NUM_POINTS; i++)
+; here : ecx = i
+	xor	ecx, ecx
+.funcloop:
+; 30   :    {
+; 31   :       ia[i] = Func(i);
+	mov	[ii], ecx
+	call	Func
+	fld	[ff]
+	fstp	qword [edx+ecx*8]
+; 32   :    }
+	inc	ecx
+	cmp	ecx, [_in]	   ; ecx == N ?
+	jne	.funcloop
+	ret
+
+;====================================================================
+; main
+;====================================================================
+
+_ia	dd 0
+_ii	dd 0
+_ip	dd 0
+_in	dd 0
+_it	dd 0
+;-----------------
+
+main:
+	 mov	eax, 68
+	 mov	ebx, 11
+	 int	0x40
+	fninit
+	 mov	cl,   2 	; power of 4
+	 mov	byte[_ip], cl
+	 mov	eax, 1
+	 shl	eax, cl
+	 shl	eax, cl
+	 mov	[_in], eax
+	 mov	dl, cl
+	 call	CreateSinCosTable
+	 mov	[_it], edx
+	 mov	ecx, [_in]
+	 shl	ecx, 3
+	 mov	ebx, 12
+	 mov	eax, 68
+	 int	0x40
+	 mov	[_ia], eax
+	 mov	edx, eax
+
+	call FillData
+
+	cpuid
+	rdtsc
+	mov	[t_0], eax
+
+	push	[_it]
+	push	[_ia]
+	push	[_ip]
+	push	[_in]
+	call	FHT_4
+	add	esp, 16
+	cpuid
+	rdtsc
+	mov	[t_1], eax
+
+	sub	eax, [t_0]
+	debug_print_hex eax
+	print	'<- fht time'
+
+	mov	edx, [_it]
+	call	DestroySinCosTable
+	mov	ecx, [_ia]
+	mov	ebx, 13
+	mov	eax, 68
+	int	0x40
+	ret
+
+
+; ========================================================
+; static data
+;----------------
+
+align 8
+
+;f18     dq 0x4032000000000000
+f256	dq 256.01f
+f14_2	dq 14.2f
+f500	dq 0x407f400000000000
+f2_5	dq 0x4004000000000000
+tt	dq ?
+tv93	dq ?
+t_div	dd ?
+t_mod	dd ?
+temp	dd ?
+ff	dq ?	; return value (int)
+ii	dd ?	; argument (int) = array index
+t_1	dd ?
+t_0	dd ?
+fcontrol dw  0x0037f
+title	db ' Fast Hartley Transform Test - A.Jerdev 2010'
+
+I_END:
--- a/programs/other/fft/FHT4i.inc
+++ b/programs/other/fft/FHT4i.inc
@ -0,0 +1,841 @@
+;           Fast Hartley Transform routine
+;           Copyright (C) 1999, 2004, 2010
+;          Artem Jerdev  artem@jerdev.co.uk
+;
+; free KolibriOS version - not to be ported to other OSes
+; ==========================================================
+
+
+; global constants
+align 8
+_r	dq	1.41421356237309504880169	; = sqrt(2)
+_r2	dq	0.70710678118654752440084	; = sqrt(2)/2
+_c1	dq	0.92387953251128675612818	; = cos(pi/8)
+_s1	dq	0.38268343236508977172846	; = sin(pi/8)
+
+;=================================================================
+; parameter1:
+; -- reg dl (bits[3:0])   = Power_of_4
+; returns:
+; -- reg edx = _CosTable address (4k-aligned)
+; assumes:     _SinTable  = _CosTable + (N/2)*8
+; user heap has to be initialized
+; destroys:
+; -- eax, ebx, ecx
+;; ==========================
+align 4
+CreateSinCosTable:
+	xor	eax,  eax
+	inc	eax
+	mov	cl,  dl
+	and	cl,  15
+	shl	eax, cl
+	shl	eax, cl
+	mov	ecx, eax		; now ecx = N
+	shl	ecx, 3
+	mov	ebx, 12
+	mov	eax, 68
+	int	0x40			; getmem(N*sizeof(double))
+
+	mov	edx, eax		; edx = _CosTable
+	shr	ecx, 1
+	mov	ebx, eax
+	add	ebx, ecx		; ebx = _SinTable
+	shr	ecx, 3
+	push	ecx			; [esp] = ecx = N/2
+
+	xor	eax,  eax
+	fldpi
+	fidiv	dword[esp]		; st : dx = 2*pi/N
+	pop	ecx
+	fldz				; st : 0, dx
+.loop:
+	fld	st0			; st : x, x, dx
+	FSINCOS 			; st : cos, sin, x, dx
+	fstp	qword [edx+eax*8]	; st : sin, x, dx
+	fstp	qword [ebx+eax*8]	; st : x, dx
+	fadd	st0, st1		; st : x+dx, dx
+
+	inc	eax
+	cmp	eax, ecx
+	jne	.loop
+	fstp	st0			; st : dx
+	fstp	st0			; st : <empty>
+ret
+
+;=================================================================
+; parameter1:
+; -- reg edx = _CosTable address
+; destroys:
+; -- eax, ebx, ecx
+;; ==========================
+align 4
+DestroySinCosTable:
+	mov	ecx, edx
+	mov	ebx, 13
+	mov	eax, 68
+	int	0x40			; free(SinCosTable)
+ret
+
+;=================================================================
+; parameter1:
+; -- reg  dl (bits[3:0])   = Power_of_4
+; -- reg edx && (-16) = 4k-aligned data array address
+; returns:
+; -- edx = Power_of_4
+; -- ecx = N
+; destroys:
+; -- eax, ebx, ecx, edx, esi
+;; ==========================
+align 4
+BitInvert:
+	mov	esi, edx
+	and	esi, 0xFFFFFFF0
+	and	edx, 0x0F
+	push	edx
+	mov	cl, dl
+	xor	eax, eax
+	inc	eax
+	shl	eax, cl
+	shl	eax, cl
+	push	eax
+	xor	ecx, ecx		; index term
+.newterm:
+	inc	ecx
+	cmp	ecx, [esp]		; N
+	jge	.done
+
+	xor	eax, eax
+	mov	edx, ecx
+	xor	bl, bl
+
+.do_invert:
+	inc	bl
+	cmp	bl, byte[esp+4] ; Power_of_4
+	jg	.switch
+
+	mov	bh, dl
+	and	bh,  3
+	shl	eax, 2
+	or	al, bh
+	shr	edx, 2
+	jmp	.do_invert
+
+.switch:
+	cmp	eax, ecx
+	jle	.newterm
+
+	fld	qword [esi+eax*8]
+	fld	qword [esi+ecx*8]
+	fstp	qword [esi+eax*8]
+	fstp	qword [esi+ecx*8]
+	jmp	.newterm
+
+.done:
+	pop	ecx
+	pop	edx
+	ret
+
+;=================================================================
+
+
+;=================================================================
+; stdcall parameters:
+; -- [esp+4]  = N
+; -- [esp+8]  = 4k-aligned data array  address
+; returns:
+; -- nothing
+; destroys:
+; -- ebx, esi
+;; ==========================
+align 4
+step1:
+	mov	ebx, [esp+8]
+	mov	esi, [esp+4]
+	shl	esi, 3
+	add	esi, ebx
+
+.loop:
+	fld	qword[ebx]
+	fld	qword[ebx+8]
+	fld	st1
+	fsub	st0, st1	; st : t2, f[i+1], f[i]
+	fxch	st1		; st : f[i+1], t2, f[i]
+	faddp	st2, st0	; st : t2, t1
+	fld	qword[ebx+16]
+	fld	qword[ebx+24]
+	fld	st1		; st : f[i+2], f[i+3], f[i+2], t2, t1
+	fadd	st0, st1	; st : t3, f[i+3], f[i+2], t2, t1
+	fxch	st2		; st : f[i+2], f[i+3], t3, t2, t1
+	fsub	st0, st1	; st : t4, f[i+3], t3, t2, t1
+	fstp	st1		; st : t4, t3, t2, t1
+	fld	st2		; st : t2, t4, t3, t2, t1
+	fadd	st0, st1	; st : t2+t4, t4, t3, t2, t1
+	fstp	qword[ebx+16]	; st : t4, t3, t2, t1
+	fsubp	st2, st0	; st : t3, t2-t4, t1
+	fld	st2		; st : t1, t3, t2-t4, t1
+	fadd	st0, st1	; st : t1+t3, t3, t2-t4, t1
+	fstp	qword[ebx]	; st : t3, t2-t4, t1
+	fsubp	st2, st0	; st : t2-t4, t1-t3
+	fstp	qword[ebx+24]	; st : t1-t3
+	fstp	qword[ebx+8]	; st : <empty>
+
+	add	ebx, 32
+	cmp	ebx, esi
+	jnz	.loop
+ret
+
+;       local stack definitions
+;===========================================================================
+_t0	equ	dword [esp]
+_t1	equ	dword[esp+4]
+_t2	equ	dword[esp+8]
+_t3	equ	dword[esp+12]
+_t4	equ	dword[esp+16]
+_t5	equ	dword[esp+20]
+_t6	equ	dword[esp+24]
+_t7	equ	dword[esp+28]
+_t8	equ	dword[esp+32]
+_t9	equ	dword[esp+36]
+
+_l1   equ	dword[esp+40]
+_l2   equ	dword[esp+44]
+_l3   equ	dword[esp+48]
+_l4   equ	dword[esp+52]
+_l5   equ	dword[esp+56]
+_l6   equ	dword[esp+60]
+_l7   equ	dword[esp+64]
+_l8   equ	dword[esp+68]
+_l9   equ	dword[esp+72]
+_l0   equ	dword[esp+76]
+_d1   equ	dword[esp+80]
+_d2   equ	dword[esp+84]
+_d3   equ	dword[esp+88]
+_d4   equ	dword[esp+92]
+_d5   equ	dword[esp+96]
+_d6   equ	dword[esp+100]
+_j5   equ	dword[esp+104]
+_jj   equ	dword[esp+108]
+_end_of_array	equ	dword[esp+112]
+_step		equ	word [esp+116]
+
+
+;=================================================================
+; cdecl parameters:
+; -- [ebp+8]   = N
+; -- [ebp+12]  = 4k-aligned data array  address
+; returns:
+; -- nothing
+; destroys:
+; -- eax, ebx
+; locals:
+; -- 10 stack-located dwords (_t0 ... _t9)
+;; ==========================
+align 4
+step2:
+	push	ebp
+	mov	ebp, esp
+	sub	esp, 40
+	mov	ebx, [ebp+12]
+	mov	eax, [ebp+ 8]
+	shl	eax, 3
+	add	eax, ebx
+
+.loop_i:
+
+; -- quad subelements  +0, +4, +8 and +12 (simpliest operations)
+	fld	qword[ebx]
+	fld	qword[ebx+8*4]
+	fld	st0
+	fadd	st0, st2	; st : t1, f_4, f_0
+	fxch	st1
+	fsubp	st2, st0	; st : t1, t2
+	fld	qword[ebx+8*8]
+	fld	qword[ebx+8*12]
+	fld	st0
+	fadd	st0, st2	; st : t3, f_12, t1, t2
+	fxch	st1
+	fsubp	st2, st0	; st : t3, t4, t1, t2
+	; ------
+	fld	st2		; st : t1, t3, t4, t1, t2
+	fadd	st0, st1
+	fstp	qword[ebx]	; st : t3, t4, t1, t2
+	fsub	st0, st2	; st : t3-t1, t4, t1, t2
+	fchs			; st : t1-t3, t4, t1, t2
+	fstp	qword[ebx+8*4]	; st : t4, t1, t2
+	fst	st1		; st : t4, t4, t2
+	fadd	st0, st2	; st : t2+t4, t4, t2
+	fstp	qword[ebx+8*8]	; st : t4, t2
+	fsubp	st1, st0	; st : t2-t4
+	fstp	qword[ebx+8*12] ; st : <empty>
+
+; -- even subelements  +2, +6, +10 and +14 (2 multiplications needed)
+	fld	qword[ebx+8*2]
+	fld	qword[ebx+8*6]
+	fld	[_r]
+	fmul	st1, st0	; st : r, t2, t1
+	fld	qword[ebx+8*10]
+	fxch	st1		; st : r, t3, t2, t1
+	fmul	qword[ebx+8*14] ; st : t4, t3, t2, t1
+	; ------
+	fld	st3		; st : t1, t4, t3, t2, t1
+	fadd	st0, st3	;
+	fadd	st0, st2	;
+	fst	qword[ebx+8*2]	; store f[i+8] = t1+t2+t3
+	fsub	st0, st3	;
+	fsub	st0, st3	;
+	fstp	qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
+	fld	st3		; st : t1, t4, t3, t2, t1
+	fsub	st0, st2	;
+	fsub	st0, st1	;
+	fst	qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
+	fadd	st0, st1	;
+	faddp	st1, st0	; st : t1-t3+t4, t3, t2, t1
+	fstp	qword[ebx+8*6]	; store f[i+6]
+	fstp	st0		; st : t2, t1
+	fstp	st0		; st : t1
+	fstp	st0		; st : <empty>
+
+; -- odd subelements
+	fld	qword[ebx+8*9]
+	fld	qword[ebx+8*11]
+	fld	st1
+	fsub	st0, st1
+	fxch	st1
+	faddp	st2, st0	; st : (f[l3]-f[l7]), (f[l3]+f[l7])
+	fld	[_r2]
+	fmul	st2, st0
+	fmulp	st1, st0	; st : t9, t6
+	fld	qword[ebx+8*3]
+	fld	st0
+	fadd	st0, st2	; st : t1, f[l5], t9, t6
+	fstp	_t1
+	fsub	st0, st1
+	fstp	_t2
+	fstp	_t9	; (t9 never used)
+	fstp	_t6		; st : <empty>
+
+	fld	[_c1]
+	fld	[_s1]
+	fld	qword[ebx+8*5]
+	fld	qword[ebx+8*7]
+	fld	st3		; st: c1, f[l6], f[l2], s1, c1
+	fmul	st0, st2	; st: f_2*c, f_6, f_2, s, c
+	fld	st1		; st: f_6, f_2*c, f_6, f_2, s, c
+	fmul	st0, st4	; st: f_6*s, f_2*c, f_6, f_2, s, c
+	faddp	st1, st0	; st: t5, f_6, f_2, s, c
+	fstp	_t5		; st: f_6, f_2, s, c
+	fld	st3		; st: c, f_6, f_2, s, c
+	fmul	st0, st1
+	fld	st3
+	fmul	st0, st3	; st: f_2*s, f_6*c, f_6, f_2, s, c
+	fsubp	st1, st0	; st: t8, f_6, f_2, s, c
+	fstp	_t8		; st: f_6, f_2, s, c
+	fstp	st0		; st: f_2, s, c
+	fstp	st0		; st: s, c
+
+	fld	qword[ebx+8*13]
+	fld	qword[ebx+8*15]
+	fld	st3		; st: c1, f[l8], f[l4], s1, c1
+	fmul	st0, st1
+	fld	st3
+	fmul	st0, st3	; st: f_4*s, f_8*c, f_8, f_4, s, c
+	faddp	st1, st0	; st: t7, f_8, f_4, s, c
+	fld	_t5		; st: t5, t7, f_8, f_4, s, c
+	fsub	st0, st1	; st: t4, t7, f_8, f_4, s, c
+	fstp	_t4
+	fstp	_t7		; st: f_8, f_4, s, c
+	fld	st3		; st: c, f_8, f_4, s, c
+	fmul	st0, st2
+	fld	st3
+	fmul	st0, st2	; st: f_8*s, f_4*c, f_8, f_4, s, c
+	fsubp	st1, st0	; st:-t0, f_8, f_4, s, c
+	fchs
+	fld	_t8
+	fchs			; st:-t8, t0, f_8, f_4, s, c
+	fsub	st0, st1	; st: t3, t0, f_8, f_4, s, c
+	fstp	_t3
+	fstp	_t0		; st: f_8, f_4, s, c
+	fstp	st0		; st: f_4, s, c
+	fstp	st0		; st: s, c
+	fstp	st0		; st: c
+	fstp	st0		; st: <empty>
+
+	fld	_t1
+	fld	_t4
+	fld	st1
+	fsub	st0, st1
+	fstp	qword[ebx+8*11] ; f[l7] = t1-t4
+	faddp	st1, st0
+	fstp	qword[ebx+8*3]	; f[l5] = t1+t4
+	fld	_t2
+	fld	_t3
+	fld	st1
+	fsub	st0, st1
+	fstp	qword[ebx+8*15] ; f[l8]
+	faddp	st1, st0
+	fstp	qword[ebx+8*7]	; f[l6]
+
+	fld	_t6
+	fld	qword[ebx+8]
+	fld	st1
+	fsub	st0, st1
+	fxch	st1
+	faddp	st2, st0	; st : t2, t1
+	fld	_t8
+	fsub	_t0
+	fld	_t5
+	fadd	_t7		; st : t4, t3, t2, t1
+
+	fld	st3
+	fsub	st0, st1
+	fstp	qword[ebx+8*9]	; f[l3] = t1-t4
+	fadd	st0, st3
+	fstp	qword[ebx+8]	; f[l1] = t1+t4
+	fld	st1		; st : t2, t3, t2, t1
+	fsub	st0, st1	; f[l4] = t2-t3
+	fstp	qword[ebx+8*13] ; st : t3, t2, t1
+	faddp	st1, st0	; st : t2+t3, t1
+	fstp	qword[ebx+8*5]	; f[l2] = t2+t3
+	fstp	st0		; st : <empty>
+
+	add	ebx, 16*8
+	cmp	ebx, eax
+	jb	.loop_i
+
+	mov	esp, ebp
+	pop	ebp
+ret
+
+
+
+
+;=================================================================
+; cdecl parameters:
+; -- [ebp+8]   = N
+; -- [ebp+12]  = p
+; -- [ebp+16]  = 4k-aligned data array  address
+; -- [ebp+20]  = 4k-aligned SinCosTable address
+; returns:
+; -- nothing
+; destroys:
+; -- all GPRegs
+; locals:
+; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
+;; ==========================
+align 4
+step3:
+	push	ebp
+	mov	ebp, esp
+	sub	esp, 120
+; 283  : {
+
+
+; 293  :   for (l=3; l<=p; l++)
+	mov	cx, 0x0200
+.newstep:
+	inc	ch
+	cmp	ch, byte[ebp+12]
+	jg	.done
+	mov	_step, cx
+
+; 294  :   {
+; 295  :     d1 = 1 << (l + l - 3);
+
+	mov	cl, ch
+	add	cl, cl
+	sub	cl, 3
+	mov	edx, 1
+	shl	edx, cl
+	mov	_d1, edx
+
+; 296  :     d2 = d1 << 1;
+	shl	edx, 1
+	mov	_d2, edx
+	mov	eax, edx
+
+; 297  :     d3 = d2 << 1;
+	shl	edx, 1
+	mov	_d3, edx
+
+; 298  :     d4 = d2 + d3;
+	add	eax, edx
+	mov	_d4, eax
+
+; 299  :     d5 = d3 << 1;
+	shl	edx, 1
+	mov	_d5, edx
+	shl	edx, 3
+	mov	_d6, edx	; d6 = d5*8 to simplify index operations
+
+; 339  :         j5 = N / d5;   ; moved out of internal loop
+	mov	cl, [ebp+12]
+	sub	cl, ch
+	add	cl, cl
+	mov	edx, 1
+	shl	edx, cl
+	mov	_j5, edx
+
+; 300  :
+; 301  :     for (j=0; j<N; j+=d5)
+	mov	ebx, [ebp+16]
+	mov	esi, [ebp+8]
+	shl	esi, 3
+	add	esi, ebx
+	mov	_end_of_array, esi
+
+.next_j:
+
+; {
+; t1 = f[j] + f[j+d2];
+	mov	eax, _d2
+	fld	qword[ebx]
+	fld	qword[ebx+eax*8]
+	fld	st1
+	fadd	st0, st1
+	fstp	_t1
+
+; t2 = f[j] - f[j+d2];
+	fsubp	st1, st0
+	fstp	_t2
+
+; t3 = f[j+d3] + f[j+d4];
+	mov	edi, _d3
+	fld	qword[ebx+edi*8]
+	mov	edx, _d4
+	fld	qword[ebx+edx*8]
+	fld	st1
+	fsub	st0, st1		; st : t4, f4, f3
+	fxch	st1			; st : f4, t4, f3
+
+; t4 = f[j+d3] - f[j+d4];
+	faddp	st2, st0		; st : t4, t3
+
+; f[j+d4] = t2 - t4;
+; f[j+d3] = t2 + t4;
+	fld	_t2
+	fld	st0
+	fsub	st0, st2		; st : f4, t2, t4, t3
+	fstp	qword[ebx+edx*8]	; st : t2, t4, t3
+	fadd	st0, st1		; st : f3, t4, t3
+	fstp	qword[ebx+edi*8]	; st : t4, t3
+
+; f[j+d2] = t1 - t3;
+; f[j]    = t1 + t3;
+	fld	_t1
+	fst	st1
+	fsub	st0, st2		; st : f2, t1, t3
+	fstp	qword[ebx+eax*8]	; st : t1, t3
+	fadd	st0, st1		; st : f0, t3
+	fstp	qword[ebx]		; st : t3
+	fstp	st0
+
+; jj = j + d1;     / ??
+	mov	edi, _d1
+	shl	edi, 3		; = d1*8
+	mov	edx, edi
+	mov	eax, edi
+	add	eax, eax	; eax = d2*8
+	shl	edx, 2		; = d3*8
+	add	edi, ebx	; now [edi] points to f[jj]
+	add	edx, edi	; and [edx] points to f[jj+d3]
+
+; t1 = f[jj];
+	fld	qword [edi]	; st : t1
+; t3 = f[jj+d3];
+	fld	qword [edx]	; st : t3, t1
+
+; t2 = f[jj+d2] * r;
+	fld	qword [edi+eax]
+	fld	[_r]
+	fmul	st1, st0	; st : r,  t2, t3, t1
+; t4 = f[jj+d4] * r
+	fmul	qword [edx+eax] ; st : t4, t2, t3, t1
+
+; f[jj]    = t1 + t2 + t3;
+	fld	st3		; st : t1, t4, t2, t3, t1
+	fadd	st0, st3
+	fadd	st0, st2
+	fstp	qword [edi]
+
+; f[jj+d2] = t1 - t3 + t4;
+	fld	st3
+	fsub	st0, st3	; st : (t1-t3), t4, t2, t3, t1
+	fld	st0
+	fadd	st0, st2	; st : f2, (t1-t3), t4, t2, t3, t1
+	fstp	qword [edi+eax]
+; f[jj+d4] = t1 - t3 - t4;
+	fsub	st0, st1	; st : f4, t4, t2, t3, t1
+	fstp	qword [edx+eax]
+
+; f[jj+d3] = t1 - t2 + t3;
+	fstp	st0		; st : t2, t3,  t1
+	fsubp	st1, st0	; st : (t3-t2), t1
+	faddp	st1, st0	; st : f3
+	fstp	qword [edx]
+
+; for (k=1; k<d1; k++)
+	xor	ecx, ecx	; ecx = k
+	mov	_jj, ecx
+.next_k:
+	inc	ecx
+	cmp	ecx, _d1
+	jge	.done_k
+; {
+	mov	eax, _d2	; the sector increment
+; l1 = j  + k;
+	mov	edx, ecx
+	mov	_l1, edx	; [ebx+edx*8] --> f[j+k]
+; l2 = l1 + d2;
+	add	edx, eax
+	mov	_l2, edx
+; l3 = l1 + d3;
+	add	edx, eax
+	mov	_l3, edx
+; l4 = l1 + d4;
+	add	edx, eax
+	mov	_l4, edx
+
+; l5 = j  + d2 - k;
+	mov	edx, eax
+	sub	edx, ecx
+	mov	_l5, edx
+; l6 = l5 + d2;
+	add	edx, eax
+	mov	_l6, edx
+; l7 = l5 + d3;
+	add	edx, eax
+	mov	_l7, edx
+; l8 = l5 + d4;
+	add	edx, eax
+	mov	_l8, edx
+
+
+; 340  :         j5 *= k;       // add-substituted multiplication
+	mov	eax, _jj
+	add	eax, _j5
+	mov	_jj, eax
+
+; c1 = C[jj];
+; s1 = S[jj];
+	mov	edi, [ebp+20]
+	fld	qword[edi+eax*8]
+	mov	esi, [ebp+8]
+	shl	esi, 2
+	add	esi, edi
+	fld	qword[esi+eax*8]	; st : s1, c1
+
+; t5 = f[l2] * c1 + f[l6] * s1;
+; t8 = f[l6] * c1 - f[l2] * s1;
+	mov	edx, _l6
+	fld	qword[ebx+edx*8]
+	mov	edx, _l2
+	fld	st0
+	fmul	st0, st2
+	fxch	st1
+	fmul	st0, st3
+	fld	qword[ebx+edx*8]	; st : f[l2], f[l6]*c, f[l6]*s, s, c
+	fmul	st4, st0
+	fmulp	st3, st0		; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
+	fsub	st0, st2		; st :   t8,    f[l6]*s, f[l2]*s, f[l2]*c
+	fstp	_t8
+	faddp	st2, st0		; st :  f[l2]*s, t5
+	fstp	st0			; st :  t5
+	fstp	_t5			; st :  <empty>
+
+; c2 = C[2*jj];
+; s2 = S[2*jj];
+	shl	eax, 1
+	fld	qword[edi+eax*8]
+	fld	qword[esi+eax*8]	; st : s2, c2
+
+; t6 = f[l3] * c2 + f[l7] * s2;
+; t9 = f[l7] * c2 - f[l3] * s2;
+	mov	edx, _l7
+	fld	qword[ebx+edx*8]
+	mov	edx, _l3
+	fld	st0
+	fmul	st0, st2
+	fxch	st1
+	fmul	st0, st3
+	fld	qword[ebx+edx*8]	; st : f[l3], f[l7]*c, f[l7]*s, s, c
+	fmul	st4, st0
+	fmulp	st3, st0		; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
+	fsub	st0, st2		; st :   t9,    f[l7]*s, f[l3]*s, f[l3]*c
+	fstp	_t9
+	faddp	st2, st0		; st :  f[l2]*s, t6
+	fstp	st0			; st :  t6
+	fstp	_t6			; st :  <empty>
+
+; c3 = C[3*jj];
+; s3 = S[3*jj];
+	add	eax, _jj
+	fld	qword[edi+eax*8]
+	fld	qword[esi+eax*8]	; st : s3, c3
+
+; t7 = f[l4] * c3 + f[l8] * s3;
+; t0 = f[l8] * c3 - f[l4] * s3;
+	mov	edx, _l8
+	fld	qword[ebx+edx*8]
+	mov	edx, _l4
+	fld	st0
+	fmul	st0, st2
+	fxch	st1
+	fmul	st0, st3
+	fld	qword[ebx+edx*8]	; st : f[l4], f[l8]*c, f[l8]*s, s, c
+	fmul	st4, st0
+	fmulp	st3, st0		; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
+	fsub	st0, st2		; st :   t9,    f[l8]*s, f[l4]*s, f[l4]*c
+	fstp	_t0
+	faddp	st2, st0		; st : f[l2]*s, t7
+	fstp	st0			; st :  t7
+	fstp	_t7			; st :  <empty>
+
+; t1 = f[l5] - t9;
+; t2 = f[l5] + t9;
+	mov	eax, _l5
+	fld	qword [ebx+eax*8]
+	fld	_t9
+	fld	st0
+	fadd	st0, st2
+	fstp	_t2
+	fsubp	st1, st0
+	fstp	_t1
+
+; t3 = - t8  - t0;
+	fld	_t8
+	fadd	_t0
+	fchs
+	fstp	_t3
+; t4 =   t5  - t7;
+	fld	_t5
+	fsub	_t7
+	fstp	_t4
+
+; f[l5] = t1 + t4;
+	fld	_t1
+	fld	_t4
+	fld	st0
+	fadd	st0, st2
+	fstp	qword [ebx+eax*8]
+; f[l7] = t1 - t4;
+	mov	eax, _l7
+	fsubp	st1, st0
+	fstp	qword [ebx+eax*8]
+
+; f[l6] = t2 + t3;
+	mov	eax, _l6
+	fld	_t2
+	fld	_t3
+	fld	st0
+	fadd	st0, st2
+	fstp	qword [ebx+eax*8]
+; f[l8] = t2 - t3;
+	mov	eax, _l8
+	fsubp	st1, st0
+	fstp	qword [ebx+eax*8]
+
+; t1 = f[l1] + t6;
+	mov	eax, _l1
+	fld	qword [ebx+eax*8]
+	fld	_t6
+	fld	st0
+	fadd	st0, st2
+	fstp	_t1
+; t2 = f[l1] - t6;
+	fsubp	st1, st0
+	fstp	_t2
+
+; t3 =    t8 - t0;
+	fld	_t8
+	fsub	_t0
+	fstp	_t3
+; t4 =    t5 + t7;
+	fld	_t5
+	fadd	_t7
+	fstp	_t4
+
+; f[l1] = t1 + t4;
+	mov	eax, _l1
+	fld	_t1
+	fld	_t4
+      fld     st0
+	fadd	st0, st2
+	fstp	qword [ebx+eax*8]
+; f[l3] = t1 - t4;
+	mov	eax, _l3
+	fsubp	st1, st0
+	fstp	qword [ebx+eax*8]
+
+; f[l2] = t2 + t3;
+	mov	eax, _l2
+	fld	_t2
+	fld	_t3
+	fld	st0
+	fadd	st0, st2
+	fstp	qword [ebx+eax*8]
+; f[l4] = t2 - t3;
+	mov	eax, _l4
+	fsubp	st1, st0
+	fstp	qword [ebx+eax*8]
+
+; 374  :       }
+	jmp	.next_k
+
+.done_k:
+; 375  :     }
+	add	ebx, _d6	; d6 = d5*8
+	cmp	ebx, _end_of_array
+	jb	.next_j
+
+; 376  :   }
+	mov	cx, _step
+	jmp	.newstep
+.done:
+	mov	esp, ebp
+	pop	ebp
+; 377  : }
+	ret
+
+
+		;=========== Step3 ends here ===========
+
+
+; =================================================================
+
+;=================================================================
+; parameters:
+; -- [ebp+8]   = N
+; -- [ebp+12]  = p
+; -- [ebp+16]  = 4k-aligned data array  address
+; -- [ebp+20]  = 4k-aligned SinCosTable address
+; returns:
+; -- nothing
+; destroys:
+; -- all GPRegs
+;; ==========================
+
+align 4
+
+FHT_4:
+
+	push	ebp
+	mov	ebp, esp
+	mov	edx, [ebp+16]
+	add	edx, [ebp+12]
+	call BitInvert
+	push	dword[ebp+16]
+	push	dword[ebp+8]
+	call	step1
+	call	step2
+	pop	edx		; N
+	pop	ecx		; a
+	push	dword[ebp+20]	; t
+	push	ecx
+	push	dword[ebp+12]	; p
+	push	edx		; N
+	call	step3
+	mov	esp, ebp
+	pop	ebp
+
+ret
--- a/programs/other/rtfread/trunk/rtfread.asm
+++ b/programs/other/rtfread/trunk/rtfread.asm
@ -69,7 +69,7 @@ include '../../../develop/libraries/box_lib/load_lib.mac'

@use_library

-;include 'debug.inc'
+;include '../../debug.inc'

 if ~ RENDER eq PIX
  TOP=TOP+4