kolibrios-fun/programs/other/fft/fht4code.asm

;           Fast Hartley Transform routine
;           Copyright (C) 1999, 2004, 2010
;          Artem Jerdev  artem@jerdev.co.uk
;
; free KolibriOS version - not to be ported to other OSes
; ==========================================================


; global constants
align 8
_r	dq	1.41421356237309504880169	; = sqrt(2)
_r2	dq	0.70710678118654752440084	; = sqrt(2)/2
_c1	dq	0.92387953251128675612818	; = cos(pi/8)
_s1	dq	0.38268343236508977172846	; = sin(pi/8)

;=================================================================
; parameter1:
; -- reg dl (bits[3:0])   = Power_of_4
; returns:
; -- reg edx = _CosTable address (4k-aligned)
; assumes:     _SinTable  = _CosTable + (N/2)*8
; user heap has to be initialized
; destroys:
; -- eax, ebx, ecx
;; ==========================
align 4
CreateSinCosTable:
	xor	eax,  eax
	inc	eax
	mov	cl,  dl
	and	cl,  15
	shl	eax, cl
	shl	eax, cl
	mov	ecx, eax		; now ecx = N
	shl	ecx, 3
	mov	ebx, 12
	mov	eax, 68
	int	0x40			; getmem(N*sizeof(double))

	mov	edx, eax		; edx = _CosTable
	shr	ecx, 1
	mov	ebx, eax
	add	ebx, ecx		; ebx = _SinTable
	shr	ecx, 3
	push	ecx			; [esp] = ecx = N/2

	xor	eax,  eax
	fldpi
	fidiv	dword[esp]		; st : dx = 2*pi/N
	pop	ecx
	fldz				; st : 0, dx
.loop:
	fld	st0			; st : x, x, dx
	FSINCOS 			; st : cos, sin, x, dx
	fstp	qword [edx+eax*8]	; st : sin, x, dx
	fstp	qword [ebx+eax*8]	; st : x, dx
	fadd	st0, st1		; st : x+dx, dx

	inc	eax
	cmp	eax, ecx
	jne	.loop
	fstp	st0			; st : dx
	fstp	st0			; st : <empty>
ret

;=================================================================
; parameter1:
; -- reg edx = _CosTable address
; destroys:
; -- eax, ebx, ecx
;; ==========================
align 4
DestroySinCosTable:
	mov	ecx, edx
	mov	ebx, 13
	mov	eax, 68
	int	0x40			; free(SinCosTable)
ret

;=================================================================
; parameter1:
; -- reg  dl (bits[3:0])   = Power_of_4
; -- reg edx && (-16) = 4k-aligned data array address
; returns:
; -- edx = Power_of_4
; -- ecx = N
; destroys:
; -- eax, ebx, ecx, edx, esi
;; ==========================
align 4
BitInvert:
	mov	esi, edx
	and	esi, 0xFFFFFFF0
	and	edx, 0x0F
	push	edx
	mov	cl, dl
	xor	eax, eax
	inc	eax
	shl	eax, cl
	shl	eax, cl
	push	eax
	xor	ecx, ecx		; index term
.newterm:
	inc	ecx
	cmp	ecx, [esp]		; N
	jge	.done

	xor	eax, eax
	mov	edx, ecx
	xor	bl, bl

.do_invert:
	inc	bl
	cmp	bl, byte[esp+4] ; Power_of_4
	jg	.switch

	mov	bh, dl
	and	bh,  3
	shl	eax, 2
	or	al, bh
	shr	edx, 2
	jmp	.do_invert

.switch:
	cmp	eax, ecx
	jle	.newterm

	fld	qword [esi+eax*8]
	fld	qword [esi+ecx*8]
	fstp	qword [esi+eax*8]
	fstp	qword [esi+ecx*8]
	jmp	.newterm

.done:
	pop	ecx
	pop	edx
	ret

;=================================================================


;=================================================================
; stdcall parameters:
; -- [esp+4]  = N
; -- [esp+8]  = 4k-aligned data array  address
; returns:
; -- nothing
; destroys:
; -- ebx, esi
;; ==========================
align 4
step1:
	mov	ebx, [esp+8]
	mov	esi, [esp+4]
	shl	esi, 3
	add	esi, ebx

.loop:
	fld	qword[ebx]
	fld	qword[ebx+8]
	fld	st1
	fsub	st0, st1	; st : t2, f[i+1], f[i]
	fxch	st1		; st : f[i+1], t2, f[i]
	faddp	st2, st0	; st : t2, t1
	fld	qword[ebx+16]
	fld	qword[ebx+24]
	fld	st1		; st : f[i+2], f[i+3], f[i+2], t2, t1
	fadd	st0, st1	; st : t3, f[i+3], f[i+2], t2, t1
	fxch	st2		; st : f[i+2], f[i+3], t3, t2, t1
	fsub	st0, st1	; st : t4, f[i+3], t3, t2, t1
	fstp	st1		; st : t4, t3, t2, t1
	fld	st2		; st : t2, t4, t3, t2, t1
	fadd	st0, st1	; st : t2+t4, t4, t3, t2, t1
	fstp	qword[ebx+16]	; st : t4, t3, t2, t1
	fsubp	st2, st0	; st : t3, t2-t4, t1
	fld	st2		; st : t1, t3, t2-t4, t1
	fadd	st0, st1	; st : t1+t3, t3, t2-t4, t1
	fstp	qword[ebx]	; st : t3, t2-t4, t1
	fsubp	st2, st0	; st : t2-t4, t1-t3
	fstp	qword[ebx+24]	; st : t1-t3
	fstp	qword[ebx+8]	; st : <empty>

	add	ebx, 32
	cmp	ebx, esi
	jnz	.loop
ret

;=================================================================
; SSE3 version:	Step1
;
;==========================

align 4
step1_sse:
	mov	ebx, [esp+8]
	mov	esi, [esp+4]
	shl	esi, 3
	add	esi, ebx

.loop:
	movddup     xmm0, [ebx]     ;   xmm0: f0 ; f0
	movddup     xmm1, [ebx+8]   ;   xmm1: f1 ; f1
	addsubpd    xmm0, xmm1      ;   xmm0: t1 ; t2   ( + - )
    movddup     xmm1, [ebx+16]  ;   xmm1: f2 ; f2
    movddup     xmm2, [ebx+24]  ;   xmm2: f3 ; f3
	addsubpd    xmm1, xmm2      ;   xmm1: t3 ; t4   ( + - )

    movddup     xmm2, xmm0      ;   xmm2: t2 ; t2
    movddup     xmm3, xmm1      ;   xmm3: t4 ; t4
	addsubpd    xmm2, xmm3      ;   xmm2: 2+4; 2-4  
    shufpd      xmm2, xmm2, 1   ;   xmm2: 2-4; 2+4
    movapd      [ebx+16], xmm2

    shufpd      xmm0, xmm0, 1   ;   xmm0: t2 ; t1
    shufpd      xmm1, xmm1, 1   ;   xmm1: t4 ; t3
    movddup     xmm2, xmm0      ;   xmm2: t1 ; t1
    movddup     xmm3, xmm1      ;   xmm3: t3 ; t3
	addsubpd    xmm2, xmm3      ;   xmm2: 1+3; 1-3  
    shufpd      xmm2, xmm2, 1   ;   xmm2: 1-3; 1+3
    movapd      [ebx], xmm2

	add	ebx, 32
	cmp	ebx, esi
	jnz	.loop
ret

;       local stack definitions
;===========================================================================
_t0	equ	dword [esp]
_t1	equ	dword[esp+4]
_t2	equ	dword[esp+8]
_t3	equ	dword[esp+12]
_t4	equ	dword[esp+16]
_t5	equ	dword[esp+20]
_t6	equ	dword[esp+24]
_t7	equ	dword[esp+28]
_t8	equ	dword[esp+32]
_t9	equ	dword[esp+36]

_l1   equ	dword[esp+40]
_l2   equ	dword[esp+44]
_l3   equ	dword[esp+48]
_l4   equ	dword[esp+52]
_l5   equ	dword[esp+56]
_l6   equ	dword[esp+60]
_l7   equ	dword[esp+64]
_l8   equ	dword[esp+68]
_l9   equ	dword[esp+72]
_l0   equ	dword[esp+76]
_d1   equ	dword[esp+80]
_d2   equ	dword[esp+84]
_d3   equ	dword[esp+88]
_d4   equ	dword[esp+92]
_d5   equ	dword[esp+96]
_d6   equ	dword[esp+100]
_j5   equ	dword[esp+104]
_jj   equ	dword[esp+108]
_end_of_array	equ	dword[esp+112]
_step		equ	word [esp+116]


;=================================================================
; cdecl parameters:
; -- [ebp+8]   = N
; -- [ebp+12]  = 4k-aligned data array  address
; returns:
; -- nothing
; destroys:
; -- eax, ebx
; locals:
; -- 10 stack-located dwords (_t0 ... _t9)
;; ==========================
align 4
step2:
	push	ebp
	mov	ebp, esp
	sub	esp, 40
	mov	ebx, [ebp+12]
	mov	eax, [ebp+ 8]
	shl	eax, 3
	add	eax, ebx

.loop_i:

; -- quad subelements  +0, +4, +8 and +12 (simpliest operations)
	fld	qword[ebx]
	fld	qword[ebx+8*4]
	fld	st0
	fadd	st0, st2	; st : t1, f_4, f_0
	fxch	st1
	fsubp	st2, st0	; st : t1, t2
	fld	qword[ebx+8*8]
	fld	qword[ebx+8*12]
	fld	st0
	fadd	st0, st2	; st : t3, f_12, t1, t2
	fxch	st1
	fsubp	st2, st0	; st : t3, t4, t1, t2
	; ------
	fld	st2		; st : t1, t3, t4, t1, t2
	fadd	st0, st1
	fstp	qword[ebx]	; st : t3, t4, t1, t2
	fsub	st0, st2	; st : t3-t1, t4, t1, t2
	fchs			; st : t1-t3, t4, t1, t2
	fstp	qword[ebx+8*4]	; st : t4, t1, t2
	fst	st1		; st : t4, t4, t2
	fadd	st0, st2	; st : t2+t4, t4, t2
	fstp	qword[ebx+8*8]	; st : t4, t2
	fsubp	st1, st0	; st : t2-t4
	fstp	qword[ebx+8*12] ; st : <empty>

; -- even subelements  +2, +6, +10 and +14 (2 multiplications needed)
	fld	qword[ebx+8*2]
	fld	qword[ebx+8*6]
	fld	[_r]
	fmul	st1, st0	; st : r, t2, t1
	fld	qword[ebx+8*10]
	fxch	st1		; st : r, t3, t2, t1
	fmul	qword[ebx+8*14] ; st : t4, t3, t2, t1
	; ------
	fld	st3		; st : t1, t4, t3, t2, t1
	fadd	st0, st3	;
	fadd	st0, st2	;
	fst	qword[ebx+8*2]	; store f[i+8] = t1+t2+t3
	fsub	st0, st3	;
	fsub	st0, st3	;
	fstp	qword[ebx+8*10] ; store f[i+10]= t1-t2+t3
	fld	st3		; st : t1, t4, t3, t2, t1
	fsub	st0, st2	;
	fsub	st0, st1	;
	fst	qword[ebx+8*14] ; store f[i+14]= t1-t3-t4
	fadd	st0, st1	;
	faddp	st1, st0	; st : t1-t3+t4, t3, t2, t1
	fstp	qword[ebx+8*6]	; store f[i+6]
	fstp	st0		; st : t2, t1
	fstp	st0		; st : t1
	fstp	st0		; st : <empty>

; -- odd subelements
	fld	qword[ebx+8*9]
	fld	qword[ebx+8*11]
	fld	st1
	fsub	st0, st1
	fxch	st1
	faddp	st2, st0	; st : (f[l3]-f[l7]), (f[l3]+f[l7])
	fld	[_r2]
	fmul	st2, st0
	fmulp	st1, st0	; st : t9, t6
	fld	qword[ebx+8*3]
	fld	st0
	fadd	st0, st2	; st : t1, f[l5], t9, t6
	fstp	_t1
	fsub	st0, st1
	fstp	_t2
	fstp	_t9	; (t9 never used)
	fstp	_t6		; st : <empty>

	fld	[_c1]
	fld	[_s1]
	fld	qword[ebx+8*5]
	fld	qword[ebx+8*7]
	fld	st3		; st: c1, f[l6], f[l2], s1, c1
	fmul	st0, st2	; st: f_2*c, f_6, f_2, s, c
	fld	st1		; st: f_6, f_2*c, f_6, f_2, s, c
	fmul	st0, st4	; st: f_6*s, f_2*c, f_6, f_2, s, c
	faddp	st1, st0	; st: t5, f_6, f_2, s, c
	fstp	_t5		; st: f_6, f_2, s, c
	fld	st3		; st: c, f_6, f_2, s, c
	fmul	st0, st1
	fld	st3
	fmul	st0, st3	; st: f_2*s, f_6*c, f_6, f_2, s, c
	fsubp	st1, st0	; st: t8, f_6, f_2, s, c
	fstp	_t8		; st: f_6, f_2, s, c
	fstp	st0		; st: f_2, s, c
	fstp	st0		; st: s, c

	fld	qword[ebx+8*13]
	fld	qword[ebx+8*15]
	fld	st3		; st: c1, f[l8], f[l4], s1, c1
	fmul	st0, st1
	fld	st3
	fmul	st0, st3	; st: f_4*s, f_8*c, f_8, f_4, s, c
	faddp	st1, st0	; st: t7, f_8, f_4, s, c
	fld	_t5		; st: t5, t7, f_8, f_4, s, c
	fsub	st0, st1	; st: t4, t7, f_8, f_4, s, c
	fstp	_t4
	fstp	_t7		; st: f_8, f_4, s, c
	fld	st3		; st: c, f_8, f_4, s, c
	fmul	st0, st2
	fld	st3
	fmul	st0, st2	; st: f_8*s, f_4*c, f_8, f_4, s, c
	fsubp	st1, st0	; st:-t0, f_8, f_4, s, c
	fchs
	fld	_t8
	fchs			; st:-t8, t0, f_8, f_4, s, c
	fsub	st0, st1	; st: t3, t0, f_8, f_4, s, c
	fstp	_t3
	fstp	_t0		; st: f_8, f_4, s, c
	fstp	st0		; st: f_4, s, c
	fstp	st0		; st: s, c
	fstp	st0		; st: c
	fstp	st0		; st: <empty>

	fld	_t1
	fld	_t4
	fld	st1
	fsub	st0, st1
	fstp	qword[ebx+8*11] ; f[l7] = t1-t4
	faddp	st1, st0
	fstp	qword[ebx+8*3]	; f[l5] = t1+t4
	fld	_t2
	fld	_t3
	fld	st1
	fsub	st0, st1
	fstp	qword[ebx+8*15] ; f[l8]
	faddp	st1, st0
	fstp	qword[ebx+8*7]	; f[l6]

	fld	_t6
	fld	qword[ebx+8]
	fld	st1
        fld     st1
	faddp	st3, st0	
	fsubp	st1, st0	; st : t2, t1

	fld	_t8
	fsub	_t0
	fld	_t5
	fadd	_t7		; st : t4, t3, t2, t1

	fld	st3
	fsub	st0, st1
	fstp	qword[ebx+8*9]	; f[l3] = t1-t4
	fadd	st0, st3
	fstp	qword[ebx+8]      ; f[l1] = t1+t4
	fld	st1		; st : t2, t3, t2, t1
	fsub	st0, st1	; f[l4] = t2-t3
	fstp	qword[ebx+8*13] ; st : t3, t2, t1
	faddp	st1, st0	; st : t2+t3, t1
	fstp	qword[ebx+8*5]	; f[l2] = t2+t3
	fstp	st0		; st : <empty>

	add	ebx, 16*8
	cmp	ebx, eax
	jb	.loop_i

	mov	esp, ebp
	pop	ebp
ret


;=================================================================
; cdecl parameters:
; -- [ebp+8]   = N
; -- [ebp+12]  = p
; -- [ebp+16]  = 4k-aligned data array  address
; -- [ebp+20]  = 4k-aligned SinCosTable address
; returns:
; -- nothing
; destroys:
; -- all GPRegs
; locals:
; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)
;; ==========================
align 4
step3:
	push	ebp
	mov	ebp, esp
	sub	esp, 120
; 283  : {


; 293  :   for (l=3; l<=p; l++)
	mov	cx, 0x0200
.newstep:
	inc	ch
	cmp	ch, byte[ebp+12]
	jg	.done
	mov	_step, cx

; 294  :   {
; 295  :     d1 = 1 << (l + l - 3);

	mov	cl, ch
	add	cl, cl
	sub	cl, 3
	mov	edx, 1
	shl	edx, cl
	mov	_d1, edx

; 296  :     d2 = d1 << 1;
	shl	edx, 1
	mov	_d2, edx
	mov	eax, edx

; 297  :     d3 = d2 << 1;
	shl	edx, 1
	mov	_d3, edx

; 298  :     d4 = d2 + d3;
	add	eax, edx
	mov	_d4, eax

; 299  :     d5 = d3 << 1;
	shl	edx, 1
	mov	_d5, edx
	shl	edx, 3
	mov	_d6, edx	; d6 = d5*8 to simplify index operations

; 339  :         j5 = N / d5;   ; moved out of internal loop
	mov	cl, [ebp+12]
	sub	cl, ch
	add	cl, cl
	mov	edx, 1
	shl	edx, cl
	mov	_j5, edx

; 300  :
; 301  :     for (j=0; j<N; j+=d5)
	mov	ebx, [ebp+16]
	mov	esi, [ebp+8]
	shl	esi, 3
	add	esi, ebx
	mov	_end_of_array, esi

.next_j:

; {
; t1 = f[j] + f[j+d2];
	mov	eax, _d2
	fld	qword[ebx]
	fld	qword[ebx+eax*8]
	fld	st1
	fadd	st0, st1
	fstp	_t1

; t2 = f[j] - f[j+d2];
	fsubp	st1, st0
	fstp	_t2

; t3 = f[j+d3] + f[j+d4];
	mov	edi, _d3
	fld	qword[ebx+edi*8]
	mov	edx, _d4
	fld	qword[ebx+edx*8]
	fld	st1
	fsub	st0, st1		; st : t4, f4, f3
	fxch	st1			; st : f4, t4, f3

; t4 = f[j+d3] - f[j+d4];
	faddp	st2, st0		; st : t4, t3

; f[j+d4] = t2 - t4;
; f[j+d3] = t2 + t4;
	fld	_t2
	fld	st0
	fsub	st0, st2		; st : f4, t2, t4, t3
	fstp	qword[ebx+edx*8]	; st : t2, t4, t3
	fadd	st0, st1		; st : f3, t4, t3
	fstp	qword[ebx+edi*8]	; st : t4, t3

; f[j+d2] = t1 - t3;
; f[j]    = t1 + t3;
	fld	_t1
	fst	st1
	fsub	st0, st2		; st : f2, t1, t3
	fstp	qword[ebx+eax*8]	; st : t1, t3
	fadd	st0, st1		; st : f0, t3
	fstp	qword[ebx]		; st : t3
	fstp	st0

; jj = j + d1;     / ??
	mov	edi, _d1
	shl	edi, 3		; = d1*8
	mov	edx, edi
	mov	eax, edi
	add	eax, eax	; eax = d2*8
	shl	edx, 2		; = d3*8
	add	edi, ebx	; now [edi] points to f[jj]
	add	edx, edi	; and [edx] points to f[jj+d3]

; t1 = f[jj];
	fld	qword [edi]	; st : t1
; t3 = f[jj+d3];
	fld	qword [edx]	; st : t3, t1

; t2 = f[jj+d2] * r;
	fld	qword [edi+eax]
	fld	[_r]
	fmul	st1, st0	; st : r,  t2, t3, t1
; t4 = f[jj+d4] * r
	fmul	qword [edx+eax] ; st : t4, t2, t3, t1

; f[jj]    = t1 + t2 + t3;
	fld	st3		; st : t1, t4, t2, t3, t1
	fadd	st0, st3
	fadd	st0, st2
	fstp	qword [edi]

; f[jj+d2] = t1 - t3 + t4;
	fld	st3
	fsub	st0, st3	; st : (t1-t3), t4, t2, t3, t1
	fld	st0
	fadd	st0, st2	; st : f2, (t1-t3), t4, t2, t3, t1
	fstp	qword [edi+eax]
; f[jj+d4] = t1 - t3 - t4;
	fsub	st0, st1	; st : f4, t4, t2, t3, t1
	fstp	qword [edx+eax]

; f[jj+d3] = t1 - t2 + t3;
	fstp	st0		; st : t2, t3,  t1
	fsubp	st1, st0	; st : (t3-t2), t1
	faddp	st1, st0	; st : f3
	fstp	qword [edx]

; for (k=1; k<d1; k++)
	xor	ecx, ecx	; ecx = k
	mov	_jj, ecx
.next_k:
	inc	ecx
	cmp	ecx, _d1
	jge	.done_k
; {
	mov	eax, _d2	; the sector increment
; l1 = j  + k;
	mov	edx, ecx
	mov	_l1, edx	; [ebx+edx*8] --> f[j+k]
; l2 = l1 + d2;
	add	edx, eax
	mov	_l2, edx
; l3 = l1 + d3;
	add	edx, eax
	mov	_l3, edx
; l4 = l1 + d4;
	add	edx, eax
	mov	_l4, edx

; l5 = j  + d2 - k;
	mov	edx, eax
	sub	edx, ecx
	mov	_l5, edx
; l6 = l5 + d2;
	add	edx, eax
	mov	_l6, edx
; l7 = l5 + d3;
	add	edx, eax
	mov	_l7, edx
; l8 = l5 + d4;
	add	edx, eax
	mov	_l8, edx


; 340  :         j5 *= k;       // add-substituted multiplication
	mov	eax, _jj
	add	eax, _j5
	mov	_jj, eax

; c1 = C[jj];
; s1 = S[jj];
	mov	edi, [ebp+20]
	fld	qword[edi+eax*8]
	mov	esi, [ebp+8]
	shl	esi, 2
	add	esi, edi
	fld	qword[esi+eax*8]	; st : s1, c1

; t5 = f[l2] * c1 + f[l6] * s1;
; t8 = f[l6] * c1 - f[l2] * s1;
	mov	edx, _l6
	fld	qword[ebx+edx*8]
	mov	edx, _l2
	fld	st0
	fmul	st0, st2
	fxch	st1
	fmul	st0, st3
	fld	qword[ebx+edx*8]	; st : f[l2], f[l6]*c, f[l6]*s, s, c
	fmul	st4, st0
	fmulp	st3, st0		; st : f[l6]*c, f[l6]*s, f[l2]*s, f[l2]*c
	fsub	st0, st2		; st :   t8,    f[l6]*s, f[l2]*s, f[l2]*c
	fstp	_t8
	faddp	st2, st0		; st :  f[l2]*s, t5
	fstp	st0			; st :  t5
	fstp	_t5			; st :  <empty>

; c2 = C[2*jj];
; s2 = S[2*jj];
	shl	eax, 1
	fld	qword[edi+eax*8]
	fld	qword[esi+eax*8]	; st : s2, c2

; t6 = f[l3] * c2 + f[l7] * s2;
; t9 = f[l7] * c2 - f[l3] * s2;
	mov	edx, _l7
	fld	qword[ebx+edx*8]
	mov	edx, _l3
	fld	st0
	fmul	st0, st2
	fxch	st1
	fmul	st0, st3
	fld	qword[ebx+edx*8]	; st : f[l3], f[l7]*c, f[l7]*s, s, c
	fmul	st4, st0
	fmulp	st3, st0		; st : f[l7]*c, f[l7]*s, f[l3]*s, f[l3]*c
	fsub	st0, st2		; st :   t9,    f[l7]*s, f[l3]*s, f[l3]*c
	fstp	_t9
	faddp	st2, st0		; st :  f[l2]*s, t6
	fstp	st0			; st :  t6
	fstp	_t6			; st :  <empty>

; c3 = C[3*jj];
; s3 = S[3*jj];
	add	eax, _jj
	fld	qword[edi+eax*8]
	fld	qword[esi+eax*8]	; st : s3, c3

; t7 = f[l4] * c3 + f[l8] * s3;
; t0 = f[l8] * c3 - f[l4] * s3;
	mov	edx, _l8
	fld	qword[ebx+edx*8]
	mov	edx, _l4
	fld	st0
	fmul	st0, st2
	fxch	st1
	fmul	st0, st3
	fld	qword[ebx+edx*8]	; st : f[l4], f[l8]*c, f[l8]*s, s, c
	fmul	st4, st0
	fmulp	st3, st0		; st : f[l8]*c, f[l8]*s, f[l4]*s, f[l4]*c
	fsub	st0, st2		; st :   t9,    f[l8]*s, f[l4]*s, f[l4]*c
	fstp	_t0
	faddp	st2, st0		; st : f[l2]*s, t7
	fstp	st0			; st :  t7
	fstp	_t7			; st :  <empty>

; t1 = f[l5] - t9;
; t2 = f[l5] + t9;
	mov	eax, _l5
	fld	qword [ebx+eax*8]
	fld	_t9
	fld	st0
	fadd	st0, st2
	fstp	_t2
	fsubp	st1, st0
	fstp	_t1

; t3 = - t8  - t0;
	fld	_t8
	fadd	_t0
	fchs
	fstp	_t3
; t4 =   t5  - t7;
	fld	_t5
	fsub	_t7
	fstp	_t4

; f[l5] = t1 + t4;
	fld	_t1
	fld	_t4
	fld	st0
	fadd	st0, st2
	fstp	qword [ebx+eax*8]
; f[l7] = t1 - t4;
	mov	eax, _l7
	fsubp	st1, st0
	fstp	qword [ebx+eax*8]

; f[l6] = t2 + t3;
	mov	eax, _l6
	fld	_t2
	fld	_t3
	fld	st0
	fadd	st0, st2
	fstp	qword [ebx+eax*8]
; f[l8] = t2 - t3;
	mov	eax, _l8
	fsubp	st1, st0
	fstp	qword [ebx+eax*8]

; t1 = f[l1] + t6;
	mov	eax, _l1
	fld	qword [ebx+eax*8]
	fld	_t6
	fld	st0
	fadd	st0, st2
	fstp	_t1
; t2 = f[l1] - t6;
	fsubp	st1, st0
	fstp	_t2

; t3 =    t8 - t0;
	fld	_t8
	fsub	_t0
	fstp	_t3
; t4 =    t5 + t7;
	fld	_t5
	fadd	_t7
	fstp	_t4

; f[l1] = t1 + t4;
	mov	eax, _l1
	fld	_t1
	fld	_t4
      fld     st0
	fadd	st0, st2
	fstp	qword [ebx+eax*8]
; f[l3] = t1 - t4;
	mov	eax, _l3
	fsubp	st1, st0
	fstp	qword [ebx+eax*8]

; f[l2] = t2 + t3;
	mov	eax, _l2
	fld	_t2
	fld	_t3
	fld	st0
	fadd	st0, st2
	fstp	qword [ebx+eax*8]
; f[l4] = t2 - t3;
	mov	eax, _l4
	fsubp	st1, st0
	fstp	qword [ebx+eax*8]

; 374  :       }
	jmp	.next_k

.done_k:
; 375  :     }
	add	ebx, _d6	; d6 = d5*8
	cmp	ebx, _end_of_array
	jb	.next_j

; 376  :   }
	mov	cx, _step
	jmp	.newstep
.done:
	mov	esp, ebp
	pop	ebp
; 377  : }
	ret


		;=========== Step3 ends here ===========


; =================================================================

;=================================================================
; parameters:
; -- [ebp+8]   = N
; -- [ebp+12]  = p
; -- [ebp+16]  = 4k-aligned data array  address
; -- [ebp+20]  = 4k-aligned SinCosTable address
; returns:
; -- nothing
; destroys:
; -- all GPRegs
;; ==========================

align 4

FHT_4:

	push	ebp
	mov	ebp, esp
	mov	edx, [ebp+16]
	add	edx, [ebp+12]
	call BitInvert
	push	dword[ebp+16]
	push	dword[ebp+8]
	call	step1
	call	step2
	pop	edx		; N
	pop	ecx		; a
	push	dword[ebp+20]	; t
	push	ecx
	push	dword[ebp+12]	; p
	push	edx		; N
	call	step3
	mov	esp, ebp
	pop	ebp

ret
fht4code: a very old bug fixed in Step2 git-svn-id: svn://kolibrios.org@3474 a494cfbc-eb01-0410-851d-a64ba20cac60 2013-04-17 01:52:35 +02:00			`; Fast Hartley Transform routine`
			`; Copyright (C) 1999, 2004, 2010`
			`; Artem Jerdev artem@jerdev.co.uk`
			`;`
			`; free KolibriOS version - not to be ported to other OSes`
			`; ==========================================================`


			`; global constants`
			`align 8`
			`_r dq 1.41421356237309504880169 ; = sqrt(2)`
			`_r2 dq 0.70710678118654752440084 ; = sqrt(2)/2`
			`_c1 dq 0.92387953251128675612818 ; = cos(pi/8)`
			`_s1 dq 0.38268343236508977172846 ; = sin(pi/8)`

			`;=================================================================`
			`; parameter1:`
			`; -- reg dl (bits[3:0]) = Power_of_4`
			`; returns:`
			`; -- reg edx = _CosTable address (4k-aligned)`
			`; assumes: _SinTable = _CosTable + (N/2)*8`
			`; user heap has to be initialized`
			`; destroys:`
			`; -- eax, ebx, ecx`
			`;; ==========================`
			`align 4`
			`CreateSinCosTable:`
			`xor eax, eax`
			`inc eax`
			`mov cl, dl`
			`and cl, 15`
			`shl eax, cl`
			`shl eax, cl`
			`mov ecx, eax ; now ecx = N`
			`shl ecx, 3`
			`mov ebx, 12`
			`mov eax, 68`
			`int 0x40 ; getmem(N*sizeof(double))`

			`mov edx, eax ; edx = _CosTable`
			`shr ecx, 1`
			`mov ebx, eax`
			`add ebx, ecx ; ebx = _SinTable`
			`shr ecx, 3`
			`push ecx ; [esp] = ecx = N/2`

			`xor eax, eax`
			`fldpi`
			`fidiv dword[esp] ; st : dx = 2*pi/N`
			`pop ecx`
			`fldz ; st : 0, dx`
			`.loop:`
			`fld st0 ; st : x, x, dx`
			`FSINCOS ; st : cos, sin, x, dx`
			`fstp qword [edx+eax*8] ; st : sin, x, dx`
			`fstp qword [ebx+eax*8] ; st : x, dx`
			`fadd st0, st1 ; st : x+dx, dx`

			`inc eax`
			`cmp eax, ecx`
			`jne .loop`
			`fstp st0 ; st : dx`
			`fstp st0 ; st : <empty>`
			`ret`

			`;=================================================================`
			`; parameter1:`
			`; -- reg edx = _CosTable address`
			`; destroys:`
			`; -- eax, ebx, ecx`
			`;; ==========================`
			`align 4`
			`DestroySinCosTable:`
			`mov ecx, edx`
			`mov ebx, 13`
			`mov eax, 68`
			`int 0x40 ; free(SinCosTable)`
			`ret`

			`;=================================================================`
			`; parameter1:`
			`; -- reg dl (bits[3:0]) = Power_of_4`
			`; -- reg edx && (-16) = 4k-aligned data array address`
			`; returns:`
			`; -- edx = Power_of_4`
			`; -- ecx = N`
			`; destroys:`
			`; -- eax, ebx, ecx, edx, esi`
			`;; ==========================`
			`align 4`
			`BitInvert:`
			`mov esi, edx`
			`and esi, 0xFFFFFFF0`
			`and edx, 0x0F`
			`push edx`
			`mov cl, dl`
			`xor eax, eax`
			`inc eax`
			`shl eax, cl`
			`shl eax, cl`
			`push eax`
			`xor ecx, ecx ; index term`
			`.newterm:`
			`inc ecx`
			`cmp ecx, [esp] ; N`
			`jge .done`

			`xor eax, eax`
			`mov edx, ecx`
			`xor bl, bl`

			`.do_invert:`
			`inc bl`
			`cmp bl, byte[esp+4] ; Power_of_4`
			`jg .switch`

			`mov bh, dl`
			`and bh, 3`
			`shl eax, 2`
			`or al, bh`
			`shr edx, 2`
			`jmp .do_invert`

			`.switch:`
			`cmp eax, ecx`
			`jle .newterm`

			`fld qword [esi+eax*8]`
			`fld qword [esi+ecx*8]`
			`fstp qword [esi+eax*8]`
			`fstp qword [esi+ecx*8]`
			`jmp .newterm`

			`.done:`
			`pop ecx`
			`pop edx`
			`ret`

			`;=================================================================`


			`;=================================================================`
			`; stdcall parameters:`
			`; -- [esp+4] = N`
			`; -- [esp+8] = 4k-aligned data array address`
			`; returns:`
			`; -- nothing`
			`; destroys:`
			`; -- ebx, esi`
			`;; ==========================`
			`align 4`
			`step1:`
			`mov ebx, [esp+8]`
			`mov esi, [esp+4]`
			`shl esi, 3`
			`add esi, ebx`

			`.loop:`
			`fld qword[ebx]`
			`fld qword[ebx+8]`
			`fld st1`
			`fsub st0, st1 ; st : t2, f[i+1], f[i]`
			`fxch st1 ; st : f[i+1], t2, f[i]`
			`faddp st2, st0 ; st : t2, t1`
			`fld qword[ebx+16]`
			`fld qword[ebx+24]`
			`fld st1 ; st : f[i+2], f[i+3], f[i+2], t2, t1`
			`fadd st0, st1 ; st : t3, f[i+3], f[i+2], t2, t1`
			`fxch st2 ; st : f[i+2], f[i+3], t3, t2, t1`
			`fsub st0, st1 ; st : t4, f[i+3], t3, t2, t1`
			`fstp st1 ; st : t4, t3, t2, t1`
			`fld st2 ; st : t2, t4, t3, t2, t1`
			`fadd st0, st1 ; st : t2+t4, t4, t3, t2, t1`
			`fstp qword[ebx+16] ; st : t4, t3, t2, t1`
			`fsubp st2, st0 ; st : t3, t2-t4, t1`
			`fld st2 ; st : t1, t3, t2-t4, t1`
			`fadd st0, st1 ; st : t1+t3, t3, t2-t4, t1`
			`fstp qword[ebx] ; st : t3, t2-t4, t1`
			`fsubp st2, st0 ; st : t2-t4, t1-t3`
			`fstp qword[ebx+24] ; st : t1-t3`
			`fstp qword[ebx+8] ; st : <empty>`

			`add ebx, 32`
			`cmp ebx, esi`
			`jnz .loop`
FHT: SSE_patch_1 git-svn-id: svn://kolibrios.org@2215 a494cfbc-eb01-0410-851d-a64ba20cac60 2011-09-20 17:34:33 +02:00			`ret`
fht4code: a very old bug fixed in Step2 git-svn-id: svn://kolibrios.org@3474 a494cfbc-eb01-0410-851d-a64ba20cac60 2013-04-17 01:52:35 +02:00
FHT: SSE_patch_1 git-svn-id: svn://kolibrios.org@2215 a494cfbc-eb01-0410-851d-a64ba20cac60 2011-09-20 17:34:33 +02:00			`;=================================================================`
			`; SSE3 version: Step1`
			`;`
			`;==========================`

			`align 4`
			`step1_sse:`
			`mov ebx, [esp+8]`
			`mov esi, [esp+4]`
			`shl esi, 3`
			`add esi, ebx`

			`.loop:`
			`movddup xmm0, [ebx] ; xmm0: f0 ; f0`
			`movddup xmm1, [ebx+8] ; xmm1: f1 ; f1`
			`addsubpd xmm0, xmm1 ; xmm0: t1 ; t2 ( + - )`
			`movddup xmm1, [ebx+16] ; xmm1: f2 ; f2`
			`movddup xmm2, [ebx+24] ; xmm2: f3 ; f3`
			`addsubpd xmm1, xmm2 ; xmm1: t3 ; t4 ( + - )`

			`movddup xmm2, xmm0 ; xmm2: t2 ; t2`
			`movddup xmm3, xmm1 ; xmm3: t4 ; t4`
			`addsubpd xmm2, xmm3 ; xmm2: 2+4; 2-4`
			`shufpd xmm2, xmm2, 1 ; xmm2: 2-4; 2+4`
			`movapd [ebx+16], xmm2`

			`shufpd xmm0, xmm0, 1 ; xmm0: t2 ; t1`
			`shufpd xmm1, xmm1, 1 ; xmm1: t4 ; t3`
			`movddup xmm2, xmm0 ; xmm2: t1 ; t1`
			`movddup xmm3, xmm1 ; xmm3: t3 ; t3`
			`addsubpd xmm2, xmm3 ; xmm2: 1+3; 1-3`
			`shufpd xmm2, xmm2, 1 ; xmm2: 1-3; 1+3`
			`movapd [ebx], xmm2`

			`add ebx, 32`
			`cmp ebx, esi`
			`jnz .loop`
			`ret`
fht4code: a very old bug fixed in Step2 git-svn-id: svn://kolibrios.org@3474 a494cfbc-eb01-0410-851d-a64ba20cac60 2013-04-17 01:52:35 +02:00
			`; local stack definitions`
			`;===========================================================================`
			`_t0 equ dword [esp]`
			`_t1 equ dword[esp+4]`
			`_t2 equ dword[esp+8]`
			`_t3 equ dword[esp+12]`
			`_t4 equ dword[esp+16]`
			`_t5 equ dword[esp+20]`
			`_t6 equ dword[esp+24]`
			`_t7 equ dword[esp+28]`
			`_t8 equ dword[esp+32]`
			`_t9 equ dword[esp+36]`

			`_l1 equ dword[esp+40]`
			`_l2 equ dword[esp+44]`
			`_l3 equ dword[esp+48]`
			`_l4 equ dword[esp+52]`
			`_l5 equ dword[esp+56]`
			`_l6 equ dword[esp+60]`
			`_l7 equ dword[esp+64]`
			`_l8 equ dword[esp+68]`
			`_l9 equ dword[esp+72]`
			`_l0 equ dword[esp+76]`
			`_d1 equ dword[esp+80]`
			`_d2 equ dword[esp+84]`
			`_d3 equ dword[esp+88]`
			`_d4 equ dword[esp+92]`
			`_d5 equ dword[esp+96]`
			`_d6 equ dword[esp+100]`
			`_j5 equ dword[esp+104]`
			`_jj equ dword[esp+108]`
			`_end_of_array equ dword[esp+112]`
			`_step equ word [esp+116]`


			`;=================================================================`
			`; cdecl parameters:`
			`; -- [ebp+8] = N`
			`; -- [ebp+12] = 4k-aligned data array address`
			`; returns:`
			`; -- nothing`
			`; destroys:`
			`; -- eax, ebx`
			`; locals:`
			`; -- 10 stack-located dwords (_t0 ... _t9)`
			`;; ==========================`
			`align 4`
			`step2:`
			`push ebp`
			`mov ebp, esp`
			`sub esp, 40`
			`mov ebx, [ebp+12]`
			`mov eax, [ebp+ 8]`
			`shl eax, 3`
			`add eax, ebx`

			`.loop_i:`

			`; -- quad subelements +0, +4, +8 and +12 (simpliest operations)`
			`fld qword[ebx]`
			`fld qword[ebx+8*4]`
			`fld st0`
			`fadd st0, st2 ; st : t1, f_4, f_0`
			`fxch st1`
			`fsubp st2, st0 ; st : t1, t2`
			`fld qword[ebx+8*8]`
			`fld qword[ebx+8*12]`
			`fld st0`
			`fadd st0, st2 ; st : t3, f_12, t1, t2`
			`fxch st1`
			`fsubp st2, st0 ; st : t3, t4, t1, t2`
			`; ------`
			`fld st2 ; st : t1, t3, t4, t1, t2`
			`fadd st0, st1`
			`fstp qword[ebx] ; st : t3, t4, t1, t2`
			`fsub st0, st2 ; st : t3-t1, t4, t1, t2`
			`fchs ; st : t1-t3, t4, t1, t2`
			`fstp qword[ebx+8*4] ; st : t4, t1, t2`
			`fst st1 ; st : t4, t4, t2`
			`fadd st0, st2 ; st : t2+t4, t4, t2`
			`fstp qword[ebx+8*8] ; st : t4, t2`
			`fsubp st1, st0 ; st : t2-t4`
			`fstp qword[ebx+8*12] ; st : <empty>`

			`; -- even subelements +2, +6, +10 and +14 (2 multiplications needed)`
			`fld qword[ebx+8*2]`
			`fld qword[ebx+8*6]`
			`fld [_r]`
			`fmul st1, st0 ; st : r, t2, t1`
			`fld qword[ebx+8*10]`
			`fxch st1 ; st : r, t3, t2, t1`
			`fmul qword[ebx+8*14] ; st : t4, t3, t2, t1`
			`; ------`
			`fld st3 ; st : t1, t4, t3, t2, t1`
			`fadd st0, st3 ;`
			`fadd st0, st2 ;`
			`fst qword[ebx+8*2] ; store f[i+8] = t1+t2+t3`
			`fsub st0, st3 ;`
			`fsub st0, st3 ;`
			`fstp qword[ebx+8*10] ; store f[i+10]= t1-t2+t3`
			`fld st3 ; st : t1, t4, t3, t2, t1`
			`fsub st0, st2 ;`
			`fsub st0, st1 ;`
			`fst qword[ebx+8*14] ; store f[i+14]= t1-t3-t4`
			`fadd st0, st1 ;`
			`faddp st1, st0 ; st : t1-t3+t4, t3, t2, t1`
			`fstp qword[ebx+8*6] ; store f[i+6]`
			`fstp st0 ; st : t2, t1`
			`fstp st0 ; st : t1`
			`fstp st0 ; st : <empty>`

			`; -- odd subelements`
			`fld qword[ebx+8*9]`
			`fld qword[ebx+8*11]`
			`fld st1`
			`fsub st0, st1`
			`fxch st1`
			`faddp st2, st0 ; st : (f[l3]-f[l7]), (f[l3]+f[l7])`
			`fld [_r2]`
			`fmul st2, st0`
			`fmulp st1, st0 ; st : t9, t6`
			`fld qword[ebx+8*3]`
			`fld st0`
			`fadd st0, st2 ; st : t1, f[l5], t9, t6`
			`fstp _t1`
			`fsub st0, st1`
			`fstp _t2`
			`fstp _t9 ; (t9 never used)`
			`fstp _t6 ; st : <empty>`

			`fld [_c1]`
			`fld [_s1]`
			`fld qword[ebx+8*5]`
			`fld qword[ebx+8*7]`
			`fld st3 ; st: c1, f[l6], f[l2], s1, c1`
			`fmul st0, st2 ; st: f_2*c, f_6, f_2, s, c`
			`fld st1 ; st: f_6, f_2*c, f_6, f_2, s, c`
			`fmul st0, st4 ; st: f_6s, f_2c, f_6, f_2, s, c`
			`faddp st1, st0 ; st: t5, f_6, f_2, s, c`
			`fstp _t5 ; st: f_6, f_2, s, c`
			`fld st3 ; st: c, f_6, f_2, s, c`
			`fmul st0, st1`
			`fld st3`
			`fmul st0, st3 ; st: f_2s, f_6c, f_6, f_2, s, c`
			`fsubp st1, st0 ; st: t8, f_6, f_2, s, c`
			`fstp _t8 ; st: f_6, f_2, s, c`
			`fstp st0 ; st: f_2, s, c`
			`fstp st0 ; st: s, c`

			`fld qword[ebx+8*13]`
			`fld qword[ebx+8*15]`
			`fld st3 ; st: c1, f[l8], f[l4], s1, c1`
			`fmul st0, st1`
			`fld st3`
			`fmul st0, st3 ; st: f_4s, f_8c, f_8, f_4, s, c`
			`faddp st1, st0 ; st: t7, f_8, f_4, s, c`
			`fld _t5 ; st: t5, t7, f_8, f_4, s, c`
			`fsub st0, st1 ; st: t4, t7, f_8, f_4, s, c`
			`fstp _t4`
			`fstp _t7 ; st: f_8, f_4, s, c`
			`fld st3 ; st: c, f_8, f_4, s, c`
			`fmul st0, st2`
			`fld st3`
			`fmul st0, st2 ; st: f_8s, f_4c, f_8, f_4, s, c`
			`fsubp st1, st0 ; st:-t0, f_8, f_4, s, c`
			`fchs`
			`fld _t8`
			`fchs ; st:-t8, t0, f_8, f_4, s, c`
			`fsub st0, st1 ; st: t3, t0, f_8, f_4, s, c`
			`fstp _t3`
			`fstp _t0 ; st: f_8, f_4, s, c`
			`fstp st0 ; st: f_4, s, c`
			`fstp st0 ; st: s, c`
			`fstp st0 ; st: c`
			`fstp st0 ; st: <empty>`

			`fld _t1`
			`fld _t4`
			`fld st1`
			`fsub st0, st1`
			`fstp qword[ebx+8*11] ; f[l7] = t1-t4`
			`faddp st1, st0`
			`fstp qword[ebx+8*3] ; f[l5] = t1+t4`
			`fld _t2`
			`fld _t3`
			`fld st1`
			`fsub st0, st1`
			`fstp qword[ebx+8*15] ; f[l8]`
			`faddp st1, st0`
			`fstp qword[ebx+8*7] ; f[l6]`

			`fld _t6`
			`fld qword[ebx+8]`
			`fld st1`
			`fld st1`
			`faddp st3, st0`
			`fsubp st1, st0 ; st : t2, t1`

			`fld _t8`
			`fsub _t0`
			`fld _t5`
			`fadd _t7 ; st : t4, t3, t2, t1`

			`fld st3`
			`fsub st0, st1`
			`fstp qword[ebx+8*9] ; f[l3] = t1-t4`
			`fadd st0, st3`
			`fstp qword[ebx+8] ; f[l1] = t1+t4`
			`fld st1 ; st : t2, t3, t2, t1`
			`fsub st0, st1 ; f[l4] = t2-t3`
			`fstp qword[ebx+8*13] ; st : t3, t2, t1`
			`faddp st1, st0 ; st : t2+t3, t1`
			`fstp qword[ebx+8*5] ; f[l2] = t2+t3`
			`fstp st0 ; st : <empty>`

			`add ebx, 16*8`
			`cmp ebx, eax`
			`jb .loop_i`

			`mov esp, ebp`
			`pop ebp`
			`ret`




			`;=================================================================`
			`; cdecl parameters:`
			`; -- [ebp+8] = N`
			`; -- [ebp+12] = p`
			`; -- [ebp+16] = 4k-aligned data array address`
			`; -- [ebp+20] = 4k-aligned SinCosTable address`
			`; returns:`
			`; -- nothing`
			`; destroys:`
			`; -- all GPRegs`
			`; locals:`
			`; -- 120 stack-located dwords (_t0 ... _t9, _l0..._step)`
			`;; ==========================`
			`align 4`
			`step3:`
			`push ebp`
			`mov ebp, esp`
			`sub esp, 120`
			`; 283 : {`


			`; 293 : for (l=3; l<=p; l++)`
			`mov cx, 0x0200`
			`.newstep:`
			`inc ch`
			`cmp ch, byte[ebp+12]`
			`jg .done`
			`mov _step, cx`

			`; 294 : {`
			`; 295 : d1 = 1 << (l + l - 3);`

			`mov cl, ch`
			`add cl, cl`
			`sub cl, 3`
			`mov edx, 1`
			`shl edx, cl`
			`mov _d1, edx`

			`; 296 : d2 = d1 << 1;`
			`shl edx, 1`
			`mov _d2, edx`
			`mov eax, edx`

			`; 297 : d3 = d2 << 1;`
			`shl edx, 1`
			`mov _d3, edx`

			`; 298 : d4 = d2 + d3;`
			`add eax, edx`
			`mov _d4, eax`

			`; 299 : d5 = d3 << 1;`
			`shl edx, 1`
			`mov _d5, edx`
			`shl edx, 3`
			`mov _d6, edx ; d6 = d5*8 to simplify index operations`

			`; 339 : j5 = N / d5; ; moved out of internal loop`
			`mov cl, [ebp+12]`
			`sub cl, ch`
			`add cl, cl`
			`mov edx, 1`
			`shl edx, cl`
			`mov _j5, edx`

			`; 300 :`
			`; 301 : for (j=0; j<N; j+=d5)`
			`mov ebx, [ebp+16]`
			`mov esi, [ebp+8]`
			`shl esi, 3`
			`add esi, ebx`
			`mov _end_of_array, esi`

			`.next_j:`

			`; {`
			`; t1 = f[j] + f[j+d2];`
			`mov eax, _d2`
			`fld qword[ebx]`
			`fld qword[ebx+eax*8]`
			`fld st1`
			`fadd st0, st1`
			`fstp _t1`

			`; t2 = f[j] - f[j+d2];`
			`fsubp st1, st0`
			`fstp _t2`

			`; t3 = f[j+d3] + f[j+d4];`
			`mov edi, _d3`
			`fld qword[ebx+edi*8]`
			`mov edx, _d4`
			`fld qword[ebx+edx*8]`
			`fld st1`
			`fsub st0, st1 ; st : t4, f4, f3`
			`fxch st1 ; st : f4, t4, f3`

			`; t4 = f[j+d3] - f[j+d4];`
			`faddp st2, st0 ; st : t4, t3`

			`; f[j+d4] = t2 - t4;`
			`; f[j+d3] = t2 + t4;`
			`fld _t2`
			`fld st0`
			`fsub st0, st2 ; st : f4, t2, t4, t3`
			`fstp qword[ebx+edx*8] ; st : t2, t4, t3`
			`fadd st0, st1 ; st : f3, t4, t3`
			`fstp qword[ebx+edi*8] ; st : t4, t3`

			`; f[j+d2] = t1 - t3;`
			`; f[j] = t1 + t3;`
			`fld _t1`
			`fst st1`
			`fsub st0, st2 ; st : f2, t1, t3`
			`fstp qword[ebx+eax*8] ; st : t1, t3`
			`fadd st0, st1 ; st : f0, t3`
			`fstp qword[ebx] ; st : t3`
			`fstp st0`

			`; jj = j + d1; / ??`
			`mov edi, _d1`
			`shl edi, 3 ; = d1*8`
			`mov edx, edi`
			`mov eax, edi`
			`add eax, eax ; eax = d2*8`
			`shl edx, 2 ; = d3*8`
			`add edi, ebx ; now [edi] points to f[jj]`
			`add edx, edi ; and [edx] points to f[jj+d3]`

			`; t1 = f[jj];`
			`fld qword [edi] ; st : t1`
			`; t3 = f[jj+d3];`
			`fld qword [edx] ; st : t3, t1`

			`; t2 = f[jj+d2] * r;`
			`fld qword [edi+eax]`
			`fld [_r]`
			`fmul st1, st0 ; st : r, t2, t3, t1`
			`; t4 = f[jj+d4] * r`
			`fmul qword [edx+eax] ; st : t4, t2, t3, t1`

			`; f[jj] = t1 + t2 + t3;`
			`fld st3 ; st : t1, t4, t2, t3, t1`
			`fadd st0, st3`
			`fadd st0, st2`
			`fstp qword [edi]`

			`; f[jj+d2] = t1 - t3 + t4;`
			`fld st3`
			`fsub st0, st3 ; st : (t1-t3), t4, t2, t3, t1`
			`fld st0`
			`fadd st0, st2 ; st : f2, (t1-t3), t4, t2, t3, t1`
			`fstp qword [edi+eax]`
			`; f[jj+d4] = t1 - t3 - t4;`
			`fsub st0, st1 ; st : f4, t4, t2, t3, t1`
			`fstp qword [edx+eax]`

			`; f[jj+d3] = t1 - t2 + t3;`
			`fstp st0 ; st : t2, t3, t1`
			`fsubp st1, st0 ; st : (t3-t2), t1`
			`faddp st1, st0 ; st : f3`
			`fstp qword [edx]`

			`; for (k=1; k<d1; k++)`
			`xor ecx, ecx ; ecx = k`
			`mov _jj, ecx`
			`.next_k:`
			`inc ecx`
			`cmp ecx, _d1`
			`jge .done_k`
			`; {`
			`mov eax, _d2 ; the sector increment`
			`; l1 = j + k;`
			`mov edx, ecx`
			`mov _l1, edx ; [ebx+edx*8] --> f[j+k]`
			`; l2 = l1 + d2;`
			`add edx, eax`
			`mov _l2, edx`
			`; l3 = l1 + d3;`
			`add edx, eax`
			`mov _l3, edx`
			`; l4 = l1 + d4;`
			`add edx, eax`
			`mov _l4, edx`

			`; l5 = j + d2 - k;`
			`mov edx, eax`
			`sub edx, ecx`
			`mov _l5, edx`
			`; l6 = l5 + d2;`
			`add edx, eax`
			`mov _l6, edx`
			`; l7 = l5 + d3;`
			`add edx, eax`
			`mov _l7, edx`
			`; l8 = l5 + d4;`
			`add edx, eax`
			`mov _l8, edx`


			`; 340 : j5 *= k; // add-substituted multiplication`
			`mov eax, _jj`
			`add eax, _j5`
			`mov _jj, eax`

			`; c1 = C[jj];`
			`; s1 = S[jj];`
			`mov edi, [ebp+20]`
			`fld qword[edi+eax*8]`
			`mov esi, [ebp+8]`
			`shl esi, 2`
			`add esi, edi`
			`fld qword[esi+eax*8] ; st : s1, c1`

			`; t5 = f[l2] * c1 + f[l6] * s1;`
			`; t8 = f[l6] * c1 - f[l2] * s1;`
			`mov edx, _l6`
			`fld qword[ebx+edx*8]`
			`mov edx, _l2`
			`fld st0`
			`fmul st0, st2`
			`fxch st1`
			`fmul st0, st3`
			`fld qword[ebx+edx8] ; st : f[l2], f[l6]c, f[l6]*s, s, c`
			`fmul st4, st0`
			`fmulp st3, st0 ; st : f[l6]c, f[l6]s, f[l2]s, f[l2]c`
			`fsub st0, st2 ; st : t8, f[l6]s, f[l2]s, f[l2]*c`
			`fstp _t8`
			`faddp st2, st0 ; st : f[l2]*s, t5`
			`fstp st0 ; st : t5`
			`fstp _t5 ; st : <empty>`

			`; c2 = C[2*jj];`
			`; s2 = S[2*jj];`
			`shl eax, 1`
			`fld qword[edi+eax*8]`
			`fld qword[esi+eax*8] ; st : s2, c2`

			`; t6 = f[l3] * c2 + f[l7] * s2;`
			`; t9 = f[l7] * c2 - f[l3] * s2;`
			`mov edx, _l7`
			`fld qword[ebx+edx*8]`
			`mov edx, _l3`
			`fld st0`
			`fmul st0, st2`
			`fxch st1`
			`fmul st0, st3`
			`fld qword[ebx+edx8] ; st : f[l3], f[l7]c, f[l7]*s, s, c`
			`fmul st4, st0`
			`fmulp st3, st0 ; st : f[l7]c, f[l7]s, f[l3]s, f[l3]c`
			`fsub st0, st2 ; st : t9, f[l7]s, f[l3]s, f[l3]*c`
			`fstp _t9`
			`faddp st2, st0 ; st : f[l2]*s, t6`
			`fstp st0 ; st : t6`
			`fstp _t6 ; st : <empty>`

			`; c3 = C[3*jj];`
			`; s3 = S[3*jj];`
			`add eax, _jj`
			`fld qword[edi+eax*8]`
			`fld qword[esi+eax*8] ; st : s3, c3`

			`; t7 = f[l4] * c3 + f[l8] * s3;`
			`; t0 = f[l8] * c3 - f[l4] * s3;`
			`mov edx, _l8`
			`fld qword[ebx+edx*8]`
			`mov edx, _l4`
			`fld st0`
			`fmul st0, st2`
			`fxch st1`
			`fmul st0, st3`
			`fld qword[ebx+edx8] ; st : f[l4], f[l8]c, f[l8]*s, s, c`
			`fmul st4, st0`
			`fmulp st3, st0 ; st : f[l8]c, f[l8]s, f[l4]s, f[l4]c`
			`fsub st0, st2 ; st : t9, f[l8]s, f[l4]s, f[l4]*c`
			`fstp _t0`
			`faddp st2, st0 ; st : f[l2]*s, t7`
			`fstp st0 ; st : t7`
			`fstp _t7 ; st : <empty>`

			`; t1 = f[l5] - t9;`
			`; t2 = f[l5] + t9;`
			`mov eax, _l5`
			`fld qword [ebx+eax*8]`
			`fld _t9`
			`fld st0`
			`fadd st0, st2`
			`fstp _t2`
			`fsubp st1, st0`
			`fstp _t1`

			`; t3 = - t8 - t0;`
			`fld _t8`
			`fadd _t0`
			`fchs`
			`fstp _t3`
			`; t4 = t5 - t7;`
			`fld _t5`
			`fsub _t7`
			`fstp _t4`

			`; f[l5] = t1 + t4;`
			`fld _t1`
			`fld _t4`
			`fld st0`
			`fadd st0, st2`
			`fstp qword [ebx+eax*8]`
			`; f[l7] = t1 - t4;`
			`mov eax, _l7`
			`fsubp st1, st0`
			`fstp qword [ebx+eax*8]`

			`; f[l6] = t2 + t3;`
			`mov eax, _l6`
			`fld _t2`
			`fld _t3`
			`fld st0`
			`fadd st0, st2`
			`fstp qword [ebx+eax*8]`
			`; f[l8] = t2 - t3;`
			`mov eax, _l8`
			`fsubp st1, st0`
			`fstp qword [ebx+eax*8]`

			`; t1 = f[l1] + t6;`
			`mov eax, _l1`
			`fld qword [ebx+eax*8]`
			`fld _t6`
			`fld st0`
			`fadd st0, st2`
			`fstp _t1`
			`; t2 = f[l1] - t6;`
			`fsubp st1, st0`
			`fstp _t2`

			`; t3 = t8 - t0;`
			`fld _t8`
			`fsub _t0`
			`fstp _t3`
			`; t4 = t5 + t7;`
			`fld _t5`
			`fadd _t7`
			`fstp _t4`

			`; f[l1] = t1 + t4;`
			`mov eax, _l1`
			`fld _t1`
			`fld _t4`
			`fld st0`
			`fadd st0, st2`
			`fstp qword [ebx+eax*8]`
			`; f[l3] = t1 - t4;`
			`mov eax, _l3`
			`fsubp st1, st0`
			`fstp qword [ebx+eax*8]`

			`; f[l2] = t2 + t3;`
			`mov eax, _l2`
			`fld _t2`
			`fld _t3`
			`fld st0`
			`fadd st0, st2`
			`fstp qword [ebx+eax*8]`
			`; f[l4] = t2 - t3;`
			`mov eax, _l4`
			`fsubp st1, st0`
			`fstp qword [ebx+eax*8]`

			`; 374 : }`
			`jmp .next_k`

			`.done_k:`
			`; 375 : }`
			`add ebx, _d6 ; d6 = d5*8`
			`cmp ebx, _end_of_array`
			`jb .next_j`

			`; 376 : }`
			`mov cx, _step`
			`jmp .newstep`
			`.done:`
			`mov esp, ebp`
			`pop ebp`
			`; 377 : }`
			`ret`


			`;=========== Step3 ends here ===========`


			`; =================================================================`

			`;=================================================================`
			`; parameters:`
			`; -- [ebp+8] = N`
			`; -- [ebp+12] = p`
			`; -- [ebp+16] = 4k-aligned data array address`
			`; -- [ebp+20] = 4k-aligned SinCosTable address`
			`; returns:`
			`; -- nothing`
			`; destroys:`
			`; -- all GPRegs`
			`;; ==========================`

			`align 4`

			`FHT_4:`

			`push ebp`
			`mov ebp, esp`
			`mov edx, [ebp+16]`
			`add edx, [ebp+12]`
			`call BitInvert`
			`push dword[ebp+16]`
			`push dword[ebp+8]`
			`call step1`
			`call step2`
			`pop edx ; N`
			`pop ecx ; a`
			`push dword[ebp+20] ; t`
			`push ecx`
			`push dword[ebp+12] ; p`
			`push edx ; N`
			`call step3`
			`mov esp, ebp`
			`pop ebp`

			`ret`