1
0
kolibrios/programs/demos/life2/sse2life.inc

319 lines
6.3 KiB
PHP
Raw Normal View History

;// fast life generator: ~2.8 pixel*generation/tact
macro live_shl x,do_shl
{
if do_shl eq yes
psllq x,1
end if
}
macro live_shr x,do_shr
{
if do_shr eq yes
psrlq x,1
end if
}
macro live_mov x,reg,how
{
if how eq low
xorps x,x
movlps x,[reg+24]
else if how eq high
xorps x,x
movhps x,[reg+edx]
else
movaps x,[reg+ecx]
end if
}
macro live_load x,y,z,t,shl_edi,shr_esi,how
{
live_mov y,edi,how
live_mov x,ebx,how
live_shl y,shl_edi
movaps t,y
xorps y,x
live_mov z,esi,how
andps x,t
live_shr z,shr_esi
movaps t,y
xorps y,z
andps t,z
orps x,t
}
macro live_operation a,A,b,B,c,C,d,D
{
movaps D,A
xorps A,B
andps D,B
movaps d,a
xorps a,D
andps d,D
movaps D,a
xorps a,b
andps D,b
orps d,D
movaps D,a
xorps a,c
andps D,c
xorps d,D
xorps a,d
movaps D,A
orps D,C
xorps A,C
xorps d,D
orps A,[ebx+ecx+16]
andps a,d
andps a,A
movaps [ebp+ecx],a
}
macro live_cycle shl_edi,shr_esi
{
local cycle
local cycle_entry
mov ecx,edx
live_load xmm2,xmm3,xmm4,xmm5,shl_edi,shr_esi,low
live_load xmm4,xmm5,xmm6,xmm7,shl_edi,shr_esi
sub ecx,eax
jmp cycle_entry
cycle:
live_load xmm4,xmm5,xmm6,xmm7,shl_edi,shr_esi
live_operation xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7
sub ecx,eax
cycle_entry:
live_load xmm6,xmm7,xmm0,xmm1,shl_edi,shr_esi
live_operation xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm0,xmm1
sub ecx,eax
live_load xmm0,xmm1,xmm2,xmm3,shl_edi,shr_esi
live_operation xmm4,xmm5,xmm6,xmm7,xmm0,xmm1,xmm2,xmm3
sub ecx,eax
live_load xmm2,xmm3,xmm4,xmm5,shl_edi,shr_esi
live_operation xmm6,xmm7,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5
sub ecx,eax
jg cycle
live_load xmm4,xmm5,xmm6,xmm7,shl_edi,shr_esi,high
live_operation xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7
}
OneGeneration_Flag12:
push edi
lea esi,[eax+1]
bt dword [esp+48],1
jnc OneGeneration_flag2_end
bt dword [esp+48],3
jc OneGeneration_flag2_end
mov edi,[esp+36]
shl edi,4
cmp edi,edx
jb OneGeneration_flag2_uphalf
sub edi,edx
cmp edi,edx
jnb OneGeneration_flag2_end
add edi,8
OneGeneration_flag2_uphalf:
mov ecx,esi
lea edi,[edi+ebx+16]
pxor mm0,mm0
OneGeneration_flag2_cycle:
movq [edi],mm0
add edi,edx
loop OneGeneration_flag2_cycle
OneGeneration_flag2_end:
bt dword [esp+48],0
jnc OneGeneration_flag1_end
bt dword [esp+48],2
jc OneGeneration_flag1_end
push edx
mov eax,[esp+36]
xor edx,edx
div esi
mov esi,edx
pop edx
cmp eax,64
jnb OneGeneration_flag1_end
imul esi,edx
lea esi,[esi+ebx+16]
btr eax,5
jnc OneGeneration_flag1_noadd4
add esi,4
OneGeneration_flag1_noadd4:
lea ecx,[edx-8]
mov edi,8
OneGeneration_flag1_cycle:
btr dword [esi+ecx],eax
sub ecx,edi
btr dword [esi+ecx],eax
sub ecx,edi
btr dword [esi+ecx],eax
sub ecx,edi
btr dword [esi+ecx],eax
sub ecx,edi
jnl OneGeneration_flag1_cycle
OneGeneration_flag1_end:
pop edi
ret
@OneGeneration$qqsiipvpxvi:
push ebp
push ebx
push esi
push edi
mov eax,[esp+20]
mov edx,[esp+24]
mov ebp,[esp+28]
mov ebx,[esp+32]
dec eax
jl OneGeneration_end
add edx,7
add ebp,15
dec ebx
shr eax,6
shl edx,3
and ebp,not 15
and ebx,not 15
and edx,not 63
jng OneGeneration_end
test eax,eax
jz OneGeneration_single
mov edi,edx
imul edi,eax
jo OneGeneration_end
push eax
add edi,ebx
call OneGeneration_Flag12
lea esi,[ebx+edx]
push dword [esp]
mov eax,16
live_cycle yes,no
jmp OneGeneration_cycle_fin
OneGeneration_cycle:
mov edi,ebx
mov ebx,esi
add ebp,edx
add esi,edx
live_cycle no,no
OneGeneration_cycle_fin:
dec dword [esp]
jg OneGeneration_cycle
mov edi,ebx
pop ecx
mov ebx,esi
mov esi,edx
add ebp,edx
imul esi,[esp]
neg esi
add esi,ebx
live_cycle no,yes
jmp OneGeneration_flag48
OneGeneration_single:
push eax
mov edi,ebx
call OneGeneration_Flag12
mov esi,ebx
mov eax,16
live_cycle yes,yes
OneGeneration_flag48:
pop ebp
inc ebp
bt dword [esp+36],3
jnc OneGeneration_flag8_end
mov edi,[esp+24]
mov ebx,[esp+28]
dec edi
add ebx,15
shl edi,4
lea esi,[edi-16]
and ebx,not 15
cmp edi,edx
jb OneGeneration_flag8_uphalf
sub edi,edx
add edi,8
cmp esi,edx
jb OneGeneration_flag8_uphalf
sub esi,edx
add esi,8
OneGeneration_flag8_uphalf:
mov ecx,ebp
OneGeneration_flag8_cycle:
movq mm0,[ebx+esi]
movq [ebx],mm0
movq mm0,[ebx+16]
movq [ebx+edi],mm0
add ebx,edx
loop OneGeneration_flag8_cycle
OneGeneration_flag8_end:
bt dword [esp+36],2
jnc OneGeneration_flag4_end
mov eax,[esp+20]
push edx
dec eax
xor edx,edx
mov ebx,[esp+32]
div ebp
add ebx,15
mov esi,eax
mov edi,edx
and ebx,not 15
dec edx
jl OneGeneration_flag4_dec0
mov ebp,edx
jmp OneGeneration_flag4_after_dec
OneGeneration_flag4_dec0:
dec ebp
dec eax
OneGeneration_flag4_after_dec:
pop edx
imul edi,edx
imul ebp,edx
add edi,ebx
add ebp,ebx
btr esi,5
jnc OneGeneration_flag4_noadd4f
add edi,4
OneGeneration_flag4_noadd4f:
btr eax,5
jnc OneGeneration_flag4_noadd4s
add ebp,4
OneGeneration_flag4_noadd4s:
mov ecx,edx
jmp OneGeneration_flag4_cycle0_entry
OneGeneration_flag4_cycle0:
btr dword [ebx+ecx],0
OneGeneration_flag4_cycle0_entry:
sub ecx,8
jl OneGeneration_flag4_cycle0_end
bt dword [ebp+ecx],eax
jnc OneGeneration_flag4_cycle0
bts dword [ebx+ecx],0
jmp OneGeneration_flag4_cycle0_entry
OneGeneration_flag4_cycle0_end:
xor eax,eax
cmp dword [esp+20],64
jng OneGeneration_flag4_single
add ebx,edx
jmp OneGeneration_flag4_cycle1_entry
OneGeneration_flag4_single:
inc eax
jmp OneGeneration_flag4_cycle1_entry
OneGeneration_flag4_cycle1:
btr dword [edi+edx],esi
OneGeneration_flag4_cycle1_entry:
sub edx,8
jl OneGeneration_flag4_end
bt dword [ebx+edx],eax
jnc OneGeneration_flag4_cycle1
bts dword [edi+edx],esi
jmp OneGeneration_flag4_cycle1_entry
OneGeneration_flag4_end:
emms
OneGeneration_end:
pop edi
pop esi
pop ebx
pop ebp
ret 20