kolibrios-gitea/programs/demos/life2/mmxlife.inc

321 lines
6.2 KiB
PHP
Raw Normal View History

;// fast life generator: ~2.2 pixel*generation/tact
macro live_shl x,do_shl
{
if do_shl eq yes
psllq x,1
end if
}
macro live_shr x,do_shr
{
if do_shr eq yes
psrlq x,1
end if
}
macro live_zero x,y
{
pxor x,x
movq y,x
}
macro live_load x,y,z,t,shl_edi,shr_esi
{
movq y,[edi+ecx]
movq x,[ebx+ecx]
live_shl y,shl_edi
movq t,y
pxor y,x
movq z,[esi+ecx]
pand x,t
live_shr z,shr_esi
movq t,y
pxor y,z
pand t,z
por x,t
}
macro live_operation a,A,b,B,c,C,d,D,shift
{
movq D,A
pxor A,B
pand D,B
movq d,a
pxor a,D
pand d,D
movq D,a
pxor a,b
pand D,b
por d,D
movq D,a
pxor a,c
pand D,c
pxor d,D
pxor a,d
movq D,A
por D,C
pxor A,C
pxor d,D
por A,[ebx+shift+16]
pand a,d
pand a,A
movq [ebp+shift],a
}
macro live_cycle shl_edi,shr_esi
{
local cycle
local cycle_entry
local last_oper
lea ecx,[edx-8]
live_zero mm2,mm3
live_load mm4,mm5,mm6,mm7,shl_edi,shr_esi
sub ecx,eax
jmp cycle_entry
cycle:
live_load mm4,mm5,mm6,mm7,shl_edi,shr_esi
live_operation mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7,ecx
sub ecx,eax
cycle_entry:
live_load mm6,mm7,mm0,mm1,shl_edi,shr_esi
live_operation mm2,mm3,mm4,mm5,mm6,mm7,mm0,mm1,ecx
sub ecx,eax
live_load mm0,mm1,mm2,mm3,shl_edi,shr_esi
live_operation mm4,mm5,mm6,mm7,mm0,mm1,mm2,mm3,ecx
sub ecx,eax
live_load mm2,mm3,mm4,mm5,shl_edi,shr_esi
live_operation mm6,mm7,mm0,mm1,mm2,mm3,mm4,mm5,ecx
sub ecx,eax
jnl cycle
cmp cl,-8
jnz last_oper
lea ecx,[edx-16]
live_load mm4,mm5,mm6,mm7,shl_edi,shr_esi
live_operation mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7,-8
sub ecx,eax
jmp cycle_entry
last_oper:
live_zero mm4,mm5
live_operation mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7,ecx
}
OneGeneration_Flag12:
push edi
lea esi,[eax+1]
bt dword [esp+48],1
jnc OneGeneration_flag2_end
bt dword [esp+48],3
jc OneGeneration_flag2_end
mov edi,[esp+36]
shl edi,4
cmp edi,edx
jb OneGeneration_flag2_uphalf
sub edi,edx
cmp edi,edx
jnb OneGeneration_flag2_end
add edi,8
OneGeneration_flag2_uphalf:
mov ecx,esi
add edi,ebx
pxor mm0,mm0
OneGeneration_flag2_cycle:
movq [edi],mm0
add edi,edx
loop OneGeneration_flag2_cycle
OneGeneration_flag2_end:
bt dword [esp+48],0
jnc OneGeneration_flag1_end
bt dword [esp+48],2
jc OneGeneration_flag1_end
push edx
mov eax,[esp+36]
xor edx,edx
div esi
mov esi,edx
pop edx
cmp eax,64
jnb OneGeneration_flag1_end
imul esi,edx
add esi,ebx
btr eax,5
jnc OneGeneration_flag1_noadd4
add esi,4
OneGeneration_flag1_noadd4:
lea ecx,[edx-8]
mov edi,8
OneGeneration_flag1_cycle:
btr dword [esi+ecx],eax
sub ecx,edi
btr dword [esi+ecx],eax
sub ecx,edi
btr dword [esi+ecx],eax
sub ecx,edi
btr dword [esi+ecx],eax
sub ecx,edi
jnl OneGeneration_flag1_cycle
OneGeneration_flag1_end:
pop edi
ret
@OneGeneration$qqsiipvpxvi:
push ebp
push ebx
push esi
push edi
mov eax,[esp+20]
mov edx,[esp+24]
mov ebp,[esp+28]
mov ebx,[esp+32]
dec eax
jl OneGeneration_end
add edx,7
add ebp,31
add ebx,15
shr eax,6
shl edx,3
and ebp,not 15
and ebx,not 15
and edx,not 63
jng OneGeneration_end
test eax,eax
jz OneGeneration_single
mov edi,edx
imul edi,eax
jo OneGeneration_end
push eax
add edi,ebx
call OneGeneration_Flag12
lea esi,[ebx+edx]
push dword [esp]
mov eax,16
live_cycle yes,no
jmp OneGeneration_cycle_fin
OneGeneration_cycle:
mov edi,ebx
mov ebx,esi
add ebp,edx
add esi,edx
live_cycle no,no
OneGeneration_cycle_fin:
dec dword [esp]
jg OneGeneration_cycle
mov edi,ebx
pop ecx
mov ebx,esi
mov esi,edx
add ebp,edx
imul esi,[esp]
neg esi
add esi,ebx
live_cycle no,yes
jmp OneGeneration_flag48
OneGeneration_single:
push eax
mov edi,ebx
call OneGeneration_Flag12
mov esi,ebx
mov eax,16
live_cycle yes,yes
OneGeneration_flag48:
pop ebp
inc ebp
bt dword [esp+36],3
jnc OneGeneration_flag8_end
mov edi,[esp+24]
mov ebx,[esp+28]
dec edi
add ebx,15
shl edi,4
lea esi,[edi-16]
and ebx,not 15
cmp edi,edx
jb OneGeneration_flag8_uphalf
sub edi,edx
add edi,8
cmp esi,edx
jb OneGeneration_flag8_uphalf
sub esi,edx
add esi,8
OneGeneration_flag8_uphalf:
mov ecx,ebp
OneGeneration_flag8_cycle:
movq mm0,[ebx+esi]
movq [ebx],mm0
movq mm0,[ebx+16]
movq [ebx+edi],mm0
add ebx,edx
loop OneGeneration_flag8_cycle
OneGeneration_flag8_end:
bt dword [esp+36],2
jnc OneGeneration_flag4_end
mov eax,[esp+20]
push edx
dec eax
xor edx,edx
mov ebx,[esp+32]
div ebp
add ebx,15
mov esi,eax
mov edi,edx
and ebx,not 15
dec edx
jl OneGeneration_flag4_dec0
mov ebp,edx
jmp OneGeneration_flag4_after_dec
OneGeneration_flag4_dec0:
dec ebp
dec eax
OneGeneration_flag4_after_dec:
pop edx
imul edi,edx
imul ebp,edx
add edi,ebx
add ebp,ebx
btr esi,5
jnc OneGeneration_flag4_noadd4f
add edi,4
OneGeneration_flag4_noadd4f:
btr eax,5
jnc OneGeneration_flag4_noadd4s
add ebp,4
OneGeneration_flag4_noadd4s:
mov ecx,edx
jmp OneGeneration_flag4_cycle0_entry
OneGeneration_flag4_cycle0:
btr dword [ebx+ecx],0
OneGeneration_flag4_cycle0_entry:
sub ecx,8
jl OneGeneration_flag4_cycle0_end
bt dword [ebp+ecx],eax
jnc OneGeneration_flag4_cycle0
bts dword [ebx+ecx],0
jmp OneGeneration_flag4_cycle0_entry
OneGeneration_flag4_cycle0_end:
xor eax,eax
cmp dword [esp+20],64
jng OneGeneration_flag4_single
add ebx,edx
jmp OneGeneration_flag4_cycle1_entry
OneGeneration_flag4_single:
inc eax
jmp OneGeneration_flag4_cycle1_entry
OneGeneration_flag4_cycle1:
btr dword [edi+edx],esi
OneGeneration_flag4_cycle1_entry:
sub edx,8
jl OneGeneration_flag4_end
bt dword [ebx+edx],eax
jnc OneGeneration_flag4_cycle1
bts dword [edi+edx],esi
jmp OneGeneration_flag4_cycle1_entry
OneGeneration_flag4_end:
emms
OneGeneration_end:
pop edi
pop esi
pop ebx
pop ebp
ret 20