sse2 mixers

git-svn-id: svn://kolibrios.org@293 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
Sergey Semyonov (Serge) 2007-01-24 15:51:55 +00:00
parent 72cb45f7f1
commit 73aa6c6d6a
4 changed files with 524 additions and 385 deletions

View File

@ -19,17 +19,21 @@ include 'proc32.inc'
include 'main.inc'
include 'imports.inc'
USE_MMX equ 0
USE_MMX_128 equ 0
USE_SSE equ 0
FORCE_MMX equ 0 ;set to 1 to force use mmx or
FORCE_MMX_128 equ 0 ;integer sse2 extensions
;and reduce driver size
;USE_SSE equ 0
DEBUG equ 1
DEBUG equ 1
EVENT_NOTIFY equ 0x00000200
EVENT_NOTIFY equ 0x00000200
OS_BASE equ 0
new_app_base equ 0x60400000
PROC_BASE equ OS_BASE+0x0080000
CAPS_SSE2 equ 26
OS_BASE equ 0; 0x80400000
new_app_base equ 0x60400000; 0x01000000
PROC_BASE equ OS_BASE+0x0080000
public START
public service_proc
@ -79,6 +83,42 @@ proc START stdcall, state:dword
mov [str.fd], eax
mov [str.bk], eax
if FORCE_MMX
if FORCE_MMX_128
display 'Use only FORCE_MMX or FORCE_MMX_128 not both together',13,10
stop
end if
mov [mix_2_core], mmx_mix_2
mov [mix_3_core], mmx_mix_3
mov [mix_4_core], mmx_mix_4
end if
if FORCE_MMX_128
if FORCE_MMX
display 'Use only FORCE_MMX or FORCE_MMX_128 not both together',13,10
stop
end if
mov [mix_2_core], mmx128_mix_2
mov [mix_3_core], mmx128_mix_3
mov [mix_4_core], mmx128_mix_4
end if
if ~(FORCE_MMX or FORCE_MMX_128) ;autodetect
mov eax, 1
cpuid
bt edx, CAPS_SSE2
jc .mmx128
;old 64-bit mmx
mov [mix_2_core], mmx_mix_2
mov [mix_3_core], mmx_mix_3
mov [mix_4_core], mmx_mix_4
jmp @F
.mmx128: ;new 128-bit sse2 extensions
mov [mix_2_core], mmx128_mix_2
mov [mix_3_core], mmx128_mix_3
mov [mix_4_core], mmx128_mix_4
@@:
end if
stdcall set_handler, [hSound], new_mix
stdcall RegService, szInfinity, service_proc
ret
@ -563,14 +603,8 @@ proc dev_play stdcall, hsrv:dword
endp
include 'mixer.asm'
;if USE_MMX
; include 'mix_mmx.inc'
;end if
if USE_MMX_128
include 'mix_sse2.inc'
end if
include 'mix_mmx.inc'
include 'mix_sse2.inc'
;if USE_SSE
; include 'mix_sse.inc'
@ -664,7 +698,7 @@ mix_buff_map rd 1
str.fd rd 1
str.bk rd 1
mix_2_1.core rd 1
mix_3_1.core rd 1
mix_4_1.core rd 1
mix_2_core rd 1
mix_3_core rd 1
mix_4_core rd 1

View File

@ -0,0 +1,241 @@
; params
; edi= output
; eax= input stream 1
; ebx= input stream 2
if used mmx_mix_2
align 4
mmx_mix_2:
movq mm0, [eax]
movq mm1, [eax+8]
movq mm2, [eax+16]
movq mm3, [eax+24]
movq mm4, [eax+32]
movq mm5, [eax+40]
movq mm6, [eax+48]
movq mm7, [eax+56]
paddsw mm0, [ebx]
movq [edi], mm0
paddsw mm1,[ebx+8]
movq [edi+8], mm1
paddsw mm2, [ebx+16]
movq [edi+16], mm2
paddsw mm3, [ebx+24]
movq [edi+24], mm3
paddsw mm4, [ebx+32]
movq [edi+32], mm4
paddsw mm5, [ebx+40]
movq [edi+40], mm5
paddsw mm6, [ebx+48]
movq [edi+48], mm6
paddsw mm7, [ebx+56]
movq [edi+56], mm7
movq mm0, [eax+64]
movq mm1, [eax+72]
movq mm2, [eax+80]
movq mm3, [eax+88]
movq mm4, [eax+96]
movq mm5, [eax+104]
movq mm6, [eax+112]
movq mm7, [eax+120]
paddsw mm0, [ebx+64]
movq [edi+64], mm0
paddsw mm1, [ebx+72]
movq [edi+72], mm1
paddsw mm2, [ebx+80]
movq [edi+80], mm2
paddsw mm3, [ebx+88]
movq [edi+88], mm3
paddsw mm4, [ebx+96]
movq [edi+96], mm4
paddsw mm5, [ecx+104]
movq [edx+104], mm5
paddsw mm6, [ebx+112]
movq [edi+112], mm6
paddsw mm7, [ebx+120]
movq [edi+120], mm7
ret
align 4
mmx_mix_3:
movq mm0, [eax]
movq mm1, [eax+8]
movq mm2, [eax+16]
movq mm3, [eax+24]
movq mm4, [eax+32]
movq mm5, [eax+40]
movq mm6, [eax+48]
movq mm7, [eax+56]
paddsw mm0, [ebx]
paddsw mm1, [ebx+8]
paddsw mm2, [ebx+16]
paddsw mm3, [ebx+24]
paddsw mm4, [ebx+32]
paddsw mm5, [ebx+40]
paddsw mm6, [ebx+48]
paddsw mm7, [ebx+56]
paddsw mm0, [ecx]
movq [edi], mm0
paddsw mm1,[ecx+8]
movq [edi+8], mm1
paddsw mm2, [ecx+16]
movq [edi+16], mm2
paddsw mm3, [ecx+24]
movq [edi+24], mm3
paddsw mm4, [ecx+32]
movq [edi+32], mm4
paddsw mm5, [ecx+40]
movq [edi+40], mm5
paddsw mm6, [ecx+48]
movq [edi+48], mm6
paddsw mm7, [ecx+56]
movq [edi+56], mm7
movq mm0, [eax+64]
movq mm1, [eax+72]
movq mm2, [eax+80]
movq mm3, [eax+88]
movq mm4, [eax+96]
movq mm5, [eax+104]
movq mm6, [eax+112]
movq mm7, [eax+120]
paddsw mm0, [ebx+64]
paddsw mm1, [ebx+72]
paddsw mm2, [ebx+80]
paddsw mm3, [ebx+88]
paddsw mm4, [ebx+96]
paddsw mm5, [ebx+104]
paddsw mm6, [ebx+112]
paddsw mm7, [ebx+120]
paddsw mm0, [ecx+64]
movq [edi+64], mm0
paddsw mm1, [ecx+72]
movq [edi+72], mm1
paddsw mm2, [ecx+80]
movq [edi+80], mm2
paddsw mm3, [ecx+88]
movq [edi+88], mm3
paddsw mm4, [ecx+96]
movq [edi+96], mm4
paddsw mm5, [ecx+104]
movq [edi+104], mm5
paddsw mm6, [ecx+112]
movq [edi+112], mm6
paddsw mm7, [ecx+120]
movq [edi+120], mm7
ret
align 4
mmx_mix_4:
movq mm0, [eax]
movq mm2, [eax+8]
movq mm4, [eax+16]
movq mm6, [eax+24]
movq mm1, [ebx]
movq mm3, [ebx+8]
movq mm5, [ebx+16]
movq mm7, [ebx+24]
paddsw mm0, [ecx]
paddsw mm2, [ecx+8]
paddsw mm4, [ecx+16]
paddsw mm6, [ecx+24]
paddsw mm1, [edx]
paddsw mm3, [edx+8]
paddsw mm5, [edx+16]
paddsw mm7, [edx+24]
paddsw mm0, mm1
movq [edi], mm0
paddsw mm2, mm3
movq [edi+8], mm2
paddsw mm4, mm5
movq [edi+16], mm4
paddsw mm5, mm6
movq [edi+24], mm6
movq mm0, [eax+32]
movq mm2, [eax+40]
movq mm4, [eax+48]
movq mm6, [eax+56]
movq mm1, [ebx+32]
movq mm3, [ebx+40]
movq mm5, [ebx+48]
movq mm7, [ebx+56]
paddsw mm0, [ecx+32]
paddsw mm2, [ecx+40]
paddsw mm4, [ecx+48]
paddsw mm6, [ecx+56]
paddsw mm1, [edx+32]
paddsw mm3, [edx+40]
paddsw mm5, [edx+48]
paddsw mm7, [edx+56]
paddsw mm0, mm1
movq [edi+32], mm0
paddsw mm2, mm2
movq [edi+40], mm2
paddsw mm4, mm5
movq [edi+48], mm4
paddsw mm6, mm7
movq [edi+56], mm6
movq mm0, [eax+64]
movq mm2, [eax+72]
movq mm4, [eax+80]
movq mm6, [eax+88]
movq mm1, [ebx+64]
movq mm3, [ebx+72]
movq mm5, [ebx+80]
movq mm7, [ebx+88]
paddsw mm0, [ecx+64]
paddsw mm2, [ecx+72]
paddsw mm4, [ecx+80]
paddsw mm6, [ecx+88]
paddsw mm1, [edx+64]
paddsw mm3, [edx+72]
paddsw mm5, [edx+80]
paddsw mm7, [edx+88]
paddsw mm0, mm1
movq [edi+64], mm0
paddsw mm2, mm3
movq [edi+72], mm2
paddsw mm4, mm5
movq [edi+80], mm4
paddsw mm6, mm5
movq [edi+88], mm7
movq mm0, [eax+96]
movq mm2, [eax+104]
movq mm4, [eax+112]
movq mm6, [eax+120]
movq mm1, [ebx+96]
movq mm3, [ebx+104]
movq mm5, [ebx+112]
movq mm7, [ebx+120]
paddsw mm0, [ecx+96]
paddsw mm2, [ecx+104]
paddsw mm4, [ecx+112]
paddsw mm6, [ecx+120]
paddsw mm1, [edx+96]
paddsw mm3, [edx+104]
paddsw mm5, [edx+112]
paddsw mm7, [edx+120]
paddsw mm0, mm1
movq [eax+96], mm0
paddsw mm2, mm3
movq [edi+104], mm2
paddsw mm4, mm5
movq [edi+112], mm4
paddsw mm6, mm7
movq [edi+120], mm6
ret
end if

View File

@ -0,0 +1,139 @@
if used mmx128_mix_2
align 4
mmx128_mix_2:
prefetcht1 [eax+128]
prefetcht1 [ebx+128]
movaps xmm0, [eax]
movaps xmm1, [eax+16]
movaps xmm2, [eax+32]
movaps xmm3, [eax+48]
movaps xmm4, [eax+64]
movaps xmm5, [eax+80]
movaps xmm6, [eax+96]
movaps xmm7, [eax+112]
paddsw xmm0, [ebx]
movaps [edi], xmm0
paddsw xmm1,[ebx+16]
movaps [edi+16], xmm1
paddsw xmm2, [ebx+32]
movaps [edi+32], xmm2
paddsw xmm3, [ebx+48]
movaps [edi+48], xmm3
paddsw xmm4, [ebx+64]
movaps [edi+64], xmm4
paddsw xmm5, [ebx+80]
movaps [edi+80], xmm5
paddsw xmm6, [ebx+96]
movaps [edi+96], xmm6
paddsw xmm7, [ebx+112]
movaps [edi+112], xmm7
ret
align 4
mmx128_mix_3:
prefetcht1 [eax+128]
prefetcht1 [ebx+128]
prefetcht1 [ecx+128]
movaps xmm0, [eax]
movaps xmm1, [eax+16]
movaps xmm2, [eax+32]
movaps xmm3, [eax+48]
movaps xmm4, [eax+64]
movaps xmm5, [eax+80]
movaps xmm6, [eax+96]
movaps xmm7, [eax+112]
paddsw xmm0, [ebx]
paddsw xmm1, [ebx+16]
paddsw xmm2, [ebx+32]
paddsw xmm3, [ebx+48]
paddsw xmm4, [ebx+64]
paddsw xmm5, [ebx+80]
paddsw xmm6, [ebx+96]
paddsw xmm7, [ebx+112]
paddsw xmm0, [ecx]
movaps [edi], xmm0
paddsw xmm1, [ecx+16]
movaps [edi+16], xmm1
paddsw xmm2, [ecx+32]
movaps [edi+32], xmm2
paddsw xmm3, [ecx+48]
movaps [edi+48], xmm3
paddsw xmm4, [ecx+64]
movaps [edi+64], xmm4
paddsw xmm5, [ecx+80]
movaps [edi+80], xmm5
paddsw xmm6, [ecx+96]
movaps [edi+96], xmm6
paddsw xmm7, [ecx+112]
movaps [edi+112], xmm7
ret
align 4
mmx128_mix_4:
prefetcht1 [eax+128]
prefetcht1 [ebx+128]
prefetcht1 [ecx+128]
prefetcht1 [edx+128]
movaps xmm0, [eax]
movaps xmm2, [eax+16]
movaps xmm4, [eax+32]
movaps xmm6, [eax+48]
movaps xmm1, [ebx]
movaps xmm3, [ebx+16]
movaps xmm5, [ebx+32]
movaps xmm7, [ebx+48]
paddsw xmm0, [ecx]
paddsw xmm2, [ecx+16]
paddsw xmm4, [ecx+32]
paddsw xmm6, [ecx+48]
paddsw xmm1, [edx]
paddsw xmm3, [edx+16]
paddsw xmm5, [edx+32]
paddsw xmm7, [edx+48]
paddsw xmm0, xmm1
movaps [edi], xmm0
paddsw xmm2, xmm3
movaps [edi+16], xmm2
paddsw xmm4, xmm5
movaps [edi+32], xmm4
paddsw xmm6, xmm7
movaps [edi+48], xmm6
movaps xmm0, [eax+64]
movaps xmm2, [eax+80]
movaps xmm4, [eax+96]
movaps xmm6, [eax+112]
movaps xmm1, [ebx+64]
movaps xmm3, [ebx+80]
movaps xmm5, [ebx+96]
movaps xmm7, [ebx+112]
paddsw xmm0, [ecx+64]
paddsw xmm2, [ecx+80]
paddsw xmm4, [ecx+96]
paddsw xmm6, [ecx+112]
paddsw xmm1, [edx+64]
paddsw xmm3, [edx+80]
paddsw xmm5, [edx+96]
paddsw xmm7, [edx+112]
paddsw xmm0, xmm1
movaps [edi+64], xmm0
paddsw xmm2, xmm3
movaps [edi+80], xmm2
paddsw xmm4, xmm5
movaps [edi+96], xmm4
paddsw xmm6, xmm7
movaps [edi+112], xmm6
ret
end if

View File

@ -90,7 +90,7 @@ proc new_mix stdcall, output:dword
.m3:
add [output],512
sub [main_count], 1
dec [main_count]
jnz .l00
call update_stream
@ -324,8 +324,8 @@ align 16
mov [edi], ebx
add edi, 4
add eax, [esp+16]
cmp eax, [esp+24]
add eax, [esp+16]
cmp eax, [esp+24]
jb .l1
mov ebp, esp
@ -382,8 +382,8 @@ align 16
mov [edi], ebx
add edi, 4
add esi, [esp+16]
cmp esi, [esp+24]
add esi, [esp+16]
cmp esi, [esp+24]
jb .l1
mov ebp, esp
@ -622,6 +622,7 @@ proc alloc_mix_buff
ret
endp
align 4
proc m16_s_mmx
movq mm0, [esi]
@ -777,56 +778,59 @@ proc m8_s_mmx
ret
endp
align 4
proc mix_2_1 stdcall, output:dword, str0:dword, str1:dword
mov edi, [output]
mov eax, [str0]
mov ebx, [str1]
mov esi, 128
call [mix_2_core] ;edi, eax, ebx
stdcall mix_2_1_mmx, edi, [str0],[str1]
; stdcall mix_2_1_sse, edi, [str0],[str1]
add edi, 128
add [str0], 128
add [str1], 128
stdcall mix_2_1_mmx, edi, [str0],[str1]
; stdcall mix_2_1_sse, edi, [str0],[str1]
add edi, 128
add [str0], 128
add [str1], 128
stdcall mix_2_1_mmx, edi, [str0],[str1]
; stdcall mix_2_1_sse, edi, [str0],[str1]
add edi, 128
add [str0], 128
add [str1], 128
stdcall mix_2_1_mmx, edi, [str0],[str1]
; stdcall mix_2_1_sse, edi, [str0],[str1]
add edi, esi
add eax, esi
add ebx, esi
call [mix_2_core] ;edi, eax, ebx
add edi, esi
add eax, esi
add ebx, esi
call [mix_2_core] ;edi, eax, ebx
add edi, esi
add eax, esi
add ebx, esi
call [mix_2_core] ;edi, eax, ebx
ret
endp
align 4
proc mix_3_1 stdcall, output:dword, str0:dword, str1:dword, str2:dword
mov edi, [output]
mov eax, [str0]
mov ebx, [str1]
mov ecx, [str2]
mov esi, 128
call [mix_3_core]
stdcall mix_3_1_mmx, edi, [str0],[str1],[str2]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
stdcall mix_3_1_mmx, edi, [str0],[str1],[str2]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
stdcall mix_3_1_mmx, edi, [str0],[str1],[str2]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
stdcall mix_3_1_mmx, edi, [str0],[str1],[str2]
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
call [mix_3_core]
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
call [mix_3_core]
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
call [mix_3_core]
ret
endp
@ -839,29 +843,35 @@ proc mix_4_1 stdcall, str0:dword, str1:dword,\
call alloc_mix_buff
and eax, eax
jz .err
mov [output], eax
mov edi, eax
mov eax, [str0]
mov ebx, [str1]
mov ecx, [str2]
mov edx, [str3]
mov esi, 128
call [mix_4_core] ;edi, eax, ebx, ecx, edx
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
mov eax, [output]
ret
.err:
@ -876,318 +886,33 @@ proc final_mix stdcall, output:dword, str0:dword, str1:dword,\
mov edi, [output]
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
add edi, 128
add [str0], 128
add [str1], 128
add [str2], 128
add [str3], 128
stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3]
ret
endp
align 4
proc mix_2_1_mmx stdcall, output:dword, str0:dword, str1:dword
mov edx, [output]
mov eax, [str0]
mov ecx, [str1]
movq mm0, [eax]
paddsw mm0, [ecx]
movq [edx], mm0
movq mm1, [eax+8]
paddsw mm1,[ecx+8]
movq [edx+8], mm1
movq mm2, [eax+16]
paddsw mm2, [ecx+16]
movq [edx+16], mm2
movq mm3, [eax+24]
paddsw mm3, [ecx+24]
movq [edx+24], mm3
movq mm0, [eax+32]
paddsw mm0, [ecx+32]
movq [edx+32], mm0
movq mm1, [eax+40]
paddsw mm1, [ecx+40]
movq [edx+40], mm1
movq mm2, [eax+48]
paddsw mm2, [ecx+48]
movq [edx+48], mm2
movq mm3, [eax+56]
paddsw mm3, [ecx+56]
movq [edx+56], mm3
movq mm0, [eax+64]
paddsw mm0, [ecx+64]
movq [edx+64], mm0
movq mm1, [eax+72]
paddsw mm1, [ecx+72]
movq [edx+72], mm1
movq mm2, [eax+80]
paddsw mm2, [ecx+80]
movq [edx+80], mm2
movq mm3, [eax+88]
paddsw mm3, [ecx+88]
movq [edx+88], mm3
movq mm0, [eax+96]
paddsw mm0, [ecx+96]
movq [edx+96], mm0
movq mm1, [eax+104]
paddsw mm1, [ecx+104]
movq [edx+104], mm1
movq mm2, [eax+112]
paddsw mm2, [ecx+112]
movq [edx+112], mm2
movq mm3, [eax+120]
paddsw mm3, [ecx+120]
movq [edx+120], mm3
ret
endp
align 4
proc mix_3_1_mmx stdcall, output:dword, str0:dword, str1:dword, str2:dword
mov edx, [output]
mov eax, [str0]
mov ebx, [str1]
mov ecx, [str2]
mov edx, [str3]
mov esi, 128
call [mix_4_core] ;edi, eax, ebx, ecx, edx
movq mm0, [eax]
paddsw mm0, [ebx]
paddsw mm0, [ecx]
movq [edx], mm0
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
movq mm1, [eax+8]
paddsw mm1,[ebx+8]
paddsw mm1,[ecx+8]
movq [edx+8], mm1
movq mm2, [eax+16]
paddsw mm2, [ebx+16]
paddsw mm2, [ecx+16]
movq [edx+16], mm2
movq mm3, [eax+24]
paddsw mm3, [ebx+24]
paddsw mm3, [ecx+24]
movq [edx+24], mm3
movq mm0, [eax+32]
paddsw mm0, [ebx+32]
paddsw mm0, [ecx+32]
movq [edx+32], mm0
movq mm1, [eax+40]
paddsw mm1, [ebx+40]
paddsw mm1, [ecx+40]
movq [edx+40], mm1
movq mm2, [eax+48]
paddsw mm2, [ebx+48]
paddsw mm2, [ecx+48]
movq [edx+48], mm2
movq mm3, [eax+56]
paddsw mm3, [ebx+56]
paddsw mm3, [ecx+56]
movq [edx+56], mm3
movq mm0, [eax+64]
paddsw mm0, [ebx+64]
paddsw mm0, [ecx+64]
movq [edx+64], mm0
movq mm1, [eax+72]
paddsw mm1, [ebx+72]
paddsw mm1, [ecx+72]
movq [edx+72], mm1
movq mm2, [eax+80]
paddsw mm2, [ebx+80]
paddsw mm2, [ecx+80]
movq [edx+80], mm2
movq mm3, [eax+88]
paddsw mm3, [ebx+88]
paddsw mm3, [ecx+88]
movq [edx+88], mm3
movq mm0, [eax+96]
paddsw mm0, [ebx+96]
paddsw mm0, [ecx+96]
movq [edx+96], mm0
movq mm1, [eax+104]
paddsw mm1, [ebx+104]
paddsw mm1, [ecx+104]
movq [edx+104], mm1
movq mm2, [eax+112]
paddsw mm2, [ebx+112]
paddsw mm2, [ecx+112]
movq [edx+112], mm2
movq mm3, [eax+120]
paddsw mm3, [ebx+120]
paddsw mm3, [ecx+120]
movq [edx+120], mm3
ret
endp
align 4
proc mix_4_1_mmx stdcall, output:dword, str0:dword, str1:dword,\
str2:dword, str3:dword
mov edx, [output]
mov esi, [str0]
mov eax, [str1]
mov ebx, [str2]
mov ecx, [str3]
movq mm0, [esi]
movq mm1, [eax]
paddsw mm0, [ebx]
paddsw mm1, [ecx]
paddsw mm0, mm1
movq [edx], mm0
movq mm2, [esi+8]
movq mm3, [eax+8]
paddsw mm2, [ebx+8]
paddsw mm3, [ecx+8]
paddsw mm2, mm3
movq [edx+8], mm2
movq mm0, [esi+16]
movq mm1, [eax+16]
paddsw mm0, [ebx+16]
paddsw mm1, [ecx+16]
paddsw mm0, mm1
movq [edx+16], mm0
movq mm2, [esi+24]
movq mm3, [eax+24]
paddsw mm2, [ebx+24]
paddsw mm3, [ecx+24]
paddsw mm2, mm3
movq [edx+24], mm2
movq mm0, [esi+32]
movq mm1, [eax+32]
paddsw mm0, [ebx+32]
paddsw mm1, [ecx+32]
paddsw mm0, mm1
movq [edx+32], mm0
movq mm2, [esi+40]
movq mm3, [eax+40]
paddsw mm2, [ebx+40]
paddsw mm3, [ecx+40]
paddsw mm2, mm3
movq [edx+40], mm2
movq mm0, [esi+48]
movq mm1, [eax+48]
paddsw mm0, [ebx+48]
paddsw mm1, [ecx+48]
paddsw mm0, mm1
movq [edx+48], mm0
movq mm2, [esi+56]
movq mm3, [eax+56]
paddsw mm2, [ebx+56]
paddsw mm3, [ecx+56]
paddsw mm2, mm3
movq [edx+56], mm2
movq mm0, [esi+64]
movq mm1, [eax+64]
paddsw mm0, [ebx+64]
paddsw mm1, [ecx+64]
paddsw mm0, mm1
movq [edx+64], mm0
movq mm2, [esi+72]
movq mm3, [eax+72]
paddsw mm2, [ebx+72]
paddsw mm3, [ecx+72]
paddsw mm2, mm3
movq [edx+72], mm2
movq mm2, [esi+80]
movq mm3, [eax+80]
paddsw mm2, [ebx+80]
paddsw mm3, [ecx+80]
paddsw mm2, mm3
movq [edx+80], mm2
movq mm2, [esi+88]
movq mm3, [eax+88]
paddsw mm2, [ebx+88]
paddsw mm3, [ecx+88]
paddsw mm2, mm3
movq [edx+88], mm2
movq mm2, [esi+96]
movq mm3, [eax+96]
paddsw mm2, [ebx+96]
paddsw mm3, [ecx+96]
paddsw mm2, mm3
movq [edx+96], mm2
movq mm2, [esi+104]
movq mm3, [eax+104]
paddsw mm2, [ebx+104]
paddsw mm3, [ecx+104]
paddsw mm2, mm3
movq [edx+104], mm2
movq mm2, [esi+112]
movq mm3, [eax+112]
paddsw mm2, [ebx+112]
paddsw mm3, [ecx+112]
paddsw mm2, mm3
movq [edx+112], mm2
movq mm2, [esi+120]
movq mm3, [eax+120]
paddsw mm2, [ebx+120]
paddsw mm3, [ecx+120]
paddsw mm2, mm3
movq [edx+120], mm2
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
add edi, esi
add eax, esi
add ebx, esi
add ecx, esi
add edx, esi
call [mix_4_core] ;edi, eax, ebx, ecx, edx
ret
endp