From 73aa6c6d6abb557e31e6214ff9b7bb4bfbd04d38 Mon Sep 17 00:00:00 2001 From: "Sergey Semyonov (Serge)" Date: Wed, 24 Jan 2007 15:51:55 +0000 Subject: [PATCH] sse2 mixers git-svn-id: svn://kolibrios.org@293 a494cfbc-eb01-0410-851d-a64ba20cac60 --- kernel/trunk/drivers/infinity.asm | 72 +++-- kernel/trunk/drivers/mix_mmx.inc | 241 ++++++++++++++++ kernel/trunk/drivers/mix_sse2.inc | 139 +++++++++ kernel/trunk/drivers/mixer.asm | 457 ++++++------------------------ 4 files changed, 524 insertions(+), 385 deletions(-) create mode 100644 kernel/trunk/drivers/mix_mmx.inc create mode 100644 kernel/trunk/drivers/mix_sse2.inc diff --git a/kernel/trunk/drivers/infinity.asm b/kernel/trunk/drivers/infinity.asm index 7b82a07c2d..fd24c81599 100644 --- a/kernel/trunk/drivers/infinity.asm +++ b/kernel/trunk/drivers/infinity.asm @@ -19,17 +19,21 @@ include 'proc32.inc' include 'main.inc' include 'imports.inc' -USE_MMX equ 0 -USE_MMX_128 equ 0 -USE_SSE equ 0 +FORCE_MMX equ 0 ;set to 1 to force use mmx or +FORCE_MMX_128 equ 0 ;integer sse2 extensions + ;and reduce driver size +;USE_SSE equ 0 -DEBUG equ 1 +DEBUG equ 1 -EVENT_NOTIFY equ 0x00000200 +EVENT_NOTIFY equ 0x00000200 + +OS_BASE equ 0 +new_app_base equ 0x60400000 +PROC_BASE equ OS_BASE+0x0080000 + +CAPS_SSE2 equ 26 -OS_BASE equ 0; 0x80400000 -new_app_base equ 0x60400000; 0x01000000 -PROC_BASE equ OS_BASE+0x0080000 public START public service_proc @@ -79,6 +83,42 @@ proc START stdcall, state:dword mov [str.fd], eax mov [str.bk], eax +if FORCE_MMX + if FORCE_MMX_128 + display 'Use only FORCE_MMX or FORCE_MMX_128 not both together',13,10 + stop + end if + mov [mix_2_core], mmx_mix_2 + mov [mix_3_core], mmx_mix_3 + mov [mix_4_core], mmx_mix_4 +end if + +if FORCE_MMX_128 + if FORCE_MMX + display 'Use only FORCE_MMX or FORCE_MMX_128 not both together',13,10 + stop + end if + mov [mix_2_core], mmx128_mix_2 + mov [mix_3_core], mmx128_mix_3 + mov [mix_4_core], mmx128_mix_4 +end if + +if ~(FORCE_MMX or FORCE_MMX_128) ;autodetect + mov eax, 1 + cpuid + bt edx, CAPS_SSE2 + jc .mmx128 + ;old 64-bit mmx + mov [mix_2_core], mmx_mix_2 + mov [mix_3_core], mmx_mix_3 + mov [mix_4_core], mmx_mix_4 + jmp @F +.mmx128: ;new 128-bit sse2 extensions + mov [mix_2_core], mmx128_mix_2 + mov [mix_3_core], mmx128_mix_3 + mov [mix_4_core], mmx128_mix_4 +@@: +end if stdcall set_handler, [hSound], new_mix stdcall RegService, szInfinity, service_proc ret @@ -563,14 +603,8 @@ proc dev_play stdcall, hsrv:dword endp include 'mixer.asm' - -;if USE_MMX -; include 'mix_mmx.inc' -;end if - -if USE_MMX_128 - include 'mix_sse2.inc' -end if +include 'mix_mmx.inc' +include 'mix_sse2.inc' ;if USE_SSE ; include 'mix_sse.inc' @@ -664,7 +698,7 @@ mix_buff_map rd 1 str.fd rd 1 str.bk rd 1 -mix_2_1.core rd 1 -mix_3_1.core rd 1 -mix_4_1.core rd 1 +mix_2_core rd 1 +mix_3_core rd 1 +mix_4_core rd 1 diff --git a/kernel/trunk/drivers/mix_mmx.inc b/kernel/trunk/drivers/mix_mmx.inc new file mode 100644 index 0000000000..9413c142d1 --- /dev/null +++ b/kernel/trunk/drivers/mix_mmx.inc @@ -0,0 +1,241 @@ + +; params +; edi= output +; eax= input stream 1 +; ebx= input stream 2 + +if used mmx_mix_2 + +align 4 +mmx_mix_2: + movq mm0, [eax] + movq mm1, [eax+8] + movq mm2, [eax+16] + movq mm3, [eax+24] + movq mm4, [eax+32] + movq mm5, [eax+40] + movq mm6, [eax+48] + movq mm7, [eax+56] + + paddsw mm0, [ebx] + movq [edi], mm0 + paddsw mm1,[ebx+8] + movq [edi+8], mm1 + paddsw mm2, [ebx+16] + movq [edi+16], mm2 + paddsw mm3, [ebx+24] + movq [edi+24], mm3 + paddsw mm4, [ebx+32] + movq [edi+32], mm4 + paddsw mm5, [ebx+40] + movq [edi+40], mm5 + paddsw mm6, [ebx+48] + movq [edi+48], mm6 + paddsw mm7, [ebx+56] + movq [edi+56], mm7 + + movq mm0, [eax+64] + movq mm1, [eax+72] + movq mm2, [eax+80] + movq mm3, [eax+88] + movq mm4, [eax+96] + movq mm5, [eax+104] + movq mm6, [eax+112] + movq mm7, [eax+120] + + paddsw mm0, [ebx+64] + movq [edi+64], mm0 + paddsw mm1, [ebx+72] + movq [edi+72], mm1 + paddsw mm2, [ebx+80] + movq [edi+80], mm2 + paddsw mm3, [ebx+88] + movq [edi+88], mm3 + paddsw mm4, [ebx+96] + movq [edi+96], mm4 + paddsw mm5, [ecx+104] + movq [edx+104], mm5 + paddsw mm6, [ebx+112] + movq [edi+112], mm6 + paddsw mm7, [ebx+120] + movq [edi+120], mm7 + ret + +align 4 +mmx_mix_3: + movq mm0, [eax] + movq mm1, [eax+8] + movq mm2, [eax+16] + movq mm3, [eax+24] + movq mm4, [eax+32] + movq mm5, [eax+40] + movq mm6, [eax+48] + movq mm7, [eax+56] + + paddsw mm0, [ebx] + paddsw mm1, [ebx+8] + paddsw mm2, [ebx+16] + paddsw mm3, [ebx+24] + paddsw mm4, [ebx+32] + paddsw mm5, [ebx+40] + paddsw mm6, [ebx+48] + paddsw mm7, [ebx+56] + paddsw mm0, [ecx] + movq [edi], mm0 + paddsw mm1,[ecx+8] + movq [edi+8], mm1 + paddsw mm2, [ecx+16] + movq [edi+16], mm2 + paddsw mm3, [ecx+24] + movq [edi+24], mm3 + paddsw mm4, [ecx+32] + movq [edi+32], mm4 + paddsw mm5, [ecx+40] + movq [edi+40], mm5 + paddsw mm6, [ecx+48] + movq [edi+48], mm6 + paddsw mm7, [ecx+56] + movq [edi+56], mm7 + + movq mm0, [eax+64] + movq mm1, [eax+72] + movq mm2, [eax+80] + movq mm3, [eax+88] + movq mm4, [eax+96] + movq mm5, [eax+104] + movq mm6, [eax+112] + movq mm7, [eax+120] + paddsw mm0, [ebx+64] + paddsw mm1, [ebx+72] + paddsw mm2, [ebx+80] + paddsw mm3, [ebx+88] + paddsw mm4, [ebx+96] + paddsw mm5, [ebx+104] + paddsw mm6, [ebx+112] + paddsw mm7, [ebx+120] + paddsw mm0, [ecx+64] + movq [edi+64], mm0 + paddsw mm1, [ecx+72] + movq [edi+72], mm1 + paddsw mm2, [ecx+80] + movq [edi+80], mm2 + paddsw mm3, [ecx+88] + movq [edi+88], mm3 + paddsw mm4, [ecx+96] + movq [edi+96], mm4 + paddsw mm5, [ecx+104] + movq [edi+104], mm5 + paddsw mm6, [ecx+112] + movq [edi+112], mm6 + paddsw mm7, [ecx+120] + movq [edi+120], mm7 + ret + +align 4 +mmx_mix_4: + + movq mm0, [eax] + movq mm2, [eax+8] + movq mm4, [eax+16] + movq mm6, [eax+24] + movq mm1, [ebx] + movq mm3, [ebx+8] + movq mm5, [ebx+16] + movq mm7, [ebx+24] + paddsw mm0, [ecx] + paddsw mm2, [ecx+8] + paddsw mm4, [ecx+16] + paddsw mm6, [ecx+24] + paddsw mm1, [edx] + paddsw mm3, [edx+8] + paddsw mm5, [edx+16] + paddsw mm7, [edx+24] + + paddsw mm0, mm1 + movq [edi], mm0 + paddsw mm2, mm3 + movq [edi+8], mm2 + paddsw mm4, mm5 + movq [edi+16], mm4 + paddsw mm5, mm6 + movq [edi+24], mm6 + + movq mm0, [eax+32] + movq mm2, [eax+40] + movq mm4, [eax+48] + movq mm6, [eax+56] + movq mm1, [ebx+32] + movq mm3, [ebx+40] + movq mm5, [ebx+48] + movq mm7, [ebx+56] + paddsw mm0, [ecx+32] + paddsw mm2, [ecx+40] + paddsw mm4, [ecx+48] + paddsw mm6, [ecx+56] + paddsw mm1, [edx+32] + paddsw mm3, [edx+40] + paddsw mm5, [edx+48] + paddsw mm7, [edx+56] + + paddsw mm0, mm1 + movq [edi+32], mm0 + paddsw mm2, mm2 + movq [edi+40], mm2 + paddsw mm4, mm5 + movq [edi+48], mm4 + paddsw mm6, mm7 + movq [edi+56], mm6 + + movq mm0, [eax+64] + movq mm2, [eax+72] + movq mm4, [eax+80] + movq mm6, [eax+88] + movq mm1, [ebx+64] + movq mm3, [ebx+72] + movq mm5, [ebx+80] + movq mm7, [ebx+88] + paddsw mm0, [ecx+64] + paddsw mm2, [ecx+72] + paddsw mm4, [ecx+80] + paddsw mm6, [ecx+88] + paddsw mm1, [edx+64] + paddsw mm3, [edx+72] + paddsw mm5, [edx+80] + paddsw mm7, [edx+88] + + paddsw mm0, mm1 + movq [edi+64], mm0 + paddsw mm2, mm3 + movq [edi+72], mm2 + paddsw mm4, mm5 + movq [edi+80], mm4 + paddsw mm6, mm5 + movq [edi+88], mm7 + + movq mm0, [eax+96] + movq mm2, [eax+104] + movq mm4, [eax+112] + movq mm6, [eax+120] + movq mm1, [ebx+96] + movq mm3, [ebx+104] + movq mm5, [ebx+112] + movq mm7, [ebx+120] + paddsw mm0, [ecx+96] + paddsw mm2, [ecx+104] + paddsw mm4, [ecx+112] + paddsw mm6, [ecx+120] + paddsw mm1, [edx+96] + paddsw mm3, [edx+104] + paddsw mm5, [edx+112] + paddsw mm7, [edx+120] + paddsw mm0, mm1 + movq [eax+96], mm0 + paddsw mm2, mm3 + movq [edi+104], mm2 + paddsw mm4, mm5 + movq [edi+112], mm4 + paddsw mm6, mm7 + movq [edi+120], mm6 + ret + +end if diff --git a/kernel/trunk/drivers/mix_sse2.inc b/kernel/trunk/drivers/mix_sse2.inc new file mode 100644 index 0000000000..4ca0a9e63d --- /dev/null +++ b/kernel/trunk/drivers/mix_sse2.inc @@ -0,0 +1,139 @@ + +if used mmx128_mix_2 + +align 4 +mmx128_mix_2: + prefetcht1 [eax+128] + prefetcht1 [ebx+128] + + movaps xmm0, [eax] + movaps xmm1, [eax+16] + movaps xmm2, [eax+32] + movaps xmm3, [eax+48] + movaps xmm4, [eax+64] + movaps xmm5, [eax+80] + movaps xmm6, [eax+96] + movaps xmm7, [eax+112] + + paddsw xmm0, [ebx] + movaps [edi], xmm0 + paddsw xmm1,[ebx+16] + movaps [edi+16], xmm1 + paddsw xmm2, [ebx+32] + movaps [edi+32], xmm2 + paddsw xmm3, [ebx+48] + movaps [edi+48], xmm3 + paddsw xmm4, [ebx+64] + movaps [edi+64], xmm4 + paddsw xmm5, [ebx+80] + movaps [edi+80], xmm5 + paddsw xmm6, [ebx+96] + movaps [edi+96], xmm6 + paddsw xmm7, [ebx+112] + movaps [edi+112], xmm7 + ret + +align 4 +mmx128_mix_3: + prefetcht1 [eax+128] + prefetcht1 [ebx+128] + prefetcht1 [ecx+128] + + movaps xmm0, [eax] + movaps xmm1, [eax+16] + movaps xmm2, [eax+32] + movaps xmm3, [eax+48] + movaps xmm4, [eax+64] + movaps xmm5, [eax+80] + movaps xmm6, [eax+96] + movaps xmm7, [eax+112] + + paddsw xmm0, [ebx] + paddsw xmm1, [ebx+16] + paddsw xmm2, [ebx+32] + paddsw xmm3, [ebx+48] + paddsw xmm4, [ebx+64] + paddsw xmm5, [ebx+80] + paddsw xmm6, [ebx+96] + paddsw xmm7, [ebx+112] + + paddsw xmm0, [ecx] + movaps [edi], xmm0 + paddsw xmm1, [ecx+16] + movaps [edi+16], xmm1 + paddsw xmm2, [ecx+32] + movaps [edi+32], xmm2 + paddsw xmm3, [ecx+48] + movaps [edi+48], xmm3 + paddsw xmm4, [ecx+64] + movaps [edi+64], xmm4 + paddsw xmm5, [ecx+80] + movaps [edi+80], xmm5 + paddsw xmm6, [ecx+96] + movaps [edi+96], xmm6 + paddsw xmm7, [ecx+112] + movaps [edi+112], xmm7 + ret + +align 4 +mmx128_mix_4: + prefetcht1 [eax+128] + prefetcht1 [ebx+128] + prefetcht1 [ecx+128] + prefetcht1 [edx+128] + + movaps xmm0, [eax] + movaps xmm2, [eax+16] + movaps xmm4, [eax+32] + movaps xmm6, [eax+48] + movaps xmm1, [ebx] + movaps xmm3, [ebx+16] + movaps xmm5, [ebx+32] + movaps xmm7, [ebx+48] + + paddsw xmm0, [ecx] + paddsw xmm2, [ecx+16] + paddsw xmm4, [ecx+32] + paddsw xmm6, [ecx+48] + paddsw xmm1, [edx] + paddsw xmm3, [edx+16] + paddsw xmm5, [edx+32] + paddsw xmm7, [edx+48] + + paddsw xmm0, xmm1 + movaps [edi], xmm0 + paddsw xmm2, xmm3 + movaps [edi+16], xmm2 + paddsw xmm4, xmm5 + movaps [edi+32], xmm4 + paddsw xmm6, xmm7 + movaps [edi+48], xmm6 + + movaps xmm0, [eax+64] + movaps xmm2, [eax+80] + movaps xmm4, [eax+96] + movaps xmm6, [eax+112] + + movaps xmm1, [ebx+64] + movaps xmm3, [ebx+80] + movaps xmm5, [ebx+96] + movaps xmm7, [ebx+112] + paddsw xmm0, [ecx+64] + paddsw xmm2, [ecx+80] + paddsw xmm4, [ecx+96] + paddsw xmm6, [ecx+112] + + paddsw xmm1, [edx+64] + paddsw xmm3, [edx+80] + paddsw xmm5, [edx+96] + paddsw xmm7, [edx+112] + paddsw xmm0, xmm1 + movaps [edi+64], xmm0 + paddsw xmm2, xmm3 + movaps [edi+80], xmm2 + paddsw xmm4, xmm5 + movaps [edi+96], xmm4 + paddsw xmm6, xmm7 + movaps [edi+112], xmm6 + ret +end if diff --git a/kernel/trunk/drivers/mixer.asm b/kernel/trunk/drivers/mixer.asm index f7c1226179..cc70e8908f 100644 --- a/kernel/trunk/drivers/mixer.asm +++ b/kernel/trunk/drivers/mixer.asm @@ -90,7 +90,7 @@ proc new_mix stdcall, output:dword .m3: add [output],512 - sub [main_count], 1 + dec [main_count] jnz .l00 call update_stream @@ -324,8 +324,8 @@ align 16 mov [edi], ebx add edi, 4 - add eax, [esp+16] - cmp eax, [esp+24] + add eax, [esp+16] + cmp eax, [esp+24] jb .l1 mov ebp, esp @@ -382,8 +382,8 @@ align 16 mov [edi], ebx add edi, 4 - add esi, [esp+16] - cmp esi, [esp+24] + add esi, [esp+16] + cmp esi, [esp+24] jb .l1 mov ebp, esp @@ -622,6 +622,7 @@ proc alloc_mix_buff ret endp +align 4 proc m16_s_mmx movq mm0, [esi] @@ -777,56 +778,59 @@ proc m8_s_mmx ret endp - align 4 proc mix_2_1 stdcall, output:dword, str0:dword, str1:dword mov edi, [output] + mov eax, [str0] + mov ebx, [str1] + mov esi, 128 + call [mix_2_core] ;edi, eax, ebx - stdcall mix_2_1_mmx, edi, [str0],[str1] -; stdcall mix_2_1_sse, edi, [str0],[str1] - add edi, 128 - add [str0], 128 - add [str1], 128 - stdcall mix_2_1_mmx, edi, [str0],[str1] -; stdcall mix_2_1_sse, edi, [str0],[str1] - add edi, 128 - add [str0], 128 - add [str1], 128 - stdcall mix_2_1_mmx, edi, [str0],[str1] -; stdcall mix_2_1_sse, edi, [str0],[str1] - add edi, 128 - add [str0], 128 - add [str1], 128 - stdcall mix_2_1_mmx, edi, [str0],[str1] -; stdcall mix_2_1_sse, edi, [str0],[str1] + add edi, esi + add eax, esi + add ebx, esi + call [mix_2_core] ;edi, eax, ebx + add edi, esi + add eax, esi + add ebx, esi + call [mix_2_core] ;edi, eax, ebx + + add edi, esi + add eax, esi + add ebx, esi + call [mix_2_core] ;edi, eax, ebx ret endp - align 4 proc mix_3_1 stdcall, output:dword, str0:dword, str1:dword, str2:dword mov edi, [output] + mov eax, [str0] + mov ebx, [str1] + mov ecx, [str2] + mov esi, 128 + call [mix_3_core] - stdcall mix_3_1_mmx, edi, [str0],[str1],[str2] - add edi, 128 - add [str0], 128 - add [str1], 128 - add [str2], 128 - stdcall mix_3_1_mmx, edi, [str0],[str1],[str2] - add edi, 128 - add [str0], 128 - add [str1], 128 - add [str2], 128 - stdcall mix_3_1_mmx, edi, [str0],[str1],[str2] - add edi, 128 - add [str0], 128 - add [str1], 128 - add [str2], 128 - stdcall mix_3_1_mmx, edi, [str0],[str1],[str2] + add edi, esi + add eax, esi + add ebx, esi + add ecx, esi + call [mix_3_core] + add edi, esi + add eax, esi + add ebx, esi + add ecx, esi + call [mix_3_core] + + add edi, esi + add eax, esi + add ebx, esi + add ecx, esi + call [mix_3_core] ret endp @@ -839,29 +843,35 @@ proc mix_4_1 stdcall, str0:dword, str1:dword,\ call alloc_mix_buff and eax, eax jz .err - mov [output], eax mov edi, eax + mov eax, [str0] + mov ebx, [str1] + mov ecx, [str2] + mov edx, [str3] + mov esi, 128 + call [mix_4_core] ;edi, eax, ebx, ecx, edx - stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3] - add edi, 128 - add [str0], 128 - add [str1], 128 - add [str2], 128 - add [str3], 128 - stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3] - add edi, 128 - add [str0], 128 - add [str1], 128 - add [str2], 128 - add [str3], 128 - stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3] - add edi, 128 - add [str0], 128 - add [str1], 128 - add [str2], 128 - add [str3], 128 - stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3] + add edi, esi + add eax, esi + add ebx, esi + add ecx, esi + add edx, esi + call [mix_4_core] ;edi, eax, ebx, ecx, edx + + add edi, esi + add eax, esi + add ebx, esi + add ecx, esi + add edx, esi + call [mix_4_core] ;edi, eax, ebx, ecx, edx + + add edi, esi + add eax, esi + add ebx, esi + add ecx, esi + add edx, esi + call [mix_4_core] ;edi, eax, ebx, ecx, edx mov eax, [output] ret .err: @@ -876,318 +886,33 @@ proc final_mix stdcall, output:dword, str0:dword, str1:dword,\ mov edi, [output] - stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3] - add edi, 128 - add [str0], 128 - add [str1], 128 - add [str2], 128 - add [str3], 128 - stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3] - add edi, 128 - add [str0], 128 - add [str1], 128 - add [str2], 128 - add [str3], 128 - stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3] - add edi, 128 - add [str0], 128 - add [str1], 128 - add [str2], 128 - add [str3], 128 - stdcall mix_4_1_mmx, edi, [str0],[str1],[str2],[str3] - - ret -endp - -align 4 -proc mix_2_1_mmx stdcall, output:dword, str0:dword, str1:dword - - mov edx, [output] - mov eax, [str0] - mov ecx, [str1] - - movq mm0, [eax] - paddsw mm0, [ecx] - movq [edx], mm0 - - movq mm1, [eax+8] - paddsw mm1,[ecx+8] - movq [edx+8], mm1 - - movq mm2, [eax+16] - paddsw mm2, [ecx+16] - movq [edx+16], mm2 - - movq mm3, [eax+24] - paddsw mm3, [ecx+24] - movq [edx+24], mm3 - - movq mm0, [eax+32] - paddsw mm0, [ecx+32] - movq [edx+32], mm0 - - movq mm1, [eax+40] - paddsw mm1, [ecx+40] - movq [edx+40], mm1 - - movq mm2, [eax+48] - paddsw mm2, [ecx+48] - movq [edx+48], mm2 - - movq mm3, [eax+56] - paddsw mm3, [ecx+56] - movq [edx+56], mm3 - - movq mm0, [eax+64] - paddsw mm0, [ecx+64] - movq [edx+64], mm0 - - movq mm1, [eax+72] - paddsw mm1, [ecx+72] - movq [edx+72], mm1 - - movq mm2, [eax+80] - paddsw mm2, [ecx+80] - movq [edx+80], mm2 - - movq mm3, [eax+88] - paddsw mm3, [ecx+88] - movq [edx+88], mm3 - - movq mm0, [eax+96] - paddsw mm0, [ecx+96] - movq [edx+96], mm0 - - movq mm1, [eax+104] - paddsw mm1, [ecx+104] - movq [edx+104], mm1 - - movq mm2, [eax+112] - paddsw mm2, [ecx+112] - movq [edx+112], mm2 - - movq mm3, [eax+120] - paddsw mm3, [ecx+120] - movq [edx+120], mm3 - - ret -endp - - - -align 4 -proc mix_3_1_mmx stdcall, output:dword, str0:dword, str1:dword, str2:dword - - mov edx, [output] mov eax, [str0] mov ebx, [str1] mov ecx, [str2] + mov edx, [str3] + mov esi, 128 + call [mix_4_core] ;edi, eax, ebx, ecx, edx - movq mm0, [eax] - paddsw mm0, [ebx] - paddsw mm0, [ecx] - movq [edx], mm0 + add edi, esi + add eax, esi + add ebx, esi + add ecx, esi + add edx, esi + call [mix_4_core] ;edi, eax, ebx, ecx, edx - movq mm1, [eax+8] - paddsw mm1,[ebx+8] - paddsw mm1,[ecx+8] - movq [edx+8], mm1 - - movq mm2, [eax+16] - paddsw mm2, [ebx+16] - paddsw mm2, [ecx+16] - movq [edx+16], mm2 - - movq mm3, [eax+24] - paddsw mm3, [ebx+24] - paddsw mm3, [ecx+24] - movq [edx+24], mm3 - - movq mm0, [eax+32] - paddsw mm0, [ebx+32] - paddsw mm0, [ecx+32] - movq [edx+32], mm0 - - movq mm1, [eax+40] - paddsw mm1, [ebx+40] - paddsw mm1, [ecx+40] - movq [edx+40], mm1 - - movq mm2, [eax+48] - paddsw mm2, [ebx+48] - paddsw mm2, [ecx+48] - movq [edx+48], mm2 - - movq mm3, [eax+56] - paddsw mm3, [ebx+56] - paddsw mm3, [ecx+56] - movq [edx+56], mm3 - - movq mm0, [eax+64] - paddsw mm0, [ebx+64] - paddsw mm0, [ecx+64] - movq [edx+64], mm0 - - movq mm1, [eax+72] - paddsw mm1, [ebx+72] - paddsw mm1, [ecx+72] - movq [edx+72], mm1 - - movq mm2, [eax+80] - paddsw mm2, [ebx+80] - paddsw mm2, [ecx+80] - movq [edx+80], mm2 - - movq mm3, [eax+88] - paddsw mm3, [ebx+88] - paddsw mm3, [ecx+88] - movq [edx+88], mm3 - - movq mm0, [eax+96] - paddsw mm0, [ebx+96] - paddsw mm0, [ecx+96] - movq [edx+96], mm0 - - movq mm1, [eax+104] - paddsw mm1, [ebx+104] - paddsw mm1, [ecx+104] - movq [edx+104], mm1 - - movq mm2, [eax+112] - paddsw mm2, [ebx+112] - paddsw mm2, [ecx+112] - movq [edx+112], mm2 - - movq mm3, [eax+120] - paddsw mm3, [ebx+120] - paddsw mm3, [ecx+120] - movq [edx+120], mm3 - - ret -endp - -align 4 -proc mix_4_1_mmx stdcall, output:dword, str0:dword, str1:dword,\ - str2:dword, str3:dword - - mov edx, [output] - mov esi, [str0] - mov eax, [str1] - mov ebx, [str2] - mov ecx, [str3] - - movq mm0, [esi] - movq mm1, [eax] - paddsw mm0, [ebx] - paddsw mm1, [ecx] - paddsw mm0, mm1 - movq [edx], mm0 - - movq mm2, [esi+8] - movq mm3, [eax+8] - paddsw mm2, [ebx+8] - paddsw mm3, [ecx+8] - paddsw mm2, mm3 - movq [edx+8], mm2 - - movq mm0, [esi+16] - movq mm1, [eax+16] - paddsw mm0, [ebx+16] - paddsw mm1, [ecx+16] - paddsw mm0, mm1 - movq [edx+16], mm0 - - movq mm2, [esi+24] - movq mm3, [eax+24] - paddsw mm2, [ebx+24] - paddsw mm3, [ecx+24] - paddsw mm2, mm3 - movq [edx+24], mm2 - - movq mm0, [esi+32] - movq mm1, [eax+32] - paddsw mm0, [ebx+32] - paddsw mm1, [ecx+32] - paddsw mm0, mm1 - movq [edx+32], mm0 - - movq mm2, [esi+40] - movq mm3, [eax+40] - paddsw mm2, [ebx+40] - paddsw mm3, [ecx+40] - paddsw mm2, mm3 - movq [edx+40], mm2 - - movq mm0, [esi+48] - movq mm1, [eax+48] - paddsw mm0, [ebx+48] - paddsw mm1, [ecx+48] - paddsw mm0, mm1 - movq [edx+48], mm0 - - movq mm2, [esi+56] - movq mm3, [eax+56] - paddsw mm2, [ebx+56] - paddsw mm3, [ecx+56] - paddsw mm2, mm3 - movq [edx+56], mm2 - - movq mm0, [esi+64] - movq mm1, [eax+64] - paddsw mm0, [ebx+64] - paddsw mm1, [ecx+64] - paddsw mm0, mm1 - movq [edx+64], mm0 - - movq mm2, [esi+72] - movq mm3, [eax+72] - paddsw mm2, [ebx+72] - paddsw mm3, [ecx+72] - paddsw mm2, mm3 - movq [edx+72], mm2 - - movq mm2, [esi+80] - movq mm3, [eax+80] - paddsw mm2, [ebx+80] - paddsw mm3, [ecx+80] - paddsw mm2, mm3 - movq [edx+80], mm2 - - movq mm2, [esi+88] - movq mm3, [eax+88] - paddsw mm2, [ebx+88] - paddsw mm3, [ecx+88] - paddsw mm2, mm3 - movq [edx+88], mm2 - - movq mm2, [esi+96] - movq mm3, [eax+96] - paddsw mm2, [ebx+96] - paddsw mm3, [ecx+96] - paddsw mm2, mm3 - movq [edx+96], mm2 - - movq mm2, [esi+104] - movq mm3, [eax+104] - paddsw mm2, [ebx+104] - paddsw mm3, [ecx+104] - paddsw mm2, mm3 - movq [edx+104], mm2 - - movq mm2, [esi+112] - movq mm3, [eax+112] - paddsw mm2, [ebx+112] - paddsw mm3, [ecx+112] - paddsw mm2, mm3 - movq [edx+112], mm2 - - movq mm2, [esi+120] - movq mm3, [eax+120] - paddsw mm2, [ebx+120] - paddsw mm3, [ecx+120] - paddsw mm2, mm3 - movq [edx+120], mm2 + add edi, esi + add eax, esi + add ebx, esi + add ecx, esi + add edx, esi + call [mix_4_core] ;edi, eax, ebx, ecx, edx + add edi, esi + add eax, esi + add ebx, esi + add ecx, esi + add edx, esi + call [mix_4_core] ;edi, eax, ebx, ecx, edx ret endp