; adler32.asm -- compute the Adler-32 checksum of a data stream ; Copyright (C) 1995-2011 Mark Adler ; For conditions of distribution and use, see copyright notice in zlib.h BASE equ 65521 ;largest prime smaller than 65536 NMAX equ 5552 ; NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 macro DO1 buf,i { movzx eax,byte[buf+i] add [adler],eax add edi,[adler] } macro DO2 buf,i { DO1 buf,i DO1 buf,i+1 } macro DO4 buf,i { DO2 buf,i DO2 buf,i+2 } macro DO8 buf,i { DO4 buf,i DO4 buf,i+4 } macro DO16 buf { DO8 buf,0 DO8 buf,8 } ; use NO_DIVIDE if your processor does not do division in hardware -- ; try it both ways to see which is faster ; note that this assumes BASE is 65521, where 65536 % 65521 == 15 ; (thank you to John Reiser for pointing this out) macro CHOP a { if NO_DIVIDE eq 1 mov eax,a shr eax,16 and a,0xffff shl eax,4 add a,eax shr eax,4 sub a,eax end if } macro MOD28 a { if NO_DIVIDE eq 1 local .end0 CHOP a cmp a,BASE jl .end0 ;if (..>=..) sub a,BASE .end0: else push eax ecx edx mov eax,a xor edx,edx mov ecx,BASE div ecx mov a,edx pop edx ecx eax end if } macro MOD a { if NO_DIVIDE eq 1 CHOP a MOD28 a else push eax ecx edx mov eax,a xor edx,edx mov ecx,BASE div ecx mov a,edx pop edx ecx eax end if } macro MOD63 a { if NO_DIVIDE eq 1 ;this assumes a is not negative ; z_off64_t tmp = a >> 32; ; a &= 0xffffffff; ; a += (tmp << 8) - (tmp << 5) + tmp; ; tmp = a >> 16; ; a &= 0xffff; ; a += (tmp << 4) - tmp; ; tmp = a >> 16; ; a &= 0xffff; ; a += (tmp << 4) - tmp; ; if (a >= BASE) a -= BASE; else push eax ecx edx mov eax,a xor edx,edx mov ecx,BASE div ecx mov a,edx pop edx ecx eax end if } ; ========================================================================= ;uLong (uLong adler, const Bytef *buf, uInt len) align 16 proc adler32 uses ebx ecx edx edi, adler:dword, buf:dword, len:dword ; split Adler-32 into component sums mov edi,[adler] shr edi,16 and dword[adler],0xffff mov ebx,[buf] mov ecx,[len] ; in case user likes doing a byte at a time, keep it fast cmp ecx,1 jne .end0 ;if (..==..) movzx eax,byte[ebx] add [adler],eax cmp dword[adler],BASE jb @f ;if (..>=..) sub dword[adler],BASE @@: add edi,[adler] cmp edi,BASE jae .combine ;if (..>=..) sub edi,BASE jmp .combine align 4 .end0: ; initial Adler-32 value (deferred check for len == 1 speed) cmp ebx,Z_NULL jne @f ;if (..==0) xor eax,eax inc eax jmp .end_f align 4 @@: ; in case short lengths are provided, keep it somewhat fast cmp ecx,16 jae .cycle3 ;if (..<..) .cycle0: mov eax,ecx dec ecx test eax,eax je @f ;while (..) movzx eax,byte[ebx] add [adler],eax inc ebx add edi,[adler] jmp .cycle0 align 4 @@: cmp dword[adler],BASE jb @f ;if (..>=..) sub dword[adler],BASE @@: MOD28 edi ;only added so many BASE's jmp .combine ; do length NMAX blocks -- requires just one modulo operation align 4 .cycle3: cmp ecx,NMAX jb .cycle3end ;while (..>=..) sub ecx,NMAX mov edx,NMAX/16 ;NMAX is divisible by 16 .cycle1: ;do DO16 ebx ;16 sums unrolled add ebx,16 dec edx jne .cycle1 ;while (..) MOD [adler] MOD edi jmp .cycle3 align 4 .cycle3end: ; do remaining bytes (less than NMAX, still just one modulo) cmp ecx,0 je .combine ;if (..) ;avoid modulos if none remaining @@: cmp ecx,16 jb .cycle2 ;while (..>=..) sub ecx,16 DO16 ebx add ebx,16 jmp @b align 4 .cycle2: mov eax,ecx dec ecx test eax,eax je @f ;while (..) movzx eax,byte[ebx] add [adler],eax inc ebx add edi,[adler] jmp .cycle2 align 4 @@: MOD [adler] MOD edi ; return recombined sums .combine: mov eax,edi shl eax,16 or eax,[adler] .end_f: ret endp ; ========================================================================= ;uLong (uLong adler1, uLong adler2, z_off64_t len2) align 4 proc adler32_combine_, adler1:dword, adler2:dword, len2:dword locals sum1 dd ? ;uLong sum2 dd ? ;uLong ; unsigned rem; endl ; for negative len, return invalid adler32 as a clue for debugging cmp dword[len2],0 jge @f ;if (..<0) mov eax,0xffffffff jmp .end_f @@: ; the derivation of this formula is left as an exercise for the reader ; MOD63(len2) ;assumes len2 >= 0 ; rem = (unsigned)len2; ; sum1 = adler1 & 0xffff; ; sum2 = rem * sum1; ; MOD(sum2); ; sum1 += (adler2 & 0xffff) + BASE - 1; ; sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; cmp dword[sum1],BASE jl @f ;if (..>=..) sub dword[sum1],BASE @@: cmp dword[sum1],BASE jl @f ;if (..>=..) sub dword[sum1],BASE @@: cmp dword[sum2],BASE shl 1 jl @f ;if (..>=..) sub dword[sum2],BASE shl 1 @@: cmp dword[sum2],BASE jl @f ;if (..>=..) sub dword[sum2],BASE @@: mov eax,[sum2] shl eax,16 or eax,[sum1] .end_f: ret endp ; ========================================================================= ;uLong (adler1, adler2, len2) ; uLong adler1 ; uLong adler2 ; z_off_t len2 align 4 proc adler32_combine, adler1:dword, adler2:dword, len2:dword stdcall adler32_combine_, [adler1], [adler2], [len2] ret endp ;uLong (adler1, adler2, len2) ; uLong adler1 ; uLong adler2 ; z_off64_t len2 align 4 proc adler32_combine64, adler1:dword, adler2:dword, len2:dword stdcall adler32_combine_, [adler1], [adler2], [len2] ret endp