IgorA 88bb295c51 small fixes & optimize
git-svn-id: svn://kolibrios.org@6851 a494cfbc-eb01-0410-851d-a64ba20cac60
2017-02-01 17:23:05 +00:00

310 lines
5.6 KiB
NASM

; adler32.asm -- compute the Adler-32 checksum of a data stream
; Copyright (C) 1995-2011 Mark Adler
; For conditions of distribution and use, see copyright notice in zlib.h
BASE equ 65521 ;largest prime smaller than 65536
NMAX equ 5552
; NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
macro DO1 buf,i
{
mov eax,buf
add eax,i
movzx eax,byte[eax]
add [adler],eax
mov eax,[adler]
add [sum2],eax
}
macro DO2 buf,i
{
DO1 buf,i
DO1 buf,i+1
}
macro DO4 buf,i
{
DO2 buf,i
DO2 buf,i+2
}
macro DO8 buf,i
{
DO4 buf,i
DO4 buf,i+4
}
macro DO16 buf
{
DO8 buf,0
DO8 buf,8
}
; use NO_DIVIDE if your processor does not do division in hardware --
; try it both ways to see which is faster
; note that this assumes BASE is 65521, where 65536 % 65521 == 15
; (thank you to John Reiser for pointing this out)
macro CHOP a
{
if NO_DIVIDE eq 1
mov eax,a
shr eax,16
and a,0xffff
shl eax,4
add a,eax
shr eax,4
sub a,eax
end if
}
macro MOD28 a
{
if NO_DIVIDE eq 1
local .end0
CHOP a
cmp a,BASE
jl .end0 ;if (..>=..)
sub a,BASE
.end0:
else
push eax ecx edx
mov eax,a
xor edx,edx
mov ecx,BASE
div ecx
mov a,edx
pop edx ecx eax
end if
}
macro MOD a
{
if NO_DIVIDE eq 1
CHOP a
MOD28 a
else
push eax ecx edx
mov eax,a
xor edx,edx
mov ecx,BASE
div ecx
mov a,edx
pop edx ecx eax
end if
}
macro MOD63 a
{
if NO_DIVIDE eq 1
;this assumes a is not negative
; z_off64_t tmp = a >> 32;
; a &= 0xffffffff;
; a += (tmp << 8) - (tmp << 5) + tmp;
; tmp = a >> 16;
; a &= 0xffff;
; a += (tmp << 4) - tmp;
; tmp = a >> 16;
; a &= 0xffff;
; a += (tmp << 4) - tmp;
; if (a >= BASE) a -= BASE;
else
push eax ecx edx
mov eax,a
xor edx,edx
mov ecx,BASE
div ecx
mov a,edx
pop edx ecx eax
end if
}
; =========================================================================
;uLong (adler, buf, len)
; uLong adler
; const Bytef *buf
; uInt len
align 4
proc adler32 uses ebx edx, adler:dword, buf:dword, len:dword
locals
sum2 dd ? ;uLong
endl
;zlib_debug 'adler32 adler = %d',[adler]
; split Adler-32 into component sums
mov eax,[adler]
shr eax,16
mov [sum2],eax
and [adler],0xffff
mov ebx,[buf]
; in case user likes doing a byte at a time, keep it fast
cmp dword[len],1
jne .end0 ;if (..==..)
movzx eax,byte[ebx]
add [adler],eax
cmp dword[adler],BASE
jb @f ;if (..>=..)
sub dword[adler],BASE
@@:
mov eax,[adler]
add [sum2],eax
cmp dword[sum2],BASE
jb @f ;if (..>=..)
sub dword[sum2],BASE
@@:
jmp .combine
align 4
.end0:
; initial Adler-32 value (deferred check for len == 1 speed)
cmp ebx,Z_NULL
jne @f ;if (..==0)
xor eax,eax
inc eax
jmp .end_f
align 4
@@:
; in case short lengths are provided, keep it somewhat fast
cmp dword[len],16
jge .end1 ;if (..<..)
.cycle0:
cmp dword[len],0
jne @f ;while (..)
movzx eax,byte[ebx]
inc ebx
add [adler],eax
mov eax,[adler]
add [sum2],eax
dec dword[len]
jmp .cycle0
align 4
@@:
cmp dword[adler],BASE
jl @f ;if (..>=..)
sub dword[adler],BASE
@@:
MOD28 dword[sum2] ;only added so many BASE's
jmp .combine
align 4
.end1:
; do length NMAX blocks -- requires just one modulo operation
.cycle3:
cmp dword[len],NMAX
jl .cycle3end ;while (..>=..)
sub dword[len],NMAX
mov edx,NMAX/16 ;NMAX is divisible by 16
.cycle1: ;do
DO16 ebx ;16 sums unrolled
add ebx,16
dec edx
cmp edx,0
jg .cycle1 ;while (..)
MOD [adler]
MOD [sum2]
jmp .cycle3
align 4
.cycle3end:
; do remaining bytes (less than NMAX, still just one modulo)
cmp dword[len],0
jne .end2 ;if (..) ;avoid modulos if none remaining
@@:
cmp dword[len],16
jl .cycle2 ;while (..>=..)
sub dword[len],16
DO16 ebx
add ebx,16
jmp @b
align 4
.cycle2:
cmp dword[len],0
jne @f ;while (..)
movzx eax,byte[ebx]
inc ebx
add [adler],eax
mov eax,[adler]
add [sum2],eax
dec dword[len]
jmp .cycle2
align 4
@@:
MOD [adler]
MOD [sum2]
.end2:
; return recombined sums
.combine:
mov eax,[sum2]
shl eax,16
or eax,[adler]
.end_f:
;zlib_debug ' adler32.ret = %d',eax
ret
endp
; =========================================================================
;uLong (adler1, adler2, len2)
; uLong adler1
; uLong adler2
; z_off64_t len2
align 4
proc adler32_combine_, adler1:dword, adler2:dword, len2:dword
locals
sum1 dd ? ;uLong
sum2 dd ? ;uLong
; unsigned rem;
endl
; for negative len, return invalid adler32 as a clue for debugging
cmp dword[len2],0
jge @f ;if (..<0)
mov eax,0xffffffff
jmp .end_f
@@:
; the derivation of this formula is left as an exercise for the reader
; MOD63(len2) ;assumes len2 >= 0
; rem = (unsigned)len2;
; sum1 = adler1 & 0xffff;
; sum2 = rem * sum1;
; MOD(sum2);
; sum1 += (adler2 & 0xffff) + BASE - 1;
; sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
cmp dword[sum1],BASE
jl @f ;if (..>=..)
sub dword[sum1],BASE
@@:
cmp dword[sum1],BASE
jl @f ;if (..>=..)
sub dword[sum1],BASE
@@:
cmp dword[sum2],BASE shl 1
jl @f ;if (..>=..)
sub dword[sum2],BASE shl 1
@@:
cmp dword[sum2],BASE
jl @f ;if (..>=..)
sub dword[sum2],BASE
@@:
mov eax,[sum2]
shl eax,16
or eax,[sum1]
.end_f:
ret
endp
; =========================================================================
;uLong (adler1, adler2, len2)
; uLong adler1
; uLong adler2
; z_off_t len2
align 4
proc adler32_combine, adler1:dword, adler2:dword, len2:dword
stdcall adler32_combine_, [adler1], [adler2], [len2]
ret
endp
;uLong (adler1, adler2, len2)
; uLong adler1
; uLong adler2
; z_off64_t len2
align 4
proc adler32_combine64, adler1:dword, adler2:dword, len2:dword
stdcall adler32_combine_, [adler1], [adler2], [len2]
ret
endp