; adler32.asm -- compute the Adler-32 checksum of a data stream
; Copyright (C) 1995-2011 Mark Adler
; For conditions of distribution and use, see copyright notice in zlib.h


BASE equ 65521 ;largest prime smaller than 65536
NMAX equ 5552
; NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1

macro DO1 buf,i
{
	mov eax,buf
	add eax,i
	movzx eax,byte[eax]
	add [adler],eax
	mov eax,[adler]
	add [sum2],eax
}
macro DO2 buf,i
{
	DO1 buf,i
	DO1 buf,i+1
}
macro DO4 buf,i
{
	DO2 buf,i
	DO2 buf,i+2
}
macro DO8 buf,i
{
	DO4 buf,i
	DO4 buf,i+4
}
macro DO16 buf
{
	DO8 buf,0
	DO8 buf,8
}

; use NO_DIVIDE if your processor does not do division in hardware --
; try it both ways to see which is faster
; note that this assumes BASE is 65521, where 65536 % 65521 == 15
; (thank you to John Reiser for pointing this out)
macro CHOP a
{
if NO_DIVIDE eq 1
	mov eax,a
	shr eax,16
	and a,0xffff
	shl eax,4
	add a,eax
	shr eax,4
	sub a,eax
end if
}
macro MOD28 a
{
if NO_DIVIDE eq 1
local .end0
	CHOP a
	cmp a,BASE
	jl .end0 ;if (..>=..)
		sub a,BASE
	.end0:
else
push eax ecx edx
	mov eax,a
	xor edx,edx
	mov ecx,BASE
	div ecx
	mov a,edx
pop edx ecx eax
end if
}
macro MOD a
{
if NO_DIVIDE eq 1
	CHOP a
	MOD28 a
else
push eax ecx edx
	mov eax,a
	xor edx,edx
	mov ecx,BASE
	div ecx
	mov a,edx
pop edx ecx eax
end if
}
macro MOD63 a
{
if NO_DIVIDE eq 1
;this assumes a is not negative
;        z_off64_t tmp = a >> 32;
;        a &= 0xffffffff;
;        a += (tmp << 8) - (tmp << 5) + tmp;
;        tmp = a >> 16;
;        a &= 0xffff;
;        a += (tmp << 4) - tmp;
;        tmp = a >> 16;
;        a &= 0xffff;
;        a += (tmp << 4) - tmp;
;        if (a >= BASE) a -= BASE;
else
push eax ecx edx
	mov eax,a
	xor edx,edx
	mov ecx,BASE
	div ecx
	mov a,edx
pop edx ecx eax
end if
}

; =========================================================================
;uLong (adler, buf, len)
;    uLong adler
;    const Bytef *buf
;    uInt len
align 4
proc adler32 uses ebx edx, adler:dword, buf:dword, len:dword
locals
	sum2 dd ? ;uLong
endl
;zlib_debug 'adler32 adler = %d',[adler]
	; split Adler-32 into component sums
	mov eax,[adler]
	shr eax,16
	mov [sum2],eax
	and [adler],0xffff
	mov ebx,[buf]

	; in case user likes doing a byte at a time, keep it fast
	cmp dword[len],1
	jne .end0 ;if (..==..)
		movzx eax,byte[ebx]
		add [adler],eax
		cmp dword[adler],BASE
		jl @f ;if (..>=..)
			sub dword[adler],BASE
		@@:
		mov eax,[adler]
		add [sum2],eax
		cmp dword[sum2],BASE
		jl @f ;if (..>=..)
			sub dword[sum2],BASE
		@@:
		jmp .combine
align 4
	.end0:

	; initial Adler-32 value (deferred check for len == 1 speed)
	cmp ebx,Z_NULL
	jne @f ;if (..==0)
		xor eax,eax
		inc eax
		jmp .end_f
align 4
	@@:

	; in case short lengths are provided, keep it somewhat fast
	cmp dword[len],16
	jge .end1 ;if (..<..)
		.cycle0:
			cmp dword[len],0
			jne @f ;while (..)
			movzx eax,byte[ebx]
			inc ebx
			add [adler],eax
			mov eax,[adler]
			add [sum2],eax
			dec dword[len]
			jmp .cycle0
align 4
		@@:
		cmp dword[adler],BASE
		jl @f ;if (..>=..)
			sub dword[adler],BASE
		@@:
		MOD28 dword[sum2] ;only added so many BASE's
		jmp .combine
align 4
	.end1:

	; do length NMAX blocks -- requires just one modulo operation
	.cycle3:
	cmp dword[len],NMAX
	jl .cycle3end ;while (..>=..)
		sub dword[len],NMAX
		mov edx,NMAX/16 ;NMAX is divisible by 16
		.cycle1: ;do
			DO16 ebx ;16 sums unrolled
			add ebx,16
			dec edx
			cmp edx,0
			jg .cycle1 ;while (..)
		MOD [adler]
		MOD [sum2]
		jmp .cycle3
align 4
	.cycle3end:

	; do remaining bytes (less than NMAX, still just one modulo)
	cmp dword[len],0
	jne .end2 ;if (..) ;avoid modulos if none remaining
		@@:
		cmp dword[len],16
		jl .cycle2 ;while (..>=..)
			sub dword[len],16
			DO16 ebx
			add ebx,16
			jmp @b
align 4
		.cycle2:
			cmp dword[len],0
			jne @f ;while (..)
			movzx eax,byte[ebx]
			inc ebx
			add [adler],eax
			mov eax,[adler]
			add [sum2],eax
			dec dword[len]
			jmp .cycle2
align 4
		@@:
		MOD [adler]
		MOD [sum2]
	.end2:

	; return recombined sums
.combine:
	mov eax,[sum2]
	shl eax,16
	or eax,[adler]
.end_f:
;zlib_debug '  adler32.ret = %d',eax
	ret
endp

; =========================================================================
;uLong (adler1, adler2, len2)
;    uLong adler1
;    uLong adler2
;    z_off64_t len2
align 4
proc adler32_combine_, adler1:dword, adler2:dword, len2:dword
locals
	sum1 dd ? ;uLong
	sum2 dd ? ;uLong
;    unsigned rem;
endl
	; for negative len, return invalid adler32 as a clue for debugging
	cmp dword[len2],0
	jge @f ;if (..<0)
		mov eax,0xffffffff
		jmp .end_f
	@@:

	; the derivation of this formula is left as an exercise for the reader
;    MOD63(len2) ;assumes len2 >= 0
;    rem = (unsigned)len2;
;    sum1 = adler1 & 0xffff;
;    sum2 = rem * sum1;
;    MOD(sum2);
;    sum1 += (adler2 & 0xffff) + BASE - 1;
;    sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
	cmp dword[sum1],BASE
	jl @f ;if (..>=..)
		sub dword[sum1],BASE
	@@:
	cmp dword[sum1],BASE
	jl @f ;if (..>=..)
		sub dword[sum1],BASE
	@@:
	cmp dword[sum2],BASE shl 1
	jl @f ;if (..>=..)
		sub dword[sum2],BASE shl 1
	@@:
	cmp dword[sum2],BASE
	jl @f ;if (..>=..)
		sub dword[sum2],BASE
	@@:
	mov eax,[sum2]
	shl eax,16
	or eax,[sum1]
.end_f:
	ret
endp

; =========================================================================
;uLong (adler1, adler2, len2)
;    uLong adler1
;    uLong adler2
;    z_off_t len2
align 4
proc adler32_combine, adler1:dword, adler2:dword, len2:dword
	stdcall adler32_combine_, [adler1], [adler2], [len2]
	ret
endp

;uLong (adler1, adler2, len2)
;    uLong adler1
;    uLong adler2
;    z_off64_t len2
align 4
proc adler32_combine64, adler1:dword, adler2:dword, len2:dword
	stdcall adler32_combine_, [adler1], [adler2], [len2]
	ret
endp