kolibrios/programs/fs/kfar/trunk/zlib/adler32.asm

; adler32.asm -- compute the Adler-32 checksum of a data stream
; Copyright (C) 1995-2011 Mark Adler
; For conditions of distribution and use, see copyright notice in zlib.h


BASE equ 65521 ;largest prime smaller than 65536
NMAX equ 5552
; NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1

macro DO1 buf,i
{
	mov eax,buf
	add eax,i
	movzx eax,byte[eax]
	add [adler],eax
	mov eax,[adler]
	add [sum2],eax
}
macro DO2 buf,i
{
	DO1 buf,i
	DO1 buf,i+1
}
macro DO4 buf,i
{
	DO2 buf,i
	DO2 buf,i+2
}
macro DO8 buf,i
{
	DO4 buf,i
	DO4 buf,i+4
}
macro DO16 buf
{
	DO8 buf,0
	DO8 buf,8
}

; use NO_DIVIDE if your processor does not do division in hardware --
; try it both ways to see which is faster
; note that this assumes BASE is 65521, where 65536 % 65521 == 15
; (thank you to John Reiser for pointing this out)
macro CHOP a
{
if NO_DIVIDE eq 1
	mov eax,a
	shr eax,16
	and a,0xffff
	shl eax,4
	add a,eax
	shr eax,4
	sub a,eax
end if
}
macro MOD28 a
{
if NO_DIVIDE eq 1
local .end0
	CHOP a
	cmp a,BASE
	jl .end0 ;if (..>=..)
		sub a,BASE
	.end0:
else
push eax ecx edx
	mov eax,a
	xor edx,edx
	mov ecx,BASE
	div ecx
	mov a,edx
pop edx ecx eax
end if
}
macro MOD a
{
if NO_DIVIDE eq 1
	CHOP a
	MOD28 a
else
push eax ecx edx
	mov eax,a
	xor edx,edx
	mov ecx,BASE
	div ecx
	mov a,edx
pop edx ecx eax
end if
}
macro MOD63 a
{
if NO_DIVIDE eq 1
;this assumes a is not negative
;        z_off64_t tmp = a >> 32;
;        a &= 0xffffffff;
;        a += (tmp << 8) - (tmp << 5) + tmp;
;        tmp = a >> 16;
;        a &= 0xffff;
;        a += (tmp << 4) - tmp;
;        tmp = a >> 16;
;        a &= 0xffff;
;        a += (tmp << 4) - tmp;
;        if (a >= BASE) a -= BASE;
else
push eax ecx edx
	mov eax,a
	xor edx,edx
	mov ecx,BASE
	div ecx
	mov a,edx
pop edx ecx eax
end if
}

; =========================================================================
;uLong (adler, buf, len)
;    uLong adler
;    const Bytef *buf
;    uInt len
align 4
proc adler32 uses ebx edx, adler:dword, buf:dword, len:dword
locals
	sum2 dd ? ;uLong
endl
;zlib_debug 'adler32 adler = %d',[adler]
	; split Adler-32 into component sums
	mov eax,[adler]
	shr eax,16
	mov [sum2],eax
	and [adler],0xffff
	mov ebx,[buf]

	; in case user likes doing a byte at a time, keep it fast
	cmp dword[len],1
	jne .end0 ;if (..==..)
		movzx eax,byte[ebx]
		add [adler],eax
		cmp dword[adler],BASE
		jl @f ;if (..>=..)
			sub dword[adler],BASE
		@@:
		mov eax,[adler]
		add [sum2],eax
		cmp dword[sum2],BASE
		jl @f ;if (..>=..)
			sub dword[sum2],BASE
		@@:
		jmp .combine
align 4
	.end0:

	; initial Adler-32 value (deferred check for len == 1 speed)
	cmp ebx,Z_NULL
	jne @f ;if (..==0)
		xor eax,eax
		inc eax
		jmp .end_f
align 4
	@@:

	; in case short lengths are provided, keep it somewhat fast
	cmp dword[len],16
	jge .end1 ;if (..<..)
		.cycle0:
			cmp dword[len],0
			jne @f ;while (..)
			movzx eax,byte[ebx]
			inc ebx
			add [adler],eax
			mov eax,[adler]
			add [sum2],eax
			dec dword[len]
			jmp .cycle0
align 4
		@@:
		cmp dword[adler],BASE
		jl @f ;if (..>=..)
			sub dword[adler],BASE
		@@:
		MOD28 dword[sum2] ;only added so many BASE's
		jmp .combine
align 4
	.end1:

	; do length NMAX blocks -- requires just one modulo operation
	.cycle3:
	cmp dword[len],NMAX
	jl .cycle3end ;while (..>=..)
		sub dword[len],NMAX
		mov edx,NMAX/16 ;NMAX is divisible by 16
		.cycle1: ;do
			DO16 ebx ;16 sums unrolled
			add ebx,16
			dec edx
			cmp edx,0
			jg .cycle1 ;while (..)
		MOD [adler]
		MOD [sum2]
		jmp .cycle3
align 4
	.cycle3end:

	; do remaining bytes (less than NMAX, still just one modulo)
	cmp dword[len],0
	jne .end2 ;if (..) ;avoid modulos if none remaining
		@@:
		cmp dword[len],16
		jl .cycle2 ;while (..>=..)
			sub dword[len],16
			DO16 ebx
			add ebx,16
			jmp @b
align 4
		.cycle2:
			cmp dword[len],0
			jne @f ;while (..)
			movzx eax,byte[ebx]
			inc ebx
			add [adler],eax
			mov eax,[adler]
			add [sum2],eax
			dec dword[len]
			jmp .cycle2
align 4
		@@:
		MOD [adler]
		MOD [sum2]
	.end2:

	; return recombined sums
.combine:
	mov eax,[sum2]
	shl eax,16
	or eax,[adler]
.end_f:
;zlib_debug '  adler32.ret = %d',eax
	ret
endp

; =========================================================================
;uLong (adler1, adler2, len2)
;    uLong adler1
;    uLong adler2
;    z_off64_t len2
align 4
proc adler32_combine_, adler1:dword, adler2:dword, len2:dword
locals
	sum1 dd ? ;uLong
	sum2 dd ? ;uLong
;    unsigned rem;
endl
	; for negative len, return invalid adler32 as a clue for debugging
	cmp dword[len2],0
	jge @f ;if (..<0)
		mov eax,0xffffffff
		jmp .end_f
	@@:

	; the derivation of this formula is left as an exercise for the reader
;    MOD63(len2) ;assumes len2 >= 0
;    rem = (unsigned)len2;
;    sum1 = adler1 & 0xffff;
;    sum2 = rem * sum1;
;    MOD(sum2);
;    sum1 += (adler2 & 0xffff) + BASE - 1;
;    sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
	cmp dword[sum1],BASE
	jl @f ;if (..>=..)
		sub dword[sum1],BASE
	@@:
	cmp dword[sum1],BASE
	jl @f ;if (..>=..)
		sub dword[sum1],BASE
	@@:
	cmp dword[sum2],BASE shl 1
	jl @f ;if (..>=..)
		sub dword[sum2],BASE shl 1
	@@:
	cmp dword[sum2],BASE
	jl @f ;if (..>=..)
		sub dword[sum2],BASE
	@@:
	mov eax,[sum2]
	shl eax,16
	or eax,[sum1]
.end_f:
	ret
endp

; =========================================================================
;uLong (adler1, adler2, len2)
;    uLong adler1
;    uLong adler2
;    z_off_t len2
align 4
proc adler32_combine, adler1:dword, adler2:dword, len2:dword
	stdcall adler32_combine_, [adler1], [adler2], [len2]
	ret
endp

;uLong (adler1, adler2, len2)
;    uLong adler1
;    uLong adler2
;    z_off64_t len2
align 4
proc adler32_combine64, adler1:dword, adler2:dword, len2:dword
	stdcall adler32_combine_, [adler1], [adler2], [len2]
	ret
endp
add 'zlib.obj' git-svn-id: svn://kolibrios.org@6617 a494cfbc-eb01-0410-851d-a64ba20cac60 2016-10-26 21:35:35 +02:00			`; adler32.asm -- compute the Adler-32 checksum of a data stream`
			`; Copyright (C) 1995-2011 Mark Adler`
			`; For conditions of distribution and use, see copyright notice in zlib.h`


			`BASE equ 65521 ;largest prime smaller than 65536`
			`NMAX equ 5552`
			`; NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1`

			`macro DO1 buf,i`
			`{`
			`mov eax,buf`
			`add eax,i`
			`movzx eax,byte[eax]`
			`add [adler],eax`
			`mov eax,[adler]`
			`add [sum2],eax`
			`}`
			`macro DO2 buf,i`
			`{`
			`DO1 buf,i`
			`DO1 buf,i+1`
			`}`
			`macro DO4 buf,i`
			`{`
			`DO2 buf,i`
			`DO2 buf,i+2`
			`}`
			`macro DO8 buf,i`
			`{`
			`DO4 buf,i`
			`DO4 buf,i+4`
			`}`
			`macro DO16 buf`
			`{`
			`DO8 buf,0`
			`DO8 buf,8`
			`}`

			`; use NO_DIVIDE if your processor does not do division in hardware --`
			`; try it both ways to see which is faster`
			`; note that this assumes BASE is 65521, where 65536 % 65521 == 15`
			`; (thank you to John Reiser for pointing this out)`
			`macro CHOP a`
			`{`
			`if NO_DIVIDE eq 1`
			`mov eax,a`
			`shr eax,16`
			`and a,0xffff`
			`shl eax,4`
			`add a,eax`
			`shr eax,4`
			`sub a,eax`
			`end if`
			`}`
			`macro MOD28 a`
			`{`
			`if NO_DIVIDE eq 1`
			`local .end0`
			`CHOP a`
			`cmp a,BASE`
			`jl .end0 ;if (..>=..)`
			`sub a,BASE`
			`.end0:`
			`else`
			`push eax ecx edx`
			`mov eax,a`
			`xor edx,edx`
			`mov ecx,BASE`
			`div ecx`
			`mov a,edx`
			`pop edx ecx eax`
			`end if`
			`}`
			`macro MOD a`
			`{`
			`if NO_DIVIDE eq 1`
			`CHOP a`
			`MOD28 a`
			`else`
			`push eax ecx edx`
			`mov eax,a`
			`xor edx,edx`
			`mov ecx,BASE`
			`div ecx`
			`mov a,edx`
			`pop edx ecx eax`
			`end if`
			`}`
			`macro MOD63 a`
			`{`
			`if NO_DIVIDE eq 1`
			`;this assumes a is not negative`
			`; z_off64_t tmp = a >> 32;`
			`; a &= 0xffffffff;`
			`; a += (tmp << 8) - (tmp << 5) + tmp;`
			`; tmp = a >> 16;`
			`; a &= 0xffff;`
			`; a += (tmp << 4) - tmp;`
			`; tmp = a >> 16;`
			`; a &= 0xffff;`
			`; a += (tmp << 4) - tmp;`
			`; if (a >= BASE) a -= BASE;`
			`else`
			`push eax ecx edx`
			`mov eax,a`
			`xor edx,edx`
			`mov ecx,BASE`
			`div ecx`
			`mov a,edx`
			`pop edx ecx eax`
			`end if`
			`}`

			`; =========================================================================`
			`;uLong (adler, buf, len)`
			`; uLong adler`
			`; const Bytef *buf`
			`; uInt len`
			`align 4`
			`proc adler32 uses ebx edx, adler:dword, buf:dword, len:dword`
			`locals`
			`sum2 dd ? ;uLong`
			`endl`
			`;zlib_debug 'adler32 adler = %d',[adler]`
			`; split Adler-32 into component sums`
			`mov eax,[adler]`
			`shr eax,16`
			`mov [sum2],eax`
			`and [adler],0xffff`
			`mov ebx,[buf]`

			`; in case user likes doing a byte at a time, keep it fast`
			`cmp dword[len],1`
			`jne .end0 ;if (..==..)`
			`movzx eax,byte[ebx]`
			`add [adler],eax`
			`cmp dword[adler],BASE`
			`jl @f ;if (..>=..)`
			`sub dword[adler],BASE`
			`@@:`
			`mov eax,[adler]`
			`add [sum2],eax`
			`cmp dword[sum2],BASE`
			`jl @f ;if (..>=..)`
			`sub dword[sum2],BASE`
			`@@:`
			`jmp .combine`
			`align 4`
			`.end0:`

			`; initial Adler-32 value (deferred check for len == 1 speed)`
			`cmp ebx,Z_NULL`
			`jne @f ;if (..==0)`
			`xor eax,eax`
			`inc eax`
			`jmp .end_f`
			`align 4`
			`@@:`

			`; in case short lengths are provided, keep it somewhat fast`
			`cmp dword[len],16`
			`jge .end1 ;if (..<..)`
			`.cycle0:`
			`cmp dword[len],0`
			`jne @f ;while (..)`
			`movzx eax,byte[ebx]`
			`inc ebx`
			`add [adler],eax`
			`mov eax,[adler]`
			`add [sum2],eax`
			`dec dword[len]`
			`jmp .cycle0`
			`align 4`
			`@@:`
			`cmp dword[adler],BASE`
			`jl @f ;if (..>=..)`
			`sub dword[adler],BASE`
			`@@:`
			`MOD28 dword[sum2] ;only added so many BASE's`
			`jmp .combine`
			`align 4`
			`.end1:`

			`; do length NMAX blocks -- requires just one modulo operation`
			`.cycle3:`
			`cmp dword[len],NMAX`
			`jl .cycle3end ;while (..>=..)`
			`sub dword[len],NMAX`
			`mov edx,NMAX/16 ;NMAX is divisible by 16`
			`.cycle1: ;do`
			`DO16 ebx ;16 sums unrolled`
			`add ebx,16`
			`dec edx`
			`cmp edx,0`
			`jg .cycle1 ;while (..)`
			`MOD [adler]`
			`MOD [sum2]`
			`jmp .cycle3`
			`align 4`
			`.cycle3end:`

			`; do remaining bytes (less than NMAX, still just one modulo)`
			`cmp dword[len],0`
			`jne .end2 ;if (..) ;avoid modulos if none remaining`
			`@@:`
			`cmp dword[len],16`
			`jl .cycle2 ;while (..>=..)`
			`sub dword[len],16`
			`DO16 ebx`
			`add ebx,16`
			`jmp @b`
			`align 4`
			`.cycle2:`
			`cmp dword[len],0`
			`jne @f ;while (..)`
			`movzx eax,byte[ebx]`
			`inc ebx`
			`add [adler],eax`
			`mov eax,[adler]`
			`add [sum2],eax`
			`dec dword[len]`
			`jmp .cycle2`
			`align 4`
			`@@:`
			`MOD [adler]`
			`MOD [sum2]`
			`.end2:`

			`; return recombined sums`
			`.combine:`
			`mov eax,[sum2]`
			`shl eax,16`
			`or eax,[adler]`
			`.end_f:`
			`;zlib_debug ' adler32.ret = %d',eax`
			`ret`
			`endp`

			`; =========================================================================`
			`;uLong (adler1, adler2, len2)`
			`; uLong adler1`
			`; uLong adler2`
			`; z_off64_t len2`
			`align 4`
			`proc adler32_combine_, adler1:dword, adler2:dword, len2:dword`
			`locals`
			`sum1 dd ? ;uLong`
			`sum2 dd ? ;uLong`
			`; unsigned rem;`
			`endl`
			`; for negative len, return invalid adler32 as a clue for debugging`
			`cmp dword[len2],0`
			`jge @f ;if (..<0)`
			`mov eax,0xffffffff`
			`jmp .end_f`
			`@@:`

			`; the derivation of this formula is left as an exercise for the reader`
			`; MOD63(len2) ;assumes len2 >= 0`
			`; rem = (unsigned)len2;`
			`; sum1 = adler1 & 0xffff;`
			`; sum2 = rem * sum1;`
			`; MOD(sum2);`
			`; sum1 += (adler2 & 0xffff) + BASE - 1;`
			`; sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;`
			`cmp dword[sum1],BASE`
			`jl @f ;if (..>=..)`
			`sub dword[sum1],BASE`
			`@@:`
			`cmp dword[sum1],BASE`
			`jl @f ;if (..>=..)`
			`sub dword[sum1],BASE`
			`@@:`
			`cmp dword[sum2],BASE shl 1`
			`jl @f ;if (..>=..)`
			`sub dword[sum2],BASE shl 1`
			`@@:`
			`cmp dword[sum2],BASE`
			`jl @f ;if (..>=..)`
			`sub dword[sum2],BASE`
			`@@:`
			`mov eax,[sum2]`
			`shl eax,16`
			`or eax,[sum1]`
			`.end_f:`
			`ret`
			`endp`

			`; =========================================================================`
			`;uLong (adler1, adler2, len2)`
			`; uLong adler1`
			`; uLong adler2`
			`; z_off_t len2`
			`align 4`
			`proc adler32_combine, adler1:dword, adler2:dword, len2:dword`
			`stdcall adler32_combine_, [adler1], [adler2], [len2]`
			`ret`
			`endp`

			`;uLong (adler1, adler2, len2)`
			`; uLong adler1`
			`; uLong adler2`
			`; z_off64_t len2`
			`align 4`
			`proc adler32_combine64, adler1:dword, adler2:dword, len2:dword`
			`stdcall adler32_combine_, [adler1], [adler2], [len2]`
			`ret`
			`endp`