;;================================================================================================;;
;;//// blend_sse.asm //// (c) dunkaist, 2011-2012 ////////////////////////////////////////////////;;
;;================================================================================================;;
;;                                                                                                ;;
;; This file is part of Common development libraries (Libs-Dev).                                  ;;
;;                                                                                                ;;
;; Libs-Dev is free software: you can redistribute it and/or modify it under the terms of the GNU ;;
;; Lesser General Public License as published by the Free Software Foundation, either version 2.1 ;;
;; of the License, or (at your option) any later version.                                         ;;
;;                                                                                                ;;
;; Libs-Dev is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without  ;;
;; even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  ;;
;; Lesser General Public License for more details.                                                ;;
;;                                                                                                ;;
;; You should have received a copy of the GNU Lesser General Public License along with Libs-Dev.  ;;
;; If not, see <http://www.gnu.org/licenses/>.                                                    ;;
;;                                                                                                ;;
;;================================================================================================;;

proc	xcf._.blend_rgb

	push		eax ebx

	xchg		al, bh
	mov		ah, bh
	neg		ax
	add		ax, 0xffff
	mul		ah
	neg		ah
	add		ah, 0xff
	xchg		ah, bh

	mov		al, 0xff
	cmp		ah, bh
	je		@f
	not		al
	div		bh
    @@:
	mov		ah, al
	movd		xmm1, eax

	pop		ebx eax
	push		ebx

	shr		eax, 8
	shr		ebx, 8

	xchg		al, bh
	mov		ah, bh
	neg		ax
	add		ax, 0xffff
	mul		ah
	neg		ah
	add		ah, 0xff
	xchg		ah, bh

	mov		al, 0xff
	cmp		ah, bh
	je		@f
	not		al
	div		bh
    @@:
	mov		ah, al
	movd		ebx, xmm1
	ror		ebx, 16
	mov		bx, ax
	rol		ebx, 16
	movd		xmm1, ebx

	pop		ebx

;	movdqu		xmm1, xword[xcf._.xmm_000000ff]
;	movdqa		xmm4, xmm1
;	movdqa		xmm5, xmm1
;	movdqa		xmm6, xmm2
;	psrldq		xmm6, 3
;	pand		xmm6, xmm1
;	psubw		xmm4, xmm6
;	movdqa		xmm6, xmm3
;	psrldq		xmm6, 3
;	pand		xmm6, xmm1
;	psubw		xmm5, xmm6
;	pmullw		xmm4, xmm5
;	psrlw		xmm4, 8
;	psubw		xmm1, xmm4
;	movdqa		xmm4, xmm1
;	movdqa		xmm1, xmm6
;	divps		xmm1, xmm4
;	packuswb	xmm1, xmm0
;	packuswb	xmm1, xmm0
;	punpcklbw	xmm1, xmm1

	punpcklbw	xmm1, xmm1
	punpcklbw	xmm1, xmm0

	movdqa		xmm7, xmm1
	psrlw		xmm7, 7
	paddw		xmm1, xmm7

	psubw		xmm3, xmm2
	pmullw		xmm3, xmm1
	psllw		xmm2, 8
	paddw		xmm3, xmm2
	pinsrw		xmm3, ebx, 3
	shr		ebx, 8
	pinsrw		xmm3, ebx, 7
	psrlw		xmm3, 8
	packuswb	xmm3, xmm0

	ret
endp


proc	xcf._.blend_gray

	xchg		al, bh
	mov		ah, bh
	neg		ax
	add		ax, 0xffff
	mul		ah
	neg		ah
	add		ah, 0xff
	xchg		ah, bh

	mov		al, 0xff
	cmp		ah, bh
	je		@f
	not		al
	div		bh
    @@:

	mov		ah, al

	movd		xmm1, eax
	punpcklbw	xmm1, xmm1
	punpcklbw	xmm1, xmm0

	movq		xmm7, xmm1
	psrlw		xmm7, 7
	paddw		xmm1, xmm7

	psubw		xmm3, xmm2
	pmullw		xmm3, xmm1
	psllw		xmm2, 8
	paddw		xmm3, xmm2
	pinsrw		xmm3, ebx, 1
	psrlw		xmm3, 8
	packuswb	xmm3, xmm0

	ret
endp


proc	xcf._.merge_32 _copy_width, _copy_height, _img_total_bpl, _bottom_total_bpl

	pxor		xmm0, xmm0

  .line:
	mov		ecx, [_copy_width]
	bt		ecx, 0
	jnc		.even
  .odd:
	movd		xmm2, [edi]
	movd		xmm3, [esi]
	add		esi, 4

	movdqa		xmm4, xmm2
	pminub		xmm4, xmm3
	pextrw		eax, xmm4, 3
	pextrw		ebx, xmm4, 1
	mov		al, bh

	push		eax
	pextrw		eax, xmm2, 3
	pextrw		ebx, xmm2, 1
	mov		bl, ah
	shl		ebx, 8
	pop		eax

	call		edx
	call		xcf._.blend_rgb
	movd		[edi], xmm3
	add		edi, 4

	cmp		ecx, 1
	je		.done

  .even:
	sub		ecx, 2
  .pixel:
	movq		xmm2, [edi]
	movq		xmm3, [esi]
	add		esi, 8

	movdqa		xmm4, xmm2
	pminub		xmm4, xmm3
	pextrw		eax, xmm4, 3
	pextrw		ebx, xmm4, 1
	mov		al, bh

	push		eax
	pextrw		eax, xmm2, 3
	pextrw		ebx, xmm2, 1
	mov		bl, ah
	shl		ebx, 8
	pop		eax

	call		edx
	call		xcf._.blend_rgb
	movq		[edi], xmm3
	add		edi, 8
	sub		ecx, 2
	jns		.pixel
	add		esi, [_img_total_bpl]
	add		edi, [_bottom_total_bpl]
	dec		[_copy_height]
	jnz		.line
  .done:
	ret
endp


proc	xcf._.merge_8a _copy_width, _copy_height, _img_total_bpl, _bottom_total_bpl
  .gray_line:
	mov		ecx, [_copy_width]
  .gray_pixel:
	mov		bx,  word[edi]
	lodsw
	movd		xmm2, ebx
	movd		xmm3, eax
	shr		eax, 8
	cmp		al, bh
	jna		@f
	mov		al, bh
    @@:
	pxor		xmm0, xmm0
	call		edx
	call		xcf._.blend_gray
	movd		eax, xmm3
	stosw
	dec		ecx
	jnz		.gray_pixel
	add		esi, [_img_total_bpl]
	add		edi, [_bottom_total_bpl]
	dec		[_copy_height]
	jnz		.gray_line
	ret
endp


proc	xcf._.composite_rgb_00 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl

	pxor		xmm0, xmm0

  .line:
	mov		ecx, [_copy_width]
	bt		ecx, 0
	jnc		.even
  .odd:
	movlpd		xmm2, [edi]
	movlpd		xmm3, [esi]
	add		esi, 4

	pextrw		eax, xmm3, 3
	pextrw		ebx, xmm3, 1
	mov		al, bh

	push		eax
	pextrw		eax, xmm2, 3
	pextrw		ebx, xmm2, 1
	mov		bl, ah
	shl		ebx, 8
	pop		eax

	xchg		al, bh
	mov		ah, bh
	neg		al
	neg		ah
	dec		al
	dec		ah
	mul		ah
	neg		ah
	dec		ah
	xchg		ah, bh

	mov		al, 0xff
	cmp		ah, bh
	je		@f
	inc		al
	div		bh
    @@:
	mov		ah, al
	movd		xmm1, eax

	punpcklbw	xmm1, xmm1
	punpcklbw	xmm1, xmm0
	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	psubsw		xmm3, xmm2
	pmullw		xmm3, xmm1
	psllw		xmm2, 8
	paddw		xmm3, xmm2
	pinsrw		xmm3, ebx, 3
	shr		ebx, 8
	pinsrw		xmm3, ebx, 7
	psrlw		xmm3, 8
	packuswb	xmm3, xmm0

	movd		[edi], xmm3
	add		edi, 4

	cmp		ecx, 1
	je		.done

  .even:
	sub		ecx, 2
  .pixel:
	movlpd		xmm2, [edi]
	movlpd		xmm3, [esi]
	add		esi, 8

	pextrw		eax, xmm3, 3
	pextrw		ebx, xmm3, 1
	mov		al, bh

	push		eax
	pextrw		eax, xmm2, 3
	pextrw		ebx, xmm2, 1
	mov		bl, ah
	shl		ebx, 8
	pop		eax


	push		eax ebx

	xchg		al, bh
	mov		ah, bh
	neg		al
	neg		ah
	dec		al
	dec		ah
	mul		ah
	neg		ah
	dec		ah
	xchg		ah, bh

	mov		al, 0xff
	cmp		ah, bh
	je		@f
	inc		al
	div		bh
    @@:
	mov		ah, al
	movd		xmm1, eax

	pop		ebx eax
	push		ebx

	shr		eax, 8
	shr		ebx, 8

	xchg		al, bh
	mov		ah, bh
	neg		ax
	add		ax, 0xffff
	mul		ah
	neg		ah
	add		ah, 0xff
	xchg		ah, bh

	mov		al, 0xff
	cmp		ah, bh
	je		@f
	not		al
	div		bh
    @@:
	mov		ah, al
	movd		ebx, xmm1
	ror		ebx, 16
	mov		bx, ax
	rol		ebx, 16
	movd		xmm1, ebx

	pop		ebx

	punpcklbw	xmm1, xmm1
	punpcklbw	xmm1, xmm0
	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	psubsw		xmm3, xmm2
	pmullw		xmm3, xmm1
	psllw		xmm2, 8
	paddw		xmm3, xmm2
	pinsrw		xmm3, ebx, 3
	shr		ebx, 8
	pinsrw		xmm3, ebx, 7
	psrlw		xmm3, 8
	packuswb	xmm3, xmm0

	movq		[edi], xmm3
	add		edi, 8
	sub		ecx, 2
	jns		.pixel
	add		esi, [_img_total_bpl]
	add		edi, [_bottom_total_bpl]
	dec		[_copy_height]
	jnz		.line
  .done:
	ret
endp


proc	xcf._.composite_gray_00 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl

  .line:
	mov		ecx, [_copy_width]
  .pixel:
	mov		bx, [edi]
	lodsw
	movd		xmm2, ebx
	movd		xmm3, eax

	shr		eax, 8

	xchg		al, bh
	mov		ah, bh
	neg		ax
	add		ax, 0xffff
	mul		ah
	neg		ah
	add		ah, 0xff
	xchg		ah, bh

	mov		al, 0xff
	cmp		ah, bh
	je		@f
	not		al
	div		bh
    @@:

	mov		ah, al

	movd		xmm1, eax
	pxor		xmm0, xmm0
	punpcklbw	xmm1, xmm1
	punpcklbw	xmm1, xmm0
	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	psubw		xmm3, xmm2
	pmullw		xmm3, xmm1
	psllw		xmm2, 8
	paddw		xmm3, xmm2
	pinsrw		xmm3, ebx, 1
	psrlw		xmm3, 8
	packuswb	xmm3, xmm0
	movd		eax, xmm3
	stosw

	dec		ecx
	jnz		.pixel
	add		esi, [_img_total_bpl]
	add		edi, [_bottom_total_bpl]
	dec		[_copy_height]
	jnz		.line

	ret
endp


proc	xcf._.composite_indexed_00 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl

  .line:
	mov		ecx, [_copy_width]
  .pixel:
	mov		bx, [edi]
	lodsw

	or		ah, 0x7f
	test		ah, 0x80
	jnz		@f
	mov		ax, bx
    @@:
	stosw

	dec		ecx
	jnz		.pixel
	add		esi, [_img_total_bpl]
	add		edi, [_bottom_total_bpl]
	dec		[_copy_height]
	jnz		.line
	ret
endp


proc	xcf._.composite_rgb_01 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl
	pushad

	pxor		xmm4, xmm4
	movd		xmm4, [xcf._.random_b]
	movd		xmm1, [xcf._.random_a]
	movd		xmm2, [xcf._.random_c]

  .line:
	mov		ecx, [_copy_width]
  .pixel:
	mov		ebx, [edi]
	lodsd

	movq		xmm0, xmm4
	pmuludq		xmm0, xmm1
	paddq		xmm0, xmm2
	movd		edx, xmm0
	movd		xmm4, edx
	pxor		xmm0, xmm0

	rol		eax, 8
	test		al, al
	jz		@f
	shr		edx, 17
	cmp		dl, al
	ja		@f
	ror		eax, 8
	or		eax, 0xff000000
	jmp		.done
    @@:
	mov		eax, ebx
  .done:
	stosd
	dec		ecx
	jnz		.pixel
	add		esi, [_img_total_bpl]
	add		edi, [_bottom_total_bpl]
	dec		[_copy_height]
	jnz		.line

  .quit:
	popad
	ret
endp


proc	xcf._.composite_gray_01 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl
	pushad

	pxor		xmm4, xmm4
	movd		xmm4, [xcf._.random_b]
	movd		xmm1, [xcf._.random_a]
	movd		xmm2, [xcf._.random_c]

  .line:
	mov		ecx, [_copy_width]
  .pixel:
	mov		ebx, [edi]
	lodsw

	movq		xmm0, xmm4
	pmuludq		xmm0, xmm1
	paddq		xmm0, xmm2
	movd		edx, xmm0
	movd		xmm4, edx
	pxor		xmm0, xmm0

	test		ah, ah
	jz		@f
	shr		edx, 17
	cmp		dl, ah
	ja		@f
	or		ax, 0xff00
	jmp		.done
    @@:
	mov		eax, ebx
  .done:
	stosw
	dec		ecx
	jnz		.pixel
	add		esi, [_img_total_bpl]
	add		edi, [_bottom_total_bpl]
	dec		[_copy_height]
	jnz		.line

  .quit:
	popad
	ret
endp


proc	xcf._.composite_rgb_03			; Multiply

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0
	pmullw		xmm3, xmm2
	psrlw		xmm3, 8

	ret
endp


proc	xcf._.composite_rgb_04			; Screen

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0
	movdqu		xmm4, xword[xcf._.xmm_00ff]
	movdqa		xmm5, xmm4
	psubw		xmm5, xmm3
	movdqa		xmm3, xmm4
	psubw		xmm4, xmm2
	pmullw		xmm4, xmm5
	psrlw		xmm4, 8
	psubw		xmm3, xmm4
	ret
endp


proc	xcf._.composite_rgb_05			; Overlay

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0
	movdqu		xmm4, xword[xcf._.xmm_00ff]
	psubw		xmm4, xmm2
	pmullw		xmm3, xmm4
	psrlw		xmm3, 7
	paddw		xmm3, xmm2
	pmullw		xmm3, xmm2
	psrlw		xmm3, 8

	ret
endp


proc	xcf._.composite_rgb_06			; Difference

	movdqa		xmm4, xmm3
	pminub		xmm4, xmm2
	pmaxub		xmm3, xmm2
	psubusb		xmm3, xmm4
	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	ret
endp


proc	xcf._.composite_rgb_07			; Addition

	paddusb		xmm3, xmm2
	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	ret
endp


proc	xcf._.composite_rgb_08			; Subtract

	movdqa		xmm4, xmm2
	psubusb		xmm4, xmm3
	movq		xmm3, xmm4
	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	ret
endp


proc	xcf._.composite_rgb_09			; Darken Only

	pminub		xmm3, xmm2
	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	ret
endp


proc	xcf._.composite_rgb_10			; Lighten Only

	pmaxub		xmm3, xmm2
	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	ret
endp


proc	xcf._.composite_rgb_11			; Hue (H of HSV)
	push		eax ebx ecx edx

	movd		eax, xmm3
	movd		ebx, xmm2

	call		xcf._.rgb2hsv
	xchg		eax, ebx
	call		xcf._.rgb2hsv
	xchg		eax, ebx

	test		ah, ah
	jnz		@f
	ror		eax, 8
	ror		ebx, 8
	mov		ah, bh
	rol		eax, 8
	rol		ebx, 8
    @@:
	mov		ax, bx

	call		xcf._.hsv2rgb

	push		eax

	movq		xmm1, xmm3
	psrldq		xmm1, 4
	movd		eax, xmm1
	movq		xmm1, xmm2
	psrldq		xmm1, 4
	movd		ebx, xmm1

	call		xcf._.rgb2hsv
	xchg		eax, ebx
	call		xcf._.rgb2hsv
	xchg		eax, ebx

	test		ah, ah
	jnz		@f
	ror		eax, 8
	ror		ebx, 8
	mov		ah, bh
	rol		eax, 8
	rol		ebx, 8
    @@:
	mov		ax, bx

	call		xcf._.hsv2rgb

	movd		xmm3, eax
	pslldq		xmm3, 4
	pop		eax
	movd		xmm1, eax
	paddq		xmm3, xmm1

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

  .quit:
	pop		edx ecx ebx eax
	ret
endp


proc	xcf._.composite_rgb_12			; Saturation (S of HSV)
	push		eax ebx ecx edx

	movd		eax, xmm3
	movd		ebx, xmm2

	call		xcf._.rgb2hsv
	xchg		eax, ebx
	call		xcf._.rgb2hsv
	xchg		eax, ebx

	ror		eax, 8
	ror		ebx, 8
	mov		ah, bh
	rol		eax, 8
	rol		ebx, 8
	mov		al, bl

	call		xcf._.hsv2rgb

	push		eax
	movq		xmm1, xmm3
	psrldq		xmm1, 4
	movd		eax, xmm1
	movq		xmm1, xmm2
	psrldq		xmm1, 4
	movd		ebx, xmm1

	call		xcf._.rgb2hsv
	xchg		eax, ebx
	call		xcf._.rgb2hsv
	xchg		eax, ebx

	ror		eax, 8
	ror		ebx, 8
	mov		ah, bh
	rol		eax, 8
	rol		ebx, 8
	mov		al, bl

	call		xcf._.hsv2rgb


	movd		xmm3, eax
	pslldq		xmm3, 4
	pop		eax
	movd		xmm1, eax
	paddq		xmm3, xmm1

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

  .quit:
	pop		edx ecx ebx eax
	ret
endp


proc	xcf._.composite_rgb_13			; Color (H and S of HSL)
	push		eax ebx ecx edx

	movd		eax, xmm3
	movd		ebx, xmm2

	call		xcf._.rgb2hsl
	xchg		eax,    ebx
	call		xcf._.rgb2hsl
	xchg		eax,    ebx

	mov		al, bl

	call		xcf._.hsl2rgb

	push		eax
	movq		xmm1, xmm3
	psrldq		xmm1, 4
	movd		eax, xmm1
	movq		xmm1, xmm2
	psrldq		xmm1, 4
	movd		ebx, xmm1

	call		xcf._.rgb2hsl
	xchg		eax,    ebx
	call		xcf._.rgb2hsl
	xchg		eax,    ebx

	mov		al, bl

	call		xcf._.hsl2rgb

	movd		xmm3, eax
	pslldq		xmm3, 4
	pop		eax
	movd		xmm1, eax
	paddq		xmm3, xmm1

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

  .quit:
	pop		edx ecx ebx eax
	ret
endp


proc	xcf._.composite_rgb_14			; Value (V of HSV)
	push		eax ebx ecx edx

	movd		eax, xmm3
	movd		ebx, xmm2

	call		xcf._.rgb2hsv
	xchg		eax, ebx
	call		xcf._.rgb2hsv
	xchg		eax, ebx

	ror		eax, 8
	ror		ebx, 8
	mov		ax, bx
	rol		eax, 8
	rol		ebx, 8

	call		xcf._.hsv2rgb

	push		eax
	movq		xmm1, xmm3
	psrldq		xmm1, 4
	movd		eax, xmm1
	movq		xmm1, xmm2
	psrldq		xmm1, 4
	movd		ebx, xmm1

	call		xcf._.rgb2hsv
	xchg		eax, ebx
	call		xcf._.rgb2hsv
	xchg		eax, ebx

	ror		eax, 8
	ror		ebx, 8
	mov		ax, bx
	rol		eax, 8
	rol		ebx, 8

	call		xcf._.hsv2rgb

	movd		xmm3, eax
	pslldq		xmm3, 4
	pop		eax
	movd		xmm1, eax
	paddq		xmm3, xmm1

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

  .quit:
	pop		edx ecx ebx eax
	ret
endp


proc	xcf._.composite_rgb_15			; Divide
	push		eax ebx ecx

	movd		eax, xmm3
	movd		ebx, xmm2

	rol		eax, 8
	rol		ebx, 8

	xchg		eax, ebx

	mov		ecx, 3

  .color:
	rol		eax, 8
	rol		ebx, 8
	shl		ax, 8
	test		bl, bl
	jz		.clamp1
	cmp		ah, bl
	jae		.clamp2
	div		bl
	jmp		.done
  .clamp1:
	mov		al, 0xff
	test		ah, ah
	jnz		@f
	not		al
    @@:
	jmp		.done
  .clamp2:
	mov		al, 0xff
	jmp		.done
  .done:
	mov		ah, al
	loop		.color

	ror		eax, 8


	push		eax
	movq		xmm1, xmm3
	psrldq		xmm1, 4
	movd		eax, xmm1
	movq		xmm1, xmm2
	psrldq		xmm1, 4
	movd		ebx, xmm1


	rol		eax, 8
	rol		ebx, 8

	xchg		eax, ebx

	mov		ecx, 3

  .color2:
	rol		eax, 8
	rol		ebx, 8
	shl		ax, 8
	test		bl, bl
	jz		.clamp12
	cmp		ah, bl
	jae		.clamp22
	div		bl
	jmp		.done2
  .clamp12:
	mov		al, 0xff
	test		ah, ah
	jnz		@f
	not		al
    @@:
	jmp		.done2
  .clamp22:
	mov		al, 0xff
	jmp		.done2
  .done2:
	mov		ah, al
	loop		.color2

	ror		eax, 8


	movd		xmm3, eax
	pslldq		xmm3, 4
	pop		eax
	movd		xmm1, eax
	paddq		xmm3, xmm1

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	pop		ecx ebx eax
	ret
endp


proc	xcf._.composite_rgb_16			; Dodge
	push		eax ebx ecx

	movd		eax, xmm3
	movd		ebx, xmm2

	rol		eax, 8
	rol		ebx, 8

	xchg		eax, ebx

	mov		ecx, 3

  .color:
	rol		eax, 8
	rol		ebx, 8
	shl		ax, 8
	neg		bl
	add		bl, 0xff
	test		bl, bl
	jz		.clamp1
	cmp		ah,  bl
	jae		.clamp2
	div		bl
	jmp		.done
  .clamp1:
	mov		al, 0xff
	test		ah, ah
	jnz		@f
	not		al
    @@:
	jmp		.done
  .clamp2:
	mov		al, 0xff
	jmp		.done
  .done:
	mov		ah, al
	loop		.color

	ror		eax, 8


	push		eax
	movq		xmm1, xmm3
	psrldq		xmm1, 4
	movd		eax, xmm1
	movq		xmm1, xmm2
	psrldq		xmm1, 4
	movd		ebx, xmm1


	rol		eax, 8
	rol		ebx, 8

	xchg		eax, ebx

	mov		ecx, 3

  .color2:
	rol		eax, 8
	rol		ebx, 8
	shl		ax, 8
	neg		bl
	add		bl, 0xff
	test		bl, bl
	jz		.clamp12
	cmp		ah,  bl
	jae		.clamp22
	div		bl
	jmp		.done2
  .clamp12:
	mov		al, 0xff
	test		ah, ah
	jnz		@f
	not		al
    @@:
	jmp		.done2
  .clamp22:
	mov		al, 0xff
	jmp		.done2
  .done2:
	mov		ah, al
	loop		.color2

	ror		eax, 8


	movd		xmm3, eax
	pslldq		xmm3, 4
	pop		eax
	movd		xmm1, eax
	paddq		xmm3, xmm1

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	pop		ecx ebx eax
	ret
endp


proc	xcf._.composite_rgb_17			; Burn
	push		eax ebx ecx

	movd		eax, xmm3
	movd		ebx, xmm2

	rol		eax, 8
	rol		ebx, 8

	xchg		eax, ebx

	mov		ecx, 3

  .color:
	rol		eax, 8
	rol		ebx, 8
	shl		ax, 8
	neg		ah
	add		ah, 0xff
	test		bl, bl
	jz		.clamp1
	cmp		ah, bl
	jae		.clamp2
	div		bl
	jmp		.done
  .clamp1:
	mov		al, 0xff
	test		ah, ah
	jnz		@f
	not		al
    @@:
	jmp		.done
  .clamp2:
	mov		al, 0xff
	jmp		.done
  .done:
	mov		ah, al
	neg		ah
	add		ah, 0xff
	loop		.color

	ror		eax, 8


	push		eax
	movq		xmm1, xmm3
	psrldq		xmm1, 4
	movd		eax, xmm1
	movq		xmm1, xmm2
	psrldq		xmm1, 4
	movd		ebx, xmm1


	rol		eax, 8
	rol		ebx, 8

	xchg		eax, ebx

	mov		ecx, 3

  .color2:
	rol		eax, 8
	rol		ebx, 8
	shl		ax, 8
	neg		ah
	add		ah, 0xff
	test		bl, bl
	jz		.clamp12
	cmp		ah, bl
	jae		.clamp22
	div		bl
	jmp		.done2
  .clamp12:
	mov		al, 0xff
	test		ah, ah
	jnz		@f
	not		al
    @@:
	jmp		.done2
  .clamp22:
	mov		al, 0xff
	jmp		.done2
  .done2:
	mov		ah, al
	neg		ah
	add		ah, 0xff
	loop		.color2

	ror		eax, 8


	movd		xmm3, eax
	pslldq		xmm3, 4
	pop		eax
	movd		xmm1, eax
	paddq		xmm3, xmm1

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	pop		ecx ebx eax
	ret
endp


proc	xcf._.composite_rgb_18			; Hard Light
	push		eax ebx ecx

	movd		eax, xmm3
	movd		ebx, xmm2

	rol		eax, 8
	rol		ebx, 8

	mov		ecx, 3

  .color:
	rol		eax, 8
	rol		ebx, 8
	cmp		al, 127
	jna		.part1
	mov		ah, 0xff
	sub		ah, bl
	neg		al
	add		al, 0xff
	mul		ah
	shl		ax, 1
	neg		ah
	add		ah, 0xff
	jmp		.done
  .part1:
	mul		bl
	shl		ax, 1
  .done:
	loop		.color

	ror		eax, 8


	push		eax
	movq		xmm1, xmm3
	psrldq		xmm1, 4
	movd		eax, xmm1
	movq		xmm1, xmm2
	psrldq		xmm1, 4
	movd		ebx, xmm1


	rol		eax, 8
	rol		ebx, 8

	mov		ecx, 3

  .color2:
	rol		eax, 8
	rol		ebx, 8
	cmp		al, 127
	jna		.part12
	mov		ah, 0xff
	sub		ah, bl
	neg		al
	add		al, 0xff
	mul		ah
	shl		ax, 1
	neg		ah
	add		ah, 0xff
	jmp		.done2
  .part12:
	mul		bl
	shl		ax, 1
  .done2:
	loop		.color2

	ror		eax, 8


	movd		xmm3, eax
	pslldq		xmm3, 4
	pop		eax
	movd		xmm1, eax
	paddq		xmm3, xmm1

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0

	pop		ecx ebx eax
	ret
endp


proc	xcf._.composite_rgb_20			; Grain Extract

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0
	movdqu		xmm4, xmm2
	psubw		xmm3, xword[xcf._.xmm_0080]
	psubw		xmm4, xmm3
	movdqa		xmm3, xmm4
	packuswb	xmm3, xmm0
	punpcklbw	xmm3, xmm0
	ret
endp


proc	xcf._.composite_rgb_21			; Grain Merge

	punpcklbw	xmm2, xmm0
	punpcklbw	xmm3, xmm0
	paddw		xmm3, xmm2
	psubusw		xmm3, xword[xcf._.xmm_0080]
	packuswb	xmm3, xmm0
	punpcklbw	xmm3, xmm0
	ret
endp


; starting numbers for pseudo-random number generator
xcf._.random_a		dd	1103515245
xcf._.random_b		dd	777
xcf._.random_c		dd	12345

xcf._.xmm_8080		dq	0x8080808080808080, 0x8080808080808080
xcf._.xmm_0080		dq	0x0080008000800080, 0x0080008000800080
xcf._.xmm_00ff		dq	0x00ff00ff00ff00ff, 0x00ff00ff00ff00ff
xcf._.xmm_0100		dq	0x0100010001000100, 0x0100010001000100
xcf._.xmm_000000ff	dq	0x000000ff000000ff, 0x0000000000000000