proc blend_rgb

        xchg    al,     bh
        mov     ah,     bh
        neg     ax
        add     ax,     0xffff
        mul     ah
        neg     ah
        add     ah,     0xff
        xchg    ah,     bh

        mov     al,     0xff
        cmp     ah,     bh
         je     @f
        not     al
        div     bh
@@:

        mov     ah,     al
        movd    mm1,    eax
;        pxor    mm0,    mm0        ; already xor'ed in composite function
        punpcklbw   mm1,    mm1
        punpcklbw   mm1,    mm0
;        punpcklbw   mm3,    mm0

        movq        mm7,    mm1
        psrlw       mm7,    7
        paddw       mm1,    mm7

        psubw       mm3,    mm2
        pmullw      mm3,    mm1
        psllw       mm2,    8
        paddw       mm3,    mm2
        pinsrw      mm3,    ebx,    3
        psrlw       mm3,    8
        packuswb    mm3,    mm0
        movd        eax,    mm3

        ret
endp


proc blend_gray

        xchg    al,     bh
        mov     ah,     bh
        neg     ax
        add     ax,     0xffff
        mul     ah
        neg     ah
        add     ah,     0xff
        xchg    ah,     bh

        mov     al,     0xff
        cmp     ah,     bh
         je     @f
        not     al
        div     bh
@@:

        mov     ah,     al
        movd    mm1,    eax
;        pxor    mm0,    mm0        ; already xor'ed in composite function
        punpcklbw   mm1,    mm1
        punpcklbw   mm1,    mm0
;        punpcklbw   mm3,    mm0

        movq        mm7,    mm1
        psrlw       mm7,    7
        paddw       mm1,    mm7

        psubw       mm3,    mm2
        pmullw      mm3,    mm1
        psllw       mm2,    8
        paddw       mm3,    mm2
        pinsrw      mm3,    ebx,    1
        psrlw       mm3,    8
        packuswb    mm3,    mm0
        movd        eax,    mm3

        ret
endp


proc merge_32 _copy_width, _copy_height, _img_total_bpl, _bottom_total_bpl
.rgb_line:
        mov     ecx,    [_copy_width]
.rgb_pixel:
        mov     ebx,    [edi]
        lodsd
        
        movd    mm2,    ebx
        movd    mm3,    eax
        shr     eax,    24
        shr     ebx,    16
        cmp     al,     bh
         jna    @f
        mov     al,     bh
@@:     pxor    mm0,    mm0
        call    edx
        call    blend_rgb
        stosd
        dec     ecx
         jnz    .rgb_pixel
        add     esi,    [_img_total_bpl]
        add     edi,    [_bottom_total_bpl]
        dec     [_copy_height]
         jnz    .rgb_line
        emms
        ret
endp


proc merge_8a _copy_width, _copy_height, _img_total_bpl, _bottom_total_bpl
.gray_line:
        mov     ecx,    [_copy_width]
.gray_pixel:
        mov     bx,     word[edi]
        lodsw
        movd    mm2,    ebx
        movd    mm3,    eax
        shr     eax,    8
        cmp     al,     bh
         jna    @f
        mov     al,     bh
@@:     pxor    mm0,    mm0
        call    edx
        call    blend_gray
        stosw
        dec     ecx
         jnz    .gray_pixel
        add     esi,    [_img_total_bpl]
        add     edi,    [_bottom_total_bpl]
        dec     [_copy_height]
         jnz    .gray_line
        emms
        ret
endp


proc composite_rgb_00 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl

.line:  mov     ecx,    [_copy_width]
.pixel: mov     ebx,    [edi]
        lodsd
        movd    mm2,    ebx
        movd    mm3,    eax
        
        shr     eax,    24
        shr     ebx,    16

        xchg    al,     bh
        mov     ah,     bh
        neg     ax
        add     ax,     0xffff
        mul     ah
        neg     ah
        add     ah,     0xff
        xchg    ah,     bh

        mov     al,     0xff
        cmp     ah,     bh
         je    @f
        not     al
        div     bh
@@:

        mov     ah,     al
        movd    mm1,    eax
        pxor    mm0,    mm0
        punpcklbw   mm1,    mm1
        punpcklbw   mm1,    mm0
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        psubsw      mm3,    mm2
        pmullw      mm3,    mm1
        psllw       mm2,    8
        paddw       mm3,    mm2
        pinsrw      mm3,    ebx,    3
        psrlw       mm3,    8
        packuswb    mm3,    mm0
        movd        eax,    mm3
        stosd

        dec     ecx
         jnz    .pixel
        add     esi,    [_img_total_bpl]
        add     edi,    [_bottom_total_bpl]
        dec     [_copy_height]
         jnz    .line

        ret
endp


proc composite_gray_00 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl

.line:  mov     ecx,    [_copy_width]
.pixel: mov     bx,     [edi]
        lodsw
        movd    mm2,    ebx
        movd    mm3,    eax
        
        shr     eax,    8

        xchg    al,     bh
        mov     ah,     bh
        neg     ax
        add     ax,     0xffff
        mul     ah
        neg     ah
        add     ah,     0xff
        xchg    ah,     bh

        mov     al,     0xff
        cmp     ah,     bh
         je    @f
        not     al
        div     bh
@@:

        mov     ah,     al
        movd    mm1,    eax
        pxor    mm0,    mm0
        punpcklbw   mm1,    mm1
        punpcklbw   mm1,    mm0
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        psubw       mm3,    mm2
        pmullw      mm3,    mm1
        psllw       mm2,    8
        paddw       mm3,    mm2
        pinsrw      mm3,    ebx,    1
        psrlw       mm3,    8
        packuswb    mm3,    mm0
        movd        eax,    mm3
        stosw

        dec     ecx
         jnz    .pixel
        add     esi,    [_img_total_bpl]
        add     edi,    [_bottom_total_bpl]
        dec     [_copy_height]
         jnz    .line

        ret
endp


proc composite_indexed_00 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl

.line:  mov     ecx,    [_copy_width]
.pixel: mov     bx,     [edi]
        lodsw

        or      ah,     0x7f
        test    ah,     0x80
         jnz    @f
        mov     ax,     bx
@@:     stosw

        dec     ecx
         jnz    .pixel
        add     esi,    [_img_total_bpl]
        add     edi,    [_bottom_total_bpl]
        dec     [_copy_height]
         jnz    .line
        ret
endp


proc composite_rgb_01 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl
        pushad

        pxor    mm4,    mm4
        movd    mm4,    [random_b]
        movd    mm1,    [random_a]
        movd    mm2,    [random_c]
        
.line:  mov     ecx,    [_copy_width]
.pixel: mov     ebx,    [edi]
        lodsd
        
        movq    mm0,    mm4
        pmuludq mm0,    mm1
        paddq   mm0,    mm2
        movd    edx,    mm0
        movd    mm4,    edx
        pxor    mm0,    mm0

        rol     eax,    8
        test    al,     al
         jz     @f
        shr     edx,    17
        cmp     dl,     al
         ja     @f
        ror     eax,    8
        or      eax,    0xff000000
         jmp    .done
@@:     mov     eax,    ebx
.done:  stosd
        dec     ecx
         jnz    .pixel
        add     esi,    [_img_total_bpl]
        add     edi,    [_bottom_total_bpl]
        dec     [_copy_height]
         jnz    .line

.quit:  popad
        ret
endp


proc composite_gray_01 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl
        pushad
        
        pxor    mm4,    mm4
        movd    mm4,    [random_b]
        movd    mm1,    [random_a]
        movd    mm2,    [random_c]
        
.line:  mov     ecx,    [_copy_width]
.pixel: mov     ebx,    [edi]
        lodsw
        
        movq    mm0,    mm4
        pmuludq mm0,    mm1
        paddq   mm0,    mm2
        movd    edx,    mm0
        movd    mm4,    edx
        pxor    mm0,    mm0

        test    ah,     ah
         jz     @f
        shr     edx,    17
        cmp     dl,     ah
         ja     @f
        or      ax,     0xff00
         jmp    .done
@@:     mov     eax,    ebx
.done:  stosw
        dec     ecx
         jnz    .pixel
        add     esi,    [_img_total_bpl]
        add     edi,    [_bottom_total_bpl]
        dec     [_copy_height]
         jnz    .line

.quit:  popad
        ret
endp


;proc composite_indexed_01 _copy_width, _copy_height, _bottom_total_bpl, _img_total_bpl
;        pushad
;        
;        pxor    mm4,    mm4
;        movd    mm4,    [random_b]
;        movd    mm1,    [random_a]
;        movd    mm2,    [random_c]
;        
;.line:  mov     ecx,    [_copy_width]
;.pixel: mov     ebx,    [edi]
;        lodsw       
;
;        movq    mm0,    mm4
;        pmuludq mm0,    mm1
;        paddq   mm0,    mm2
;        movd    edx,    mm0
;        movd    mm4,    edx
;        pxor    mm0,    mm0
;
;        test    ah,     ah
;         jz     @f
;        shr     edx,    17
;        cmp     dl,     ah
;         ja     @f
;        or      ax,     0xff00
;         jmp    .done
;@@:     mov     eax,    ebx
;.done:  stosw
;        dec     ecx
;         jnz    .pixel
;        add     esi,    [_img_total_bpl]
;        add     edi,    [_bottom_total_bpl]
;        dec     [_copy_height]
;         jnz    .line
;
;.quit:  popad
;        ret
;endp


proc composite_rgb_03               ; Multiply
        
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0
        pmullw  mm3,    mm2
        psrlw       mm3,    8
        
        ret
endp


proc composite_rgb_04               ; Screen

        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0
        movq    mm4,    [mmx_00ff]
        movq    mm5,    mm4
        psubw   mm4,    mm2
        psubw   mm5,    mm3
        pmullw  mm4,    mm5
        psrlw       mm4,    8
        movq    mm3,    [mmx_00ff]
        psubw   mm3,    mm4

        ret
endp


proc composite_rgb_05               ; Overlay

        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0
        movq        mm4,    [mmx_00ff]
        psubw       mm4,    mm2
        pmullw      mm3,    mm4
        psrlw       mm3,    7
        paddw       mm3,    mm2
        pmullw      mm3,    mm2
        psrlw       mm3,    8
        
        ret
endp


proc composite_rgb_06               ; Difference

        movq    mm4,    mm3
        pminub  mm4,    mm2
        pmaxub  mm3,    mm2
        psubusb mm3,    mm4
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        ret
endp


proc composite_rgb_07               ; Addition

        paddusb mm3,    mm2
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        ret
endp


proc composite_rgb_08               ; Subtract

        movq    mm4,    mm2
        psubusb mm4,    mm3
        movq    mm3,    mm4
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        ret
endp


proc composite_rgb_09               ; Darken Only

        pminub  mm3,    mm2
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        ret
endp


proc composite_rgb_10               ; Lighten Only

        pmaxub  mm3,    mm2
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        ret
endp


proc composite_rgb_11               ; Hue (H of HSV)
        push    eax ebx ecx edx
        
        movd    eax,    mm3
        movd    ebx,    mm2

        call    pixel_rgb2hsv
        xchg    eax,    ebx
        call    pixel_rgb2hsv
        xchg    eax,    ebx
        
        test    ah,     ah
         jnz    @f
        ror     eax,    8
        ror     ebx,    8
        mov     ah,     bh
        rol     eax,    8
        rol     ebx,    8
@@:
        mov     ax,     bx
        
        call    pixel_hsv2rgb


        movd    mm3,    eax
        
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0
        
.quit:
        pop     edx ecx ebx eax
        ret
endp


proc composite_rgb_12               ; Saturation (S of HSV)
        push    eax ebx ecx edx
        
        movd    eax,    mm3
        movd    ebx,    mm2

        call    pixel_rgb2hsv
        xchg    eax,    ebx
        call    pixel_rgb2hsv
        xchg    eax,    ebx
        
        ror     eax,    8
        ror     ebx,    8
        mov     ah,     bh
        rol     eax,    8
        rol     ebx,    8
        mov     al,     bl
        
        call    pixel_hsv2rgb


        movd    mm3,    eax
        
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0
        
.quit:
        pop     edx ecx ebx eax
        ret
endp


proc composite_rgb_13               ; Color (H and S of HSL)
        push    eax ebx ecx edx
        
        movd    eax,    mm3
        movd    ebx,    mm2

        call    pixel_rgb2hsl
        xchg    eax,    ebx
        call    pixel_rgb2hsl
        xchg    eax,    ebx
        
        mov     al,     bl
        
        call    pixel_hsl2rgb


        movd    mm3,    eax
        
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0
        
.quit:
        pop     edx ecx ebx eax
        ret
endp


proc composite_rgb_14               ; Value (V of HSV)
        push    eax ebx ecx edx
        
        movd    eax,    mm3
        movd    ebx,    mm2

        call    pixel_rgb2hsv
        xchg    eax,    ebx
        call    pixel_rgb2hsv
        xchg    eax,    ebx
        
        ror     eax,    8
        ror     ebx,    8
        mov     ax,     bx
        rol     eax,    8
        rol     ebx,    8
        
        call    pixel_hsv2rgb


        movd    mm3,    eax
        
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0
        
.quit:
        pop     edx ecx ebx eax
        ret
endp


proc composite_rgb_15               ; Divide
        push    eax ebx ecx

        movd    eax,    mm3
        movd    ebx,    mm2

        rol     eax,    8
        rol     ebx,    8

        xchg    eax,    ebx

        mov     ecx,    3

.color: rol     eax,    8
        rol     ebx,    8
        shl     ax,     8
        test    bl,     bl
         jz     .clamp1
        cmp     ah,     bl
         jae    .clamp2
        div     bl
         jmp    .done
.clamp1:mov     al,     0xff
        test    ah,     ah
         jnz    @f
        not     al
@@:      jmp    .done
.clamp2:mov     al,     0xff
         jmp    .done
.done:  mov     ah,     al
        loop    .color

        ror     eax,    8
        movd    mm3,    eax

        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        pop     ecx ebx eax
        ret
endp


proc composite_rgb_16               ; Dodge
        push eax ebx ecx

        movd    eax,    mm3
        movd    ebx,    mm2

        rol     eax,    8
        rol     ebx,    8

        xchg    eax,    ebx
        
        mov     ecx,    3

.color: rol     eax,    8
        rol     ebx,    8
        shl     ax,     8
        neg     bl
        add     bl,     0xff
        test    bl,     bl
         jz     .clamp1
        cmp     ah,     bl
         jae    .clamp2
        div     bl
         jmp    .done
.clamp1:mov     al,     0xff
        test    ah,     ah
         jnz    @f
        not     al
@@:      jmp    .done
.clamp2:mov     al,     0xff
         jmp    .done
.done:  mov     ah,     al
        loop    .color

        ror     eax,    8
        movd    mm3,    eax

        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        pop     ecx ebx eax
        ret
endp


proc composite_rgb_17               ; Burn
        push eax ebx ecx

        movd    eax,    mm3
        movd    ebx,    mm2

        rol     eax,    8
        rol     ebx,    8

        xchg    eax,    ebx
        
        mov     ecx,    3

.color: rol     eax,    8
        rol     ebx,    8
        shl     ax,     8
        neg     ah
        add     ah,     0xff
        test    bl,     bl
         jz     .clamp1
        cmp     ah,     bl
         jae    .clamp2
        div     bl
         jmp    .done
.clamp1:mov     al,     0xff
        test    ah,     ah
         jnz    @f
        not     al
@@:      jmp    .done
.clamp2:mov     al,     0xff
         jmp    .done
.done:  mov     ah,     al
        neg     ah
        add     ah,     0xff
        loop    .color

        ror     eax,    8
        movd    mm3,    eax

        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        pop     ecx ebx eax
        ret
endp


proc composite_rgb_18               ; Hard Light
        push eax ebx ecx

        movd    eax,    mm3
        movd    ebx,    mm2

        rol     eax,    8
        rol     ebx,    8

        mov     ecx,    3

.color: rol     eax,    8
        rol     ebx,    8
        cmp     al,     127
         jna    .part1
        mov     ah,     0xff
        sub     ah,     bl
        neg     al
        add     al,     0xff
        mul     ah
        shl     ax,     1
        neg     ah
        add     ah,     0xff
         jmp    .done
.part1:
        mul     bl
        shl     ax,     1
.done:  loop    .color

        ror     eax,    8
        movd    mm3,    eax

        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0

        pop     ecx ebx eax
        ret
endp


proc composite_rgb_20               ; Grain Extract

        movq        mm4,    [mmx_0080]
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0
        movq        mm5,    mm2
        psubw       mm3,    mm4
        psubsw      mm5,    mm3
        packuswb    mm5,    mm0
        punpcklbw   mm5,    mm0
        movq        mm3,    mm5

        ret
endp


proc composite_rgb_21               ; Grain Merge

        movq        mm4,    [mmx_0080]
        punpcklbw   mm2,    mm0
        punpcklbw   mm3,    mm0
        movq        mm5,    mm2
        psubw       mm5,    mm4
        paddsw      mm3,    mm5
        packuswb    mm3,    mm0
        punpcklbw   mm3,    mm0

        ret
endp


mmx_0080        dq  0x0080008000800080
mmx_00ff        dq  0x00ff00ff00ff00ff
mmx_0100        dq  0x0100010001000100