;SIZE_X equ 350
;SIZE_Y equ 350
;ROUND equ 8
;TEX_X equ 512
;TEX_Y equ 512
;TEXTURE_SIZE EQU (512*512)-1
;TEX_SHIFT EQU 9

;CATMULL_SHIFT equ 8
;TEXTURE_SIZE EQU (TEX_X * TEX_Y)-1
;Ext = SSE
;SSE = 3
;MMX = 1
;NON = 0
;use32
;------- Big thanks to Majuma (www.majuma.xt.pl) for absolutely great---
;------- DOS 13h mode demos --------------------------------------------
;------- Procedure draws triangle with two overlapped textures, I use --
;--------Catmull Z-buffer algorithm- (Z coordinate interpolation)-------
;--------I calc texture pixel by this way: col1*col2/256 ---------------
two_tex_triangle_z:
;------------------in - eax - x1 shl 16 + y1 -----------
;---------------------- ebx - x2 shl 16 + y2 -----------
;---------------------- ecx - x3 shl 16 + y3 -----------
;---------------------- edx - pointer to b. texture-----
;---------------------- esi - pointer to e. texture-----
;---------------------- edi - pointer to screen buffer--
;---------------------- stack : b. tex coordinates------
;----------------------         e. tex coordinates------
;----------------------         Z position coordinates--
;----------------------         pointer io Z buffer-----
;-- Z-buffer - filled with coordinates as dword --------
;-- (Z coor. as word) shl CATMULL_SHIFT ----------------
.b_x1	equ ebp+4   ; procedure don't save registers !!!
.b_y1	equ ebp+6   ; each coordinate as word
.b_x2	equ ebp+8
.b_y2	equ ebp+10	 ; b - first texture
.b_x3	equ ebp+12
.b_y3	equ ebp+14	 ; e - second texture
.e_x1	equ ebp+16
.e_y1	equ ebp+18
.e_x2	equ ebp+20
.e_y2	equ ebp+22
.e_x3	equ ebp+24
.e_y3	equ ebp+26
.z1	equ word[ebp+28]
.z2	equ word[ebp+30]
.z3	equ word[ebp+32]
.z_buff equ dword[ebp+34]	; pointer to Z-buffer


.t_bmap equ dword[ebp-4]	; pointer to b. texture
.t_emap equ dword[ebp-8]	; pointer to e. texture
.x1	equ word[ebp-10]
.y1	equ word[ebp-12]
.x2	equ word[ebp-14]
.y2	equ word[ebp-16]
.x3	equ word[ebp-18]
.y3	equ word[ebp-20]

.dx12  equ dword[ebp-24]
.dbx12 equ dword[ebp-28]
.dby12 equ dword[ebp-32]
.dby12q equ	[ebp-32]
.dex12 equ dword[ebp-36]
.dey12 equ dword[ebp-40]
.dey12q equ	[ebp-40]
.dz12  equ dword[ebp-44]

.dx13  equ dword[ebp-48]
.dbx13 equ dword[ebp-52]
.dby13 equ dword[ebp-56]
.dby13q equ	[ebp-56]
.dex13 equ dword[ebp-60]
.dey13 equ dword[ebp-64]
.dey13q equ	[ebp-64]
.dz13  equ dword[ebp-68]

.dx23  equ dword[ebp-72]
.dbx23 equ dword[ebp-76]
.dby23 equ dword[ebp-80]
.dby23q equ	[ebp-80]
.dex23 equ dword[ebp-84]
.dey23 equ dword[ebp-88]
.dey23q equ	[ebp-88]
.dz23  equ dword[ebp-92]

.cx1   equ dword[ebp-96]   ; current variables
.cx2   equ dword[ebp-100]
.cbx1  equ dword[ebp-104]
.cby1  equ [ebp-108]
.cex1  equ dword[ebp-112]
.cey1  equ [ebp-116]
.cbx2  equ dword[ebp-120]
.cby2  equ [ebp-124]
.cex2  equ dword[ebp-128]
.cey2  equ [ebp-132]

.cz1   equ dword[ebp-136]
.cz2   equ dword[ebp-140]

    if Ext >= MMX
       emms
    else
       cld
    end if
       mov     ebp,esp
       push    edx esi	     ; store bump map
;       push    esi        ; store e. map
     ; sub     esp,120
 .sort3:		  ; sort triangle coordinates...
       cmp     ax,bx
       jle     .sort1
       xchg    eax,ebx
       mov     edx,dword[.b_x1]
       xchg    edx,dword[.b_x2]
       mov     dword[.b_x1],edx
       mov     edx,dword[.e_x1]
       xchg    edx,dword[.e_x2]
       mov     dword[.e_x1],edx
       mov     dx,.z1
       xchg    dx,.z2
       mov     .z1,dx
 .sort1:
       cmp	bx,cx
       jle	.sort2
       xchg	ebx,ecx
       mov	edx,dword[.b_x2]
       xchg	edx,dword[.b_x3]
       mov	dword[.b_x2],edx
       mov	edx,dword[.e_x2]
       xchg	edx,dword[.e_x3]
       mov	dword[.e_x2],edx
       mov     dx,.z2
       xchg    dx,.z3
       mov     .z2,dx
       jmp	.sort3
 .sort2:
       push	eax ebx ecx    ; store triangle coords in variables
;       push     ebx
;       push     ecx

	 mov	  edx,80008000h  ; eax,ebx,ecx are ANDd together into edx which means that
	 and	  edx,ebx	 ; if *all* of them are negative a sign flag is raised
	 and	  edx,ecx
	 and	  edx,eax
	 test	  edx,80008000h  ; Check both X&Y at once
	 jne	  .loop23_done
    ;   mov     edx,eax         ; eax,ebx,ecx are ORd together into edx which means that
    ;   or      edx,ebx         ; if any *one* of them is negative a sign flag is raised
    ;   or      edx,ecx
    ;   test    edx,80000000h   ; Check only X
    ;   jne     .loop23_done

    ;   cmp     .x1,SIZE_X    ; {
    ;   jg      .loop23_done
    ;   cmp     .x2,SIZE_X     ; This can be optimized with effort
    ;   jg      .loop23_done
    ;   cmp     .x3,SIZE_X
    ;   jg      .loop23_done    ; {


       mov	bx,.y2	     ; calc delta 12
       sub	bx,.y1
       jnz	.bt_dx12_make
       mov	ecx,6
       xor	edx,edx
     @@:
       push	edx   ;dword 0
       loop	@b
       jmp	.bt_dx12_done
 .bt_dx12_make:
       mov	ax,.x2
       sub	ax,.x1
       cwde
       movsx	ebx,bx
       shl	eax,ROUND
       cdq
       idiv	ebx
;      mov      .dx12,eax
       push	 eax

if Ext=SSE

       sub	 esp,16
       cvtsi2ss  xmm3,ebx	     ;rcps
    ;   mov       eax,255
       cvtsi2ss  xmm4,[i255d] ;eax
       divss	 xmm3,xmm4
       rcpss	 xmm3,xmm3
    ;   mulss     xmm3,xmm4
       shufps	 xmm3,xmm3,0

       movd	 mm0,[.b_x1]
       movd	 mm1,[.b_x2]
       movd	 mm2,[.e_x1]
       movd	 mm3,[.e_x2]
     ;  psubsw    mm3,mm2
     ;  psubsw    mm1,mm0
       pxor	 mm4,mm4
       punpcklwd mm0,mm4
       punpcklwd mm1,mm4
       punpcklwd mm2,mm4
       punpcklwd mm3,mm4
   ;    pslld     mm0,ROUND
   ;    pslld     mm1,ROUND
   ;    pslld     mm2,ROUND
   ;    pslld     mm3,ROUND
       cvtpi2ps  xmm0,mm0
       movlhps	 xmm0,xmm0
       cvtpi2ps  xmm0,mm2
       cvtpi2ps  xmm1,mm1
       movlhps	 xmm1,xmm1
       cvtpi2ps  xmm1,mm3
       subps	 xmm1,xmm0

     ;  pxor      mm4,mm4
     ;  movq      mm5,mm1
     ;  movq      mm6,mm1
     ;  pcmpeqb   mm5,mm4
;       psubd     mm1,mm0
;       psubd     mm3,mm2

    ;   movq      mm0,[.b_x1]      ; bx1  by1   bx2    by2
    ;   movq      mm1,[.e_x1]      ; ex1  ey1   ex2    ey2
    ;   pxor
    ;   punpcklhd mm0,mm1   ; lwd  ;
    ;   psubw     mm1,mm0   ; mm1, mm0
    ;   pxor      mm2,mm2
  ;     pmovmaskb eax,mm1
  ;     and       eax,10101010b
    ;   pcmpgtw   mm2,mm1
    ;   punpcklwd mm1,mm2
 ;      psllw     mm0,ROUND
 ;      psllw     mm1,ROUND
 ;      movq      mm2,mm0
 ;      psrlq     mm0,32

;       cvtpi2ps  xmm0,mm1
;       movlhps   xmm0,xmm0
;       cvtpi2ps  xmm0,mm3
  ;     divps     xmm1,xmm3
       mulps	 xmm1,xmm3
       shufps	 xmm1,xmm1,10110001b
       cvtps2pi  mm0,xmm1	   ; mm0 -> 2 delta dwords
       movhlps	 xmm1,xmm1
       cvtps2pi  mm1,xmm1
       movq	 .dey12q,mm0
       movq	 .dby12q,mm1

;       movd      .dex12,mm0
;       psrlq     mm0,32
;       movd      .dey12,mm0
;       movhlps   xmm1,xmm1
;       cvtps2pi  mm0,xmm1
;       movd      .dbx12,mm0
;       psrlq     mm0,32
;       movd      .dby12,mm0

else
       mov	ax,word[.b_x2]
       sub	ax,word[.b_x1]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;     mov      .dbx12,eax
       push	 eax

       mov	ax,word[.b_y2]
       sub	ax,word[.b_y1]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;     mov      .dby12,eax
       push	 eax

 ;     mov       eax,.dbx12
 ;     mov       ebx,.dby12
 ;     int3

       mov	ax,word[.e_x2]
       sub	ax,word[.e_x1]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dex12,eax
       push	 eax

       mov	ax,word[.e_y2]
       sub	ax,word[.e_y1]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dey12,eax
       push	 eax

end if
	mov	ax,.z2
	sub	ax,.z1
	cwde
	shl	eax,CATMULL_SHIFT
	cdq
	idiv	ebx
	push	eax
   .bt_dx12_done:

       mov	bx,.y3	     ; calc delta13
       sub	bx,.y1
       jnz	.bt_dx13_make
       mov	ecx,6
       xor	edx,edx
     @@:
       push	edx   ;dword 0
       loop	@b
       jmp	.bt_dx13_done
 .bt_dx13_make:
       mov	ax,.x3
       sub	ax,.x1
       cwde
       movsx	ebx,bx
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dx13,eax
       push	 eax

if Ext=SSE

       cvtsi2ss  xmm3,ebx
  ;     mov       eax,255
       cvtsi2ss  xmm4,[i255d]
       divss	 xmm3,xmm4
       rcpss	 xmm3,xmm3
;       mulss     xmm3,xmm4
       shufps	 xmm3,xmm3,0
       sub	 esp,16

       movd	 mm0,[.b_x1]
       movd	 mm1,[.b_x3]
       movd	 mm2,[.e_x1]
       movd	 mm3,[.e_x3]

       pxor	 mm4,mm4
       punpcklwd mm0,mm4
       punpcklwd mm1,mm4
       punpcklwd mm2,mm4
       punpcklwd mm3,mm4

       cvtpi2ps  xmm0,mm0
       movlhps	 xmm0,xmm0
       cvtpi2ps  xmm0,mm2
       cvtpi2ps  xmm1,mm1
       movlhps	 xmm1,xmm1
       cvtpi2ps  xmm1,mm3
       subps	 xmm1,xmm0

   ;    divps     xmm1,xmm3
       mulps	 xmm1,xmm3
       shufps	 xmm1,xmm1,10110001b
       cvtps2pi  mm0,xmm1	   ; mm0 -> 2 delta dwords
       movhlps	 xmm1,xmm1
       cvtps2pi  mm1,xmm1
       movq	 .dey13q,mm0
       movq	 .dby13q,mm1

else

       mov	ax,word[.b_x3]
       sub	ax,word[.b_x1]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dbx13,eax
       push	 eax

       mov	ax,word[.b_y3]
       sub	ax,word[.b_y1]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dby13,eax
       push	 eax

       mov	ax,word[.e_x3]
       sub	ax,word[.e_x1]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dex13,eax
       push	 eax

       mov	ax,word[.e_y3]
       sub	ax,word[.e_y1]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dey13,eax
       push	 eax

end if

       mov     ax,.z3
       sub     ax,.z1
       cwde
       shl     eax,CATMULL_SHIFT
       cdq
       idiv    ebx
  ;    mov    .dz13,eax
       push    eax
   .bt_dx13_done:

       mov	bx,.y3	     ; calc delta23
       sub	bx,.y2
       jnz	.bt_dx23_make
       mov	ecx,6
       xor	edx,edx
     @@:
       push	edx   ;dword 0
       loop	@b
       jmp	.bt_dx23_done
 .bt_dx23_make:
       mov	ax,.x3
       sub	ax,.x2
       cwde
       movsx	ebx,bx
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dx23,eax
       push	 eax

if Ext=SSE

       cvtsi2ss  xmm3,ebx
     ;  mov       eax,255
       cvtsi2ss  xmm4,[i255d] ;eax
       divss	 xmm3,xmm4
       shufps	 xmm3,xmm3,0
       sub	 esp,16

       movd	 mm0,[.b_x2]
       movd	 mm1,[.b_x3]
       movd	 mm2,[.e_x2]
       movd	 mm3,[.e_x3]

       pxor	 mm4,mm4
       punpcklwd mm0,mm4
       punpcklwd mm1,mm4
       punpcklwd mm2,mm4
       punpcklwd mm3,mm4

       cvtpi2ps  xmm0,mm0
       movlhps	 xmm0,xmm0
       cvtpi2ps  xmm0,mm2
       cvtpi2ps  xmm1,mm1
       movlhps	 xmm1,xmm1
       cvtpi2ps  xmm1,mm3
       subps	 xmm1,xmm0

       divps	 xmm1,xmm3
       shufps	 xmm1,xmm1,10110001b
       cvtps2pi  mm0,xmm1	   ; mm0 -> 2 delta dwords
       movhlps	 xmm1,xmm1
       cvtps2pi  mm1,xmm1
       movq	 .dey23q,mm0
       movq	 .dby23q,mm1

else

       mov	ax,word[.b_x3]
       sub	ax,word[.b_x2]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dbx23,eax
       push	 eax

       mov	ax,word[.b_y3]
       sub	ax,word[.b_y2]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dby23,eax
       push	 eax

       mov	ax,word[.e_x3]
       sub	ax,word[.e_x2]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dex23,eax
       push	 eax

       mov	ax,word[.e_y3]
       sub	ax,word[.e_y2]
       cwde
       shl	eax,ROUND
       cdq
       idiv	ebx
 ;      mov      .dey23,eax
       push	 eax
end if
       mov     ax,.z3
       sub     ax,.z2
       cwde
       shl     eax,CATMULL_SHIFT
       cdq
       idiv    ebx
      ; mov     .dz23,eax
       push    eax
      ;  sub     esp,40
   .bt_dx23_done:
       movsx	eax,.x1
       shl	eax,ROUND
     ;  mov      .cx1,eax
     ;  mov      .cx2,eax
       push	eax eax
     ; push     eax

       movsx	eax,word[.b_x1]
       shl	eax,ROUND
       mov	.cbx1,eax
       mov	.cbx2,eax
     ;  push     eax eax
     ; push     eax

       movsx	eax,word[.b_y1]
       shl	eax,ROUND
       mov	.cby1,eax
       mov	.cby2,eax
     ;  push     eax eax
     ; push     eax

       movsx	eax,word[.e_x1]
       shl	eax,ROUND
       mov	.cex1,eax
       mov	.cex2,eax
      ; push     eax eax
      ;push     eax

       movsx	eax,word[.e_y1]
       shl	eax,ROUND
       mov	.cey1,eax
       mov	.cey2,eax
       sub	esp,32
      ; push     eax eax
      ;push     eax

       movsx	eax,.z1
       shl	eax,CATMULL_SHIFT
      ; mov      .cz1,eax
      ; mov      .cz2,eax
      push     eax eax
      ;push     eax

       movsx	ecx,.y1
       cmp	cx,.y2
       jge	.loop12_done
  .loop12:
       call	.call_line

       mov	eax,.dx13
       add	.cx1,eax
       mov	ebx,.dx12
       add	.cx2,ebx
if Ext>= SSE2
       movups	xmm0,.cey1
       movups	xmm1,.cey2
       movups	xmm2,.dey12q
       movups	xmm3,.dey13q
       paddd	xmm0,xmm3
       paddd	xmm1,xmm2
       movups	.cey1,xmm0
       movups	.cey2,xmm1
else if (Ext = MMX) | (Ext=SSE)
       movq	mm0,.cby2	; with this optimization object
       movq	mm1,.cby1	; looks bit annoying
       movq	mm2,.cey2
       movq	mm3,.cey1
       paddd	mm0,.dby12q
       paddd	mm1,.dby13q
       paddd	mm2,.dey12q
       paddd	mm3,.dey13q
       movq	.cby2,mm0
       movq	.cby1,mm1
       movq	.cey1,mm3
       movq	.cey2,mm2
else
       mov	edx,.dbx13
       add	.cbx1,edx
       mov	eax,.dbx12
       add	.cbx2,eax
       mov	ebx,.dby13
       add	.cby1,ebx
       mov	edx,.dby12
       add	.cby2,edx

       mov	eax,.dex13
       add	.cex1,eax
       mov	ebx,.dex12
       add	.cex2,ebx
       mov	edx,.dey13
       add	.cey1,edx
       mov	eax,.dey12
       add	.cey2,eax

end if
       mov	ebx,.dz13
       add	.cz1,ebx
       mov	edx,.dz12
       add	.cz2,edx

       inc	ecx
       cmp	cx,.y2
       jl	.loop12
    .loop12_done:

       movsx	ecx,.y2
       cmp	cx,.y3
       jge	.loop23_done

       movsx	eax,.z2
       shl	eax,CATMULL_SHIFT
       mov	.cz2,eax

       movsx	eax,.x2
       shl	eax,ROUND
       mov	.cx2,eax

       movzx	eax,word[.b_x2]
       shl	eax,ROUND
       mov	.cbx2,eax

       movzx	eax,word[.b_y2]
       shl	eax,ROUND
       mov	.cby2,eax

       movzx	eax,word[.e_x2]
       shl	eax,ROUND
       mov	.cex2,eax

       movzx	eax,word[.e_y2]
       shl	eax,ROUND
       mov	.cey2,eax

     .loop23:
       call	.call_line
;if Ext = NON
       mov	eax,.dx13
       add	.cx1,eax
       mov	ebx,.dx23
       add	.cx2,ebx
if Ext>= SSE2
       movups	xmm0,.cey1
       movups	xmm1,.cey2
       movups	xmm2,.dey23q
       movups	xmm3,.dey13q
       paddd	xmm0,xmm3
       paddd	xmm1,xmm2
       movups	.cey1,xmm0
       movups	.cey2,xmm1
else if (Ext = MMX) | ( Ext = SSE)
       movq	mm0,.cby2	 ;    with this mmx optimization object looks bit
       movq	mm1,.cby1	 ;    annoying
       movq	mm2,.cey2
       movq	mm3,.cey1
       paddd	mm0,.dby23q
       paddd	mm1,.dby13q
       paddd	mm2,.dey23q
       paddd	mm3,.dey13q
       movq	.cby2,mm0
       movq	.cby1,mm1
       movq	.cey2,mm2
       movq	.cey1,mm3

else
       mov	edx,.dbx13
       add	.cbx1,edx
       mov	eax,.dbx23
       add	.cbx2,eax
       mov	ebx,.dby13
       add	.cby1,ebx
       mov	edx,.dby23
       add	.cby2,edx

       mov	eax,.dex13
       add	.cex1,eax
       mov	ebx,.dex23
       add	.cex2,ebx
       mov	edx,.dey13
       add	.cey1,edx
       mov	eax,.dey23
       add	.cey2,eax
end if

       mov	ebx,.dz13
       add	.cz1,ebx
       mov	edx,.dz23
       add	.cz2,edx
;else
;       movq     mm0,.db13q
;       movq     mm1,.cbx1q

       inc	ecx
       cmp	cx,.y3
       jl	.loop23
    .loop23_done:

       mov	esp,ebp
ret   34

.call_line:

       pushad

       push	.cz1
       push	.cz2
       push	.z_buff
       push	.t_bmap
       push	.t_emap
       push	dword .cey2
       push	.cex2
       push	dword .cey1
       push	.cex1
       push	dword .cby2
       push	.cbx2
       push	dword .cby1
       push	.cbx1
       push	ecx

       mov	eax,.cx1
       sar	eax,ROUND
       mov	ebx,.cx2
       sar	ebx,ROUND

       call	two_tex_line_z

       popad
ret
two_tex_line_z:
;--------------in: eax - x1
;--------------    ebx - x2
;--------------    edi - pointer to screen buffer
;stack - another parameters :
.y	equ dword [ebp+4]
.bx1	equ  [ebp+8]   ;   ---
.by1	equ  [ebp+12]  ;       |
.bx2	equ  [ebp+16]  ;       |
.by2	equ  [ebp+20]  ;       |>   b. texture and e. texture coords
.ex1	equ  [ebp+24]  ;       |>   shifted shl ROUND
.ey1	equ  [ebp+28]  ;       |
.ex2	equ  [ebp+32]  ;       |
.ey2	equ  [ebp+36]  ;   ---
.emap	equ  [ebp+40]  ; b texture offset
.bmap	equ  [ebp+44]  ; e texture offset
.z_buff equ dword [ebp+48]
.z2	equ dword [ebp+52]  ;   -- |>   z coords shifted
.z1	equ dword [ebp+56]  ;   --       shl  CATMULL_SHIFT

.x1	equ dword [ebp-4]
.x2	equ dword [ebp-8]
.dbx	equ [ebp-12]
.dex	equ [ebp-16]
.dby	equ [ebp-20]
.dey	equ [ebp-24]
.dz	equ dword [ebp-28]
.cbx	equ [ebp-32]
.cex	equ [ebp-36]
.cby	equ [ebp-40]
.cey	equ [ebp-44]
.cz	equ dword [ebp-48]
.czbuff equ dword [ebp-52]

	mov	ebp,esp

	mov	ecx,.y
	or	ecx,ecx
	jl	.bl_end
	mov	dx,word[size_y_var]
	dec	dx
	cmp	cx,dx	;word[size_y_var]  ;SIZE_Y
	jge	.bl_end

	cmp	eax,ebx
	jl	@f
	je	.bl_end

	xchg	eax,ebx
if Ext=NON
	mov	edx,.bx1
	xchg	edx,.bx2
	mov	.bx1,edx
	mov	edx,.by1
	xchg	edx,.by2
	mov	.by1,edx

	mov	edx,.ex1
	xchg	edx,.ex2
	mov	.ex1,edx
	mov	edx,.ey1
	xchg	edx,.ey2
	mov	.ey1,edx
else
	movq	mm0,.bx1
	movq	mm1,.ex1
	movq	mm2,.bx2
	movq	mm3,.ex2
	movq	.bx2,mm0
	movq	.ex2,mm1
	movq	.bx1,mm2
	movq	.ex1,mm3
end if
	mov	edx,.z1
	xchg	edx,.z2
	mov	.z1,edx
    @@:
	push	eax ebx    ;store x1, x2
	mov	ebx,.x1
	movzx	edx,word[size_x_var]
	dec	edx
	cmp	ebx,edx
  ;      cmp     bx,word[size_x_var]  ;SIZE_X
	jg	.bl_end
	cmp	.x2,0
	jle	.bl_end

	mov	ebx,.x2
	sub	ebx,.x1

if  Ext >= SSE

       sub	 esp,16
       cvtsi2ss  xmm3,ebx	     ;rcps
       shufps	 xmm3,xmm3,0

  ;     movq      mm0,.bx1q
  ;     movq      mm1,.bx2q
  ;     movq      mm2,.ex1q
  ;     movq      mm3,.ex2q
  ;     psubd     mm1,mm0
  ;     psubd     mm3,mm2
  ;     cvtpi2ps  xmm1,mm1
  ;     movlhps   xmm1,xmm1
  ;     cvtpi2ps  xmm1,mm3

       cvtpi2ps  xmm0,.bx1 ;mm0     ; bx1; by1
       movlhps	 xmm0,xmm0
       cvtpi2ps  xmm0,.ex1 ;mm2     ; ex1; ey1
       cvtpi2ps  xmm1,.bx2 ;mm1     ; bx2; by2
       movlhps	 xmm1,xmm1
       cvtpi2ps  xmm1,.ex2 ;mm3     ; ex2; ey2
       subps	 xmm1,xmm0
				    ; hi             lo
       divps	 xmm1,xmm3 ; xmm1 -> dby; dbx; dey; dex

       shufps	 xmm1,xmm1,11011000b
       cvtps2pi  mm0,xmm1	   ; mm0 -> 2 delta dwords
       movhlps	 xmm1,xmm1
       cvtps2pi  mm1,xmm1
       movq	 .dex,mm0 ; hi - lo  ->  dbx, dex
       movq	 .dey,mm1 ; hi - lo  ->  dby, dey

else

	mov	eax,.bx2       ; calc .dbx
	sub	eax,.bx1
	cdq
	idiv	ebx
	push	eax

	mov	eax,.ex2       ; calc .dby
	sub	eax,.ex1
	cdq
	idiv	ebx
	push	eax

	mov	eax,.by2       ; calc .dex
	sub	eax,.by1
	cdq
	idiv	ebx
	push	eax

	mov	eax,.ey2       ; calc .dey
	sub	eax,.ey1
	cdq
	idiv	ebx
	push	eax

end if

	mov	eax,.z2        ; calc .dz
	sub	eax,.z1
	cdq
	idiv	ebx
	push	eax

	cmp	.x1,0	      ; set correctly begin variable
	jge	@f	      ; CLIPPING ON FUNCTION
			      ; cutting triangle exceedes screen
	mov	ebx,.x1
	neg	ebx
	imul	ebx	      ; eax = .dz * abs(.x1)
	add	.z1,eax
	mov	.x1,0

	mov	eax,.dbx
	imul	ebx
	add    .bx1,eax

	mov	eax,.dby
	imul	ebx
	add	.by1,eax

	mov	eax,.dex
	imul	ebx
	add	.ex1,eax

	mov	eax,.dey
	imul	ebx
	add	.ey1,eax
      @@:
	movzx	eax,word[size_x_var] ;SIZE_X  ;word[size_x_var]
	mov	ebx,.x2
	cmp	eax,ebx
	jg	@f
	mov	.x2,eax
      @@:
      ;  movd    mm0,eax
      ;  movd    mm1,.x2
      ;  pminsw  mm0,mm1
      ;  movd    .x2,mm0
 ;       cmp     .x2,SIZE_X  ;eax   |
 ;       jl      @f                 |>       this dont work idk cause
 ;       mov     .x2,SIZE_X ;eax    |
      @@:
      ;  movzx   eax,word[size_x_var]       ;calc memory begin in buffers
	mov	ebx,.y
	mul	ebx
	mov	ebx,.x1
	add	eax,ebx
	mov	ebx,eax
	lea	eax,[eax*3]
	add	edi,eax 	  ; edi - screen
	mov	esi,.z_buff	  ; z-buffer filled with dd variables
	shl	ebx,2
	add	esi,ebx 	  ; esi - Z buffer

	mov	ecx,.x2
	sub	ecx,.x1
	; init current variables
	push	dword .bx1 ;.by1 .ex1 .ey1 .z1 esi
	push	dword .ex1
	push	dword .by1
	push	dword .ey1

	push	.z1		 ; current z shl CATMULL_SHIFT
	push	esi

if Ext >= MMX
     pxor   mm0,mm0
     movq   mm3,.cex   ; hi - lo -> cbx; cex
     movq   mm4,.cey   ; hi - lo -> cby; cey
;     movq   mm5,mm3
;     movq   mm6,mm4
;     psrad  mm5,ROUND
;     psrad  mm6,ROUND
;     movq   .ceyq,mm5
;     movq   .cbyq,mm6
     mov    edx,.czbuff
else
     cld
end if
     .draw:
    ; if TEX = SHIFTING   ;bump drawing only in shifting mode
if Ext=NON
	mov	esi,.czbuff	 ; .czbuff current address in buffer
	mov	ebx,.cz 	 ; .cz - cur z position
	cmp	ebx,dword[esi]
else
	mov	ebx,.cz
	cmp	ebx,dword[edx]
end if
	jge	.skip

if Ext=NON
	mov	eax,.cby
	mov	esi,.cbx
	sar	eax,ROUND
	sar	esi,ROUND
	shl	eax,TEX_SHIFT	;-
	add	esi,eax
	lea	esi,[esi*3]	    ;-  ; esi - current b. texture addres
	add	esi,.bmap

	mov	ebx,.cex       ;.cex - current env map X
	mov	eax,.cey       ;.cey - current  env map y
	sar	ebx,ROUND
	sar	eax,ROUND

	shl	eax,TEX_SHIFT
	add	ebx,eax
	lea	ebx,[ebx*3]
	add	ebx,.emap


else
	movq	mm5,mm4 ;.cey
	psrad	mm5,ROUND
	pslld	mm5,TEX_SHIFT
	movq	mm6,mm3 ;.cex
	psrad	mm6,ROUND
	paddd	mm5,mm6
	movq	mm6,mm5
	paddd	mm5,mm5
	paddd	mm5,mm6
	paddd	mm5,.emap
	movd	esi,mm5
	psrlq	mm5,32
	movd	ebx,mm5
end if
if Ext>=MMX
	movd	  mm1,[esi]
	movd	  mm2,[ebx]
	punpcklbw mm1,mm0
	punpcklbw mm2,mm0
	pmullw	  mm1,mm2
	psrlw	  mm1,8
	packuswb  mm1,mm0
	movd	  [edi],mm1
	mov	  ebx,.cz
	mov	  dword[edx],ebx
else
	cld			; esi - tex e.
	lodsb			; ebx - tex b.
	mov	dl,[ebx]
	mul	dl
	shr	ax,8
	stosb
	inc	ebx
	lodsb
	mov	dl,[ebx]
	mul	dl
	shr	ax,8
	stosb
	inc	ebx
	lodsb
	mov	dl,[ebx]
	mul	dl
	shr	ax,8
	stosb
	mov	ebx,.cz
	mov	esi,.czbuff
	mov	dword[esi],ebx
	jmp	.no_skip
end if
     .skip:
	add	edi,3

   if Ext = NON
     .no_skip:
	add	.czbuff,4
	mov	eax,.dbx
	add	.cbx,eax
	mov	eax,.dby
	add	.cby,eax
	mov	eax,.dex
	add	.cex,eax
	mov	eax,.dey
	add	.cey,eax
    else
	add	edx,4
	paddd	mm3,.dex
	paddd	mm4,.dey
  ;      movq    mm5,mm3
  ;      movq    mm6,mm4
  ;      psrad   mm5,ROUND
  ;      psrad   mm6,ROUND
     ;   movq    .cex,mm3
     ;   movq    .cey,mm4
    end if
	mov	eax,.dz
	add	.cz,eax
    if Ext = NON
	dec	ecx
	jnz	.draw
    else
	loop	.draw
    end if

  .bl_end:
	mov	esp,ebp
ret 56