diff --git a/programs/demos/view3ds/3dmath.inc b/programs/demos/view3ds/3dmath.inc
index b144f91dd1..e8c2174c6d 100644
--- a/programs/demos/view3ds/3dmath.inc
+++ b/programs/demos/view3ds/3dmath.inc
@@ -600,8 +600,9 @@ else
 
     add      esi,12
     add      edi,12
-    dec      ecx
-    jne      .again
+;    dec      ecx
+;    jne      .again
+   loop     .again
     mov      [edi],dword -1
 end if
 ret
@@ -667,7 +668,7 @@ translate_points:  ; just convert into integer; z coord still needed
   end if
 
   .again:
-  if 0
+  if   0
     fld    dword[esi+8]
  ;   fmul   [rsscale]
     fist   word[edi+4]
@@ -695,7 +696,7 @@ translate_points:  ; just convert into integer; z coord still needed
     fiadd  [vect_y]
     fistp  word[edi+2]
    end if
-   if Ext>=SSE
+  if Ext>=SSE2
     movups   xmm0,[esi]
     cvtps2dq xmm0,xmm0
     packssdw xmm0,xmm0
@@ -722,8 +723,6 @@ translate_points:  ; just convert into integer; z coord still needed
 
     add    esi,12
     add    edi,6
-  ;  dec    ecx
-  ;  jnz    .again
     loop    .again
 
 ret
diff --git a/programs/demos/view3ds/3glass.inc b/programs/demos/view3ds/3glass.inc
index 293cbf3881..74648e2fdf 100644
--- a/programs/demos/view3ds/3glass.inc
+++ b/programs/demos/view3ds/3glass.inc
@@ -344,7 +344,7 @@ end if
       pop   ebp
 
 ret
-align 16
+
 glass_line:
 ; in:
 ;    xmm0 - normal vector 1
@@ -362,7 +362,7 @@ glass_line:
 
    push  ebp
    mov   ebp,esp
-   sub   esp,256
+   sub   esp,190
    sub   ebp,16
    and   ebp,0xfffffff0
 
@@ -537,7 +537,7 @@ align 16
         jnz      .ddraw
 
   .end_rp_line:
-        add      esp,256
+        add      esp,190
         pop      ebp
 
 ret
diff --git a/programs/demos/view3ds/3glass_tex.inc b/programs/demos/view3ds/3glass_tex.inc
index d78fd986d6..a2022ef566 100644
--- a/programs/demos/view3ds/3glass_tex.inc
+++ b/programs/demos/view3ds/3glass_tex.inc
@@ -452,7 +452,7 @@ end if
       pop   ebp
 
 ret
-align 16
+
 glass_tex_line:
 ; in:
 ;    xmm0 - normal vector 1
@@ -747,7 +747,6 @@ end if
      .skip:
         add      edi,4
         add      esi,4
-      ;  addps    xmm0,.dn
         movaps   xmm0,.n1     ; cur normal
         addps    xmm0,.dn
         addps    xmm2,.dtx
diff --git a/programs/demos/view3ds/3ray_shd.inc b/programs/demos/view3ds/3ray_shd.inc
index 7da685927e..86e7a87ba0 100644
--- a/programs/demos/view3ds/3ray_shd.inc
+++ b/programs/demos/view3ds/3ray_shd.inc
@@ -372,7 +372,7 @@ ray_shd_l:
 
    push  ebp
    mov   ebp,esp
-   sub   esp,320
+   sub   esp,270
    sub   ebp,16
    and   ebp,0xfffffff0
 
@@ -421,6 +421,7 @@ ray_shd_l:
         shufps  xmm3,xmm3,11100001b
     @@:
         movd    .cur_tri,mm7
+  ;      sub     .cur_tri,dword 1
         cmp     ax,.x_max
         jge     .end_rp_line
         cmp     bx,.x_min
@@ -689,7 +690,7 @@ end if
         dec      ecx
         jnz      .ddraw
   .end_rp_line:
-        add      esp,320
+        add      esp,270
         pop      ebp
 
 ret
diff --git a/programs/demos/view3ds/a_procs.inc b/programs/demos/view3ds/a_procs.inc
index 05799e390e..9ad72476cf 100644
--- a/programs/demos/view3ds/a_procs.inc
+++ b/programs/demos/view3ds/a_procs.inc
@@ -62,20 +62,6 @@ ret
 if Ext > SSE2
  ;--------------------------------------------------------------------
 init_point_lights:
-  ;      mov       eax,1000
-  ;      cvtsi2ss  xmm1,eax
-  ;      shufps    xmm1,xmm1,11000000b
-  ;      mov       esi,lights_aligned
-  ;      mov       edi,point_light_coords
-  ;      mov       ecx,3
-  ;    @@:
-  ;      movaps    xmm0,[esi]
-  ;      addps     xmm0,[f05xz]
-  ;      mulps     xmm0,xmm1
-  ;      movaps    [edi],xmm0
-  ;      add       esi,64
-   ;     add       edi,16
-   ;     loop      @b
       mov       ecx,3
       mov       edi,point_light_coords
     @@:
@@ -90,16 +76,11 @@ init_point_lights:
       call      random
       cvtsi2ss  xmm0,eax
       movss     [edi+4],xmm0
-   ;   movzx     ebx,word[size_x_var]
-   ;   shl       ebx,2
-   ;   neg       ebx
       mov       ecx,-1900
-   ;   sub       ecx,100
       mov       edx,-600
       call      random
       cvtsi2ss  xmm0,eax
       movss     [edi+8],xmm0
-   ;   mov       dword[edi+8],-1700.0
       mov       [edi+12],dword 0
       add       edi,16
       pop       ecx
@@ -174,7 +155,7 @@ intersect_tri: ; Moeller-Trumbore method
 ;      or       eax,eax
 ;      jz       @f
       comiss   xmm0,[eps]
-      jl       @f
+      jb       @f
 
       rcpss    xmm0,.det
       movss    .invdet,xmm0
@@ -228,7 +209,7 @@ intersect_tri: ; Moeller-Trumbore method
    ;   test     eax,1
    ;   jz       @f
       comiss   xmm1,[eps]
-      jl       @f
+      jb       @f
 
       mov      eax,1
       cmp      .ift,0
@@ -264,6 +245,16 @@ do_edges_list:
     .edd_ptr  equ [ebp-8]
     .counter  equ [ebp-12]
 
+       mov     ebx, 12
+       mov     eax, 68
+       mov     ecx,[triangles_count_var]
+       lea     ecx,[ecx*3]
+       shl     ecx,4
+       add     ecx,1024
+       mov     edx,[edges_ptr]
+       int     0x40                   ;  -> allocate memory to edges
+       mov     [edges_ptr], eax   ;  -> eax = pointer to allocated mem
+
 
     mov     ebx,[edges_ptr]
     mov     eax,[triangles_ptr]
@@ -280,17 +271,18 @@ do_edges_list:
     loop    @b
 
 
+
     mov     ebx,[edges_ptr]
     mov     ecx,[triangles_count_var]
     lea     ecx,[ecx*3]
   .mxd:
     mov     eax,[ebx]
+    mov     edx,[ebx+4]
     cmp     eax,[ebx+4]
-    jl      @f
-    movq    xmm0,[ebx]
-    pshufd  xmm0,xmm0,11100001b
-    movq    [ebx],xmm0
-  @@:
+    cmovg   eax,edx
+    cmovg   edx,[ebx]
+    mov     [ebx],eax
+    mov     [ebx+4],edx
     add     ebx,8
     loop    .mxd
 
@@ -303,20 +295,20 @@ do_edges_list:
    mov    esi,ecx
    shl    esi,3
    add    esi,ebx
-
+   dec    ecx
  .ccc:
    mov    eax,[ebx+8]
    cmp    eax,[ebx]
-   jge    .g
+   jae    .g
    movq   xmm0,[ebx+8]
    push   ebx
  .c:
    cmp    ebx,esi
-   jge    .done
+   jae    .done
    cmp    ebx,[edges_ptr]
-   jl     .done
+   jb     .done
    cmp    eax,[ebx]
-   jge    .done
+   jae    .done
    movq   xmm7,[ebx]
    movq   [ebx+8],xmm7
    sub    ebx,8
@@ -328,10 +320,7 @@ do_edges_list:
    pop    ebx
  .g:
    add    ebx,8
-   dec    ecx
-   cmp    ecx,1
-   jnz    .ccc
-
+   loop    .ccc
 
   ; insert sort again
    mov    ebx,[edges_ptr]
@@ -350,7 +339,7 @@ do_edges_list:
    inc    ecx
    add    ebx,8
    cmp    ebx,esi
-   jge    .br         ; break
+   jae    .br         ; break
    cmp    eax,[ebx]
    je     .aa
    mov    .counter,ecx
@@ -368,12 +357,12 @@ do_edges_list:
    mov    eax,[ebx+12]
    mov    edx,[ebx+8]
    cmp    eax,[ebx+4]
-   jge    .gg2
+   jae    .gg2
    movq   xmm0,[ebx+8]
    push   ebx
  .c2:
    cmp    eax,[ebx+4]
-   jge    .done2
+   jae    .done2
    movq   xmm7,[ebx]
    movq   [ebx+8],xmm7
 
@@ -405,60 +394,68 @@ do_edges_list:
    add   esp,8
  .ff:
 
+
    ; count edges
-   mov    ecx,0
-   mov    edx,[triangles_count_var]
-   lea    edx,[edx*3]
-   mov    ebx,[edges_ptr]
-;   mov    esi,edx
-;   shl    esi,3
-;   add    esi,[edges_ptr]
+
+   mov    ecx,[triangles_count_var]
+   lea    ecx,[ecx*3+3]
+   mov    esi,[edges_ptr]
+   xor    edx,edx
+   cld
  .nx:
-   movq    xmm0,[ebx]
-   add     ebx,8
-;   cmp     ebx,esi
-;   jae     @f
-   movq    xmm1,[ebx]
-; @@:
-   pcmpeqd xmm0,xmm1
-   pmovmskb eax,xmm0
-   and     eax,0xff
-   cmp     eax,0xff
-   jz      @f
-   inc     ecx
-  @@:
-   dec     edx
-   jnz     .nx
+   lodsd
+   mov    ebx,eax
+   lodsd
+   cmp    ebx,[esi]
+   jnz    .ic
+   cmp    eax,[esi+4]
+   jnz    .ic
+   loop   .nx
+   jmp    .endc
+ .ic:
+
+   inc    edx
+   loop   .nx
+  .endc:
+    mov     .ed_cnt,edx
+    mov     ecx,edx
 
 
-    mov     .ed_cnt,ecx
-    lea     ecx,[ecx*3]
-    shl     ecx,2
+    shl     ecx,3
     add     ecx,65536
     mov     ebx,12
     mov     eax,68
     mov     edx,.edd_ptr
-    int     0x40                   ;  -> allocate memory to triangles
+    int     0x40            ;  -> allocate memory to new edges
     mov     .edd_ptr, eax   ;  -> eax = pointer to allocated mem
 
 
 
-   mov      ebx,[edges_ptr]
-   mov      ecx,[triangles_count_var]
-   lea      ecx,[ecx*3]
-  .seek:
-   movq     xmm0,[ebx]
-   movq     xmm1,[ebx+8]
-   pcmpeqd  xmm1,xmm0
-   pmovmskb edx,xmm1
-   and      edx,0xff
-   cmp      edx,0xff
-   je       @f
-   movq     [eax],xmm0
-   add      eax,8
- @@:
-   add      ebx,8
-   loop     .seek
+   mov    ecx,[triangles_count_var]
+   lea    ecx,[ecx*3]
+   add    ecx,ecx
+   mov    esi,[edges_ptr]
+   mov    edi,eax
+   xor    edx,edx
+   cld
+ .nx1:
+   lodsd
+   mov    ebx,eax
+   lodsd
+   cmp    ebx,[esi]
+   jnz    .ic1
+   cmp    eax,[esi+4]
+   jnz    .ic1
+   loop   .nx1
+   jmp    .endc1
+ .ic1:
+   xchg   eax,ebx
+   stosd
+   mov    eax,ebx
+   stosd
+   inc    edx
+   loop   .nx1
+  .endc1:
 
    mov       eax,68
    mov       ebx,13
@@ -595,8 +592,8 @@ draw_dots:
    mov     edi,[screen_ptr]
    lea     eax,[eax*3]
    add     edi,eax
-   xor     eax,eax
-   not     eax
+   or      eax,-1
+;   not     eax
    stosd
  @@:
    loop    .drw
diff --git a/programs/demos/view3ds/bump_tex.inc b/programs/demos/view3ds/bump_tex.inc
index 12c696b6a7..3b645496a1 100644
--- a/programs/demos/view3ds/bump_tex.inc
+++ b/programs/demos/view3ds/bump_tex.inc
@@ -707,7 +707,7 @@ if  Ext >= SSE2
        movups  .cty2,xmm3
 end if
 
-if  (Ext = MMX)
+if  (Ext = MMX)| (Ext = SSE)
        movq     mm0,.cby2
        movq     mm1,.cby1
        movq     mm2,.cey2
@@ -843,7 +843,7 @@ if  Ext >= SSE2
 end if
 
 
-if (Ext = MMX)
+if  (Ext = MMX)| (Ext = SSE)
        movq     mm0,.cby2
        movq     mm1,.cby1
        movq     mm2,.cey2
@@ -1469,13 +1469,3 @@ end if
   .bl_end:
         mov     esp,ebp
 ret 76
-;Ext = MMX
-
-;     else
-;        movq    mm5, qword[.temp1]  ;-
-;        paddd   mm5, qword[.temp5]  ; .temp5 == low dword = TEX_X, high dword = -TEX_X
-;        pand    mm5, qword[.temp3]  ; .temp3 == low = high dword = TEX_SIZE
-;        paddd   mm5, qword[.temp4]  ; .temp4 == low = high dword = offset .bmap
-;        movd    ebx,mm5
-;        psrlq   mm5,32
-;     end if
diff --git a/programs/demos/view3ds/chunks.inc b/programs/demos/view3ds/chunks.inc
index 051c1a8a1b..e1882f8470 100644
--- a/programs/demos/view3ds/chunks.inc
+++ b/programs/demos/view3ds/chunks.inc
@@ -76,14 +76,19 @@ detect_chunks:
    mov     .chmr,eax     ; chunks mark if bit is set - tri was used
 
    mov     edi,eax
-   pxor    xmm0,xmm0
+;   pxor    xmm0,xmm0
    mov     ecx,[triangles_count_var]
-   shr     ecx,7
+   shr     ecx,5
    inc     ecx
- @@:
-   movdqa  [edi],xmm0
-   add     edi,16
-   loop    @b
+   xor     eax,eax
+   cld
+   rep     stosd
+;   shr     ecx,7
+;   inc     ecx
+; @@:
+;   movdqa  [edi],xmm0
+;   add     edi,16
+;   loop    @b
 
 
   mov      eax,[points_count_var]
@@ -293,6 +298,7 @@ detect_chunks:
 
         mov     .up,esi
         mov     .str,edi
+;        mov     edi,.tri_ch1
     .lb1:                     ; nx chunk
         cmp     edi,.ltch1
         jnb     .endl
@@ -399,7 +405,7 @@ detect_chunks:
 
 
 
-;   mov    ebx,.chunks
+   mov    ebx,.chunks
    mov    ecx,.ch_cnt
 
    mov  esi,.tri_ch
diff --git a/programs/demos/view3ds/data.inc b/programs/demos/view3ds/data.inc
index 4feac6fdb6..f32b274501 100644
--- a/programs/demos/view3ds/data.inc
+++ b/programs/demos/view3ds/data.inc
@@ -357,7 +357,7 @@ base_vector:
       if Ext=SSE3
         db   ' (SSE3)'
       end if
-        db   ' 0.076',0
+        db   ' 0.077',0
     labellen:
         STRdata db '-1        '
      lab_vert:
@@ -488,7 +488,7 @@ end if
     the_one:
            times 4 dd 1.0
 
-     eps:  times 4 dd 0.00000
+     eps:  times 4 dd 0.000001
 
         vect_x:        dw SIZE_X / 2
         vect_y        dw SIZE_Y / 2
@@ -500,9 +500,9 @@ end if
         xres_var                dw SIZE_X
 
 
-     epsone      dd 1.0001
+     epsone      dd 1.00001
      aprox       dd 0.0001
-     epsminus    dd -0.0001
+     epsminus    dd 0.00001
 
 
         file_info:
@@ -513,9 +513,9 @@ end if
          fptr   dd      0 ;workarea
         file_name:
                db      '/sys/3d/house.3ds',0
-             ;  db      '/tmp0/1/sc.3ds',0
+             ;  db      '/tmp0/1/bmwm3.3ds',0
 
-      rb 256
+      rb 1024
 
 
 I_END:
diff --git a/programs/demos/view3ds/grd_cat.inc b/programs/demos/view3ds/grd_cat.inc
index 5480b12b6d..c154ec785f 100644
--- a/programs/demos/view3ds/grd_cat.inc
+++ b/programs/demos/view3ds/grd_cat.inc
@@ -37,19 +37,19 @@ gouraud_triangle_z:
 .dz12  equ dword[ebp-20]
 .dc12r equ dword[ebp-24]
 .dc12g equ dword[ebp-28]
-.dc12b equ dword[ebp-32]
+.dc12b equ [ebp-32]
 
 .dx13  equ dword[ebp-36]
 .dz13  equ dword[ebp-40]
 .dc13r equ dword[ebp-44]
 .dc13g equ dword[ebp-48]
-.dc13b equ dword[ebp-52]
+.dc13b equ [ebp-52]
 
 .dx23  equ dword[ebp-56]
 .dz23  equ dword[ebp-60]
 .dc23r equ dword[ebp-64]
 .dc23g equ dword[ebp-68]
-.dc23b equ dword[ebp-72]
+.dc23b equ [ebp-72]
 
 .zz1   equ dword[ebp-76]
 .c1r   equ dword[ebp-80]
@@ -78,7 +78,7 @@ end if
 
        mov     ebp,esp
      ;  sub     esp,84
- .sort3:		  ; sort triangle coordinates...
+ .sort3:                  ; sort triangle coordinates...
        cmp     ax,bx
        jle     .sort1
        xchg    eax,ebx
@@ -89,389 +89,491 @@ end if
        xchg    edx,dword[.col2b]
        mov     dword[.col1b],edx
  .sort1:
-       cmp	bx,cx
-       jle	.sort2
-       xchg	ebx,ecx
-       mov	edx,dword[.col2r]
-       xchg	edx,dword[.col3r]
-       mov	dword[.col2r],edx
-       mov	edx,dword[.col2b]
-       xchg	edx,dword[.col3b]
-       mov	dword[.col2b],edx
+       cmp      bx,cx
+       jle      .sort2
+       xchg     ebx,ecx
+       mov      edx,dword[.col2r]
+       xchg     edx,dword[.col3r]
+       mov      dword[.col2r],edx
+       mov      edx,dword[.col2b]
+       xchg     edx,dword[.col3b]
+       mov      dword[.col2b],edx
        jmp .sort3
  .sort2:
-       push	eax	     ; store in variables
-       push	ebx
-       push	ecx
-	 mov	  edx,80008000h  ; eax,ebx,ecx are ANDd together into edx which means that
-	 and	  edx,ebx	 ; if *all* of them are negative a sign flag is raised
-	 and	  edx,ecx
-	 and	  edx,eax
-	 test	  edx,80008000h  ; Check both X&Y at once
-	 jne	  .gt_loop2_end
+       push     eax          ; store in variables
+       push     ebx
+       push     ecx
+         mov      edx,80008000h  ; eax,ebx,ecx are ANDd together into edx which means that
+         and      edx,ebx        ; if *all* of them are negative a sign flag is raised
+         and      edx,ecx
+         and      edx,eax
+         test     edx,80008000h  ; Check both X&Y at once
+         jne      .gt_loop2_end
 
-       mov	bx,.y2	     ; calc deltas
-       sub	bx,.y1
-       jnz	.gt_dx12_make
+       mov      bx,.y2       ; calc deltas
+       sub      bx,.y1
+       jnz      .gt_dx12_make
       ; mov      .dx12,0
       ; mov      .dz12,0
       ; mov      .dc12r,0
       ; mov      .dc12g,0
       ; mov      .dc12b,0
-       mov	ecx,5
+       mov      ecx,5
      @@:
-       push	dword 0
-       loop	@b
-       jmp	.gt_dx12_done
+       push     dword 0
+       loop     @b
+       jmp      .gt_dx12_done
   .gt_dx12_make:
-       mov	ax,.x2
-       sub	ax,.x1
-       cwde
-       movsx	ebx,bx
-       shl	eax,ROUND
+
+if Ext>= SSE2
+
+       movsx    ebx,bx
+       mov      eax,1 shl 15
        cdq
-       idiv	ebx
+       idiv     ebx
+    ;   push     eax
+       mov      ebx,eax
+
+
+       mov      ax,.x2
+       sub      ax,.x1
+       cwde
+       imul     ebx
+       sar      eax,15 - ROUND
+       push     eax
+    ;   mov      .dx12,eax
+
+       sub       esp,4*4
+       movd      xmm0,ebx
+       pshuflw   xmm0,xmm0,0
+       movq      xmm1,[.col1r]
+       movq      xmm2,[.col2r]
+       psubw     xmm2,xmm1
+       movdqa    xmm3,xmm2
+       pmullw    xmm2,xmm0
+       pmulhw    xmm3,xmm0
+       punpcklwd xmm2,xmm3
+       psrad     xmm2,15 - ROUND
+       pshufd    xmm2,xmm2,11000110b
+       movdqu    .dc12b,xmm2
+else
+       mov      ax,.x2
+       sub      ax,.x1
+       cwde
+       movsx    ebx,bx
+       shl      eax,ROUND
+       cdq
+       idiv     ebx
  ;      mov      .dx12,eax
-       push	 eax
+       push      eax
 
-       mov	ax,word[.z2]
-       sub	ax,word[.z1]
+       mov      ax,word[.z2]
+       sub      ax,word[.z1]
        cwde
-       shl	eax,CATMULL_SHIFT
+       shl      eax,CATMULL_SHIFT
        cdq
-       idiv	ebx
-       push	eax
+       idiv     ebx
+       push     eax
 
-       mov	ax,word[.col2r]
-       sub	ax,word[.col1r]
+       mov      ax,word[.col2r]
+       sub      ax,word[.col1r]
        cwde
-       shl	eax,ROUND
+       shl      eax,ROUND
        cdq
-       idiv	ebx
+       idiv     ebx
       ; mov      .dc12r,eax
-       push	  eax
-       mov	  ax,word[.col2g]
-       sub	  ax,word[.col1g]
+       push       eax
+       mov        ax,word[.col2g]
+       sub        ax,word[.col1g]
        cwde
-       shl	eax,ROUND
+       shl      eax,ROUND
        cdq
-       idiv	ebx
+       idiv     ebx
      ;  mov .dc12g,eax
-       push	eax
-       mov	ax,word[.col2b]        ;;---
-       sub	ax,word[.col1b]
+       push     eax
+       mov      ax,word[.col2b]        ;;---
+       sub      ax,word[.col1b]
        cwde
-       shl	eax,ROUND
+       shl      eax,ROUND
        cdq
-       idiv	ebx
+       idiv     ebx
       ; mov .dc12b,eax
-       push	eax
+       push     eax
+end if
    .gt_dx12_done:
 
-       mov	bx,.y3	     ; calc deltas
-       sub	bx,.y1
-       jnz	.gt_dx13_make
+       mov      bx,.y3       ; calc deltas
+       sub      bx,.y1
+       jnz      .gt_dx13_make
       ; mov      .dx13,0
       ; mov      .dz13,0
       ; mov      .dc13r,0
       ; mov      .dc13g,0
       ; mov      .dc13b,0
-       mov	ecx,5
+       mov      ecx,5
      @@:
-       push	dword 0
-       loop	@b
-       jmp	.gt_dx13_done
+       push     dword 0
+       loop     @b
+       jmp      .gt_dx13_done
     .gt_dx13_make:
-       mov	ax,.x3
-       sub	ax,.x1
-       cwde
-       movsx	ebx,bx
-       shl	eax,ROUND
+
+if Ext>= SSE2
+
+       movsx    ebx,bx
+       mov      eax,1 shl 15
        cdq
-       idiv	ebx
+       idiv     ebx
+       mov      ebx,eax
+
+
+       mov      ax,.x3
+       sub      ax,.x1
+       cwde
+       imul     ebx
+       sar      eax,15 - ROUND
+       push     eax
+
+       sub       esp,4*4
+       movd      xmm0,ebx
+       pshuflw   xmm0,xmm0,0
+       movq      xmm1,[.col1r]
+       movq      xmm2,[.col3r]
+       psubw     xmm2,xmm1
+       movdqa    xmm3,xmm2
+       pmullw    xmm2,xmm0
+       pmulhw    xmm3,xmm0
+       punpcklwd xmm2,xmm3
+       psrad     xmm2,15 - ROUND
+       pshufd    xmm2,xmm2,11000110b
+       movdqu    .dc13b,xmm2
+else
+
+       mov      ax,.x3
+       sub      ax,.x1
+       cwde
+       movsx    ebx,bx
+       shl      eax,ROUND
+       cdq
+       idiv     ebx
  ;      mov      .dx13,eax
-       push	 eax
+       push      eax
 
-       mov	ax,word[.z3]
-       sub	ax,word[.z1]
+       mov      ax,word[.z3]
+       sub      ax,word[.z1]
        cwde
-       shl	eax,CATMULL_SHIFT
+       shl      eax,CATMULL_SHIFT
        cdq
-       idiv	ebx
-       push	eax
+       idiv     ebx
+       push     eax
 
-       mov	ax,word[.col3r]
-       sub	ax,word[.col1r]
+       mov      ax,word[.col3r]
+       sub      ax,word[.col1r]
        cwde
-       shl	eax,ROUND
+       shl      eax,ROUND
        cdq
-       idiv	ebx
+       idiv     ebx
       ; mov      .dc13r,eax
-       push	  eax
-       mov	  ax,word[.col3g]
-       sub	  ax,word[.col1g]
+       push       eax
+       mov        ax,word[.col3g]
+       sub        ax,word[.col1g]
        cwde
-       shl	eax,ROUND
+       shl      eax,ROUND
        cdq
-       idiv	ebx
+       idiv     ebx
      ;  mov .dc13g,eax
-       push	eax
-       mov	ax,word[.col3b]
-       sub	ax,word[.col1b]
+       push     eax
+       mov      ax,word[.col3b]
+       sub      ax,word[.col1b]
        cwde
-       shl	eax,ROUND
+       shl      eax,ROUND
        cdq
-       idiv	ebx
+       idiv     ebx
       ; mov .dc13b,eax
-       push	eax
+       push     eax
+end if
    .gt_dx13_done:
 
-       mov	bx,.y3	     ; calc deltas
-       sub	bx,.y2
-       jnz	.gt_dx23_make
+       mov      bx,.y3       ; calc deltas
+       sub      bx,.y2
+       jnz      .gt_dx23_make
       ; mov      .dx23,0
       ; mov      .dz23,0
       ; mov      .dc23r,0
       ; mov      .dc23g,0
       ; mov      .dc23b,0
-       mov	ecx,5
+       mov      ecx,5
      @@:
-       push	dword 0
-       loop	@b
-       jmp	.gt_dx23_done
+       push     dword 0
+       loop     @b
+       jmp      .gt_dx23_done
     .gt_dx23_make:
-       mov	ax,.x3
-       sub	ax,.x2
-       cwde
-       movsx	ebx,bx
-       shl	eax,ROUND
+
+if Ext>= SSE2
+
+       movsx    ebx,bx
+       mov      eax,1 shl 15
        cdq
-       idiv	ebx
+       idiv     ebx
+    ;   push     eax
+       mov      ebx,eax
+
+       mov      ax,.x3
+       sub      ax,.x2
+       cwde
+       imul     ebx
+       sar      eax,15 - ROUND
+       push     eax
+
+       sub       esp,4*4
+       movd      xmm0,ebx
+       pshuflw   xmm0,xmm0,0
+       movq      xmm1,[.col2r]
+       movq      xmm2,[.col3r]
+       psubw     xmm2,xmm1
+       movdqa    xmm3,xmm2
+       pmullw    xmm2,xmm0
+       pmulhw    xmm3,xmm0
+       punpcklwd xmm2,xmm3
+       psrad     xmm2,15 - ROUND
+       pshufd    xmm2,xmm2,11000110b
+       movdqu    .dc23b,xmm2
+else
+
+
+       mov      ax,.x3
+       sub      ax,.x2
+       cwde
+       movsx    ebx,bx
+       shl      eax,ROUND
+       cdq
+       idiv     ebx
  ;      mov      .dx23,eax
-       push	 eax
+       push      eax
 
-       mov	ax,word[.z3]
-       sub	ax,word[.z2]
+       mov      ax,word[.z3]
+       sub      ax,word[.z2]
        cwde
-       shl	eax,CATMULL_SHIFT
+       shl      eax,CATMULL_SHIFT
        cdq
-       idiv	ebx
-       push	eax
+       idiv     ebx
+       push     eax
 
-       mov	ax,word[.col3r]
-       sub	ax,word[.col2r]
+       mov      ax,word[.col3r]
+       sub      ax,word[.col2r]
        cwde
-       shl	eax,ROUND
+       shl      eax,ROUND
        cdq
-       idiv	ebx
+       idiv     ebx
       ; mov     .dc23r,eax
-       push	eax
-       mov	ax,word[.col3g]
-       sub	ax,word[.col2g]
+       push     eax
+       mov      ax,word[.col3g]
+       sub      ax,word[.col2g]
        cwde
-       shl	eax,ROUND
+       shl      eax,ROUND
        cdq
-       idiv	ebx
+       idiv     ebx
      ;  mov .dc23g,eax
-       push	eax
-       mov	ax,word[.col3b]
-       sub	ax,word[.col2b]
+       push     eax
+       mov      ax,word[.col3b]
+       sub      ax,word[.col2b]
        cwde
-       shl	eax,ROUND
+       shl      eax,ROUND
        cdq
-       idiv	ebx
+       idiv     ebx
       ; mov .dc23b,eax
-       push	eax
+       push     eax
+end if
    .gt_dx23_done:
-       sub	esp,32
+       sub      esp,32
 
-       movsx	eax,.x1 		   ; eax - cur x1
-       shl	eax,ROUND		   ; ebx - cur x2
-       mov	ebx,eax
-       movsx	edx,word[.z1]
-       shl	edx,CATMULL_SHIFT
-       mov	.zz1,edx
-       mov	.zz2,edx
-       movzx	edx,word[.col1r]
-       shl	edx,ROUND
-       mov	.c1r,edx
-       mov	.c2r,edx
-       movzx	edx,word[.col1g]
-       shl	edx,ROUND
-       mov	.c1g,edx
-       mov	.c2g,edx
-       movzx	edx,word[.col1b]
-       shl	edx,ROUND
-       mov	.c1b,edx
-       mov	.c2b,edx
-       mov	cx,.y1
-       cmp	cx,.y2
-       jge	.gt_loop1_end
+       movsx    eax,.x1                    ; eax - cur x1
+       shl      eax,ROUND                  ; ebx - cur x2
+       mov      ebx,eax
+       movsx    edx,word[.z1]
+       shl      edx,CATMULL_SHIFT
+       mov      .zz1,edx
+       mov      .zz2,edx
+       movzx    edx,word[.col1r]
+       shl      edx,ROUND
+       mov      .c1r,edx
+       mov      .c2r,edx
+       movzx    edx,word[.col1g]
+       shl      edx,ROUND
+       mov      .c1g,edx
+       mov      .c2g,edx
+       movzx    edx,word[.col1b]
+       shl      edx,ROUND
+       mov      .c1b,edx
+       mov      .c2b,edx
+       mov      cx,.y1
+       cmp      cx,.y2
+       jge      .gt_loop1_end
 
     .gt_loop1:
        pushad
     ; macro .debug
 
-       mov	edx,.c2r	      ; c2r,c2g,c2b,c1r,c1g,c1b - current colors
-       sar	edx,ROUND
-       push	dx
-       mov	edx,.c2g
-       sar	edx,ROUND
-       push	dx
-       mov	edx,.c2b
-       sar	edx,ROUND
-       push	dx
-       sar	ebx,ROUND    ; x2
-       push	bx
-       mov	edx,.c1r
-       sar	edx,ROUND
-       push	dx
-       mov	edx,.c1g
-       sar	edx,ROUND
-       push	dx
-       mov	edx,.c1b
-       sar	edx,ROUND
-       push	dx
-       sar	eax,ROUND
-       push	ax	      ; x1
-       push	cx	      ; y
-       push	.zz2
-       push	.zz1
-       call	gouraud_line_z
+       mov      edx,.c2r              ; c2r,c2g,c2b,c1r,c1g,c1b - current colors
+       sar      edx,ROUND
+       push     dx
+       mov      edx,.c2g
+       sar      edx,ROUND
+       push     dx
+       mov      edx,.c2b
+       sar      edx,ROUND
+       push     dx
+       sar      ebx,ROUND    ; x2
+       push     bx
+       mov      edx,.c1r
+       sar      edx,ROUND
+       push     dx
+       mov      edx,.c1g
+       sar      edx,ROUND
+       push     dx
+       mov      edx,.c1b
+       sar      edx,ROUND
+       push     dx
+       sar      eax,ROUND
+       push     ax            ; x1
+       push     cx            ; y
+       push     .zz2
+       push     .zz1
+       call     gouraud_line_z
 
        popad
+
 if Ext >= MMX
-       movq	mm0,.c1bM
-       paddd	mm0,qword .dc13bM
-       movq	.c1bM,mm0
-       movq	mm1,.c2bM
-       paddd	mm1,qword .dc12bM
-       movq	.c2bM,mm1
+       movq     mm0,.c1bM
+       paddd    mm0,qword .dc13bM
+       movq     .c1bM,mm0
+       movq     mm1,.c2bM
+       paddd    mm1,qword .dc12bM
+       movq     .c2bM,mm1
 
-       movq	mm0,.c1rM
-       paddd	mm0,qword .dc13rM
-       movq	.c1rM,mm0
-       movq	mm1,.c2rM
-       paddd	mm1,qword .dc12rM
-       movq	.c2rM,mm1
+       movq     mm0,.c1rM
+       paddd    mm0,qword .dc13rM
+       movq     .c1rM,mm0
+       movq     mm1,.c2rM
+       paddd    mm1,qword .dc12rM
+       movq     .c2rM,mm1
 else
-       mov	edx,.dc13r
-       add	.c1r,edx
-       mov	edx,.dc13g
-       add	.c1g,edx
-       mov	edx,.dc13b
-       add	.c1b,edx
-       mov	edx,.dc12r
-       add	.c2r,edx
-       mov	edx,.dc12g
-       add	.c2g,edx
-       mov	edx,.dc12b
-       add	.c2b,edx
+       mov      edx,.dc13r
+       add      .c1r,edx
+       mov      edx,.dc13g
+       add      .c1g,edx
+       mov      edx,.dc13b
+       add      .c1b,edx
+       mov      edx,.dc12r
+       add      .c2r,edx
+       mov      edx,.dc12g
+       add      .c2g,edx
+       mov      edx,.dc12b
+       add      .c2b,edx
 
-       mov	edx,.dz13
-       add	.zz1,edx
-       mov	edx,.dz12
-       add	.zz2,edx
+       mov      edx,.dz13
+       add      .zz1,edx
+       mov      edx,.dz12
+       add      .zz2,edx
 end if
-       add	eax,.dx13
-       add	ebx,.dx12
-       inc	cx
-       cmp	cx,.y2
-       jl	.gt_loop1
+       add      eax,.dx13
+       add      ebx,.dx12
+       inc      cx
+       cmp      cx,.y2
+       jl       .gt_loop1
 
    .gt_loop1_end:
-       mov	cx,.y2
-       cmp	cx,.y3
-       jge	.gt_loop2_end
+       mov      cx,.y2
+       cmp      cx,.y3
+       jge      .gt_loop2_end
 
-       movsx	ebx,.x2 		   ; eax - cur x1
-       shl	ebx,ROUND		   ; ebx - cur x2
-       movsx	edx,word[.z2]
-       shl	edx,CATMULL_SHIFT
-       mov	.zz2,edx
-       movzx	edx,word[.col2r]
-       shl	edx,ROUND
-       mov	.c2r,edx
-       movzx	edx,word[.col2g]
-       shl	edx,ROUND
-       mov	.c2g,edx
-       movzx	edx,word[.col2b]
-       shl	edx,ROUND
-       mov	.c2b,edx
+       movsx    ebx,.x2                    ; eax - cur x1
+       shl      ebx,ROUND                  ; ebx - cur x2
+       movsx    edx,word[.z2]
+       shl      edx,CATMULL_SHIFT
+       mov      .zz2,edx
+       movzx    edx,word[.col2r]
+       shl      edx,ROUND
+       mov      .c2r,edx
+       movzx    edx,word[.col2g]
+       shl      edx,ROUND
+       mov      .c2g,edx
+       movzx    edx,word[.col2b]
+       shl      edx,ROUND
+       mov      .c2b,edx
 
     .gt_loop2:
        pushad
     ; macro .debug
 
-       mov	edx,.c2r	      ; c2r,c2g,c2b,c1r,c1g,c1b - current colors
-       sar	edx,ROUND
-       push	dx
-       mov	edx,.c2g
-       sar	edx,ROUND
-       push	dx
-       mov	edx,.c2b
-       sar	edx,ROUND
-       push	dx
-       sar	ebx,ROUND    ; x2
-       push	bx
-       mov	edx,.c1r
-       sar	edx,ROUND
-       push	dx
-       mov	edx,.c1g
-       sar	edx,ROUND
-       push	dx
-       mov	edx,.c1b
-       sar	edx,ROUND
-       push	dx
-       sar	eax,ROUND
-       push	ax	      ; x1
-       push	cx	      ; y
-       push	.zz2
-       push	.zz1
-       call	gouraud_line_z
+       mov      edx,.c2r              ; c2r,c2g,c2b,c1r,c1g,c1b - current colors
+       sar      edx,ROUND
+       push     dx
+       mov      edx,.c2g
+       sar      edx,ROUND
+       push     dx
+       mov      edx,.c2b
+       sar      edx,ROUND
+       push     dx
+       sar      ebx,ROUND    ; x2
+       push     bx
+       mov      edx,.c1r
+       sar      edx,ROUND
+       push     dx
+       mov      edx,.c1g
+       sar      edx,ROUND
+       push     dx
+       mov      edx,.c1b
+       sar      edx,ROUND
+       push     dx
+       sar      eax,ROUND
+       push     ax            ; x1
+       push     cx            ; y
+       push     .zz2
+       push     .zz1
+       call     gouraud_line_z
 
        popad
 
 if Ext >= MMX
-       movq	mm0,.c1bM
-       paddd	mm0,qword .dc13bM
-       movq	.c1bM,mm0
-       movq	mm1,.c2bM
-       paddd	mm1,qword .dc23bM
-       movq	.c2bM,mm1
+       movq     mm0,.c1bM
+       paddd    mm0,qword .dc13bM
+       movq     .c1bM,mm0
+       movq     mm1,.c2bM
+       paddd    mm1,qword .dc23bM
+       movq     .c2bM,mm1
 
-       movq	mm0,.c1rM
-       paddd	mm0,qword .dc13rM
-       movq	.c1rM,mm0
-       movq	mm1,.c2rM
-       paddd	mm1,qword .dc23rM
-       movq	.c2rM,mm1
+       movq     mm0,.c1rM
+       paddd    mm0,qword .dc13rM
+       movq     .c1rM,mm0
+       movq     mm1,.c2rM
+       paddd    mm1,qword .dc23rM
+       movq     .c2rM,mm1
 else
-       mov	edx,.dc13r
-       add	.c1r,edx
-       mov	edx,.dc13g
-       add	.c1g,edx
-       mov	edx,.dc13b
-       add	.c1b,edx
-       mov	edx,.dc23r
-       add	.c2r,edx
-       mov	edx,.dc23g
-       add	.c2g,edx
-       mov	edx,.dc23b
-       add	.c2b,edx
-       mov	edx,.dz13
-       add	.zz1,edx
-       mov	edx,.dz23
-       add	.zz2,edx
+       mov      edx,.dc13r
+       add      .c1r,edx
+       mov      edx,.dc13g
+       add      .c1g,edx
+       mov      edx,.dc13b
+       add      .c1b,edx
+       mov      edx,.dc23r
+       add      .c2r,edx
+       mov      edx,.dc23g
+       add      .c2g,edx
+       mov      edx,.dc23b
+       add      .c2b,edx
+       mov      edx,.dz13
+       add      .zz1,edx
+       mov      edx,.dz23
+       add      .zz2,edx
 end if
-       add	eax,.dx13
-       add	ebx,.dx23
-       inc	cx
-       cmp	cx,.y3
-       jl	.gt_loop2
+       add      eax,.dx13
+       add      ebx,.dx23
+       inc      cx
+       cmp      cx,.y3
+       jl       .gt_loop2
    .gt_loop2_end:
 
-       mov	esp,ebp
+       mov      esp,ebp
 ret 24
 gouraud_line_z:
 ;----------------- procedure drawing gouraud line
@@ -479,10 +581,11 @@ gouraud_line_z:
 ;----------------- esi - pointer to Z_buffer
 ;----------------- edi - pointer to screen buffer
 ;----------------- stack:
-.z1  equ dword[ebp+4]	; z coordiunate shifted left CATMULL_SHIFT
+.z1  equ dword[ebp+4]   ; z coordiunate shifted left CATMULL_SHIFT
 .z2  equ dword[ebp+8]
 .y   equ word[ebp+12]
 .x1  equ ebp+14
+
 .c1b equ ebp+16
 .c1g equ ebp+18
 .c1r equ ebp+20
@@ -509,138 +612,191 @@ gouraud_line_z:
 .dc_rM equ ebp-16
 .dc_gM equ ebp-12
 .dc_bM equ ebp-8
-	mov	  ebp,esp
+        mov       ebp,esp
 
-	mov	ax,.y
-	or	ax,ax
-	jl	.gl_quit
-	mov	bx,[size_y_var]
-	dec	bx
-	cmp	ax,bx ;SIZE_Y
-	jge	.gl_quit
+        mov     ax,.y
+        or      ax,ax
+        jl      .gl_quit
+        mov     bx,[size_y_var]
+        dec     bx
+        cmp     ax,bx ;SIZE_Y
+        jge     .gl_quit
 
-	mov	eax,dword[.x1]
-	cmp	ax,word[.x2]
-	je	.gl_quit
-	jl	@f
+        mov     eax,dword[.x1]
+        cmp     ax,word[.x2]
+        je      .gl_quit
+        jl      @f
 
-	xchg	eax,dword[.x2]
-	mov	dword[.x1],eax
-	mov	eax,dword[.c1g]
-	xchg	eax,dword[.c2g]
-	mov	dword[.c1g],eax
-	mov	eax,.z1
-	xchg	eax,.z2
-	mov	.z1,eax
+        xchg    eax,dword[.x2]
+        mov     dword[.x1],eax
+        mov     eax,dword[.c1g]
+        xchg    eax,dword[.c2g]
+        mov     dword[.c1g],eax
+        mov     eax,.z1
+        xchg    eax,.z2
+        mov     .z1,eax
    @@:
-	mov	bx,[size_x_var]
-	dec	bx
-	cmp	word[.x1],bx  ;SIZE_X
-	jge	.gl_quit
-	cmp	word[.x2],0
-	jle	.gl_quit
+        mov     bx,[size_x_var]
+        dec     bx
+        cmp     word[.x1],bx  ;SIZE_X
+        jge     .gl_quit
+        cmp     word[.x2],0
+        jle     .gl_quit
 
-	mov	eax,.z2
-	sub	eax,.z1
-	cdq
-	mov	bx,word[.x2]	  ; dz = z2-z1/x2-x1
-	sub	bx,word[.x1]
-	movsx	ebx,bx
-	idiv	ebx
-	push	eax
+if 0
+       mov     bx,word[.x2]      ; dz = z2-z1/x2-x1
+       sub     bx,word[.x1]
+       movsx   ebx,bx
 
-	mov	ax,word[.c2b]
-	sub	ax,word[.c1b]
-	cwde
-	shl	eax,ROUND
-	cdq
-	idiv	ebx
-	push	eax
 
-	mov	ax,word[.c2g]
-	sub	ax,word[.c1g]
-	cwde
-	shl	eax,ROUND
-	cdq
-	idiv	ebx
-	push	eax
+       mov      eax,1 shl 15
+       cdq
+       idiv     ebx
+       mov      ebx,eax
 
-	mov	ax,word[.c2r]
-	sub	ax,word[.c1r]
-	cwde
-	shl	eax,ROUND	  ; dc_r = c2r-c1r/x2-x1
-	cdq
-	idiv	ebx
-	push	eax
 
-	cmp	word[.x1],0	; clipping on function
-	jg	@f
-	mov	eax,.dz
-	movsx	ebx,word[.x1]
-	neg	ebx
-	imul	ebx
-	add	.z1,eax
-	mov	word[.x1],0
+       mov      eax,.x3
+       sub      eax,.x1
+       cwde
+       imul     ebx
+       sar      eax,15 - ROUND
+       push     eax
 
-	mov	eax,.dc_r
-	imul	ebx
-	sar	eax,ROUND
-	add	word[.c1r],ax
+       sub       esp,4*4
+       movd      xmm0,ebx
+       pshuflw   xmm0,xmm0,0
+       movq      xmm1,[.col1r]
+       movq      xmm2,[.col3r]
+       psubw     xmm2,xmm1
+       movdqa    xmm3,xmm2
+       pmullw    xmm2,xmm0
+       pmulhw    xmm3,xmm0
+       punpcklwd xmm2,xmm3
+       psrad     xmm2,15 - ROUND
+       pshufd    xmm2,xmm2,11000110b
+       movdqu    .dc13b,xmm2
 
-	mov	eax,.dc_g
-	imul	ebx
-	sar	eax,ROUND
-	add	word[.c1g],ax
 
-	mov	eax,.dc_b
-	imul	ebx
-	sar	eax,ROUND
-	add	word[.c1b],ax
+end if
+
+
+        mov     eax,.z2
+        sub     eax,.z1
+        cdq
+        mov     bx,word[.x2]      ; dz = z2-z1/x2-x1
+        sub     bx,word[.x1]
+        movsx   ebx,bx
+        idiv    ebx
+        push    eax
+
+
+        mov      eax,1 shl 15
+        cdq
+        idiv     ebx
+        mov      ebx,eax
+
+
+        mov     ax,word[.c2b]
+        sub     ax,word[.c1b]
+        cwde
+        imul     ebx
+        sar      eax,15 - ROUND
+        push     eax
+
+
+        mov     ax,word[.c2g]
+        sub     ax,word[.c1g]
+        cwde
+        imul     ebx
+        sar      eax,15 - ROUND
+        push     eax
+
+
+
+        mov     ax,word[.c2r]
+        sub     ax,word[.c1r]
+        cwde
+        imul     ebx
+        sar      eax,15 - ROUND
+        push     eax
+
+        cmp     word[.x1],0     ; clipping on function
+        jg      @f
+        mov     eax,.dz
+        movsx   ebx,word[.x1]
+        neg     ebx
+        imul    ebx
+        add     .z1,eax
+        mov     word[.x1],0
+
+        mov     eax,.dc_r
+        imul    ebx
+        sar     eax,ROUND
+        add     word[.c1r],ax
+
+        mov     eax,.dc_g
+        imul    ebx
+        sar     eax,ROUND
+        add     word[.c1g],ax
+
+        mov     eax,.dc_b
+        imul    ebx
+        sar     eax,ROUND
+        add     word[.c1b],ax
 
       @@:
-	mov	bx,[size_x_var]
-	dec	bx
-	cmp	word[.x2],bx  ;SIZE_X
-	jl	@f
-	mov	word[.x2],bx  ;SIZE_X
+        mov     bx,[size_x_var]
+        dec     bx
+        cmp     word[.x2],bx  ;SIZE_X
+        jl      @f
+        mov     word[.x2],bx  ;SIZE_X
      @@:
-	sub	esp,16	    ; calculate memory begin
-	movzx	edx,word[size_x_var]  ;SIZE_X       ; in buffers
-	movzx	eax,.y
-	mul	edx
-	movzx	edx,word[.x1]
-	add	eax,edx
-	push	eax
-	lea	eax,[eax*3]
-	add	edi,eax
-	pop	eax
-	shl	eax,2
-	add	esi,eax
+        sub     esp,16      ; calculate memory begin
+        movzx   edx,word[size_x_var]  ;SIZE_X       ; in buffers
+        movzx   eax,.y
+        mul     edx
+        movzx   edx,word[.x1]
+        add     eax,edx
+        push    eax
+        lea     eax,[eax*3]
+        add     edi,eax
+        pop     eax
+        shl     eax,2
+        add     esi,eax
 
-	mov	cx,word[.x2]
-	sub	cx,word[.x1]
-	movzx	ecx,cx
-	mov	ebx,.z1 	 ; ebx - currrent z shl CATMULL_SIFT
+        mov     cx,word[.x2]
+        sub     cx,word[.x1]
+        movzx   ecx,cx
+        mov     ebx,.z1          ; ebx - currrent z shl CATMULL_SIFT
 ;if Ext >= SSE
 ;        mov     .cz,edx
 ;end if
-	mov	edx,.dz 	 ; edx - delta z
-	movzx	eax,word[.c1r]
-	shl	eax,ROUND
-	mov	.cr,eax
-	movzx	eax,word[.c1g]
-	shl	eax,ROUND
-	mov	.cg,eax
-	movzx	eax,word[.c1b]
-	shl	eax,ROUND
-	mov	.cb,eax
-if Ext = MMX
+        mov     edx,.dz          ; edx - delta z
+if Ext >= SSE2
+        movq      xmm7,[.c1b]
+        pshuflw   xmm7,xmm7,11000110b
+        punpcklwd xmm7,[the_zero]
+        pslld     xmm7,ROUND
+        movdqu    xmm1,[.dc_rM]
+end if
+if Ext = NON
+        movzx   eax,word[.c1r]
+        shl     eax,ROUND
+        mov     .cr,eax
+        movzx   eax,word[.c1g]
+        shl     eax,ROUND
+        mov     .cg,eax
+        movzx   eax,word[.c1b]
+        shl     eax,ROUND
+        mov     .cb,eax
+end if
+if (Ext = MMX) | (Ext=SSE)
 ;        mov     .c_z,edx
-	movd	mm2,[.dc_bM]	     ; delta color blue MMX
-	movd	mm3,[.cbM]	     ; current blue MMX
-	movq	mm5,[.dc_rM]
-	movq	mm4,[.crM]
-	pxor	mm6,mm6
+        movd    mm2,[.dc_bM]         ; delta color blue MMX
+        movd    mm3,[.cbM]           ; current blue MMX
+        movq    mm5,[.dc_rM]
+        movq    mm4,[.crM]
+        pxor    mm6,mm6
 end if
 
 
@@ -650,55 +806,70 @@ end if
 ;        psrsq   mm0,32
 ;        movd    ebx,mm0
 ;end if
-	cmp	ebx,dword[esi]	 ; esi - z_buffer
-	jge	@f		 ; edi - Screen buffer
-if Ext = MMX
-	movq	mm0,mm3 	 ; mm0, mm1 - temp registers
-	psrld	mm0,ROUND
-	movq	mm1,mm4
-	psrld	mm1,ROUND
-	packssdw  mm1,mm0
-	packuswb  mm1,mm6
-;        movd     [edi],mm1
-	movd	  eax,mm1
-	stosw
-	shr	  eax,16
-	stosb
-else
-	mov	eax,.cr
-	sar	eax,ROUND
-	stosb
-	mov	eax,.cg
-	sar	eax,ROUND
-	stosb
-	mov	eax,.cb
-	sar	eax,ROUND
-	stosb
+        cmp     ebx,dword[esi]   ; esi - z_buffer
+        jge     @f               ; edi - Screen buffer
+if Ext >= SSE2
+        movdqa   xmm0,xmm7
+        psrld    xmm0,ROUND
+        packssdw xmm0,xmm0
+        packuswb xmm0,xmm0
+        movd     eax,xmm0
+        stosw
+        shr      eax,16
+        stosb
 end if
-	mov	dword[esi],ebx
+if (Ext=MMX) | (Ext=SSE)
+        movq    mm0,mm3          ; mm0, mm1 - temp registers
+        psrld   mm0,ROUND
+        movq    mm1,mm4
+        psrld   mm1,ROUND
+        packssdw  mm1,mm0
+        packuswb  mm1,mm6
+;        movd     [edi],mm1
+        movd      eax,mm1
+        stosw
+        shr       eax,16
+        stosb
+end if
+if Ext=NON
+        mov     eax,.cr
+        sar     eax,ROUND
+        stosb
+        mov     eax,.cg
+        sar     eax,ROUND
+        stosb
+        mov     eax,.cb
+        sar     eax,ROUND
+        stosb
+end if
+        mov     dword[esi],ebx
 ;if Ext = NON
-	jmp	.no_skip
+        jmp     .no_skip
 ;end if
       @@:
-	add	edi,3
+        add     edi,3
       .no_skip:
-	add	esi,4
+        add     esi,4
 ;if Ext=NON
-	add	ebx,edx
+        add     ebx,edx
 ;end if
-if Ext=MMX
-	paddd	mm3,mm2
-	paddd	mm4,mm5
-else
-	mov	eax,.dc_g
-	add	.cg,eax
-	mov	eax,.dc_b
-	add	.cb,eax
-	mov	eax,.dc_r
-	add	.cr,eax
+if Ext >=SSE2
+        paddd   xmm7,xmm1
 end if
-	loop	.ddraw
+if (Ext=MMX) | (Ext=SSE)
+        paddd   mm3,mm2
+        paddd   mm4,mm5
+end if
+if Ext = NON
+        mov     eax,.dc_g
+        add     .cg,eax
+        mov     eax,.dc_b
+        add     .cb,eax
+        mov     eax,.dc_r
+        add     .cr,eax
+end if
+        loop    .ddraw
 
    .gl_quit:
-	mov	  esp,ebp
+        mov       esp,ebp
 ret 26
diff --git a/programs/demos/view3ds/history.txt b/programs/demos/view3ds/history.txt
index a6c8faef2b..47f94b4b82 100644
--- a/programs/demos/view3ds/history.txt
+++ b/programs/demos/view3ds/history.txt
@@ -1,3 +1,25 @@
+View3ds 0.076 - XII 2021
+1. Detecting manifold chunks procedure based on kind of sorted pivot 
+   table. Chunks are counted and this number displayed.
+2. New calculating normal vectors proc that use some data produced
+   by new chunks routine. Now big object loading is fast. I load object that
+   contains ~500000 vertices,  ~700000 faces and  ~2000 0000 unique edges
+   in few seconds on i5 2cond gen. Earlier such objects calculating was
+   rather above time limits.
+3. On http://board.flatassembler.net occasionaly there are some disccusions
+   about optimizing. Some clever people, wich skills and competence I trust,
+   claims - for CPU's manufactured last  ~15 years size of code is crucial
+   for speed. (Better utilize CPU cache).
+   So I wrote some 'movsd' mnemonics instead  'mov [edi],sth'; 'loop' instead
+   'dec ecx,jnz sth'. Moreover I come back to init some local varibles
+   by 'push' (flat_cat.inc). I took effort to change divisions to
+   multiplications  two_tex.inc  (works ok in fpu only Ext = NON mode and
+   of course in Ext = SSE3 mode),  grd_tex.inc (single line not parallel
+   muls, whole drawing routine  4 divs instead 27 divisions),
+   bump_tex.inc - 3 divs in SSE2 mode.s  See sources for details.
+4. Editor button allows now editing by vertex all above 65535 vert objects.
+----------------------------------------------------------------------------------
+
 View3ds 0.075 - XII 2021
 1. Cusom rotate using keys and mouse scroll support by Leency.
 ----------------------------------------------------------------------------------
@@ -8,8 +30,7 @@ View3ds 0.074 - IX 2021
 3. New rendering model - ray casted shadows and appropiate button to
    set 'on' this option. Note that is non real time model, especially when
    complex object is computed. I took effort to introduce accelerating
-   structure - AABB (Axis Aligned Bounding Boxes).. but it is disabled 
-  
+   structure - AABB (Axis Aligned Bounding Boxes).. but it is disabled 
    for now - seems to work incorrect(slow).
 ----------------------------------------------------------------------------------
 
diff --git a/programs/demos/view3ds/readme.txt b/programs/demos/view3ds/readme.txt
index e680574552..646fb3469e 100644
--- a/programs/demos/view3ds/readme.txt
+++ b/programs/demos/view3ds/readme.txt
@@ -1,33 +1,16 @@
-View3ds 0.076 - tiny viewer to .3ds and .asc files with several graphics
+View3ds 0.077 - tiny viewer to .3ds and .asc files with several graphics
                 effects implementation.
 
-What's new?
-1. Detecting manifold chunks procedure based on kind of sorted pivot 
-    table. Chunks are counted and this number displayed.
-2. New calculating normal vectors proc that use some data produced
-    by new chunks routine. Now big object loading is fast. I load object that 
-    contains ~500000 vertices,  ~700000 faces and  ~2000 0000 unique edges
-    in few seconds on i5 2cond gen. Earlier such objects calculating was
-    rather above time limits.
-3. On http://board.flatassembler.net occasionaly there are some disccusions
-    about optimizing. Some clever people, wich skills and competence I trust,
-    claims - for CPU's manufactured last  ~15 years size of code is crucial 
-    for speed. (Better utilize CPU cache).
-    So I wrote some 'movsd' mnemonics instead  'mov [edi],sth'; 'loop' instead
-    'dec ecx,jnz sth'. Moreover I come back to init some local varibles 
-    by 'push' (flat_cat.inc). I took effort to change divisions to 
-    multiplications  two_tex.inc  (works ok in fpu only Ext = NON mode and
-    of course in Ext = SSE3 mode),  grd_tex.inc (single line not parallel 
-    muls, whole drawing routine  4 divs instead 27 divisions), 
-    bump_tex.inc - 3 divs in SSE2 mode.s  See sources for details. 
-4. Editor button allows now editing by vertex all above 65535 vert objects.
-
-	
+Whats new?
+1. More divs elimination comparing to ver 0.076, - grd_cat.inc file.
+2. Some 3ds object I have, reads with invalid normals - fixed.
+3. Invalid submit edition bug - fixed. Smaller size of adjcent proc.
+4. Edges detection fix.
 
 
 Buttons description:
-1.  rotary: choosing rotary axle: x, y, x+y, keys - for object translate
-    using keyboard.	 .
+1.  rotary: choosing rotary axle: x, y, x+y, keys - for object custom rotate
+    using keyboard - keys <, >, PgUp, PgDown.
 2.  shd. model: choosing shading model: flat, grd (smooth), env (spherical
     environment mapping, bump (bump mapping), tex (texture mapping),
     pos (position shading depend), dots (app draws only points - nodes of object),
@@ -40,7 +23,7 @@ Buttons description:
 6.  ray shadow: calc ray casted shadows.
 7.  culling: backface culling on/ off.
 8.  rand. light: Randomize 3 unlinear lights( so called Phong's illumination).
-9.  Blur: blur N times; N=0,1,2,3,4,5
+9.  blur: blur N times; N=0,1,2,3,4,5
 10.11,12,13. loseless operations (rotary 90, 180 degrees).
 12. emboss: Do emboss effect( flat bumps ), use 'bumps deep' button to do edges
      more deep.
@@ -56,9 +39,9 @@ Buttons description:
 20. bright - -> decrease picture brightness.
 21. wav effect -> do effect based sine function.
 22. editor -> setting editing option. If is "on" then red bars are draw according to each
-   vertex, Pressing  and moving left mouse button (cursor must be on handler)- change
-   vertex position. If left mouse button is released apply current position. You may also
-   decrease whole handlers count by enable culling (using appropriate button) - some
-   back handlers become hidden.
+    vertex, Pressing  and moving left mouse button (cursor must be on handler)- change
+    vertex position. If left mouse button is released apply current position. You may also
+    decrease whole handlers count by enable culling (using appropriate button) - some
+    back handlers become hidden.
 
-                         Maciej Guba             XII 2021
+                         Maciej Guba   march 2022
diff --git a/programs/demos/view3ds/view3ds.asm b/programs/demos/view3ds/view3ds.asm
index e0caf2ba4d..9fc7afd9f7 100644
--- a/programs/demos/view3ds/view3ds.asm
+++ b/programs/demos/view3ds/view3ds.asm
@@ -1,5 +1,5 @@
 
-; application : View3ds ver. 0.076 - tiny .3ds and .asc files viewer
+; application : View3ds ver. 0.077 - tiny .3ds and .asc files viewer
 ;               with a few graphics effects demonstration.
 ; compiler    : FASM
 ; system      : KolibriOS
@@ -64,53 +64,74 @@ START:    ; start of execution
         fstp   [rsscale]
         pop    ebx
 
-        call   alloc_buffer_mem
-        call   read_param
-        call   read_from_disk    ; read, if all is ok eax = 0
-        cmp    eax,0
-        jne    .gen
-        mov    esi,[fptr]
-        cmp    [esi],word 4D4Dh
-        jne    .asc
-        call   read_tp_variables ; init points and triangles count variables
-        cmp    eax,0
-
+        call    alloc_buffer_mem
+        call    read_param
+        call    read_from_disk    ; read, if all is ok eax = 0
+        btr     eax,31            ; mark 1
+        cmp     eax,0
+        jne     .gen
+        bts     eax,31            ; mark 2
+        mov     esi,[fptr]
+        cmp     [esi],word 4D4Dh
+        jne     .asc_gen
+        call    read_tp_variables ; init points and triangles count variables
+        cmp     eax,0
         jne    .malloc
+        xor    eax,eax            ; if failed read -> generate
     .gen:
-     ; if no house.3ds on board - generate
-        xor      bl,bl ; reallocate memory
+    .asc_gen:   ; read asc file or generate
+        push    eax
+     ; if no house.3ds on rd - generate
+        xor      bl,bl ; allocate memory
         mov      [triangles_count_var],20000
         mov      [points_count_var],20000
         call     alloc_mem_for_tp
+        pop      eax
+        bt       eax,31
+        jc       .asc
+        mov      bl,[generator_flag]
+        call     generate_object
+        mov      ax,1  ;mark
 
-        mov    bl,[generator_flag]
-        call   generate_object
-        jmp    .opt
+        jmp      .opt
     .asc:
-        mov    [triangles_count_var],10000  ; to do: read asc header
-        mov    [points_count_var],10000
-        call   alloc_mem_for_tp
+   ;     xor    bl,bl
+   ;     mov    [triangles_count_var],20000  ; to do: read asc header
+   ;     mov    [points_count_var],20000
+   ;     call   alloc_mem_for_tp
         call   read_asc
+        xor    ax,ax
         jmp    .opt
     .malloc:
         call   alloc_mem_for_tp
         call   read_from_file
     .opt:
+      if     Ext >= SSE2
+        push   ax
+      end if
         call   optimize_object1     ;  proc in file b_procs.asm
                                     ;  set point(0,0,0) in center and  calc all coords
                                     ;  to be in <-1.0,1.0>
         call   normalize_all_light_vectors
         call   copy_lights ; to aligned float
-        call   init_triangles_normals2
+ ;       call   init_triangles_normals2
+
 
      if Ext >= SSE2
+              ; if first byte of ax set -> old style normal vectors finding
         call   detect_chunks
         mov    [chunks_number],ecx
         mov    [chunks_ptr],ebx
+        push   esi
+        push   edi
+        call   init_triangles_normals2
+     ;  esi -   tri_ch
+     ;  edi -   t_ptr - every vertice index  - pointer to to all triangles
+     ;          that have this index
+        pop    edi
+        pop    esi
+        pop    ax
 
-     ;   esi -   tri_ch
-     ;   edi -   t_ptr - every vertice index  - pointer to to all triangles
-     ;           that have this index
      end if
 
         call   init_point_normals
@@ -122,7 +143,6 @@ START:    ; start of execution
         call   do_color_buffer   ; intit color_map
      if Ext >= SSE3
         call   init_point_lights
-        mov    [fire_flag],0     ; proteza
      end if
         mov    edi,bumpmap
         call   calc_bumpmap
@@ -206,10 +226,22 @@ START:    ; start of execution
         jmp     noclose
 
     red:   ; redraw
+     ;   xor     edx,edx
+     ; @@:
+     ;   push    edx
         mov     eax,9  ; get process info
         mov     ebx,procinfo
-        mov     ecx,-1
+        or      ecx,-1
         int     0x40
+     ;   pop     edx
+     ;   inc     edx
+     ;   cmp     dword[procinfo+26],50000000  ; ~ 10 Mbytes
+     ;   jb      @f
+     ;   cmp     edx,1
+     ;   je      @b
+
+
+    ; @@:
         mov     eax,[procinfo+42]    ; read params of window
         sub     eax,225
         mov     [size_x_var],ax
@@ -297,14 +329,14 @@ START:    ; start of execution
         call    update_flags          ; update flags and write labels of flags
 
                                       ; do other operations according to flag
-        cmp     ah,3                  ; ah = 3 -> shading model
-        jne     .next_m6
-        cmp     [dr_flag],2
-        jne     @f
+;        cmp     ah,3                 ; ah = 3 -> shading model
+;        jne     .next_m6
+;        cmp     [dr_flag],2
+;        jne     @f
    ;     call    init_envmap2    ;   <----! this don't works in env mode
                                  ;          and more than ~18 kb objects
  ;       call    init_envmap_cub2
-     @@:
+;     @@:
         cmp     [dr_flag],4
         jne     @f
         call    generate_texture2
@@ -402,7 +434,7 @@ START:    ; start of execution
         call   detect_chunks
         mov    [chunks_number],ecx
         mov    [chunks_ptr],ebx
-
+        mov    ax,1  ; - old style detecting normal vectors
      ;   esi -   tri_ch
      ;   edi -   t_ptr - every vertice index  - pointer to to all triangles
      ;           that have this index
@@ -412,6 +444,7 @@ START:    ; start of execution
         call    calc_bumpmap_coords   ; bump and texture mapping
         call    do_edges_list
         call    write_info
+
      .next_m2:
         cmp      ah,19
         je       @f
@@ -693,6 +726,7 @@ START:    ; start of execution
     lea     ecx,[eax*4]
 
 if (Ext = MMX)|(Ext = SSE)
+    emms
     mov      bh,bl
     push     bx
     shl      ebx,16
@@ -884,9 +918,10 @@ clear_vertices_index:
     movzx ecx,word[size_y_var]
     imul  ecx,eax
     xor   eax,eax
-    shr   ecx,1
+ ;   shr   ecx,1
     rep   stosd
 ret
+
 edit:     ; mmx required, edit mesh by vertex
         push   ebp
         mov    ebp,esp
@@ -895,9 +930,9 @@ edit:     ; mmx required, edit mesh by vertex
         .y_coord equ ebp-2
         .x_coord equ ebp-4
         .points_translated equ ebp-10
-        .points            equ ebp-22
-        .points_rotated    equ ebp-34
-        .mx                equ ebp-70
+        .points            equ ebp-26
+        .points_rotated    equ ebp-26-16
+        .mx                equ ebp-26-56
 
     macro check_bar
     {
@@ -906,17 +941,11 @@ edit:     ; mmx required, edit mesh by vertex
         movzx  edx,word[size_x_var]
         imul   edx,ecx
         add    ebx,edx
-        push   ebx
         mov    ecx,ebx
-        shl     ecx,2
-       ; lea    ecx,[ebx*2]
+        shl    ecx,2
         lea    ebx,[ebx*3]
-
-        cmp    [dr_flag],12
-        jl    @f
-        add    ebx,[esp]
-      @@:
-        add    esp,4
+        cmp    [dr_flag],10
+        cmovg  ebx,ecx
         add    ebx,[screen_ptr]
         mov    ebx,[ebx]
         and    ebx,0x00ffffff
@@ -935,10 +964,9 @@ edit:     ; mmx required, edit mesh by vertex
         pcmpgtw mm0,mm1
         pcmpgtw mm3,mm1
         pxor    mm3,mm0
-        movd    eax,mm3
-        mov     cx,ax
-        shr     eax,16
-        and     ax,cx
+        pmovmskb eax,mm3
+        and     eax,1111b
+
         or      ax,ax
         jz      .no_edit
 
@@ -949,15 +977,12 @@ edit:     ; mmx required, edit mesh by vertex
 
       ; store both x and y coordinates
         ror    eax,16
-       ; push   eax
-       ; sub    esp,256
         mov    [.x_coord],eax
         test   word[mouse_state],100000000b
         jz     .not_press  ; check if left mouse button press
 
         ;  left button  pressed
 
-
         check_bar
         jne    .no_edit
         add    ecx,[vertices_index_ptr]
@@ -992,29 +1017,17 @@ edit:     ; mmx required, edit mesh by vertex
         check_bar
         jne    .end
 
-        mov    esi,[vertex_edit_no]
-    ;    dec    esi
-        lea    esi,[esi*3]
-        add    esi,esi
-        add    esi,[points_translated_ptr]
-        emms
+        movd        xmm0,[edit_end_x]
+        punpcklwd   xmm0,[the_zero]
+        movd        xmm1,[vect_x]
+        punpcklwd   xmm1,[the_zero]
+   ;     movd        xmm2,[offset_y]
+   ;     punpcklwd   xmm2,[the_zero]
+        psubd       xmm0,xmm1
+   ;     psubd       xmm0,xmm2
+        cvtdq2ps    xmm0,xmm0
+        movups      [.points],xmm0
 
-        movd    mm1,dword[esi]
-        paddw   mm1,mm0
-        psubw   mm1,qword[vect_x]
-        movd    dword[esi],mm1
-
-        lea    edi,[.points]
-     ; detranslate
-        fninit
-        fild word[esi+4]
-        fstp dword[edi+8]
-        fild word[esi+2]
-        fisub word[offset_x]
-        fstp dword[edi+4]
-        fild word[esi]
-        fisub word[offset_y]   ; proteza
-        fstp dword[edi]
 
         mov     esi,matrix
         lea     edi,[.mx]
@@ -1028,7 +1041,7 @@ edit:     ; mmx required, edit mesh by vertex
 
    ;    inject into vertex list
         mov     edi,[vertex_edit_no]
-    ;    dec     edi
+      ;  dec     edi
         lea     edi,[edi*3]
         shl     edi,2
         add     edi,[points_ptr]
@@ -1037,11 +1050,8 @@ edit:     ; mmx required, edit mesh by vertex
         movsd
         movsd
         movsd
-     ;   mov     ecx,3
-     ;   cld
-     ;   rep     movsd
-
 
+        mov    dword[edit_start_x],0
         mov    dword[edit_end_x],0
         mov    [vertex_edit_no],-1
 
@@ -1096,7 +1106,7 @@ alloc_buffer_mem:
 
     mov      esp,ebp
     pop      ebp
-
+ret
 
 
 
@@ -1511,6 +1521,7 @@ init_point_normals:
 ;in:
 ;    esi - tri_ch
 ;    edi - t_ptr
+;    ax =  1 -> old style finding normals
 .z equ dword [ebp-8]
 .y equ dword [ebp-12]
 .x equ [ebp-16]
@@ -1519,6 +1530,7 @@ init_point_normals:
 .t_ptr        equ dword [ebp-36]
 .tri_ch       equ dword [ebp-40]
 .max_val      equ dword [ebp-44]
+.mark         equ word  [ebp-45]
 
         push      ebp
         mov       ebp,esp
@@ -1527,9 +1539,9 @@ init_point_normals:
         mov       .t_ptr,edi
         mov       .tri_ch,esi
 
-
-
-
+;        mov       .mark,ax
+        bt        ax,0
+        jc        .old1
 
 
         mov       ecx,[triangles_count_var]
@@ -1581,6 +1593,9 @@ init_point_normals:
 
         jmp      .end
 
+      .old1:
+
+        xor     edx,edx
 
       .old:
 
@@ -1644,6 +1659,9 @@ init_point_normals:
         mov       edx,.point_number
         cmp       edx,[points_count_var]
         jne       .ipn_loop
+ ;       cmp       .mark,1
+ ;       je        .end1
+ ;        always free if Ext>=SSE2
      .end:
 
         mov     eax,68
@@ -1656,7 +1674,7 @@ init_point_normals:
         mov     ecx,.tri_ch
         int     0x40
 
-
+  ;   .end1:
 
 
         add       esp,64
@@ -1817,38 +1835,37 @@ clrscr:
         movzx   ecx,word[size_x_var]
         movzx   eax,word[size_y_var]
         imul    ecx,eax
-
-
+        cld
         xor     eax,eax
-      if Ext=NON
+  ;    if Ext=NON
         rep     stosd
-      else if Ext = MMX
-        pxor    mm0,mm0
-      @@:
-        movq    [edi+00],mm0
-        movq    [edi+08],mm0
-        movq    [edi+16],mm0
-        movq    [edi+24],mm0
-        add     edi,32
-        sub     ecx,8
-        jnc     @b
-      else
-        push    ecx
-        mov     ecx,edi
-        and     ecx,0x0000000f
-        rep     stosb
-        pop     ecx
-        and     ecx,0xfffffff0
-        xorps   xmm0,xmm0
-      @@:
-        movaps  [edi],xmm0
-        movaps  [edi+16],xmm0
-        movaps  [edi+32],xmm0
-        movaps  [edi+48],xmm0
-        add     edi,64
-        sub     ecx,16
-        jnz     @b
-      end if
+;      else if Ext = MMX
+;        pxor    mm0,mm0
+ ;     @@:
+ ;       movq    [edi+00],mm0
+;        movq    [edi+08],mm0
+;        movq    [edi+16],mm0
+ ;       movq    [edi+24],mm0
+ ;       add     edi,32
+ ;       sub     ecx,8
+ ;       jnc     @b
+ ;     else
+ ;       push    ecx
+ ;       mov     ecx,edi
+ ;       and     ecx,0x0000000f
+ ;       rep     stosb
+ ;       pop     ecx
+ ;       and     ecx,0xfffffff0
+ ;       xorps   xmm0,xmm0
+ ;     @@:
+ ;       movaps  [edi],xmm0
+ ;       movaps  [edi+16],xmm0
+ ;       movaps  [edi+32],xmm0
+ ;       movaps  [edi+48],xmm0
+ ;       add     edi,64
+ ;       sub     ecx,16
+ ;       jnz     @b
+ ;     end if
 
 ret
 
@@ -1879,7 +1896,7 @@ draw_triangles:
 
         push    ebp
         mov     ebp,esp
-        sub     esp,60
+        sub     esp,64
 
  ;       movzx   ax,[dr_flag]
         mov     .dr_flag,ax
@@ -2777,6 +2794,7 @@ if Ext >= SSE3
    ;     je      @f
    ;     int3
    ;   @@:
+
         mov     eax, .index1x12
         mov     ebx, .index2x12
         mov     ecx, .index3x12
@@ -2945,7 +2963,7 @@ end if
 
 
    .eend:
-        add      esp,60
+        add      esp,64
         pop      ebp
 
 ret
@@ -2956,7 +2974,7 @@ draw_handlers:
        ;  in eax - render model
         push  ebp
         mov   ebp,esp
-;        emms
+        emms
        .fac         equ  dword[ebp-16]
        .xplus_scr    equ ebp-8
        .xplus_index  equ ebp-12
@@ -3320,12 +3338,12 @@ alloc_mem_for_tp:
         int     0x40                   ;  -> allocate memory to triangles
         mov     [triangles_ptr], eax   ;  -> eax = pointer to allocated mem
 
-        mov     eax, 68
-        mov     ecx,[triangles_count_var]
-        imul    ecx,[i36]
-        mov     edx,[edges_ptr]
-        int     0x40                   ;  -> allocate memory to triangles
-        mov     [edges_ptr], eax   ;  -> eax = pointer to allocated mem
+;        mov     eax, 68
+;        mov     ecx,[triangles_count_var]
+;        imul    ecx,[i36]
+;        mov     edx,[edges_ptr]
+;        int     0x40                   ;  -> allocate memory to triangles
+;        mov     [edges_ptr], eax   ;  -> eax = pointer to allocated mem
 
 
                                             ; ststic  memory
@@ -3411,6 +3429,7 @@ read_from_disk:
   ;  eax = 0   -> ok file loaded
 ret
 read_param:
+    cld
     mov        esi,I_Param
     cmp        dword[esi],0
     je         .end
@@ -3543,9 +3562,6 @@ ret
 ;   *******  WINDOW DEFINITIONS AND DRAW ********
 ;   *********************************************
     draw_window:
-        movzx   eax,[fire_flag]
-        push    eax
-    ;    int3
         mov     eax,12          ; function 12:tell os about windowdraw
         mov     ebx,1           ; 1, start of draw
         int     0x40
@@ -3732,8 +3748,8 @@ ret
         mov     eax,12          ; function 12:tell os about windowdraw
         mov     ebx,2           ; 2, end of draw
         int     0x40
-        pop     eax
-        mov     [fire_flag],al
+      ;  pop     eax
+      ;  mov     [fire_flag],al
         ret