From 68825a9e23acbd1ecf322a1ef35b8f99dda3862e Mon Sep 17 00:00:00 2001 From: IgorA Date: Mon, 8 Feb 2016 15:32:44 +0000 Subject: [PATCH] small speed optimize git-svn-id: svn://kolibrios.org@6172 a494cfbc-eb01-0410-851d-a64ba20cac60 --- .../libraries/TinyGL/asm_fork/clip.asm | 116 +++++++++--------- .../libraries/TinyGL/asm_fork/zbuffer.asm | 103 ++++++---------- 2 files changed, 94 insertions(+), 125 deletions(-) diff --git a/programs/develop/libraries/TinyGL/asm_fork/clip.asm b/programs/develop/libraries/TinyGL/asm_fork/clip.asm index 55971c1e48..0931023187 100644 --- a/programs/develop/libraries/TinyGL/asm_fork/clip.asm +++ b/programs/develop/libraries/TinyGL/asm_fork/clip.asm @@ -131,58 +131,59 @@ endp ; line -align 16 -proc interpolate uses eax ebx ecx, q:dword,p0:dword,p1:dword,t:dword - mov eax,[q] - mov ebx,[p0] - mov ecx,[p1] +;input: +;q - регистр с адресом вершины для интерполяции +;p0 - регистр с адресом 1-й вершины +;p1 - регистр с адресом 2-й вершины +;t - float +macro interpolate q, p0, p1, t +{ fld dword[t] ; интерполяция по координатам - fld dword[ecx+offs_vert_pc] - fsub dword[ebx+offs_vert_pc] + fld dword[p1+offs_vert_pc] + fsub dword[p0+offs_vert_pc] fmul st0,st1 - fadd dword[ebx+offs_vert_pc] - fstp dword[eax+offs_vert_pc] ;q.pc.X = p0.pc.X + (p1.pc.X - p0.pc.X) * t + fadd dword[p0+offs_vert_pc] + fstp dword[q+offs_vert_pc] ;q.pc.X = p0.pc.X + (p1.pc.X - p0.pc.X) * t - fld dword[ecx+offs_vert_pc+offs_Y] - fsub dword[ebx+offs_vert_pc+offs_Y] + fld dword[p1+offs_vert_pc+offs_Y] + fsub dword[p0+offs_vert_pc+offs_Y] fmul st0,st1 - fadd dword[ebx+offs_vert_pc+offs_Y] - fstp dword[eax+offs_vert_pc+offs_Y] + fadd dword[p0+offs_vert_pc+offs_Y] + fstp dword[q+offs_vert_pc+offs_Y] - fld dword[ecx+offs_vert_pc+offs_Z] - fsub dword[ebx+offs_vert_pc+offs_Z] + fld dword[p1+offs_vert_pc+offs_Z] + fsub dword[p0+offs_vert_pc+offs_Z] fmul st0,st1 - fadd dword[ebx+offs_vert_pc+offs_Z] - fstp dword[eax+offs_vert_pc+offs_Z] + fadd dword[p0+offs_vert_pc+offs_Z] + fstp dword[q+offs_vert_pc+offs_Z] - fld dword[ecx+offs_vert_pc+offs_W] - fsub dword[ebx+offs_vert_pc+offs_W] + fld dword[p1+offs_vert_pc+offs_W] + fsub dword[p0+offs_vert_pc+offs_W] fmul st0,st1 - fadd dword[ebx+offs_vert_pc+offs_W] - fstp dword[eax+offs_vert_pc+offs_W] + fadd dword[p0+offs_vert_pc+offs_W] + fstp dword[q+offs_vert_pc+offs_W] ; интерполяция по цвету - fld dword[ecx+offs_vert_color] - fsub dword[ebx+offs_vert_color] + fld dword[p1+offs_vert_color] + fsub dword[p0+offs_vert_color] fmul st0,st1 - fadd dword[ebx+offs_vert_color] - fstp dword[eax+offs_vert_color] + fadd dword[p0+offs_vert_color] + fstp dword[q+offs_vert_color] - fld dword[ecx+offs_vert_color+4] - fsub dword[ebx+offs_vert_color+4] + fld dword[p1+offs_vert_color+4] + fsub dword[p0+offs_vert_color+4] fmul st0,st1 - fadd dword[ebx+offs_vert_color+4] - fstp dword[eax+offs_vert_color+4] + fadd dword[p0+offs_vert_color+4] + fstp dword[q+offs_vert_color+4] - fld dword[ecx+offs_vert_color+8] - fsub dword[ebx+offs_vert_color+8] + fld dword[p1+offs_vert_color+8] + fsub dword[p0+offs_vert_color+8] fmulp - fadd dword[ebx+offs_vert_color+8] - fstp dword[eax+offs_vert_color+8] - ret -endp + fadd dword[p0+offs_vert_color+8] + fstp dword[q+offs_vert_color+8] +} ; ; Line Clipping @@ -421,10 +422,10 @@ align 4 mov eax,ebp sub eax,8+2*sizeof.GLVertex ;eax = &q1 - stdcall interpolate, eax,edi,esi,[tmin] + interpolate eax,edi,esi,tmin stdcall gl_transform_to_viewport, edx,eax add eax,sizeof.GLVertex ;eax = &q2 - stdcall interpolate, eax,edi,esi,[tmax] + interpolate eax,edi,esi,tmax stdcall gl_transform_to_viewport, edx,eax sub eax,sizeof.GLVertex ;eax = &q1 @@ -586,9 +587,10 @@ endp align 4 clip_proc dd clip_xmin,clip_xmax, clip_ymin,clip_ymax, clip_zmin,clip_zmax +;input: +;edi - q align 16 -proc updateTmp uses eax ebx ecx edx, context:dword, q:dword, p0:dword, p1:dword, t:dword - mov ebx,[q] +proc updateTmp uses eax ecx edx, context:dword, p0:dword, p1:dword, t:dword mov edx,[context] mov eax,[p0] cmp dword[edx+offs_cont_current_shade_model],GL_SMOOTH ;if (context.current_shade_model == GL_SMOOTH) @@ -598,26 +600,26 @@ proc updateTmp uses eax ebx ecx edx, context:dword, q:dword, p0:dword, p1:dword, fsub dword[eax+offs_vert_color] fmul dword[t] fadd dword[eax+offs_vert_color] - fstp dword[ebx+offs_vert_color] ;q.color.v[0]=p0.color.v[0] + (p1.color.v[0]-p0.color.v[0])*t + fstp dword[edi+offs_vert_color] ;q.color.v[0]=p0.color.v[0] + (p1.color.v[0]-p0.color.v[0])*t fld dword[ecx+offs_vert_color+4] fsub dword[eax+offs_vert_color+4] fmul dword[t] fadd dword[eax+offs_vert_color+4] - fstp dword[ebx+offs_vert_color+4] ;q.color.v[1]=p0.color.v[1] + (p1.color.v[1]-p0.color.v[1])*t + fstp dword[edi+offs_vert_color+4] ;q.color.v[1]=p0.color.v[1] + (p1.color.v[1]-p0.color.v[1])*t fld dword[ecx+offs_vert_color+8] fsub dword[eax+offs_vert_color+8] fmul dword[t] fadd dword[eax+offs_vert_color+8] - fstp dword[ebx+offs_vert_color+8] ;q.color.v[2]=p0.color.v[2] + (p1.color.v[2]-p0.color.v[2])*t + fstp dword[edi+offs_vert_color+8] ;q.color.v[2]=p0.color.v[2] + (p1.color.v[2]-p0.color.v[2])*t jmp @f align 4 .els_0: mov ecx,[eax+offs_vert_color] - mov [ebx+offs_vert_color],ecx ;q.color.v[0]=p0.color.v[0] + mov [edi+offs_vert_color],ecx ;q.color.v[0]=p0.color.v[0] mov ecx,[eax+offs_vert_color+4] - mov [ebx+offs_vert_color+4],ecx ;q.color.v[1]=p0.color.v[1] + mov [edi+offs_vert_color+4],ecx ;q.color.v[1]=p0.color.v[1] mov ecx,[eax+offs_vert_color+8] - mov [ebx+offs_vert_color+8],ecx ;q.color.v[2]=p0.color.v[2] + mov [edi+offs_vert_color+8],ecx ;q.color.v[2]=p0.color.v[2] @@: cmp dword[edx+offs_cont_texture_2d_enabled],0 ;if (context.texture_2d_enabled) @@ -627,28 +629,28 @@ align 4 fsub dword[eax+offs_vert_tex_coord+offs_X] fmul dword[t] fadd dword[eax+offs_vert_tex_coord+offs_X] - fstp dword[ebx+offs_vert_tex_coord+offs_X] ;q.tex_coord.X=p0.tex_coord.X + (p1.tex_coord.X-p0.tex_coord.X)*t + fstp dword[edi+offs_vert_tex_coord+offs_X] ;q.tex_coord.X=p0.tex_coord.X + (p1.tex_coord.X-p0.tex_coord.X)*t fld dword[ecx+offs_vert_tex_coord+offs_Y] fsub dword[eax+offs_vert_tex_coord+offs_Y] fmul dword[t] fadd dword[eax+offs_vert_tex_coord+offs_Y] - fstp dword[ebx+offs_vert_tex_coord+offs_Y] ;q.tex_coord.Y=p0.tex_coord.Y + (p1.tex_coord.Y-p0.tex_coord.Y)*t + fstp dword[edi+offs_vert_tex_coord+offs_Y] ;q.tex_coord.Y=p0.tex_coord.Y + (p1.tex_coord.Y-p0.tex_coord.Y)*t @@: - stdcall gl_clipcode, [ebx+offs_vert_pc+offs_X],[ebx+offs_vert_pc+offs_Y],\ - [ebx+offs_vert_pc+offs_Z],[ebx+offs_vert_pc+offs_W] - mov dword[ebx+offs_vert_clip_code],eax + stdcall gl_clipcode, [edi+offs_vert_pc+offs_X],[edi+offs_vert_pc+offs_Y],\ + [edi+offs_vert_pc+offs_Z],[edi+offs_vert_pc+offs_W] + mov dword[edi+offs_vert_clip_code],eax or eax,eax ;if (q.clip_code==0) jnz @f - stdcall gl_transform_to_viewport,[context],ebx - mov eax,ebx + stdcall gl_transform_to_viewport,[context],edi + mov eax,edi add eax,offs_vert_zp+offs_zbup_b push eax add eax,offs_zbup_g-offs_zbup_b push eax add eax,offs_zbup_r-offs_zbup_g push eax - stdcall RGBFtoRGBI, dword[ebx+offs_vert_color],dword[ebx+offs_vert_color+4],dword[ebx+offs_vert_color+8] + stdcall RGBFtoRGBI, dword[edi+offs_vert_color],dword[edi+offs_vert_color+4],dword[edi+offs_vert_color+8] @@: ret endp @@ -885,7 +887,7 @@ align 4 sub ebx,offs_vert_pc sub ecx,offs_vert_pc - stdcall updateTmp,[context],edi,ebx,ecx,eax ;updateTmp(c,&tmp1,q[0],q[1],tt) + stdcall updateTmp,[context],ebx,ecx,eax ;(c,&tmp1,q[0],q[1],tt) add ebx,offs_vert_pc lea eax,[clip_proc] @@ -898,7 +900,7 @@ align 4 sub edi,offs_vert_pc sub ebx,offs_vert_pc sub edx,offs_vert_pc - stdcall updateTmp,[context],edi,ebx,edx,eax ;updateTmp(c,&tmp2,q[0],q[2],tt) + stdcall updateTmp,[context],ebx,edx,eax ;(c,&tmp2,q[0],q[2],tt) mov eax,[ebx+offs_vert_edge_flag] mov [tmp1.edge_flag],eax ;q[0].edge_flag @@ -967,7 +969,7 @@ align 4 sub edi,(2*sizeof.GLVertex)-offs_vert_pc stdcall dword[eax],edi,ebx,ecx ;clip_proc[clip_bit](&tmp1.pc,&q[0].pc,&q[1].pc) sub edi,offs_vert_pc - stdcall updateTmp,[context],edi,[q],[q+4],eax + stdcall updateTmp,[context],[q],[q+4],eax lea eax,[clip_proc] mov edi,[clip_bit] @@ -977,7 +979,7 @@ align 4 sub edi,sizeof.GLVertex-offs_vert_pc stdcall dword[eax],edi,ebx,edx ;clip_proc[clip_bit](&tmp2.pc,&q[0].pc,&q[2].pc) sub edi,offs_vert_pc - stdcall updateTmp,[context],edi,[q],[q+8],eax + stdcall updateTmp,[context],[q],[q+8],eax mov dword[tmp1.edge_flag],1 mov eax,[edx+offs_vert_edge_flag-offs_vert_pc] diff --git a/programs/develop/libraries/TinyGL/asm_fork/zbuffer.asm b/programs/develop/libraries/TinyGL/asm_fork/zbuffer.asm index 8ef1774173..ea13e503e3 100644 --- a/programs/develop/libraries/TinyGL/asm_fork/zbuffer.asm +++ b/programs/develop/libraries/TinyGL/asm_fork/zbuffer.asm @@ -7,7 +7,7 @@ ;output: ; eax - указатель на ZBuffer (0 если не удача) -align 4 +align 16 proc ZB_open uses ecx edi, xsize:dword, ysize:dword, mode:dword,\ nb_colors:dword, color_indexes:dword, color_table:dword, frame_buffer:dword @@ -30,13 +30,6 @@ proc ZB_open uses ecx edi, xsize:dword, ysize:dword, mode:dword,\ mov eax,[mode] mov [edi+offs_zbuf_mode],eax -if TGL_FEATURE_8_BITS eq 1 - cmp eax,ZB_MODE_INDEX - jne @f -;ZB_initDither(edi, nb_colors, color_indexes, color_table); - jmp .end_s - @@: -end if if TGL_FEATURE_32_BITS eq 1 cmp eax,ZB_MODE_RGBA je .correct @@ -83,19 +76,14 @@ endp ;void ZB_close(ZBuffer * zb) ;{ -if TGL_FEATURE_8_BITS eq 1 -; if (zb->mode == ZB_MODE_INDEX) -; ZB_closeDither(zb); -end if -; ; if (zb->frame_buffer_allocated) ; gl_free(zb->pbuf); -; + ; gl_free(zb->zbuf); ; gl_free(zb); ;} -align 4 +align 16 proc ZB_resize uses eax ebx ecx edi esi, zb:dword, frame_buffer:dword, xsize:dword, ysize:dword mov ebx,[zb] @@ -149,7 +137,7 @@ endp ; unsigned char *p1; ; PIXEL *q; ; int y, n; -; + ; q = zb->pbuf; ; p1 = buf; ; n = zb->xsize * PSZB; @@ -159,7 +147,7 @@ endp ; q = (PIXEL *) ((char *) q + zb->linesize); ; } ;} -; + ;#if TGL_FEATURE_RENDER_BITS == 16 ;/* 32 bpp copy */ @@ -183,10 +171,10 @@ endp ; unsigned short *q; ; unsigned int *p, *p1, v, w0, w1; ; int y, n; -; + ; q = zb->pbuf; ; p1 = (unsigned int *) buf; -; + ; for (y = 0; y < zb->ysize; y++) { ; p = p1; ; n = zb->xsize >> 2; @@ -199,7 +187,7 @@ endp ;#endif ; p[0] = w0; ; p[1] = w1; -; + ; v = *(unsigned int *) (q + 2); ;#if BYTE_ORDER == BIG_ENDIAN ; RGB16_TO_RGB32(w1, w0, v); @@ -208,11 +196,11 @@ endp ;#endif ; p[2] = w0; ; p[3] = w1; -; + ; q += 4; ; p += 4; ; } while (--n > 0); -; + ; p1 += linesize; ; } ;} @@ -272,11 +260,11 @@ endp ; unsigned short *q; ; unsigned int *p, *p1, w0, w1, w2, v0, v1; ; int y, n; -; + ; q = zb->pbuf; ; p1 = (unsigned int *) buf; ; linesize = linesize * 3; -; + ; for (y = 0; y < zb->ysize; y++) { ; p = p1; ; n = zb->xsize >> 2; @@ -302,16 +290,6 @@ endp ; int linesize) ;{ ; switch (zb->mode) { -;#ifdef TGL_FEATURE_8_BITS -; case ZB_MODE_INDEX: -; ZB_ditherFrameBuffer(zb, buf, linesize >> 1); -; break; -;#endif -;#ifdef TGL_FEATURE_16_BITS -; case ZB_MODE_5R6G5B: -; ZB_copyBuffer(zb, buf, linesize); -; break; -;#endif ;#ifdef TGL_FEATURE_32_BITS ; case ZB_MODE_RGBA: ; ZB_copyFrameBufferRGB32(zb, buf, linesize >> 1); @@ -341,10 +319,10 @@ endp ; PIXEL *q; ; unsigned short *p, *p1; ; int y, n; -; + ; q = zb->pbuf; ; p1 = (unsigned short *) buf; -; + ; for (y = 0; y < zb->ysize; y++) { ; p = p1; ; n = zb->xsize >> 2; @@ -364,11 +342,6 @@ endp ; int linesize) ;{ ; switch (zb->mode) { -;#ifdef TGL_FEATURE_16_BITS -; case ZB_MODE_5R6G5B: -; ZB_copyFrameBuffer5R6G5B(zb, buf, linesize); -; break; -;#endif ;#ifdef TGL_FEATURE_24_BITS ; case ZB_MODE_RGB24: ; ZB_copyBuffer(zb, buf, linesize); @@ -393,10 +366,10 @@ endp ; PIXEL *q; ; unsigned short *p, *p1; ; int y, n; -; + ; q = zb->pbuf; ; p1 = (unsigned short *) buf; -; + ; for (y = 0; y < zb->ysize; y++) { ; p = p1; ; n = zb->xsize >> 2; @@ -411,16 +384,11 @@ endp ; p1 = (unsigned short *)((char *)p1 + linesize); ; } ;} -; + ;void ZB_copyFrameBuffer(ZBuffer * zb, void *buf, ; int linesize) ;{ ; switch (zb->mode) { -;#ifdef TGL_FEATURE_16_BITS -; case ZB_MODE_5R6G5B: -; ZB_copyFrameBuffer5R6G5B(zb, buf, linesize); -; break; -;#endif ;#ifdef TGL_FEATURE_32_BITS ; case ZB_MODE_RGBA: ; ZB_copyBuffer(zb, buf, linesize); @@ -430,15 +398,17 @@ endp ; assert(0); ; } ;} -; + ;#endif /* TGL_FEATURE_RENDER_BITS == 32 */ ; ; adr must be aligned on an 'int' ; -align 4 -proc memset_s uses eax ecx edi, adr:dword, val:dword, count:dword +;destroy: +; ecx, edi +align 16 +proc memset_s uses eax, adr:dword, val:dword, count:dword mov eax,[val] mov di,ax ror eax,16 @@ -455,7 +425,7 @@ proc memset_s uses eax ecx edi, adr:dword, val:dword, count:dword ret endp -align 4 +align 16 proc memset_l uses eax ecx edi, adr:dword, val:dword, count:dword mov eax,[val] mov ecx,[count] @@ -465,8 +435,10 @@ proc memset_l uses eax ecx edi, adr:dword, val:dword, count:dword endp ; count must be a multiple of 4 and >= 4 -align 4 -proc memset_RGB24 uses eax ecx edi esi, adr:dword, r:dword, g:dword, b:dword, count:dword +;destroy: +; edi, esi +align 16 +proc memset_RGB24 uses eax ecx, adr:dword, r:dword, g:dword, b:dword, count:dword mov esi,[adr] mov eax,[r] ;копируем в буфер первые 12 байт (минимальное число кратное 3 и 4) mov byte[esi],al @@ -501,10 +473,8 @@ proc memset_RGB24 uses eax ecx edi esi, adr:dword, r:dword, g:dword, b:dword, co sub ecx,esi ;ecx*=3 rep stosd jmp .end_f - @@: - - ;если r!=g или g!=b или b!=r - @@: +align 16 + @@: ;если r!=g или g!=b или b!=r movsd movsd movsd @@ -514,9 +484,9 @@ proc memset_RGB24 uses eax ecx edi esi, adr:dword, r:dword, g:dword, b:dword, co ret endp -align 4 -proc ZB_clear uses eax ebx ecx, zb:dword, clear_z:dword, z:dword, clear_color:dword,\ - r:dword, g:dword, b:dword +align 16 +proc ZB_clear uses eax ebx ecx edi esi, zb:dword, clear_z:dword, z:dword,\ + clear_color:dword, r:dword, g:dword, b:dword ;if TGL_FEATURE_RENDER_BITS != 24 ; color dd ? ;end if @@ -546,14 +516,11 @@ if TGL_FEATURE_RENDER_BITS eq 24 end if mov ebx,[eax+offs_zbuf_pbuf] mov ecx,[eax+offs_zbuf_ysize] +align 4 .cycle_0: -if (TGL_FEATURE_RENDER_BITS eq 15) ;or (TGL_FEATURE_RENDER_BITS eq 16) - ;color = RGB_TO_PIXEL(r, g, b); - ;memset_s(ebx, color, zb->xsize); -end if if TGL_FEATURE_RENDER_BITS eq 32 - ;color = RGB_TO_PIXEL(r, g, b); - ;memset_l(ebx, color, zb->xsize); + ;color = RGB_TO_PIXEL(r, g, b) + ;memset_l(ebx, color, zb->xsize) end if if TGL_FEATURE_RENDER_BITS eq 24 sub esp,16