;405 412 586
;
;
bufer_size=1024*16+2
fichero=4
mcu_ptr=8
color_ptr=12
estado=16
color_c=17
nbits=color_c
idct=20
tmp_bits=24
actable=28
matriz_limit=32
sourcebits=36
sourcebits_index=40
sourcebits_limit=44
qt_ptrs=48
ht_dc_ptrs=64
ht_ac_ptrs=80
matrices=96
tmp_bufer=100
x_org=104
y_org=108
x_mcu=112
y_mcu=116
x_size=120
y_size=124
x_org2=128
y_org2=132
x_mcu2=136
y_mcu2=140
x_size2=144
y_size2=148
q_ptr=152
dc=164
position=204
draw_ptr=208
struct_size=212

jpeg_info:  ;fichero en eax
            ;retorna ebp
        xor ebp,ebp
        pushad
        mov ebp,esp
        mov ecx,6
        sub esp,ecx
        mov edi,esp
        call read
        pop dx
        cmp dx,0d8ffh
        je .l1
        mov esp,ebp
        popad
        ret
   .l1: push eax
        mov ecx,struct_size
        call mallocz
        mov [edi],ebp
        mov ebp,edi
        pop dword [ebp+fichero]
        pop ax
        pop cx
        jmp .l3
   .l2: mov ebx,[ebp+tmp_bufer]
        add ebx,[ebx-4]
        mov cx,[ebx-2]
        mov ax,[ebx-4]
   .l3: push .l2
        xchg cl,ch
        add cx,2
        cmp ch,3
        jnc .l4
        cmp al,0ffh
        jne eoi
        cmp ah,0dbh
        je dqt
        cmp ah,0c4h
        je dht
        cmp ah,0c0h
        je sof0
        cmp ah,0dah
        je sos
        cmp ah,0c2h
        je eoi
        cmp ah,0c9h
        je eoi
        cmp ah,0d9h
        je eoi
   .l4: lea edx,[ecx-4]
        xor ecx,ecx
        mov eax,[ebp+fichero]
        call skip
        mov ecx,4
        call READ
        cmp ecx,[edi-4]
        jne eoi
        ret

eoi:
      mov esp,[ebp]
; do not close file - this will be done by caller
        and       dword [ebp+fichero], 0
      call jpeg_close
      popad
      xor ebp,ebp
      ret

jpeg_close:
      test ebp,ebp
      jz .l2
      pushad
      mov eax,[ebp+fichero]
      call close
      mov edi,[ebp+sourcebits]
      call free
      lea esi,[ebp+qt_ptrs]
      mov ecx,14
 .l1: mov edi,[esi]
      add esi,4
      call free
      loop .l1
      mov edi,ebp
      call free
      popad
 .l2: ret

dqt:  call READ
      mov esi,edi
      lea eax,[edi+ecx]
      push eax
 .l1: xor eax,eax
      lodsb
      cmp al,4
      jnc eoi
      lea ebx,[ebp+qt_ptrs+eax*4]
      test dword [ebx],-1
      jnz eoi
      mov ecx,64
      xor eax,eax
      sub esp,128
      mov edi,esp
 .l2: lodsb
      stosw
      loop .l2
      mov ecx,256
      call malloc
      mov [ebx],edi
      mov eax,esi
      mov esi,esp
      pushad
      mov ebp,zigzag
      fninit
      mov cl,64
      xor eax,eax
  .l3: fild word[esi]
      mov al,[ebp]
      and al,-2
      add ebp,2
      add esi,2
      mov ebx,eax
      and ebx,28
      fmul dword [ebx+k2]
      mov ebx,eax
      shr ebx,3
      and ebx,4+8+16
      fmul dword [ebx+k2]
      fstp dword [edi+eax]
      dec cl
      jnz .l3
      popad
      mov esi,eax
      add esp,128
      mov eax,[esp]
      sub eax,esi
      jc eoi
      cmp eax,4
      ja .l1
      jne eoi
      pop eax
      ret

sof0:  call READ
       cmp byte [edi],8
       jne eoi  ;precision
       mov ax,[edi+1]
       xchg al,ah
       mov [ebp+y_size],ax
       mov ax,[edi+3]
       xchg al,ah
       mov [ebp+x_size],ax
       mov al,[edi+5] ;ncomponentes
       mov cl,al
       mov [ebp+color_c],al
       mov edx,modes
       dec al
       jz .l1
       dec al
       dec al
       jnz eoi
       mov al,[edi+10]
       mov ah,[edi+13]
       cmp ax,1111h
       jne eoi
       mov al,[edi+7]
       add edx,16
       cmp al,11h
       je .l1
       add edx,16
       cmp al,21h
       je .l1
       add edx,16
       cmp al,22h
       jne eoi
  .l1: lea ebx,[ebp+q_ptr]
       lea esi,[ebp+qt_ptrs]
       mov [ebp+mcu_ptr],edx
  .l2: movzx eax,byte [edi+8]
       add edi,3
       cmp al,4
       jnc eoi
       lea eax,[eax*4+esi]
       mov [ebx],eax
       add ebx,16
       dec cl
       jnz .l2
       ret

READ:  mov eax,[ebp+fichero]
       mov edi,[ebp+tmp_bufer]
       movzx ecx,cx
       call mresize
       mov [ebp+tmp_bufer],edi
       jmp read

dht:    call READ
        mov esi,edi
        lea eax,[edi+ecx]
        push eax
   .l1: lodsb
        mov edi,esi
        mov ebx,3+16
        and bl,al
        cmp bl,al
        jne eoi
        shr bl,2
        and al,3
        or bl,al
        lea ebx,[ebp+ht_dc_ptrs+ebx*4]
        test dword [ebx],-1
        jnz eoi
        mov cl,15
        mov al,[edi]
   .l2: inc edi  ;calcular numero de codigos
        add al,[edi]
        jc eoi
        dec cl
        jnz .l2
        movzx ecx,al
        lea ecx,[ecx*4+2]
        call malloc
        mov [ebx],edi
        call arbol_hf
        mov eax,[esp]
        sub eax,ebx
        jc eoi
        mov esi,ebx
        cmp eax,4
        ja .l1
        jne eoi
        pop eax
        ret

arbol_hf:   ;esi=ht edi=memoria para el arbol
            ;retorna en ebx  el final de ht
            ;codigos: bits 0-3=nbits del siguiente numero
            ;         bits 4-7=numero de zeros
            ;         bits 8-14=longitud de este codigo o error si =127
            ;         bit 15=codigo/puntero
        push ebp
        lea ebx,[edi-2]
        add ebx,[ebx-2]
        mov word [ebx],-1 ;codigo de error si encontrado
        push ebx
        push esi
        lea ebx,[esi+16]
        mov ebp,esp
        xor ecx,ecx
        push edi
        push ecx
        add edi,2
        mov dx,1
        add dh,[esi]
        jz .l3
        jmp .l2
  .l1:  push edi
        push ecx
        add edi,2
  .l2:  inc cl
        cmp cl,dl
        jc .l1
        mov al,[ebx]
        inc ebx
        mov ah,128 ;marca de codigo
        or ah,dl
        cmp edi,[ebp+4]
        jnc .l5
        stosw
        cmp esp,ebp
        jnc .l5
        pop ecx
        pop esi
        lea eax,[edi-2]
        sub eax,esi
        mov [esi],ax
        dec dh
        jnz .l2 ;ncodigos
        mov esi,[ebp]
   .l3: inc esi
        inc dl
        cmp dl,17
        jnc .l4
        add dh,[esi]
        jz .l3
        mov [ebp],esi
        jmp .l2
   .l4: lea esp,[ebp+8]
        pop ebp
        ret
   .l5: mov ebp,[ebp+8]
        jmp eoi

sos:   sub ecx,4  ;a continuacion vienen los datos de la imagen
       call READ
       mov eax,[ebp+fichero]
       call ftell
       mov [ebp+position],edx
       mov esi,edi
       lea edi,[ebp+q_ptr]
       lodsb ;numero de componentes
       sub [ebp+color_c],al
       jnz eoi
       mov dh,al
  .l1: mov ebx,[edi]
       mov eax,[ebx]
       stosd
       lodsw
       mov cl,ah
       and eax,0f00h
       and ecx,0f0h
       shr eax,6
       shr ecx,2
       lea ebx,[ebp+ht_ac_ptrs+eax]
       mov eax,[ebx]
       lea ebx,[ebp+ht_dc_ptrs+ecx]
       mov ecx,[ebx]
       test eax,eax
       jz eoi
       test ecx,ecx
       jz eoi
       stosd
       mov eax,ecx
       stosd
       add edi,4
       dec dh
       jnz .l1
       mov edx,[ebp+mcu_ptr]
       cmp edx,modes
       jne .l2
       lea esi,[ebp+q_ptr]
       lea edi,[ebp+q_ptr+32]
       movsd
       movsd
       movsd
   .l2:
       mov esi,edx
       push dword [esi]
       pop dword [ebp+mcu_ptr]
       push dword [esi+4]
       pop dword[ebp+color_ptr]
       push dword [esi+12]
       pop dword [ebp+y_mcu]
       push dword [esi+8]
       pop dword [ebp+x_mcu]
       mov ecx,64*18
       call malloc
       mov [ebp+matrices],edi
       mov ecx,bufer_size
       call malloc
       mov [ebp+sourcebits],edi
       mov esp,[ebp]
       mov [esp+8],ebp
       popad
       ret

jpeg_display:
       test ebp,ebp
       jnz .inicio
       ret
      .inicio:
       pushad
       mov [ebp],esp
       mov eax,[ebp+fichero]
       mov edx,[ebp+position]
       call lseek
       mov edi,[ebp+sourcebits]
       add edi,bufer_size
       mov [ebp+sourcebits_index],edi
       sub edi,2
       mov [ebp+sourcebits_limit],edi
       mov edi,[ebp+matrices]
       mov [ebp+matriz_limit],edi
       xor eax,eax
       mov [esp+8],eax
       mov [ebp+estado],eax
       mov [ebp+tmp_bits],eax
       mov [ebp+dc],eax
       mov [ebp+dc+16],eax
       mov [ebp+dc+32],eax

       mov eax,[ebp+y_mcu]
       mov ecx,[ebp+y_org]
       sub ecx,eax
       mov [ebp+y_org2],ecx
       mov [ebp+y_mcu2],eax
       push dword [ebp+y_size]
       pop dword [ebp+y_size2]
  .l3: push dword [ebp+x_org]
       pop dword [ebp+x_org2]
       push dword [ebp+x_mcu]
       pop dword [ebp+x_mcu2]
       push dword [ebp+x_size]
       pop dword [ebp+x_size2]
       mov eax,[ebp+y_mcu2]
       add [ebp+y_org2],eax
       sub [ebp+y_size2],eax
       jnc .l4
       add eax,[ebp+y_size2]
       jnz .cont
       mov [esp+8],ebp
       popad
       ret
     .cont:
       mov dword [ebp+y_size2],0
       mov [ebp+y_mcu2],eax
      .l4:
        mov eax,[ebp+x_mcu2]
        sub [ebp+x_size2],eax
        jnc .l5
        add eax,[ebp+x_size2]
        jz .l3
        mov dword [ebp+x_size2],0
        mov [ebp+x_mcu2],eax
        call dword [ebp+mcu_ptr]
        mov eax,[ebp+x_mcu]
        mov ecx,[ebp+x_mcu2]
        mov edx,[ebp+y_mcu2]
        call recortar
        jmp .l6
      .l5:
        call dword [ebp+mcu_ptr]
        mov ecx,[ebp+x_mcu2]
        mov edx,[ebp+y_mcu2]
      .l6:
        mov eax,[ebp+x_org2]
        mov ebx,[ebp+y_org2]
        call dword [ebp+draw_ptr]
        add [ebp+x_org2],ecx
        mov ax,[ebp+estado]
        test al,15
        jz .l4
        cmp ah,8
        jnc .l4
        xor edx,edx
        mov [ebp+tmp_bits],edx
        mov [ebp+dc],edx
        mov [ebp+dc+16],edx
        mov [ebp+dc+32],edx
        add dword [ebp+sourcebits_index],2
        and word [ebp+estado],0c0h
        test al,32
        jz .l4
        jmp .l3

color100:
        push edi
    .l1: lodsw
        mov dl,ah
        mov ah,al
        stosw
        mov ah,dl
        stosb
        mov al,dl
        stosb
        stosw
        dec cl
        jnz .l1
        pop edi
        ret

color111:
        push edi
   .l1: lodsw
        mov bx,[esi+62]
        mov dx,[esi+126]
        xchg ah,bh
        xchg ah,dl
        xchg ah,bl
        stosw
        mov ax,bx
        stosw
        mov ax,dx
        stosw
        dec cl
        jnz .l1
        pop edi
        mov ecx,64*3
        jmp ybr_bgr

color411:
        push ebp
        push edi
        lea ebp,[esi+ecx*8]
     .l1: push ecx
        mov ax,[esi]
        mov cx,[ebp]
        mov dx,[ebp+64]
        add ebp,2
        xchg ch,dl
        mov bx,ax
        mov ah,cl
        mov bl,ch
        mov [edi],ax
        mov [edi+2],bx
        mov [edi+4],cx
        mov ax,[esi+8]
        mov bh,ah
        mov ah,cl
        mov [edi+48],ax
        mov [edi+48+2],bx
        mov [edi+48+4],cx
        mov ax,[esi+2]
        mov bx,ax
        mov ah,dl
        mov bl,dh
        mov [edi+6],ax
        mov [edi+2+6],bx
        mov [edi+4+6],dx
        mov ax,[esi+8+2]
        mov bh,ah
        mov ah,dl
        mov [edi+48+6],ax
        mov [edi+48+2+6],bx
        mov [edi+48+4+6],dx
        pop ecx
        add edi,12
        dec ecx
        add esi,4
        test cl,1
        jnz .l1
        add esi,64-8
        test cl,2
        jnz .l1
        sub esi,128-16
        add edi,48
        test cl,15
        jnz .l1
        add esi,64
        test cl,cl
        jnz .l1
        pop edi
        pop ebp
        mov ecx,64*4*3
        jmp ybr_bgr

color211:
        push ebp
        push edi
        lea ebp,[esi+ecx*4]
    .l1: push ecx
        mov ax,[esi]
        mov cx,[ebp]
        mov dx,[ebp+64]
        add ebp,2
        xchg ch,dl
        mov bx,ax
        mov ah,cl
        mov bl,ch
        mov [edi],ax
        mov [edi+2],bx
        mov [edi+4],cx
        mov ax,[esi+2]
        mov bx,ax
        mov ah,dl
        mov bl,dh
        mov [edi+6],ax
        mov [edi+2+6],bx
        mov [edi+4+6],dx
        pop ecx
        add edi,12
        dec cl
        add esi,4
        test cl,1
        jnz .l1
        add esi,64-8
        test cl,2
        jnz .l1
        sub esi,128-8
        test cl,cl
        jnz .l1
        pop edi
        pop ebp
        mov ecx,64*3*2
        jmp ybr_bgr


mcu411: lea ebx,[ebp+q_ptr]
        call hufdecode
        lea ebx,[ebp+q_ptr]
        call hufdecode
mcu211: lea ebx,[ebp+q_ptr]
        call hufdecode
mcu111: lea ebx,[ebp+q_ptr]
        call hufdecode
        lea ebx,[ebp+q_ptr+16]
        call hufdecode
mcu100: lea ebx,[ebp+q_ptr+32]
        call hufdecode
        mov esi,[ebp+matrices]
        mov dword [ebp+matriz_limit],esi
        mov ecx,32
        lea edi,[esi+64*6]
        jmp dword [ebp+color_ptr]

cargar_bits: ;edx=bits,cl=nbits,
             ;bp=data struct
             ;cr: cl,edx,eax,si
             ;ncr bx,bp,di,ch

        mov esi,[ebp+sourcebits_index]
        cmp esi,[ebp+sourcebits_limit]
        jnc .l6
        movzx eax,byte [esi]
        inc esi
        add cl,8
        cmp al,-1
        je .l2
        mov ah,al
        lodsb
        add cl,8
        cmp al,-1
        je .l2
   .l1: ror eax,cl
        or edx,eax
        mov [ebp+sourcebits_index],esi
        ret
   .l2: lodsb
        test al,al
        jnz .l3
        mov al,-1
        call .l1
        cmp cl,16
        jc cargar_bits
        ret
   .l3: sub esi,2
        sub cl,8
        sub al,0d0h
        cmp al,8
        jc .l4
        sub al,9
        mov al,63
        jz .l4
        mov al,127
    .l4: inc al
        or [ebp+estado],al
        movzx eax,ah
        jmp .l1
    .l5: mov [ebp+sourcebits_limit],edi
        mov word [edi],0d9ffh
        popad
        jmp cargar_bits
   .l6: ;read file
        pushad
        mov ecx,bufer_size-2
        mov edx,[ebp+sourcebits_limit]
        mov edi,[ebp+sourcebits]
        mov ax,[edx]
        sub edx,edi
        stosw
        sub esi,edx
        mov [ebp+sourcebits_index],esi
        cmp edx,ecx
        jne .l5
        mov eax,[ebp+fichero]
        call read
        lea ecx,[edi+ecx-2]
        mov [ebp+sourcebits_limit],ecx
        popad
        jmp cargar_bits


hufdecode:  ;si->dctable [bp+20]->actable di->outbufer edx->bits cl->bits en edx


            ;[bp+24]->sourcebits
            ;[bp+22]=outbufer+128
            ;[bx]       q ptr para aa&n
            ;[bx+2]     a ptr
            ;[bx+4]     d ptr
            ;[bx+8]     dc componente
        fninit
        push dword [ebx]
        mov cl,[ebp+nbits]
        mov edx,[ebp+tmp_bits]
        cmp cl,16
        jnc .l1
        call cargar_bits
    .l1: mov eax,[ebx+4]
        mov esi,[ebx+8]
        mov [ebp+actable],eax
        movzx eax,word [esi]
        add esi,2
   .l2: add edx,edx
        jnc .l3
        add esi,eax
   .l3: lodsw
        test ax,ax
        jns .l2
        ;codigo encontrado
        and ax,7f0fh
        mov edi,[ebp+matriz_limit] ;arrays
        sub cl,ah
        jns .l4
        fldz
   .error:
        xor ecx,ecx
        or byte [ebp+estado],32
        jmp .l12
   .l4: cmp cl,al
        jnc .l5
        push eax
        call cargar_bits
        pop eax
   .l5: sub cl,al
        mov ch,cl
        mov cl,al
        mov eax,edx
        shl edx,cl
        sar eax,17
        xor ax,8000h
        xor cl,15
        sar ax,cl
        mov cl,ch
        mov ch,2
        add ax,8000h ;incrementar si negativo
        adc ax,8000h
        add [ebx+12],ax
        fild word [ebx+12]
        push ecx
        mov ecx,64
        xor eax,eax
        add [ebp+matriz_limit],ecx
        rep stosd
        pop ecx
        sub edi,64*4
        mov ebx,[esp]
        fmul dword [ebx]
    .l6: cmp cl,16
        jnc .l7
        call cargar_bits
   .l7: mov esi,[ebp+actable]
        movzx eax,word[esi]
        add esi,2
   .l8: add edx,edx
        jnc .l9
        add esi,eax
   .l9: lodsw
        test ax,ax
        jns .l8
        ;codigo encontrado
        and ah,127
        xor ebx,ebx
        sub cl,ah
        js .error
        or bl,al
        jz .l12
        and al,0f0h
        shr al,3
        add ch,al
        js .error
        and bl,0fh
        jz .l11
        cmp cl,bl
        jnc .l10
        call cargar_bits
    .l10: sub cl,bl
        xchg bl,cl
        mov eax,edx
        shl edx,cl
        sar eax,17
        xor cl,15
        xor ax,8000h
        sar ax,cl
        add ax,8000h ;incrementar si negativo
        adc ax,8000h
        mov cl,bl
        mov bl,ch
        mov [ebp+tmp_bits],ax
        mov ax,[ebx+zigzag]
        mov ebx,[esp]
        fild word [ebp+tmp_bits]
        or [ebp+idct],ax
        and eax,11111100b
        fmul dword [ebx+eax]
        fstp dword [edi+eax]
  .l11: add ch,2
        jns .l6
  .l12: mov [ebp+nbits],cl
        mov [ebp+tmp_bits],edx
        xor ebx,ebx
        add esp,4
        xchg ebx,[ebp+idct]
        cmp ch,2
        je idctf1
        fstp dword [edi]
        test bh,0feh
        jnz idctf3
idctf2a: test bh,1
        mov esi,edi
        jz .l1
        test bl,1
        jnz idctf3
        push idctf2b
        jmp idctf3b
   .l1: call idctf3a
        mov cl,4
        call limit
        mov eax,[edi-8]
        mov edx,[edi-4]
        mov cl,7
   .l2: mov [edi],eax
        mov [edi+4],edx
        add edi,8
        dec cl
        jnz .l2
        ret

idctf1: fistp word[edi+64]
        mov ax,128
        add ax,[edi+64]
        jz .l2
        test ah,ah
        jz .l1
        mov al,-1
        js .l2
   .l1: mov ah,al
        stosw
        stosw
        mov eax,[edi-4]
        mov ecx,15
        rep stosd
   .l2: ret

idctf3: mov bl,8
        mov esi,edi
    .l1: rcr bh,1
        jc .l3
        mov eax,[esi]
        test eax,eax
        jz .l4
        mov cl,7
    .l2: add esi,32
        mov [esi],eax
        dec cl
        jnz .l2
        sub esi,32*7-4
        dec bl
        jnz .l1
        jmp .l5
   .l3: call idctf3b
   .l4: add esi,4
        dec bl
        jnz .l1
   .l5: mov esi,edi
        mov cl,8
   .l6: call idctf3a
        add esi,32
        add edi,16
        dec cl
        jnz .l6
        sub edi,128
        mov esi,edi
        mov cl,32
limit:  mov dx,[esi]
        mov bx,[esi+2]
        add esi,4
        add dx,128
        add bx,128
        test dh,dh
        mov ax,dx
        jz .l1
        mov al,0
        js .l1
        mov al,-1
   .l1: test bh,bh
        mov ah,bl
        jz .l2
        mov ah,0
        js .l2
        mov ah,-1
   .l2: stosw
        dec cl
        jnz limit
        ret

idctf2b:
        mov dl,8
  .l1:  fld dword[esi]
        add esi,32
        mov ax,128
        fistp word [edi]
        add ax,[edi]
        test ah,ah
        jz .l2
        mov al,0
        js .l2
        mov al,-1
  .l2:  mov ah,al
        stosw
        stosw
        stosw
        stosw
        dec dl
        jnz .l1
        ret

idctf3a: ;si(d float),di(w int) ncr
fld dword[esi+1*4]      ;f1                ;t21=f1+f7
fld st0
fld dword[esi+7*4]          ;f7
fadd st2,st0
fsubp st1,st0          ;t22=f1-f7
fld dword[esi+5*4]
fld st0           ;f5       ;t23=f5+f3
fld dword[esi+3*4]                   ;f3
fadd st2,st0
fsubp st1,st0          ;t20=f5-f3
fld st0
fadd st0,st3                ;t25=(t20+t22)*k2
fmul dword[k+4]                 ;k2 ;t25,t20,t23,t22,t21
fld st4                         ;t7=t21+t23
fadd st0,st3                        ;t7,t25,t20,t23,t22,t21
fld dword[k+12]                 ;k4     ;t6=k4*t20+t25-t7
fmulp st3,st0
fsub st2,st0
fld st1
faddp st3,st0                   ;t7,t25,t6,t23,t22,t21
fld st5                         ;t5=(t21-t23)*k1-t6
fsub st0,st4
fmul dword[k]          ;k1
fsub st0,st3
fstp st6               ;t7,t25,t6,t23,t22,t5
fstp st3                        ;t25,t6,t7,t22,t5
fxch st3
fmul dword[k+8]    ;k3      ;t4=k3*t22-t25+t5
fadd st0,st4                        ;t22*k3+t5,t6,t7,t25,t5
fsubrp st3,st0                  ;t6,t7,t4,t5
fld dword[esi]                       ;f0           ;t10=f0+f4
fst st5                         ;f0,t4,t5,t6,t7,f0
fld dword[esi+4*4]                   ;f4
fsub st6,st0                    ;t11=f0-f4
faddp st1,st0
fld st0               ;t10,t10,t6,t7,t4,t5,t11
fld dword[esi+2*4]                   ;f2      ;t13=f2+f6
fadd dword[esi+6*4]                  ;f6      ;t13,t10,t10,t6,t7,t4,t5,t11
fadd st2,st0                    ;t13,t10,t0,t6,t7,t4,t5,t11 ;t0=t10+t13
fsubp st1,st0                      ;t3,t0,t6,t7,t4,t5,t11 ;t3=t10-t13
fld st0                         ;p3=t3-t4
fsub st0,st5
fistp word [edi+3*2]        ;p3
fadd st0,st4                        ;p4=t3+t4
fld dword[esi+2*4]                   ;f2
fstp st5
fistp word [edi+4*2]        ;p4 ;t0,t6,t7,f2,t5,t11
fld st0                         ;p0=t0+t7
fsub st0,st3
fistp word [edi+7*2]        ;p7
fadd st0,st2                        ;p7=t0-t7
fistp word [edi]                     ;p0 ;t6,t7,f2,t5,t11
fld st2                         ;f2  ;f2,t6,t7,f2,t5,t11  ;t12=(f2-f6)*k1-t13
fld dword[esi+6*4]                   ;f6
fadd st4,st0                    ;f6,f2,t6,t7,t13,t5,t11
fsubp st1,st0
fmul dword[k]         ;k1
fsub st0,st3
fst st3                ;t12,t6,t7,t12,t5,t11
fadd st0,st5                        ;t1=t11+t12
fst st2                         ;t1,t6,t1,t12,t5,t11
fadd st0,st1                        ;p1=t1+t6
fistp word [edi+2]                   ;p1    ;t6,t1,t12,t5,t11
fsubp st1,st0                       ;p6=t1-t6
fistp word [edi+6*2]                 ;p6  ;t12,t5,t11
fsubp st2,st0                   ;t2=t11-t12  ;t5,t2
fld st0
fadd st0,st2                ;p2=t2+t5
fistp word [edi+2*2]                 ;p2
fsubp st1,st0                       ;p5=t2-t5   ;t5,t2
fistp word [edi+5*2]
ret             ;p5




idctf3b: ;si ncr
fld dword[esi+1*32]
fld st0          ;f1       ;t21=f1+f7
fld dword[esi+7*32]
fadd st2,st0     ;f7
fsubp st1,st0                       ;t22=f1-f7
fld dword[esi+5*32]
fld st0          ;f5       ;t23=f5+f3
fld dword[esi+3*32]                  ;f3
fadd st2,st0
fsubp st1,st0
fld st0  ;t20=f5-f3
fadd st0,st3                        ;t25=(t20+t22)*k2
fmul dword[k+4]                 ;k2 ;t25,t20,t23,t22,t21
fld st4                         ;t7=t21+t23
fadd st0,st3                        ;t7,t25,t20,t23,t22,t21
fld dword[k+12]                 ;k4     ;t6=k4*t20+t25-t7
fmulp st3,st0
fsub st2,st0
fld st1
faddp st3,st0                   ;t7,t25,t6,t23,t22,t21
fld st5                         ;t5=(t21-t23)*k1-t6
fsub st0,st4
fmul dword[k]          ;k1
fsub st0,st3
fstp st6               ;t7,t25,t6,t23,t22,t5
fstp st3
fxch st3               ;t25,t6,t7,t22,t5
fmul dword[k+8]                 ;k3      ;t4=k3*t22-t25+t5
fadd st0,st4                        ;t22*k3+t5,t6,t7,t25,t5
fsubrp st3,st0                  ;t6,t7,t4,t5
fld dword[esi]                       ;f0           ;t10=f0+f4
fst st5                         ;f0,t4,t5,t6,t7,f0
fld dword[esi+4*32]                  ;f4
fsub st6,st0                    ;t11=f0-f4
faddp st1,st0
fld st0               ;t10,t10,t6,t7,t4,t5,t11
fld dword[esi+2*32]                  ;f2      ;t13=f2+f6
fadd dword[esi+6*32]                 ;f6      ;t13,t10,t10,t6,t7,t4,t5,t11
fadd st2,st0                    ;t13,t10,t0,t6,t7,t4,t5,t11 ;t0=t10+t13
fsubp st1,st0                       ;t3,t0,t6,t7,t4,t5,t11 ;t3=t10-t13
fld st0                         ;p3=t3-t4
fsub st0,st5
fstp dword[esi+3*32]        ;p3
fadd st0,st4                        ;p4=t3+t4
fld dword[esi+2*32]                  ;f2
fstp st5
fstp dword[esi+4*32]        ;p4 ;t0,t6,t7,f2,t5,t11
fld st0
fsub st0,st3                ;p0=t0+t7
fstp dword[esi+7*32]                 ;p7
fadd st0,st2                        ;p7=t0-t7
fstp dword[esi]                      ;p0 ;t6,t7,f2,t5,t11
fld st2                         ;f2  ;f2,t6,t7,f2,t5,t11  ;t12=(f2-f6)*k1-t13
fld dword[esi+6*32]                  ;f6
fadd st4,st0                    ;f6,f2,t6,t7,t13,t5,t11
fsubp st1,st0
fmul dword[k]         ;k1
fsub st0,st3
fst st3                ;t12,t6,t7,t12,t5,t11
fadd st0,st5                        ;t1=t11+t12
fst st2                         ;t1,t6,t1,t12,t5,t11
fadd st0,st1                        ;p1=t1+t6
fstp dword[esi+1*32]                 ;p1    ;t6,t1,t12,t5,t11
fsubp st1,st0                       ;p6=t1-t6
fstp dword[esi+6*32]                 ;p6  ;t12,t5,t11
fsubp st2,st0
fld st0           ;t2=t11-t12  ;t5,t2
fadd st0,st2                        ;p2=t2+t5
fstp dword[esi+2*32]                 ;p2
fsubp st1,st0                       ;p5=t2-t5   ;t5,t2
fstp dword[esi+5*32]
ret             ;p5

ybr_bgr:  ;edi=bmp ecx=n_BYTES
	  ;retorna edi+=ecx
	pushad
	mov esi,edi
	add edi,ecx
	push edi
	mov edi,[colortabla]
   .l1: lodsw
	movzx ebx,ah
	movzx ebp,al
	movzx eax,al
	movzx ecx,byte[esi]
	lea ebx,[ebx*4+edi+1024]
	lea ecx,[ecx*4+edi]
	add eax,[ebx] ;cb   ;solo se usan 16 bits
	mov edx,[ebx+2]     ;pero el codigo de 32 bits es mas rapido
	mov ebx,[ecx] ;cr
	add eax,[ecx+2]
	add ebx,ebp ;b
	add edx,ebp ;r
	test ah,ah
	jz .l2
	mov al,0
	js .l2
	mov al,-1
   .l2: test dh,dh
	jz .l3
	mov dl,0
	js .l3
	mov dl,-1
   .l3: test bh,bh
	mov dh,al
	jz .l4
	mov bl,0
	js .l4
	mov bl,-1
   .l4: mov [esi-2],dx
	mov [esi],bl
	inc esi
	cmp esi,[esp]
	jc .l1
	pop edi
	popad
	ret

recortar:  ;edi=bufer eax=ancho en pixels (ecx,edx)tama�o deseado
        pushad
        dec edx
        jz .l2
        lea ebx,[ecx*3]
        lea eax,[eax*3]
        lea esi,[edi+eax]
        add edi,ebx
        sub eax,ebx
  .l1:  mov ecx,ebx
        call movedata
        add esi,eax
        dec edx
        jnz .l1
   .l2: popad
        ret

;R = Y                    + 1.402  *(Cr-128)
;G = Y - 0.34414*(Cb-128) - 0.71414*(Cr-128)
;B = Y + 1.772  *(Cb-128)

colortabla: dd 0

colorprecalc: ;prepara la tabla para convertir ycb a rgb
	 mov ecx,1024*2
	 call malloc
	 mov [colortabla],edi
	 fninit
	 fld dword [.k+4]
	 fld dword [.k]
	 mov dl,0
	 call .l1
	 fld dword [.k+12]
	 fld dword[.k+8]
    .l1: mov cx,-128
    .l2: mov [edi],ecx
	 inc ecx
	 fild word[edi]
	 fld st0
	 fmul st0,st2
	 fistp word[edi]
	 fmul st0,st2
	 fistp word[edi+2]
	 add edi,4
	 inc dl
	 jnz .l2
	 ret

     .k: dd 1.402,-0.71414,-0.34414,+1.772