;;================================================================================================;; ;;//// jpeg.asm //// (c) diamond, 2008-2009 //////////////////////////////////////////////////////;; ;;================================================================================================;; ;; ;; ;; This file is part of Common development libraries (Libs-Dev). ;; ;; ;; ;; Libs-Dev is free software: you can redistribute it and/or modify it under the terms of the GNU ;; ;; Lesser General Public License as published by the Free Software Foundation, either version 2.1 ;; ;; of the License, or (at your option) any later version. ;; ;; ;; ;; Libs-Dev is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without ;; ;; even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; ;; Lesser General Public License for more details. ;; ;; ;; ;; You should have received a copy of the GNU Lesser General Public License along with Libs-Dev. ;; ;; If not, see . ;; ;; ;; ;;================================================================================================;; include 'jpeg.inc' img.is.jpg: push esi ebp mov esi, [esp+12] ; esi -> JPEG data mov ebp, [esp+16] ; ebp = data size call get_marker jc .no cmp al, 0xD8 ; SOI marker? push 1 pop eax jz .ok .no: xor eax, eax .ok: pop ebp esi ret 8 img.decode.jpg: finit pushad mov esi, [esp+20h+4] ; esi -> JPEG data mov ebp, [esp+20h+8] ; ebp = data size @@: ; allocate area for JPEG processing push sizeof.jpeg.work call [mem.alloc] test eax, eax jz .ret mov ebx, eax xor ecx, ecx mov [ebx + jpeg.work.image], ecx mov [ebx + jpeg.work.dct_buffer], ecx mov [ebx + jpeg.work._esp], esp ; check for SOI [Start-Of-Image] marker call get_marker jc .end cmp al, 0xD8 ; SOI? jz .soi_ok .end: ; general exit from the function ; for progressive mode: convert loaded DCT coefficients to image call handle_progressive ; convert full-color images to RGB call convert_to_rgb push [ebx + jpeg.work.image] push ebx call [mem.free] pop eax .ret: mov [esp+28], eax popad ret 8 .soi_ok: mov [ebx + jpeg.work.restart_interval], ecx mov [ebx + jpeg.work.adobe_ycck], cl ; loop until start of frame (real data), parse markers .markers_loop: call get_marker jc .end ; markers RSTn do not have parameters ; N.B. They can not exist in this part of JPEG, but let's be liberal :) cmp al, 0xD0 jb @f cmp al, 0xD8 jb .markers_loop @@: cmp al, 0xD9 ; EOI? [invalid here] jz .end ; ok, this is marker segment ; first word is length of the segment cmp ebp, 2 jb .end xor edx, edx mov dl, [esi+1] mov dh, [esi] ; edx = marker length, al = marker value sub ebp, edx jb .end cmp al, 0xDB ; DQT? jz .dqt cmp al, 0xC4 ; DHT? jz .dht cmp al, 0xCC ; DAC? [ignored - no arithmetic coding] jz .next_marker cmp al, 0xDD ; DRI? jz .dri cmp al, 0xDA ; SOS? jz .sos cmp al, 0xC0 jb @f cmp al, 0xD0 jb .sofn @@: cmp al, 0xEE ; APP14? jz .app14 ; unrecognized marker; let's skip it and hope for the best .next_marker: add esi, edx jmp .markers_loop .app14: ; check for special Adobe marker cmp dx, 14 jb .next_marker cmp byte [esi+2], 'A' jnz .next_marker cmp dword [esi+3], 'dobe' jnz .next_marker cmp byte [esi+13], 2 setz [ebx + jpeg.work.adobe_ycck] jmp .next_marker .dqt: ; DQT marker found ; length: 2 bytes for length field + 65 bytes per table sub edx, 2 jc .end lodsw .dqt_loop: test edx, edx jz .markers_loop sub edx, 1+64 jc .end lodsb ; 8-bit DCT-based process shall not use a 16-bit precision quantization table. test al, 0xF0 jnz .end and eax, 3 mov [ebx+jpeg.work.quant_tables_defined+eax], 1 shl eax, 8 lea edi, [ebx+eax+jpeg.work.quant_tables] xor ecx, ecx @@: xor eax, eax lodsb push eax fild dword [esp] pop eax movzx eax, byte [zigzag+ecx] add eax, eax push eax and eax, 7*4 fmul dword [idct_pre_table+eax] pop eax push eax shr eax, 3 and eax, 7*4 fmul dword [idct_pre_table+eax] pop eax fstp dword [edi+eax] inc ecx cmp ecx, 64 jb @b jmp .dqt_loop .dri: ; DRI marker found cmp edx, 4 ; length must be 4 jnz .end2 movzx eax, word [esi+2] xchg al, ah mov [ebx+jpeg.work.restart_interval], eax jmp .next_marker .dht: ; DHT marker found sub edx, 2 jc .end2 lodsw .dht_loop: test edx, edx jz .markers_loop sub edx, 17 jc .end2 ; next Huffman table; find place for it lodsb mov edi, eax and eax, 0x10 and edi, 3 shr eax, 2 or edi, eax mov [ebx+jpeg.work.dc_huffman_defined+edi], 1 ; shl edi, 11 imul edi, max_hufftable_size lea edi, [ebx+edi+jpeg.work.dc_huffman] ; edi -> destination table ; get table size xor eax, eax push 16 pop ecx @@: add al, [esi] adc ah, 0 inc esi loop @b cmp ax, 0x100 ja .end2 sub edx, eax jc .end2 ; construct Huffman tree push ebx edx ; lea eax, [edi+256*8] ; push eax ; push 16 ; mov edx, esi ; @@: ; cmp byte [edx-1], 0 ; jnz @f ; dec edx ; dec dword [esp] ; jmp @b ; @@: ; sub edx, [esp] ; lea eax, [edi+8] ; push 2 ; pop ecx ; .lenloop: ; mov bl, byte [edx] ; test bl, bl ; jz .len1done ; push eax ; xor eax, eax ; .len1loop: ; dec ecx ; js .dhterr ; cmp edi, [esp+8] ; jae .dhterr ; lodsb ; stosd ; dec bl ; jnz .len1loop ; pop eax ; .len1done: ; jecxz .len2done ; push ecx ; .len2loop: ; cmp eax, [esp+8] ; jb @f ; or eax, -1 ; @@: ; cmp edi, [esp+8] ; jae .dhterr ; stosd ; add eax, 8 ; jnb @f ; or eax, -1 ; @@: ; loop .len2loop ; pop ecx ; .len2done: ; add ecx, ecx ; inc edx ; dec dword [esp] ; jnz .lenloop ; pop eax ; pop eax ; sub eax, edi ; shr eax, 2 ; cmp eax, ecx ; ja @f ; mov ecx, eax ; @@: ; or eax, -1 ; rep stosd ; pop edx ebx ; jmp .dht_loop ; .dhterr: ; ;pop eax eax eax edx ebx ; add esp, 5*4 lea eax, [edi+256*2] push eax lea edx, [esi-16] mov ah, 1 mov ecx, 128 .dht_l1: movzx ebx, byte [edx] inc edx test ebx, ebx jz .dht_l3 .dht_l2: cmp edi, [esp] jae .dhterr1 lodsb xchg al, ah push ecx rep stosw pop ecx xchg al, ah dec ebx jnz .dht_l2 .dht_l3: inc ah shr ecx, 1 jnz .dht_l1 push edi mov edi, [esp+4] push edi mov eax, 0x00090100 mov cl, 8 .dht_l4: movzx ebx, byte [edx] inc edx test ebx, ebx jz .dht_l6 .dht_l5: cmp edi, [esp] jb @f mov edi, [esp+4] rol eax, 16 cmp edi, [esp+8] jae .dhterr2 stosw inc ah mov [esp+4], edi pop edi push edi rol eax, 16 add dword [esp], 16*2 @@: lodsb xchg al, ah push ecx rep stosw pop ecx xchg al, ah dec ebx jnz .dht_l5 .dht_l6: inc ah shr ecx, 1 jnz .dht_l4 push edi movzx ebx, byte [edx] add ebx, ebx add bl, [edx+1] adc bh, 0 add ebx, ebx add bl, [edx+2] adc bh, 0 add ebx, ebx add bl, [edx+3] adc bh, 0 add ebx, 15 shr ebx, 4 mov cl, 8 lea ebx, [edi+ebx*2] sub ebx, [esp+12] add ebx, 31 shr ebx, 5 mov edi, ebx shl edi, 5 add edi, [esp+12] xor ebx, 9 shl ebx, 16 xor eax, ebx push edi .dht_l7: movzx ebx, byte [edx] inc edx test ebx, ebx jz .dht_l10 .dht_l8: cmp edi, [esp] jb .dht_l9 mov edi, [esp+4] cmp edi, [esp+8] jb @f mov edi, [esp+12] cmp edi, [esp+16] jae .dhterr3 mov al, 9 stosb rol eax, 8 stosb inc eax ror eax, 8 mov [esp+12], edi mov edi, [esp+8] add dword [esp+8], 16*2 @@: mov al, 9 stosb rol eax, 16 stosb inc eax ror eax, 16 mov [esp+4], edi pop edi push edi add dword [esp], 16*2 .dht_l9: lodsb xchg al, ah push ecx rep stosw pop ecx xchg al, ah dec ebx jnz .dht_l8 .dht_l10: inc ah shr ecx, 1 jnz .dht_l7 push -1 pop eax pop ecx sub ecx, edi rep stosb pop edi pop ecx sub ecx, edi rep stosb pop edi pop ecx sub ecx, edi rep stosb pop edx ebx jmp .dht_loop .dhterr3: pop eax eax .dhterr2: pop eax eax .dhterr1: pop eax pop edx ebx .end2: jmp .end .sofn: ; SOFn marker found cmp [ebx+jpeg.work.image], 0 jnz .end2 ; only one frame is allowed ; only SOF0 [baseline sequential], SOF1 [extended sequential], SOF2 [progressive] ; nobody supports other compression methods cmp al, 0xC2 ja .end2 setz [ebx+jpeg.work.progressive] ; Length must be at least 8 sub edx, 8 jb .end2 ; Sample precision in JFIF must be 8 bits cmp byte [esi+2], 8 jnz .end2 ; Color space in JFIF is either YCbCr (color images, 3 components) ; or Y (grey images, 1 component) movzx eax, byte [esi+7] cmp al, 1 jz @f cmp al, 3 jz @f ; Adobe products sometimes use YCCK color space with 4 components cmp al, 4 jnz .end2 cmp [ebx+jpeg.work.adobe_ycck], 0 jz .end2 @@: mov edi, eax ; edi = number of components lea eax, [eax*3] sub edx, eax jnz .end2 ; image type: 8 bpp for grayscale JPEGs, 24 bpp for normal, ; 32 bpp for Adobe YCCK push Image.bpp8 pop eax ; Image.bpp8 = 1 cmp edi, eax jz @f inc eax ; Image.bpp24 = 2 cmp edi, 3 jz @f inc eax ; Image.bpp32 = 3 @@: push eax ; get width and height ; width must be nonzero ; height must be nonzero - nobody supports DNL markers mov ah, [esi+3] mov al, [esi+4] ; eax = height xor ecx, ecx mov ch, [esi+5] mov cl, [esi+6] ; ecx = width ; allocate memory for image stdcall img.create, ecx, eax test eax, eax jz .end2 mov [ebx + jpeg.work.image], eax ; create grayscale palette if needed cmp edi, 1 jnz .no_create_palette push ecx edi mov edi, [eax + Image.Palette] xor eax, eax mov ecx, 256 @@: stosd add eax, 0x010101 loop @b pop edi ecx .no_create_palette: ; other image characteristics mov eax, edi shl eax, 3 mov [ebx + jpeg.work.delta_x], eax mov [ebx + jpeg.work.pixel_size], edi ;mov eax, edi imul eax, ecx mov [ebx + jpeg.work.delta_y], eax shr eax, 3 mov [ebx + jpeg.work.line_size], eax add esi, 8 mov ecx, edi lea edi, [ebx + jpeg.work.components] xor eax, eax xor edx, edx .sof_parse_comp: movsb ; db ComponentIdentifier lodsb mov ah, al and al, 0xF jz .end3 shr ah, 4 jz .end3 stosd ; db V, db H, db ?, db ? (will be filled later) cmp dl, al ja @f mov dl, al @@: cmp dh, ah ja @f mov dh, ah @@: movsb ; db QuantizationTableID loop .sof_parse_comp mov word [ebx + jpeg.work.max_v], dx movzx eax, dh movzx edx, dl push eax edx shl eax, 3 shl edx, 3 mov [ebx + jpeg.work.block_width], eax mov [ebx + jpeg.work.block_height], edx pop edx eax push eax edx imul eax, [ebx + jpeg.work.delta_x] mov [ebx + jpeg.work.block_delta_x], eax imul edx, [ebx + jpeg.work.delta_y] mov [ebx + jpeg.work.block_delta_y], edx mov ecx, [ebx + jpeg.work.image] mov eax, [ecx + Image.Width] add eax, [ebx + jpeg.work.block_width] dec eax xor edx, edx div [ebx + jpeg.work.block_width] mov [ebx + jpeg.work.x_num_blocks], eax mov eax, [ecx + Image.Height] add eax, [ebx + jpeg.work.block_height] dec eax xor edx, edx div [ebx + jpeg.work.block_height] mov [ebx + jpeg.work.y_num_blocks], eax mov ecx, [ebx + jpeg.work.pixel_size] pop edx lea edi, [ebx + jpeg.work.components] @@: mov eax, edx div byte [edi+1] ; VMax / V_i = VFactor_i mov byte [edi+3], al ; db VFactor pop eax push eax div byte [edi+2] ; HMax / H_i = HFactor_i mov byte [edi+4], al ; db HFactor add edi, 6 loop @b pop eax cmp [ebx + jpeg.work.progressive], 0 jz .sof_noprogressive mov eax, [ebx + jpeg.work.x_num_blocks] mul [ebx + jpeg.work.block_width] mul [ebx + jpeg.work.y_num_blocks] mul [ebx + jpeg.work.block_height] add eax, eax mov [ebx + jpeg.work.dct_buffer_size], eax mul [ebx + jpeg.work.pixel_size] push eax call [mem.alloc] test eax, eax jnz @f xchg eax, [ebx + jpeg.work.image] push eax call img.destroy jmp .end @@: mov [ebx + jpeg.work.dct_buffer], eax .sof_noprogressive: jmp .markers_loop .end3: jmp .end .sos: ; SOS marker found ; frame must be already opened cmp [ebx + jpeg.work.image], 0 jz .end3 cmp edx, 6 jb .end3 ; parse marker movzx eax, byte [esi+2] ; number of components in this scan test eax, eax jz .end3 ; must be nonzero cmp al, byte [ebx + jpeg.work.pixel_size] ja .end3 ; must be <= total number of components ; mov [ns], eax cmp al, 1 setz [ebx + jpeg.work.not_interleaved] lea ecx, [6+eax+eax] cmp edx, ecx jnz .end3 mov ecx, eax lea edi, [ebx + jpeg.work.cur_components] add esi, 3 .sos_find_comp: lodsb ; got ComponentID, look for component info push ecx esi mov ecx, [ebx + jpeg.work.pixel_size] lea esi, [ebx + jpeg.work.components] and dword [edi+48], 0 and dword [edi+52], 0 @@: cmp [esi], al jz @f inc dword [edi+52] add esi, 6 loop @b @@: mov eax, [esi+1] mov dl, [esi+5] pop esi ecx jnz .end3 ; bad ComponentID cmp [ebx + jpeg.work.not_interleaved], 0 jz @f mov ax, 0x0101 @@: stosd ; db V, db H, db VFactor, db HFactor push ecx xor eax, eax mov al, byte [edi-1] ; get HFactor mul byte [ebx+jpeg.work.pixel_size] ; number of components stosd ; HIncrement_i = HFactor_i * sizeof(pixel) mov al, byte [edi-4-2] ; get VFactor mul byte [ebx+jpeg.work.pixel_size] ; number of components mov ecx, [ebx+jpeg.work.image] imul eax, [ecx+Image.Width] ; image width stosd ; VIncrement_i = VFactor_i * sizeof(row) xchg eax, edx and eax, 3 cmp [ebx+jpeg.work.quant_tables_defined+eax], 0 jz .end3 shl eax, 8 lea eax, [ebx+eax+jpeg.work.quant_tables] stosd ; dd QuantizationTable lodsb movzx eax, al mov edx, eax shr eax, 4 and edx, 3 and eax, 3 cmp [ebx+jpeg.work.dc_huffman_defined+eax], 0 jnz .dc_table_ok cmp [ebx+jpeg.work.progressive], 0 jz .end3 xor eax, eax jmp .dc_table_done .dc_table_ok: ; shl eax, 11 imul eax, max_hufftable_size lea eax, [ebx+jpeg.work.dc_huffman+eax] .dc_table_done: cmp [ebx+jpeg.work.ac_huffman_defined+edx], 0 jnz .ac_table_ok cmp [ebx+jpeg.work.progressive], 0 jz .end3 xor edx, edx jmp .ac_table_done .ac_table_ok: ; shl edx, 11 imul edx, max_hufftable_size lea edx, [ebx+jpeg.work.ac_huffman+edx] .ac_table_done: stosd ; dd DCTable xchg eax, edx stosd ; dd ACTable mov eax, [ecx+Image.Width] movzx ecx, byte [edi-21] ; get HFactor cdq ; edx:eax = width (width<0x10000, so as dword it is unsigned) div ecx stosd ; dd width / HFactor_i stosd xchg eax, ecx inc eax sub eax, edx stosd ; dd HFactor_i+1 - (width % HFactor_i) mov ecx, [ebx+jpeg.work.image] mov eax, [ecx+Image.Height] movzx ecx, byte [edi-34] ; get VFactor cdq div ecx stosd ; dd height / VFactor_i stosd xchg eax, ecx inc eax sub eax, edx stosd ; dd VFactor_i+1 - (height % VFactor_i) pop ecx scasd ; dd DCPrediction cmp dword [edi], 0 setnp al ror al, 1 mov byte [edi-1], al scasd ; dd ComponentOffset dec ecx jnz .sos_find_comp mov [ebx+jpeg.work.cur_components_end], edi lea edi, [ebx+jpeg.work.ScanStart] movsb cmp byte [esi], 63 ja .end3 movsb lodsb push eax and al, 0xF stosb pop eax shr al, 4 stosb ; now unpack data call init_limits and [ebx+jpeg.work.decoded_MCUs], 0 mov [ebx+jpeg.work.cur_rst_marker], 7 and [ebx+jpeg.work.huffman_bits], 0 cmp [ebx+jpeg.work.progressive], 0 jz .sos_noprogressive ; progressive mode - only decode DCT coefficients ; initialize pointers to coefficients data ; zero number of EOBs for AC coefficients ; redefine HIncrement and VIncrement lea edi, [ebx+jpeg.work.cur_components] .coeff_init: mov eax, [ebx+jpeg.work.dct_buffer_size] mul dword [edi+52] add eax, [ebx+jpeg.work.dct_buffer] mov [edi+12], eax and dword [edi+52], 0 cmp [ebx+jpeg.work.ScanStart], 0 jz .scan_dc cmp dword [edi+20], 0 jz .end3 jmp @f .scan_dc: cmp dword [edi+16], 0 jz .end3 @@: movzx eax, byte [edi+1] shl eax, 7 mov [edi+4], eax mov eax, [edi+28] mov cl, [edi+3] cmp cl, [edi+32] sbb eax, -7-1 shr eax, 3 shl eax, 7 mov [edi+8], eax add edi, 56 cmp edi, [ebx+jpeg.work.cur_components_end] jb .coeff_init ; unpack coefficients ; N.B. Speed optimization has sense here. .coeff_decode_loop: lea edx, [ebx+jpeg.work.cur_components] .coeff_components_loop: mov edi, [edx+12] movzx ecx, byte [edx] push dword [edx+40] push edi .coeff_y_loop: push ecx movzx eax, byte [edx+1] push dword [edx+28] push edi .coeff_x_loop: cmp dword [edx+40], 0 jl @f cmp dword [edx+28], 0 jge .realdata @@: cmp [ebx+jpeg.work.not_interleaved], 0 jnz .norealdata push eax edi lea edi, [ebx+jpeg.work.dct_coeff] call decode_progressive_coeff pop edi eax jmp .norealdata .realdata: push eax call decode_progressive_coeff add edi, 64*2 pop eax .norealdata: sub dword [edx+28], 8 sub eax, 1 jnz .coeff_x_loop pop edi pop dword [edx+28] add edi, [edx+8] pop ecx sub dword [edx+40], 8 sub ecx, 1 jnz .coeff_y_loop movzx eax, byte [edx+1] shl eax, 3 pop edi add edi, [edx+4] pop dword [edx+40] sub [edx+28], eax mov [edx+12], edi add edx, 56 cmp edx, [ebx+jpeg.work.cur_components_end] jnz .coeff_components_loop call next_MCU jc .norst sub [ebx+jpeg.work.cur_x], 1 jnz .coeff_decode_loop call next_line lea edx, [ebx+jpeg.work.cur_components] @@: mov eax, [ebx+jpeg.work.max_x] imul eax, [edx+4] sub [edx+12], eax movzx eax, byte [edx] imul eax, [edx+8] add [edx+12], eax add edx, 56 cmp edx, [ebx+jpeg.work.cur_components_end] jnz @b sub [ebx+jpeg.work.cur_y], 1 jnz .coeff_decode_loop jmp .markers_loop .norst: .end4: jmp .end3 .sos_noprogressive: ; normal mode - unpack JPEG image mov edi, [ebx+jpeg.work.image] mov edi, [edi+Image.Data] mov [ebx+jpeg.work.cur_out_ptr], edi ; N.B. Speed optimization has sense here. .decode_loop: call decode_MCU call next_MCU jc .end4 sub [ebx+jpeg.work.cur_x], 1 jnz .decode_loop call next_line sub [ebx+jpeg.work.cur_y], 1 jnz .decode_loop jmp .markers_loop get_marker: ; in: esi -> data ; out: CF=0, al=marker value - ok ; CF=1 - no marker sub ebp, 1 jc .ret lodsb if 1 cmp al, 0xFF jae @f ; Some stupid men, which do not read specifications and manuals, ; sometimes create markers with length field two less than true ; value (in JPEG length of marker = length of data INCLUDING ; length field itself). To open such files, allow 2 bytes ; before next marker. cmp ebp, 2 jb .ret lodsb lodsb end if cmp al, 0xFF jb .ret @@: sub ebp, 1 jc .ret lodsb cmp al, 0xFF jz @b clc .ret: ret align 16 decode_MCU: lea edx, [ebx+jpeg.work.cur_components] .components_loop: ; decode each component push [ebx+jpeg.work.cur_out_ptr] movzx ecx, byte [edx] push dword [edx+40] ; we have H_i * V_i blocks of packed data, decode them .y_loop_1: push [ebx+jpeg.work.cur_out_ptr] push ecx movzx eax, byte [edx+1] push dword [edx+28] .x_loop_1: push eax call decode_data_unit cmp dword [edx+40], 0 jl .nocopyloop cmp dword [edx+28], 0 jl .nocopyloop ; now we have decoded block 8*8 in decoded_data ; H_i * V_i packed blocks 8*8 make up one block (8*HMax) * (8*VMax) ; so each pixel in packed block corresponds to HFact * VFact pixels movzx ecx, byte [edx+2] push esi ebp mov edi, [ebx+jpeg.work.cur_out_ptr] add edi, [edx+52] .y_loop_2: push ecx edi cmp ecx, [edx+44] mov ecx, [edx+40] sbb ecx, 8-1 sbb eax, eax and ecx, eax add ecx, 8 jz .skip_x_loop_2 movzx eax, byte [edx+3] .x_loop_2: push eax ecx edi cmp eax, [edx+32] mov eax, [edx+28] sbb eax, 8-1 sbb ebp, ebp and eax, ebp mov ebp, .copyiter_all lea esi, [ebx+jpeg.work.decoded_data] sub ebp, eax sub ebp, eax sub ebp, eax mov eax, [edx+4] sub eax, 1 .copyloop: push esi edi jmp ebp .copyiter_all: movsb repeat 7 add edi, eax movsb end repeat nop nop pop edi esi add edi, [edx+8] add esi, 8 sub ecx, 1 jnz .copyloop pop edi ecx eax add edi, [ebx+jpeg.work.pixel_size] sub eax, 1 jnz .x_loop_2 .skip_x_loop_2: pop edi ecx add edi, [ebx+jpeg.work.line_size] sub ecx, 1 jnz .y_loop_2 pop ebp esi .nocopyloop: mov eax, [ebx+jpeg.work.delta_x] add [ebx+jpeg.work.cur_out_ptr], eax pop eax sub dword [edx+28], 8 sub eax, 1 jnz .x_loop_1 pop dword [edx+28] pop ecx pop eax sub dword [edx+40], 8 add eax, [ebx+jpeg.work.delta_y] mov [ebx+jpeg.work.cur_out_ptr], eax sub ecx, 1 jnz .y_loop_1 movzx eax, byte [edx+1] pop dword [edx+40] shl eax, 3 pop [ebx+jpeg.work.cur_out_ptr] sub dword [edx+28], eax add edx, 56 cmp edx, [ebx+jpeg.work.cur_components_end] jb .components_loop mov eax, [ebx+jpeg.work.cur_block_dx] add [ebx+jpeg.work.cur_out_ptr], eax ret align 16 next_MCU: add [ebx+jpeg.work.decoded_MCUs], 1 mov eax, [ebx+jpeg.work.restart_interval] test eax, eax jz .no_restart cmp [ebx+jpeg.work.decoded_MCUs], eax jb .no_restart and [ebx+jpeg.work.decoded_MCUs], 0 and [ebx+jpeg.work.huffman_bits], 0 cmp [ebx+jpeg.work.cur_x], 1 jnz @f cmp [ebx+jpeg.work.cur_y], 1 jz .no_restart @@: ; restart marker must be present sub ebp, 2 js .error cmp byte [esi], 0xFF jnz .error mov al, [ebx+jpeg.work.cur_rst_marker] inc eax and al, 7 mov [ebx+jpeg.work.cur_rst_marker], al add al, 0xD0 cmp [esi+1], al jnz .error add esi, 2 ; handle restart marker - zero all DC predictions lea edx, [ebx+jpeg.work.cur_components] @@: and word [edx+48], 0 add edx, 56 cmp edx, [ebx+jpeg.work.cur_components_end] jb @b .no_restart: clc ret .error: stc ret next_line: mov eax, [ebx+jpeg.work.max_x] mov [ebx+jpeg.work.cur_x], eax mul [ebx+jpeg.work.cur_block_dx] sub eax, [ebx+jpeg.work.cur_block_dy] sub [ebx+jpeg.work.cur_out_ptr], eax lea edx, [ebx+jpeg.work.cur_components] @@: mov eax, [edx+24] mov [edx+28], eax movzx eax, byte [edx] shl eax, 3 sub [edx+40], eax add edx, 56 cmp edx, [ebx+jpeg.work.cur_components_end] jb @b ret init_limits: push [ebx+jpeg.work.x_num_blocks] pop [ebx+jpeg.work.max_x] push [ebx+jpeg.work.y_num_blocks] pop [ebx+jpeg.work.max_y] push [ebx+jpeg.work.block_delta_x] pop [ebx+jpeg.work.cur_block_dx] push [ebx+jpeg.work.block_delta_y] pop [ebx+jpeg.work.cur_block_dy] cmp [ebx+jpeg.work.not_interleaved], 0 jz @f mov eax, dword [ebx+jpeg.work.cur_components+28] movzx ecx, byte [ebx+jpeg.work.cur_components+3] cmp cl, [ebx+jpeg.work.cur_components+32] sbb eax, -7-1 shr eax, 3 mov [ebx+jpeg.work.max_x], eax mov eax, dword [ebx+jpeg.work.cur_components+40] movzx edx, byte [ebx+jpeg.work.cur_components+2] cmp dl, [ebx+jpeg.work.cur_components+44] sbb eax, -7-1 shr eax, 3 mov [ebx+jpeg.work.max_y], eax imul ecx, [ebx+jpeg.work.delta_x] mov [ebx+jpeg.work.cur_block_dx], ecx imul edx, [ebx+jpeg.work.delta_y] mov [ebx+jpeg.work.cur_block_dy], edx @@: push [ebx+jpeg.work.max_x] pop [ebx+jpeg.work.cur_x] push [ebx+jpeg.work.max_y] pop [ebx+jpeg.work.cur_y] ret ;macro get_bit ;{ ;local .l1,.l2,.marker ; add cl, cl ; jnz .l1 ; sub ebp, 1 ; js decode_data_unit.eof ; mov cl, [esi] ; cmp cl, 0xFF ; jnz .l2 ;.marker: ; add esi, 1 ; sub ebp, 1 ; js decode_data_unit.eof ; cmp byte [esi], 0xFF ; jz .marker ; cmp byte [esi], 0 ; jnz decode_data_unit.eof ;.l2: ; sub esi, -1 ; adc cl, cl ;.l1: ;} macro get_bit stack_depth { local .l1,.l2,.marker sub cl, 1 jns .l1 sub ebp, 1 js .eof_pop#stack_depth mov ch, [esi] cmp ch, 0xFF jnz .l2 .marker: add esi, 1 sub ebp, 1 js .eof_pop#stack_depth cmp byte [esi], 0xFF jz .marker cmp byte [esi], 0 jnz .eof_pop#stack_depth .l2: add esi, 1 mov cl, 7 .l1: add ch, ch } macro get_bits stack_depth,stack_depth_p1,restore_edx { local .l1,.l2,.l3,.marker2 movzx eax, ch mov dl, cl shl eax, 24 neg cl push ebx add cl, 24 .l1: cmp bl, dl jbe .l2 sub bl, dl sub ebp, 1 js .eof_pop#stack_depth_p1 mov ch, [esi] cmp ch, 0xFF jnz .l3 .marker2: add esi, 1 sub ebp, 1 js .eof_pop#stack_depth_p1 cmp byte [esi], 0xFF jz .marker2 cmp byte [esi], 0 jnz .eof_pop#stack_depth_p1 .l3: movzx edx, ch add esi, 1 shl edx, cl sub cl, 8 or eax, edx mov dl, 8 jmp .l1 .l2: mov cl, bl sub dl, bl shl ch, cl pop ebx cmp eax, 80000000h rcr eax, 1 mov cl, 31 sub cl, bl sar eax, cl mov cl, dl if restore_edx eq true pop edx end if add eax, 80000000h adc eax, 80000000h } ; macro get_huffman_code ; { ; local .l1 ; xor ebx, ebx ; .l1: ; get_bit ; adc ebx, ebx ; mov eax, [eax+4*ebx] ; xor ebx, ebx ; cmp eax, -1 ; jz .eof_pop ; cmp eax, 0x1000 ; jae .l1 ; mov ebx, eax ; } macro get_huffman_code stack_depth,stack_depth_p1 { local .l1,.l2,.l3,.l4,.l5,.l6,.nomarker1,.marker1,.nomarker2,.marker2,.nomarker3,.marker3,.done ; 1. (First level in Huffman table) Does the current Huffman code fit in 8 bits ; and have we got enough bits? movzx ebx, ch cmp byte [eax+ebx*2], cl jbe .l1 ; 2a. No; load next byte sub ebp, 1 js .eof_pop#stack_depth mov ch, [esi] movzx edx, ch cmp ch, 0xFF jnz .nomarker1 .marker1: add esi, 1 sub ebp, 1 js .eof_pop#stack_depth cmp byte [esi], 0xFF jz .marker1 cmp byte [esi], 0 jnz .eof_pop#stack_depth .nomarker1: shr edx, cl add esi, 1 or ebx, edx ; 3a. (First level in Huffman table, >=8 bits known) Does the current Huffman code fit in 8 bits? cmp byte [eax+ebx*2], 8 jbe .l2 jl .eof_pop#stack_depth ; 4aa. No; go to next level movzx ebx, byte [eax+ebx*2+1] mov dl, ch shl ebx, 5 ror edx, cl lea ebx, [eax+ebx+0x200] shr edx, 24 push edx shr edx, 4 ; 5aa. (Second level in Huffman table) Does the current Huffman code fit in 12 bits ; and have we got enough bits? cmp byte [ebx+edx*2], cl jbe .l3 ; 6aaa. No; have we got 12 bits? cmp cl, 4 jae .l4 ; 7aaaa. No; load next byte pop edx sub ebp, 1 js .eof_pop#stack_depth mov ch, [esi] cmp ch, 0xFF jnz .nomarker2 .marker2: add esi, 1 sub ebp, 1 js .eof_pop#stack_depth cmp byte [esi], 0xFF jz .marker2 cmp byte [esi], 0 jnz .eof_pop#stack_depth .nomarker2: push ecx shr ch, cl add esi, 1 or dl, ch pop ecx push edx shr edx, 4 ; 8aaaa. (Second level in Huffman table) Does the current Huffman code fit in 12 bits? cmp byte [ebx+edx*2], 4 jbe .l5 jl .eof_pop#stack_depth_p1 ; 9aaaaa. No; go to next level movzx ebx, byte [ebx+edx*2+1] pop edx shl ebx, 5 and edx, 0xF lea ebx, [eax+ebx+0x200] ; 10aaaaa. Get current code length and value sub cl, [ebx+edx*2] movzx eax, byte [ebx+edx*2+1] neg cl shl ch, cl neg cl add cl, 8 jmp .done .l5: ; 9aaaab. Yes; get current code length and value sub cl, [ebx+edx*2] movzx eax, byte [ebx+edx*2+1] neg cl pop edx shl ch, cl neg cl add cl, 8 jmp .done .l4: ; 7aaab. Yes; go to next level movzx ebx, byte [ebx+edx*2+1] pop edx shl ebx, 5 and edx, 0xF lea ebx, [eax+ebx+0x200] ; 8aaab. (Third level in Huffman table) Have we got enough bits? cmp [ebx+edx*2], cl jbe .l6 ; 9aaaba. No; load next byte sub ebp, 1 js .eof_pop#stack_depth mov ch, [esi] cmp ch, 0xFF jnz .nomarker3 .marker3: add esi, 1 sub ebp, 1 js .eof_pop#stack_depth cmp byte [esi], 0xFF jz .marker3 cmp byte [esi], 0 jnz .eof_pop#stack_depth .nomarker3: push ecx shr ch, cl add esi, 1 or dl, ch pop ecx ; 10aaaba. Get current code length and value sub cl, [ebx+edx*2] movzx eax, byte [ebx+edx*2+1] neg cl shl ch, cl neg cl add cl, 8 jmp .done .l3: ; 6aab. Yes; get current code length and value pop eax .l6: ; 9aaabb. Yes; get current code length and value sub cl, [ebx+edx*2] movzx eax, byte [ebx+edx*2+1] xor cl, 7 shl ch, cl xor cl, 7 add ch, ch jmp .done .l2: ; 3ab. Yes; get current code length and value sub cl, [eax+ebx*2] movzx eax, byte [eax+ebx*2+1] neg cl shl ch, cl neg cl add cl, 8 jmp .done .l1: ; 3b. Yes; get current code length and value mov dl, [eax+ebx*2] movzx eax, byte [eax+ebx*2+1] xchg cl, dl sub dl, cl shl ch, cl mov cl, dl .done: mov ebx, eax } ; Decode DCT coefficients for one 8*8 block in progressive mode ; from input stream, given by pointer esi and length ebp ; N.B. Speed optimization has sense here. align 16 decode_progressive_coeff: mov ecx, [ebx+jpeg.work.huffman_bits] cmp [ebx+jpeg.work.ScanStart], 0 jnz .ac ; DC coefficient cmp [ebx+jpeg.work.ApproxPosHigh], 0 jz .dc_first ; DC coefficient, subsequent passes xor eax, eax get_bit 0 adc eax, eax mov [ebx+jpeg.work.huffman_bits], ecx mov cl, [ebx+jpeg.work.ApproxPosLow] shl eax, cl or [edi], ax ret .dc_first: ; DC coefficient, first pass mov eax, [edx+16] push ebx push edx get_huffman_code 2,3 get_bits 2,3,true pop ebx add eax, [edx+48] mov [edx+48], ax mov [ebx+jpeg.work.huffman_bits], ecx mov cl, [ebx+jpeg.work.ApproxPosLow] shl eax, cl mov [edi], ax ret .ac: ; AC coefficients movzx eax, [ebx+jpeg.work.ScanStart] cmp al, [ebx+jpeg.work.ScanEnd] ja .ret cmp dword [edx+52], 0 jnz .was_eob push ebx .acloop: push edx push eax mov eax, [edx+20] get_huffman_code 3,4 pop eax test ebx, 15 jz .band push eax ebx and ebx, 15 get_bits 4,5,false pop ebx xchg eax, [esp] shr ebx, 4 mov edx, [esp+8] .zeroloop1: push eax ebx movzx eax, byte [zigzag+eax] xor ebx, ebx cmp word [edi+eax], bx jz .zeroloop2 get_bit 5 jnc @f push ecx mov cl, [edx+jpeg.work.ApproxPosLow] xor ebx, ebx cmp byte [edi+eax+1], 80h adc ebx, 0 add ebx, ebx sub ebx, 1 shl ebx, cl pop ecx add [edi+eax], bx @@: pop ebx eax @@: add eax, 1 cmp al, [edx+jpeg.work.ScanEnd] ja decode_data_unit.eof_pop3 jmp .zeroloop1 .zeroloop2: pop ebx eax sub ebx, 1 jns @b .nozero1: pop ebx test ebx, ebx jz @f push eax movzx eax, byte [zigzag+eax] push ecx mov cl, [edx+jpeg.work.ApproxPosLow] shl ebx, cl pop ecx mov [edi+eax], bx pop eax @@: add eax, 1 cmp al, [edx+jpeg.work.ScanEnd] pop edx jbe .acloop pop ebx mov [ebx+jpeg.work.huffman_bits], ecx .ret: ret .eof_pop5: pop ebx .eof_pop4: pop ebx .eof_pop3: pop ebx .eof_pop2: pop ebx .eof_pop1: pop ebx .eof_pop0: jmp decode_data_unit.eof_pop0 .band: shr ebx, 4 cmp ebx, 15 jnz .eob mov edx, [esp+4] push 0 jmp .zeroloop1 .eob: pop edx push eax mov eax, 1 test ebx, ebx jz .eob0 @@: get_bit 2 adc eax, eax sub ebx, 1 jnz @b .eob0: mov [edx+52], eax pop eax pop ebx .was_eob: sub dword [edx+52], 1 cmp al, [ebx+jpeg.work.ScanEnd] ja .ret2 push edx .zeroloop3: push eax movzx eax, byte [zigzag+eax] xor edx, edx cmp word [edi+eax], dx jz @f get_bit 2 jnc @f push ecx mov cl, [ebx+jpeg.work.ApproxPosLow] xor edx, edx cmp byte [edi+eax+1], 80h adc edx, 0 add edx, edx sub edx, 1 shl edx, cl pop ecx add [edi+eax], dx @@: pop eax add eax, 1 cmp al, [ebx+jpeg.work.ScanEnd] jbe .zeroloop3 pop edx .ret2: mov [ebx+jpeg.work.huffman_bits], ecx ret handle_progressive: cmp [ebx+jpeg.work.dct_buffer], 0 jnz @f ret @@: ; information for all components lea esi, [ebx+jpeg.work.components] xor ebp, ebp mov ecx, [ebx+jpeg.work.pixel_size] .next_component: lea edi, [ebx+jpeg.work.cur_components] lodsb ; ComponentID lodsd mov ax, 0x0101 stosd ; db V, db H, db VFactor, db HFactor xor eax, eax mov al, byte [edi-1] ; get HFactor mul byte [ebx+jpeg.work.pixel_size] ; number of components stosd ; HIncrement_i = HFactor_i * sizeof(pixel) movzx eax, byte [edi-4-2] ; get VFactor mul [ebx+jpeg.work.line_size] ; number of components * image width stosd ; VIncrement_i = VFactor_i * sizeof(row) lodsb and eax, 3 cmp [ebx+jpeg.work.quant_tables_defined+eax], 0 jz .error shl eax, 8 lea eax, [ebx+jpeg.work.quant_tables+eax] stosd ; dd QuantizationTable stosd ; dd DCTable - ignored mov eax, ebp mul [ebx+jpeg.work.dct_buffer_size] add eax, [ebx+jpeg.work.dct_buffer] stosd ; instead of dd ACTable - pointer to current DCT coefficients push ecx mov eax, [ebx+jpeg.work.image] mov eax, [eax+Image.Width] movzx ecx, byte [edi-21] ; get HFactor ; cdq ; edx = 0 as a result of previous mul div ecx stosd ; dd width / HFactor_i stosd xchg eax, ecx inc eax sub eax, edx stosd ; dd HFactor_i+1 - (width % HFactor_i) mov eax, [ebx+jpeg.work.image] mov eax, [eax+Image.Height] movzx ecx, byte [edi-34] ; get VFactor cdq div ecx stosd ; dd height / VFactor_i stosd xchg eax, ecx inc eax sub eax, edx stosd ; dd VFactor_i+1 - (height % VFactor_i) pop ecx xor eax, eax test ebp, ebp setnp al ror eax, 1 stosd ; dd DCPrediction mov eax, ebp stosd ; dd ComponentOffset inc ebp push ecx mov [ebx+jpeg.work.cur_components_end], edi lea edx, [edi-56] ; do IDCT and unpack mov edi, [ebx+jpeg.work.image] mov edi, [edi+Image.Data] mov [ebx+jpeg.work.cur_out_ptr], edi mov [ebx+jpeg.work.not_interleaved], 1 call init_limits .decode_loop: call decode_MCU sub [ebx+jpeg.work.cur_x], 1 jnz .decode_loop call next_line sub [ebx+jpeg.work.cur_y], 1 jnz .decode_loop pop ecx dec ecx jnz .next_component ; image unpacked, return .error: push [ebx+jpeg.work.dct_buffer] call [mem.free] ret ; Support for YCbCr -> RGB conversion ; R = Y + 1.402 * (Cr - 128) ; G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) ; B = Y + 1.772 * (Cb - 128) ; When converting YCbCr -> RGB, we need to do some multiplications; ; to be faster, we precalculate the table for all 256 possible values ; Also we approximate fractions with N/65536, this gives sufficient precision img.initialize.jpeg: ;initialize_color_table: ; 1.402 = 1 + 26345/65536, -0.71414 = -46802/65536 ; -0.34414 = -22554/65536, 1.772 = 1 + 50594/65536 pushad mov edi, color_table_1 mov ecx, 128 ; 1. Cb -> 1.772*Cb xor eax, eax mov dx, 8000h .l1: push ecx @@: stosd add dx, 50594 adc eax, 1 loop @b neg dx adc eax, -1 neg eax pop ecx jnz .l1 ; 2. Cb -> -0.34414*Cb mov ax, dx .l2: push ecx @@: stosd sub eax, 22554 loop @b neg eax pop ecx cmp ax, dx jnz .l2 xor eax, eax ; 3. Cr -> -0.71414*Cr .l3: push ecx @@: stosd sub eax, 46802 loop @b neg eax pop ecx jnz .l3 ; 4. Cr -> 1.402*Cr .l4: push ecx @@: stosd add dx, 26345 adc eax, 1 loop @b neg dx adc eax, -1 neg eax pop ecx jnz .l4 popad ret ; this function is called in the end of image loading convert_to_rgb: ; some checks mov eax, [ebx+jpeg.work.image] test eax, eax ; image exists? jz .ret cmp byte [ebx+jpeg.work.pixel_size], 3 ; full-color image? jz .ycc2rgb cmp byte [ebx+jpeg.work.pixel_size], 4 jz .ycck2rgb .ret: ret .ycc2rgb: ; conversion is needed mov esi, [eax+Image.Width] imul esi, [eax+Image.Height] mov edi, [eax+Image.Data] push ebx ; N.B. Speed optimization has sense here. align 16 .loop: ; mov ebx, [edi] ; mov edx, ebx ; mov ecx, ebx ; movzx ebx, bl ; ebx = Y ; shr edx, 16 ; mov eax, ebx ; movzx edx, dl ; edx = Cr ; movzx ecx, ch ; ecx = Cb movzx ebx, byte [edi] movzx ecx, byte [edi+1] mov eax, ebx movzx edx, byte [edi+2] ; B = Y + color_table_1[Cb] add eax, [color_table_1+ecx*4] mov ebp, [color_table_2+ecx*4] cmp eax, 80000000h sbb ecx, ecx and eax, ecx add ebp, [color_table_3+edx*4] cmp eax, 0x100 sbb ecx, ecx not ecx sar ebp, 16 or eax, ecx mov [edi], al ; G = Y + color_table_2[Cb] + color_table_3[Cr] lea eax, [ebx+ebp] cmp eax, 80000000h sbb ecx, ecx and eax, ecx cmp eax, 0x100 sbb ecx, ecx not ecx or eax, ecx mov [edi+1], al ; R = Y + color_table_4[Cr] mov eax, ebx add eax, [color_table_4+edx*4] cmp eax, 80000000h sbb ecx, ecx and eax, ecx cmp eax, 0x100 sbb ecx, ecx not ecx or eax, ecx mov [edi+2], al add edi, 3 sub esi, 1 jnz .loop pop ebx ret .ycck2rgb: ; conversion is needed mov esi, [eax+Image.Width] imul esi, [eax+Image.Height] push ebx push esi mov edi, [eax+Image.Data] mov esi, edi ; N.B. Speed optimization has sense here. align 16 .kloop: ; mov ebx, [esi] ; mov edx, ebx ; mov ecx, ebx ; movzx ebx, bl ; ebx = Y ; shr edx, 16 ; mov eax, ebx ; movzx edx, dl ; edx = Cr ; movzx ecx, ch ; ecx = Cb movzx ebx, byte [esi] movzx ecx, byte [esi+1] mov eax, ebx movzx edx, byte [esi+2] ; B = Y + color_table_1[Cb] add eax, [color_table_1+ecx*4] mov ebp, [color_table_2+ecx*4] cmp eax, 80000000h sbb ecx, ecx and eax, ecx add ebp, [color_table_3+edx*4] cmp eax, 0x100 sbb ecx, ecx not ecx sar ebp, 16 or eax, ecx xor al, 0xFF mul byte [esi+3] add al, ah adc ah, 0 add al, 80h adc ah, 0 mov byte [edi], ah ; G = Y + color_table_2[Cb] + color_table_3[Cr] lea eax, [ebx+ebp] cmp eax, 80000000h sbb ecx, ecx and eax, ecx cmp eax, 0x100 sbb ecx, ecx not ecx or eax, ecx xor al, 0xFF mul byte [esi+3] add al, ah adc ah, 0 add al, 80h adc ah, 0 mov byte [edi+1], ah ; R = Y + color_table_4[Cr] mov eax, ebx add eax, [color_table_4+edx*4] cmp eax, 80000000h sbb ecx, ecx and eax, ecx cmp eax, 0x100 sbb ecx, ecx not ecx or eax, ecx xor al, 0xFF mul byte [esi+3] add al, ah adc ah, 0 add al, 80h adc ah, 0 mov byte [edi+2], ah add esi, 4 add edi, 4 ;3 sub dword [esp], 1 jnz .kloop pop eax pop ebx ; release some memory - must succeed because we decrease size ; add ecx, 44+1 ; mov edx, ebx ; push 68 ; pop eax ; push 20 ; pop ebx ; int 0x40 ; mov ebx, eax ret ; Decodes one data unit, that is, 8*8 block, ; from input stream, given by pointer esi and length ebp ; N.B. Speed optimization has sense here. align 16 decode_data_unit: ; edx -> component data cmp [ebx+jpeg.work.progressive], 0 jz @f mov edi, [edx+20] add dword [edx+20], 64*2 jmp .coeff_decoded @@: lea edi, [ebx+jpeg.work.dct_coeff] mov ecx, 64*2/4 xor eax, eax rep stosd mov edi, zigzag+1 mov ecx, [ebx+jpeg.work.huffman_bits] ; read DC coefficient push ebx mov eax, [edx+16] push edx get_huffman_code 2,3 get_bits 2,3,true pop ebx add eax, [edx+48] mov [ebx+jpeg.work.dct_coeff], ax mov [edx+48], ax ; read AC coefficients push ebx @@: mov eax, [edx+20] push edx get_huffman_code 2,3 shr eax, 4 and ebx, 15 jz .band add edi, eax cmp edi, zigzag+64 jae .eof_pop2 get_bits 2,3,true movzx ebx, byte [edi] add ebx, [esp] mov [jpeg.work.dct_coeff+ebx], ax add edi, 1 cmp edi, zigzag+64 jb @b jmp .do_idct .band: pop edx cmp al, 15 jnz .do_idct add edi, 16 cmp edi, zigzag+64 jb @b ; jmp .eof_pop1 .do_idct: pop ebx lea edi, [ebx+jpeg.work.dct_coeff] mov [ebx+jpeg.work.huffman_bits], ecx ; coefficients loaded, now IDCT .coeff_decoded: mov eax, [edx+12] add ebx, jpeg.work.idct_tmp_area push 8 .idct_loop1: mov cx, word [edi+1*16] repeat 6 or cx, word [edi+(%+1)*16] end repeat jnz .real_transform fild word [edi] fmul dword [eax] fstp dword [ebx] mov ecx, [ebx] repeat 7 mov [ebx+%*32], ecx end repeat jmp .idct_next1 .real_transform: ; S0,...,S7 - transformed values, s0,...,s7 - sought-for values ; S0,...,S7 are dequantized; ; dequantization table elements were multiplied to [idct_pre_table], ; so S0,S1,... later denote S0/2\sqrt{2},S1*\cos{\pi/16}/2,... ; sqrt2 = \sqrt{2}, cos = 2\cos{\pi/8}, ; cos_sum = -2(\cos{\pi/8}+\cos{3\pi/8}), cos_diff = 2(\cos{\pi/8}-\cos{3\pi/8}) ; Now formulas: ; s0 = ((S0+S4)+(S2+S6)) + ((S1+S7)+(S3+S5)) ; s7 = ((S0+S4)+(S2+S6)) - ((S1+S7)+(S3+S5)) ; val0 = ((cos-1)S1-(cos+cos_sum+1)S3+(cos+cos_sum-1)S5-(cos+1)S7) ; s1 = ((S0-S4)+((sqrt2-1)S2-(sqrt2+1)S6)) + val0 ; s6 = ((S0-S4)+((sqrt2-1)S2-(sqrt2+1)S6)) - val0 ; val1 = (S1+S7-S3-S5)sqrt2 - val0 ; s2 = ((S0-S4)-((sqrt2-1)S2-(sqrt2+1)S6)) + val1 ; s5 = ((S0-S4)-((sqrt2-1)S2-(sqrt2+1)S6)) - val1 ; val2 = (S1-S7)cos_diff - (S1-S3+S5-S7)cos + val1 ; s3 = ((S0+S4)-(S2+S6)) - val2 ; s4 = ((S0+S4)-(S2+S6)) + val2 fild word [edi+3*16] fmul dword [eax+3*32] fild word [edi+5*16] fmul dword [eax+5*32] ; st0=S5,st1=S3 fadd st1,st0 fadd st0,st0 fsub st0,st1 ; st0=S5-S3,st1=S5+S3 fild word [edi+1*16] fmul dword [eax+1*32] fild word [edi+7*16] fmul dword [eax+7*32] ; st0=S7,st1=S1 fsub st1,st0 fadd st0,st0 fadd st0,st1 ; st0=S1+S7,st1=S1-S7,st2=S5-S3,st3=S5+S3 fadd st3,st0 fadd st0,st0 fsub st0,st3 ; st0=S1-S3-S5+S7,st1=S1-S7,st2=S5-S3,st3=S1+S3+S5+S7 fmul [idct_sqrt2] fld st2 fadd st0,st2 fmul [idct_cos] ; st0=(S1-S3+S5-S7)cos,st1=(S1-S3-S5+S7)sqrt2, ; st2=S1-S7,st3=S5-S3,st4=S1+S3+S5+S7 fxch st2 fmul [idct_cos_diff] fsub st0,st2 ; st0=(S1-S7)cos_diff - (S1-S3+S5-S7)cos fxch st3 fmul [idct_cos_sum] fadd st0,st2 ; st0=(S5-S3)cos_sum+(S1-S3+S5-S7)cos fsub st0,st4 ; st0=val0 fsub st1,st0 ; st0=val0,st1=val1,st2=(S1-S3+S5-S7)cos, ; st3=(S1-S7)cos_diff-(S1-S3+S5-S7)cos,st4=S1+S3+S5+S7 fxch st2 fstp st0 fadd st2,st0 ; st0=val1,st1=val0,st2=val2,st3=S1+S3+S5+S7 fild word [edi+0*16] fmul dword [eax+0*32] fild word [edi+4*16] fmul dword [eax+4*32] ; st0=S4,st1=S0 fsub st1,st0 fadd st0,st0 fadd st0,st1 ; st0=S0+S4,st1=S0-S4 fild word [edi+6*16] fmul dword [eax+6*32] fild word [edi+2*16] fmul dword [eax+2*32] ; st0=S2,st1=S6 fadd st1,st0 fadd st0,st0 fsub st0,st1 ; st0=S2-S6,st1=S2+S6 fmul [idct_sqrt2] fsub st0,st1 fsub st3,st0 fadd st0,st0 fadd st0,st3 ; st0=(S0-S4)+((S2-S6)sqrt2-(S2+S6)) ; st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6)) fxch st1 fsub st2,st0 fadd st0,st0 fadd st0,st2 ; st0=(S0+S4)+(S2+S6),st1=(S0-S4)+((S2-S6)sqrt2-(S2+S6)), ; st2=(S0+S4)-(S2+S6),st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6)) ; st4=val1,st5=val0,st6=val2,st7=S1+S3+S5+S7 fsubr st7,st0 fadd st0,st0 fsub st0,st7 fstp dword [ebx+0*32] fsubr st4,st0 fadd st0,st0 fsub st0,st4 fstp dword [ebx+1*32] fadd st4,st0 fadd st0,st0 fsub st0,st4 fstp dword [ebx+3*32] fsubr st1,st0 fadd st0,st0 fsub st0,st1 fstp dword [ebx+2*32] fstp dword [ebx+5*32] fstp dword [ebx+6*32] fstp dword [ebx+4*32] fstp dword [ebx+7*32] .idct_next1: add ebx, 4 add edi, 2 add eax, 4 sub dword [esp], 1 jnz .idct_loop1 pop ecx sub ebx, 8*4 mov ecx, 8 .idct_loop2: fld dword [ebx+3*4] fld dword [ebx+5*4] fadd st1,st0 fadd st0,st0 fsub st0,st1 ; st0=S5-S3,st1=S5+S3 fld dword [ebx+1*4] fld dword [ebx+7*4] fsub st1,st0 fadd st0,st0 fadd st0,st1 ; st0=S1+S7,st1=S1-S7,st2=S5-S3,st3=S5+S3 fadd st3,st0 fadd st0,st0 fsub st0,st3 ; st0=S1-S3-S5+S7,st1=S1-S7,st2=S5-S3,st3=S1+S3+S5+S7 fmul [idct_sqrt2] fld st2 fadd st0,st2 fmul [idct_cos] ; st0=(S1-S3+S5-S7)cos,st1=(S1-S3-S5+S7)sqrt2, ; st2=S1-S7,st3=S5-S3,st4=S1+S3+S5+S7 fxch st2 fmul [idct_cos_diff] fsub st0,st2 ; st0=(S1-S7)cos_diff - (S1-S3+S5-S7)cos fxch st3 fmul [idct_cos_sum] fadd st0,st2 ; st0=(S5-S3)cos_sum+(S1-S3+S5-S7)cos fsub st0,st4 ; st0=val0 fsub st1,st0 ; st0=val0,st1=val1,st2=(S1-S3+S5-S7)cos, ; st3=(S1-S7)cos_diff-(S1-S3+S5-S7)cos,st4=S1+S3+S5+S7 fxch st2 fstp st0 fadd st2,st0 ; st0=val1,st1=val0,st2=val2,st3=S1+S3+S5+S7 fld dword [ebx+0*4] fld dword [ebx+4*4] fsub st1,st0 fadd st0,st0 fadd st0,st1 ; st0=S0+S4,st1=S0-S4 fld dword [ebx+6*4] fld dword [ebx+2*4] fadd st1,st0 fadd st0,st0 fsub st0,st1 ; st0=S2-S6,st1=S2+S6 fmul [idct_sqrt2] fsub st0,st1 fsub st3,st0 fadd st0,st0 fadd st0,st3 ; st0=(S0-S4)+((S2-S6)sqrt2-(S2+S6)) ; st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6)) fxch st1 fsub st2,st0 fadd st0,st0 fadd st0,st2 ; st0=(S0+S4)+(S2+S6),st1=(S0-S4)+((S2-S6)sqrt2-(S2+S6)), ; st2=(S0+S4)-(S2+S6),st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6)) ; st4=val1,st5=val0,st6=val2,st7=S1+S3+S5+S7 fsubr st7,st0 fadd st0,st0 fsub st0,st7 fistp dword [ebx+0*4] fsubr st4,st0 fadd st0,st0 fsub st0,st4 fistp dword [ebx+1*4] fadd st4,st0 fadd st0,st0 fsub st0,st4 fistp dword [ebx+3*4] fsubr st1,st0 fadd st0,st0 fsub st0,st1 fistp dword [ebx+2*4] fistp dword [ebx+5*4] fistp dword [ebx+6*4] fistp dword [ebx+4*4] fistp dword [ebx+7*4] add ebx, 32 sub ecx, 1 jnz .idct_loop2 sub ebx, 32*8 mov ecx, 64 lea edi, [ebx - jpeg.work.idct_tmp_area + jpeg.work.decoded_data - 1] push esi .idct_loop3: mov eax, [ebx] add ebx, 4 add eax, 80h cmp eax, 80000000h sbb esi, esi add edi, 1 and eax, esi cmp eax, 100h sbb esi, esi not esi or eax, esi sub al, [edx+51] sub ecx, 1 mov [edi], al jnz .idct_loop3 pop esi sub ebx, 64*4 + jpeg.work.idct_tmp_area ; done ret .eof_pop3: pop ebx .eof_pop2: pop ebx .eof_pop1: pop ebx .eof_pop0: ; EOF or incorrect data during scanning mov esp, [ebx + jpeg.work._esp] jmp img.decode.jpg.end img.encode.jpg: xor eax, eax ret 8 zigzag: ; (x,y) -> 2*(x+y*8) repeat 8 .cur = % if .cur and 1 repeat % db 2*((%-1) + (.cur-%)*8) end repeat else repeat % db 2*((.cur-%) + (%-1)*8) end repeat end if end repeat repeat 7 .cur = % if .cur and 1 repeat 8-% db 2*((%+.cur-1) + (8-%)*8) end repeat else repeat 8-% db 2*((8-%) + (%+.cur-1)*8) end repeat end if end repeat align 4 idct_pre_table: ; c_0 = 1/(2\sqrt{2}), c_i = cos(i*\pi/16)/2 dd 0.35355339, 0.49039264, 0.461939766, 0.41573481 dd 0.35355339, 0.27778512, 0.19134172, 0.09754516 idct_sqrt2 dd 1.41421356 ; \sqrt{2} idct_cos dd 1.847759065 ; 2\cos{\pi/8} idct_cos_sum dd -2.61312593 ; -2(\cos{\pi/8} + \cos{3\pi/8}) idct_cos_diff dd 1.08239220 ; 2(\cos{\pi/8} - \cos{3\pi/8}) ;---------------------------------------------------------------------