kolibrios-gitea/programs/media/zsea/plugins/jpeg/cnv_jpeg.asm

2379 lines
46 KiB
NASM
Raw Normal View History

;*****************************************************************************
; JPEG to RAW convert plugin - for zSea image viewer
; Copyright (c) 2008, 2009, Evgeny Grechnikov aka Diamond
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
; * Neither the name of the <organization> nor the
; names of its contributors may be used to endorse or promote products
; derived from this software without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY Evgeny Grechnikov ''AS IS'' AND ANY
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL <copyright holder> BE LIABLE FOR ANY
; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;*****************************************************************************
; Some small changes (c) 2011 Marat Zakiyanov aka Mario79, aka Mario
;*****************************************************************************
format MS COFF
public EXPORTS
section '.flat' code readable align 16
START:
pushad
finit
mov eax,dword [esp+36]
mov esi, [eax] ; esi -> JPEG data
mov ebp, [eax+12] ; ebp = file size
mov [_esp], esp
; initialize constant tables, if not yet
cmp [color_table_1+4], 0
jnz @f
call initialize_color_table
@@:
xor ebx, ebx ; ebx -> RAW data, not allocated yet
mov [dct_buffer], ebx
mov [_ebx], ebx
; check for SOI [Start-Of-Image] marker
call get_marker
jc .end
cmp al, 0xD8 ; SOI?
jz .soi_ok
.end:
; general exit from the function
; for progressive mode: convert loaded DCT coefficients to image
call handle_progressive
; convert full-color images to RGB
call convert_to_rgb
xor eax, eax
test ebx, ebx
jnz @f
inc eax ; ebx=0 => bad image
@@:
.ret:
mov ecx, [esp+28]
mov [ecx+4], ebx ; save RAW data ptr
mov [ecx+8], eax ; save result
popad
ret 4
.soi_ok:
mov [restart_interval], ebx
mov [adobe_ycck], 0
; loop until start of frame (real data), parse markers
.markers_loop:
call get_marker
jc .end
; markers RSTn do not have parameters
; N.B. They can not exist in this part of JPEG, but let's be liberal :)
cmp al, 0xD0
jb @f
cmp al, 0xD8
jb .markers_loop
@@:
cmp al, 0xD9 ; EOI? [invalid here]
jz .end
; ok, this is marker segment
; first word is length of the segment
cmp ebp, 2
jb .end
xor edx, edx
mov dl, [esi+1]
mov dh, [esi] ; edx = marker length, al = marker value
sub ebp, edx
jb .end
cmp al, 0xDB ; DQT?
jz .dqt
cmp al, 0xC4 ; DHT?
jz .dht
cmp al, 0xCC ; DAC? [ignored - no arithmetic coding]
jz .next_marker
cmp al, 0xDD ; DRI?
jz .dri
cmp al, 0xDA ; SOS?
jz .sos
cmp al, 0xC0
jb @f
cmp al, 0xD0
jb .sofn
@@:
cmp al, 0xEE ; APP14?
jz .app14
; unrecognized marker; let's skip it and hope for the best
.next_marker:
add esi, edx
jmp .markers_loop
.app14:
; check for special Adobe marker
cmp dx, 14
jb .next_marker
cmp byte [esi+2], 'A'
jnz .next_marker
cmp dword [esi+3], 'dobe'
jnz .next_marker
cmp byte [esi+13], 2
setz [adobe_ycck]
jmp .next_marker
.dqt:
; DQT marker found
; length: 2 bytes for length field + 65 bytes per table
sub edx, 2
jc .end
lodsw
.dqt_loop:
test edx, edx
jz .markers_loop
sub edx, 1+64
jc .end
lodsb
; 8-bit DCT-based process shall not use a 16-bit precision quantization table.
test al, 0xF0
jnz .end
and eax, 3
mov [eax+quant_tables_defined], 1
shl eax, 8
lea edi, [eax+quant_tables]
xor ecx, ecx
@@:
xor eax, eax
lodsb
push eax
fild dword [esp]
pop eax
movzx eax, byte [zigzag+ecx]
add eax, eax
push eax
and eax, 7*4
fmul dword [idct_pre_table+eax]
pop eax
push eax
shr eax, 3
and eax, 7*4
fmul dword [idct_pre_table+eax]
pop eax
fstp dword [edi+eax]
inc ecx
cmp ecx, 64
jb @b
jmp .dqt_loop
.dri:
; DRI marker found
cmp edx, 4 ; length must be 4
jnz .end2
movzx eax, word [esi+2]
xchg al, ah
mov [restart_interval], eax
jmp .next_marker
.dht:
; DHT marker found
sub edx, 2
jc .end2
lodsw
.dht_loop:
test edx, edx
jz .markers_loop
sub edx, 17
jc .end2
; next Huffman table; find place for it
lodsb
mov edi, eax
and eax, 0x10
and edi, 3
shr eax, 2
or edi, eax
mov [dc_huffman_defined+edi], 1
; shl edi, 11
imul edi, max_hufftable_size
add edi, dc_huffman ; edi -> destination table
; get table size
xor eax, eax
push 16
pop ecx
@@:
add al, [esi]
adc ah, 0
inc esi
loop @b
cmp ax, 0x100
ja .end2
sub edx, eax
jc .end2
; construct Huffman tree
push ebx edx
; lea eax, [edi+256*8]
; push eax
; push 16
; mov edx, esi
; @@:
; cmp byte [edx-1], 0
; jnz @f
; dec edx
; dec dword [esp]
; jmp @b
; @@:
; sub edx, [esp]
; lea eax, [edi+8]
; push 2
; pop ecx
; .lenloop:
; mov bl, byte [edx]
; test bl, bl
; jz .len1done
; push eax
; xor eax, eax
; .len1loop:
; dec ecx
; js .dhterr
; cmp edi, [esp+8]
; jae .dhterr
; lodsb
; stosd
; dec bl
; jnz .len1loop
; pop eax
; .len1done:
; jecxz .len2done
; push ecx
; .len2loop:
; cmp eax, [esp+8]
; jb @f
; or eax, -1
; @@:
; cmp edi, [esp+8]
; jae .dhterr
; stosd
; add eax, 8
; jnb @f
; or eax, -1
; @@:
; loop .len2loop
; pop ecx
; .len2done:
; add ecx, ecx
; inc edx
; dec dword [esp]
; jnz .lenloop
; pop eax
; pop eax
; sub eax, edi
; shr eax, 2
; cmp eax, ecx
; ja @f
; mov ecx, eax
; @@:
; or eax, -1
; rep stosd
; pop edx ebx
; jmp .dht_loop
; .dhterr:
; ;pop eax eax eax edx ebx
; add esp, 5*4
lea eax, [edi+256*2]
push eax
lea edx, [esi-16]
mov ah, 1
mov ecx, 128
.dht_l1:
movzx ebx, byte [edx]
inc edx
test ebx, ebx
jz .dht_l3
.dht_l2:
cmp edi, [esp]
jae .dhterr1
lodsb
xchg al, ah
push ecx
rep stosw
pop ecx
xchg al, ah
dec ebx
jnz .dht_l2
.dht_l3:
inc ah
shr ecx, 1
jnz .dht_l1
push edi
mov edi, [esp+4]
push edi
mov eax, 0x00090100
mov cl, 8
.dht_l4:
movzx ebx, byte [edx]
inc edx
test ebx, ebx
jz .dht_l6
.dht_l5:
cmp edi, [esp]
jb @f
mov edi, [esp+4]
rol eax, 16
cmp edi, [esp+8]
jae .dhterr2
stosw
inc ah
mov [esp+4], edi
pop edi
push edi
rol eax, 16
add dword [esp], 16*2
@@:
lodsb
xchg al, ah
push ecx
rep stosw
pop ecx
xchg al, ah
dec ebx
jnz .dht_l5
.dht_l6:
inc ah
shr ecx, 1
jnz .dht_l4
push edi
movzx ebx, byte [edx]
add ebx, ebx
add bl, [edx+1]
adc bh, 0
add ebx, ebx
add bl, [edx+2]
adc bh, 0
add ebx, ebx
add bl, [edx+3]
adc bh, 0
add ebx, 15
shr ebx, 4
mov cl, 8
lea ebx, [edi+ebx*2]
sub ebx, [esp+12]
add ebx, 31
shr ebx, 5
mov edi, ebx
shl edi, 5
add edi, [esp+12]
xor ebx, 9
shl ebx, 16
xor eax, ebx
push edi
.dht_l7:
movzx ebx, byte [edx]
inc edx
test ebx, ebx
jz .dht_l10
.dht_l8:
cmp edi, [esp]
jb .dht_l9
mov edi, [esp+4]
cmp edi, [esp+8]
jb @f
mov edi, [esp+12]
cmp edi, [esp+16]
jae .dhterr3
mov al, 9
stosb
rol eax, 8
stosb
inc eax
ror eax, 8
mov [esp+12], edi
mov edi, [esp+8]
add dword [esp+8], 16*2
@@:
mov al, 9
stosb
rol eax, 16
stosb
inc eax
ror eax, 16
mov [esp+4], edi
pop edi
push edi
add dword [esp], 16*2
.dht_l9:
lodsb
xchg al, ah
push ecx
rep stosw
pop ecx
xchg al, ah
dec ebx
jnz .dht_l8
.dht_l10:
inc ah
shr ecx, 1
jnz .dht_l7
push -1
pop eax
pop ecx
sub ecx, edi
rep stosb
pop edi
pop ecx
sub ecx, edi
rep stosb
pop edi
pop ecx
sub ecx, edi
rep stosb
pop edx ebx
jmp .dht_loop
.dhterr3:
pop eax eax
.dhterr2:
pop eax eax
.dhterr1:
pop eax
pop edx ebx
.end2:
jmp .end
.sofn:
; SOFn marker found
test ebx, ebx
jnz .end2 ; only one frame is allowed
; only SOF0 [baseline sequential], SOF1 [extended sequential], SOF2 [progressive]
; nobody supports other compression methods
cmp al, 0xC2
ja .end2
setz [progressive]
; Length must be at least 8
sub edx, 8
jb .end2
; Sample precision in JFIF must be 8 bits
cmp byte [esi+2], 8
jnz .end2
; Color space in JFIF is either YCbCr (color images, 3 components)
; or Y (grey images, 1 component)
movzx eax, byte [esi+7]
cmp al, 1
jz @f
cmp al, 3
jz @f
; Adobe products sometimes use YCCK color space with 4 components
cmp al, 4
jnz .end2
cmp [adobe_ycck], 0
jz .end2
@@:
mov edi, eax ; edi = number of components
lea eax, [eax*3]
sub edx, eax
jnz .end2
; get width and height
; width must be nonzero
; height must be nonzero - nobody supports DNL markers
mov ah, [esi+3]
mov al, [esi+4] ; eax = height
test eax, eax
jz .end2
xor ecx, ecx
mov ch, [esi+5]
mov cl, [esi+6] ; ecx = width
jecxz .end3
push eax ecx
imul ecx, eax
cmp ecx, 4000000h
jb @f
pop ecx eax
.end3:
jmp .end2
@@:
imul ecx, edi
push ecx
add ecx, 44+1
; for grayscale images, allocate additional memory for palette
cmp edi, 1
jnz @f
add ecx, 256*4-1
@@:
push 68
pop eax
push 12
pop ebx
int 0x40
mov ebx, eax
test eax, eax
jnz @f
pop ecx
mov al, 2
jmp .ret
@@:
; OS zeroes all allocated memory
; push edx edi
; mov edi, eax
; mov edx, ecx
; shr ecx, 2
; xor eax, eax
; rep stosd
; mov ecx, edx
; and ecx, 3
; rep stosb
; mov ecx, edx
; pop edi edx
mov [_ebx], ebx
pop dword [ebx+32] ; size of pixels area
push 44
pop eax
mov dword [ebx+28], eax ; pointer to RGB data
; create grayscale palette if needed
cmp edi, 1
jnz .no_create_palette
mov dword [ebx+20], eax
mov ecx, 256*4
mov dword [ebx+24], ecx
add dword [ebx+28], ecx
push edi
shr ecx, 2
xor eax, eax
lea edi, [ebx+44]
@@:
stosd
add eax, 0x010101
loop @b
pop edi
.no_create_palette:
; other image characteristics
pop ecx eax
mov dword [ebx], 'RAW ' ; signature
mov dword [ebx+4], ecx ; width
mov dword [ebx+8], eax ; height
mov eax, edi
shl eax, 3
mov dword [ebx+12], eax ; total pixel size
mov byte [ebx+16], 8 ; 8 bits per component
mov word [ebx+18], di ; number of components
mov [delta_x], eax
mov [pixel_size], edi
;mov eax, edi
imul eax, ecx
mov [delta_y], eax
shr eax, 3
mov [line_size], eax
; and dword [ebx+36], 0 ; transparency data pointer = NULL
; and dword [ebx+40], 0 ; transparency data size
add esi, 8
mov ecx, edi
mov edi, components
xor eax, eax
xor edx, edx
.sof_parse_comp:
movsb ; db ComponentIdentifier
lodsb
mov ah, al
and al, 0xF
jz .end3
shr ah, 4
jz .end3
stosd ; db V, db H, db ?, db ? (will be filled later)
cmp dl, al
ja @f
mov dl, al
@@:
cmp dh, ah
ja @f
mov dh, ah
@@:
movsb ; db QuantizationTableID
loop .sof_parse_comp
mov word [max_v], dx
movzx eax, dh
movzx edx, dl
push eax edx
shl eax, 3
shl edx, 3
mov [block_width], eax
mov [block_height], edx
pop edx eax
push eax edx
imul eax, [delta_x]
mov [block_delta_x], eax
imul edx, [delta_y]
mov [block_delta_y], edx
mov eax, [ebx+4]
add eax, [block_width]
dec eax
xor edx, edx
div [block_width]
mov [x_num_blocks], eax
mov eax, [ebx+8]
add eax, [block_height]
dec eax
xor edx, edx
div [block_height]
mov [y_num_blocks], eax
mov cl, [ebx+18]
pop edx
mov edi, components
@@:
mov eax, edx
div byte [edi+1] ; VMax / V_i = VFactor_i
mov byte [edi+3], al ; db VFactor
pop eax
push eax
div byte [edi+2] ; HMax / H_i = HFactor_i
mov byte [edi+4], al ; db HFactor
add edi, 6
loop @b
pop eax
cmp [progressive], 0
jz .sof_noprogressive
mov eax, [x_num_blocks]
mul [block_width]
mul [y_num_blocks]
mul [block_height]
add eax, eax
mov [dct_buffer_size], eax
mul [pixel_size]
xchg eax, ecx
push ebx
push 68
pop eax
push 12
pop ebx
int 0x40
pop ebx
test eax, eax
jnz @f
mov ecx, ebx
push 68
pop eax
push 13
pop ebx
int 0x40
xor ebx, ebx
jmp .end
@@:
mov [dct_buffer], eax
.sof_noprogressive:
jmp .markers_loop
.sos:
; SOS marker found
; frame must be already opened
test ebx, ebx
jz .end3
cmp edx, 6
jb .end3
; parse marker
movzx eax, byte [esi+2] ; number of components in this scan
test eax, eax
jz .end3 ; must be nonzero
cmp al, [ebx+18]
ja .end3 ; must be <= total number of components
; mov [ns], eax
cmp al, 1
setz [not_interleaved]
lea ecx, [6+eax+eax]
cmp edx, ecx
jnz .end3
mov ecx, eax
mov edi, cur_components
add esi, 3
.sos_find_comp:
lodsb ; got ComponentID, look for component info
push ecx esi
mov cl, [ebx+18]
mov esi, components
and dword [edi+48], 0
and dword [edi+52], 0
@@:
cmp [esi], al
jz @f
inc dword [edi+52]
add esi, 6
loop @b
@@:
mov eax, [esi+1]
mov dl, [esi+5]
pop esi ecx
jnz .end3 ; bad ComponentID
cmp [not_interleaved], 0
jz @f
mov ax, 0x0101
@@:
stosd ; db V, db H, db VFactor, db HFactor
xor eax, eax
mov al, byte [edi-1] ; get HFactor
mul byte [ebx+18] ; number of components
stosd ; HIncrement_i = HFactor_i * sizeof(pixel)
mov al, byte [edi-4-2] ; get VFactor
mul byte [ebx+18] ; number of components
imul eax, [ebx+4] ; image width
stosd ; VIncrement_i = VFactor_i * sizeof(row)
xchg eax, edx
and eax, 3
cmp [quant_tables_defined+eax], 0
jz .end3
shl eax, 8
add eax, quant_tables
stosd ; dd QuantizationTable
lodsb
movzx eax, al
mov edx, eax
shr eax, 4
and edx, 3
and eax, 3
cmp [dc_huffman_defined+eax], 0
jnz .dc_table_ok
cmp [progressive], 0
jz .end3
xor eax, eax
jmp .dc_table_done
.dc_table_ok:
; shl eax, 11
imul eax, max_hufftable_size
add eax, dc_huffman
.dc_table_done:
cmp [ac_huffman_defined+edx], 0
jnz .ac_table_ok
cmp [progressive], 0
jz .end3
xor edx, edx
jmp .ac_table_done
.ac_table_ok:
; shl edx, 11
imul edx, max_hufftable_size
add edx, ac_huffman
.ac_table_done:
stosd ; dd DCTable
xchg eax, edx
stosd ; dd ACTable
push ecx
mov eax, [ebx+4]
movzx ecx, byte [edi-21] ; get HFactor
cdq ; edx:eax = width (width<0x10000, so as dword it is unsigned)
div ecx
stosd ; dd width / HFactor_i
stosd
xchg eax, ecx
inc eax
sub eax, edx
stosd ; dd HFactor_i+1 - (width % HFactor_i)
mov eax, [ebx+8]
movzx ecx, byte [edi-34] ; get VFactor
cdq
div ecx
stosd ; dd height / VFactor_i
stosd
xchg eax, ecx
inc eax
sub eax, edx
stosd ; dd VFactor_i+1 - (height % VFactor_i)
pop ecx
scasd ; dd DCPrediction
cmp dword [edi], 0
setnp al
ror al, 1
mov byte [edi-1], al
scasd ; dd ComponentOffset
dec ecx
jnz .sos_find_comp
mov [cur_components_end], edi
mov edi, ScanStart
movsb
cmp byte [esi], 63
ja .end3
movsb
lodsb
push eax
and al, 0xF
stosb
pop eax
shr al, 4
stosb
; now unpack data
call init_limits
and [decoded_MCUs], 0
mov [cur_rst_marker], 7
and [huffman_bits], 0
cmp [progressive], 0
jz .sos_noprogressive
; progressive mode - only decode DCT coefficients
; initialize pointers to coefficients data
; zero number of EOBs for AC coefficients
; redefine HIncrement and VIncrement
mov edi, cur_components
.coeff_init:
mov eax, [dct_buffer_size]
mul dword [edi+52]
add eax, [dct_buffer]
mov [edi+12], eax
and dword [edi+52], 0
cmp [ScanStart], 0
jz .scan_dc
cmp dword [edi+20], 0
jz .end3
jmp @f
.scan_dc:
cmp dword [edi+16], 0
jz .end3
@@:
movzx eax, byte [edi+1]
shl eax, 7
mov [edi+4], eax
mov eax, [edi+28]
mov cl, [edi+3]
cmp cl, [edi+32]
sbb eax, -7-1
shr eax, 3
shl eax, 7
mov [edi+8], eax
add edi, 56
cmp edi, [cur_components_end]
jb .coeff_init
; unpack coefficients
; N.B. Speed optimization has sense here.
push ebx
.coeff_decode_loop:
mov edx, cur_components
.coeff_components_loop:
mov edi, [edx+12]
movzx ecx, byte [edx]
push dword [edx+40]
push edi
.coeff_y_loop:
push ecx
movzx eax, byte [edx+1]
push dword [edx+28]
push edi
.coeff_x_loop:
cmp dword [edx+40], 0
jl @f
cmp dword [edx+28], 0
jge .realdata
@@:
cmp [not_interleaved], 0
jnz .norealdata
push eax edi
mov edi, dct_coeff
call decode_progressive_coeff
pop edi eax
jmp .norealdata
.realdata:
push eax
call decode_progressive_coeff
add edi, 64*2
pop eax
.norealdata:
sub dword [edx+28], 8
sub eax, 1
jnz .coeff_x_loop
pop edi
pop dword [edx+28]
add edi, [edx+8]
pop ecx
sub dword [edx+40], 8
sub ecx, 1
jnz .coeff_y_loop
movzx eax, byte [edx+1]
shl eax, 3
pop edi
add edi, [edx+4]
pop dword [edx+40]
sub [edx+28], eax
mov [edx+12], edi
add edx, 56
cmp edx, [cur_components_end]
jnz .coeff_components_loop
call next_MCU
jc .norst
sub [cur_x], 1
jnz .coeff_decode_loop
call next_line
mov edx, cur_components
@@:
mov eax, [max_x]
imul eax, [edx+4]
sub [edx+12], eax
movzx eax, byte [edx]
imul eax, [edx+8]
add [edx+12], eax
add edx, 56
cmp edx, [cur_components_end]
jnz @b
sub [cur_y], 1
jnz .coeff_decode_loop
pop ebx
jmp .markers_loop
.norst:
pop ebx
jmp .end4
.sos_noprogressive:
; normal mode - unpack JPEG image
mov edi, [ebx+28]
add edi, ebx
mov [cur_out_ptr], edi
; N.B. Speed optimization has sense here.
.decode_loop:
call decode_MCU
call next_MCU
jc .end4
sub [cur_x], 1
jnz .decode_loop
call next_line
sub [cur_y], 1
jnz .decode_loop
jmp .markers_loop
.end4:
jmp .end3
;---------------------------------------------------------------------
get_marker:
; in: esi -> data
; out: CF=0, al=marker value - ok
; CF=1 - no marker
sub ebp, 1
jc .ret
lodsb
if 1
cmp al, 0xFF
jae @f
; Some stupid men, which do not read specifications and manuals,
; sometimes create markers with length field two less than true
; value (in JPEG length of marker = length of data INCLUDING
; length field itself). To open such files, allow 2 bytes
; before next marker.
cmp ebp, 2
jb .ret
lodsb
lodsb
end if
cmp al, 0xFF
jb .ret
@@:
sub ebp, 1
jc .ret
lodsb
cmp al, 0xFF
jz @b
clc
.ret:
ret
;---------------------------------------------------------------------
align 16
decode_MCU:
mov edx, cur_components
.components_loop:
; decode each component
push [cur_out_ptr]
movzx ecx, byte [edx]
push dword [edx+40]
; we have H_i * V_i blocks of packed data, decode them
.y_loop_1:
push [cur_out_ptr]
push ecx
movzx eax, byte [edx+1]
push dword [edx+28]
.x_loop_1:
push eax
call decode_data_unit
cmp dword [edx+40], 0
jl .nocopyloop
cmp dword [edx+28], 0
jl .nocopyloop
; now we have decoded block 8*8 in decoded_data
; H_i * V_i packed blocks 8*8 make up one block (8*HMax) * (8*VMax)
; so each pixel in packed block corresponds to HFact * VFact pixels
movzx ecx, byte [edx+2]
push esi ebp
mov edi, [cur_out_ptr]
add edi, [edx+52]
.y_loop_2:
push ecx edi
cmp ecx, [edx+44]
mov ecx, [edx+40]
sbb ecx, 8-1
sbb eax, eax
and ecx, eax
add ecx, 8
jz .skip_x_loop_2
movzx eax, byte [edx+3]
.x_loop_2:
push eax ecx edi
cmp eax, [edx+32]
mov eax, [edx+28]
sbb eax, 8-1
sbb ebp, ebp
and eax, ebp
mov ebp, .copyiter_all
mov esi, decoded_data
sub ebp, eax
sub ebp, eax
sub ebp, eax
mov eax, [edx+4]
sub eax, 1
.copyloop:
push esi edi
jmp ebp
.copyiter_all:
movsb
repeat 7
add edi, eax
movsb
end repeat
nop
nop
pop edi esi
add edi, [edx+8]
add esi, 8
sub ecx, 1
jnz .copyloop
pop edi ecx eax
add edi, [pixel_size]
sub eax, 1
jnz .x_loop_2
.skip_x_loop_2:
pop edi ecx
add edi, [line_size]
sub ecx, 1
jnz .y_loop_2
pop ebp esi
.nocopyloop:
mov eax, [delta_x]
add [cur_out_ptr], eax
pop eax
sub dword [edx+28], 8
sub eax, 1
jnz .x_loop_1
pop dword [edx+28]
pop ecx
pop eax
sub dword [edx+40], 8
add eax, [delta_y]
mov [cur_out_ptr], eax
sub ecx, 1
jnz .y_loop_1
movzx eax, byte [edx+1]
pop dword [edx+40]
shl eax, 3
pop [cur_out_ptr]
sub dword [edx+28], eax
add edx, 56
cmp edx, [cur_components_end]
jb .components_loop
mov eax, [cur_block_dx]
add [cur_out_ptr], eax
ret
align 16
next_MCU:
add [decoded_MCUs], 1
mov eax, [restart_interval]
test eax, eax
jz .no_restart
cmp [decoded_MCUs], eax
jb .no_restart
and [decoded_MCUs], 0
and [huffman_bits], 0
cmp [cur_x], 1
jnz @f
cmp [cur_y], 1
jz .no_restart
@@:
; restart marker must be present
sub ebp, 2
js .error
cmp byte [esi], 0xFF
jnz .error
mov al, [cur_rst_marker]
inc eax
and al, 7
mov [cur_rst_marker], al
add al, 0xD0
cmp [esi+1], al
jnz .error
add esi, 2
; handle restart marker - zero all DC predictions
mov edx, cur_components
@@:
and word [edx+48], 0
add edx, 56
cmp edx, [cur_components_end]
jb @b
.no_restart:
clc
ret
.error:
stc
ret
next_line:
mov eax, [max_x]
mov [cur_x], eax
mul [cur_block_dx]
sub eax, [cur_block_dy]
sub [cur_out_ptr], eax
mov edx, cur_components
@@:
mov eax, [edx+24]
mov [edx+28], eax
movzx eax, byte [edx]
shl eax, 3
sub [edx+40], eax
add edx, 56
cmp edx, [cur_components_end]
jb @b
ret
init_limits:
push [x_num_blocks]
pop [max_x]
push [y_num_blocks]
pop [max_y]
push [block_delta_x]
pop [cur_block_dx]
push [block_delta_y]
pop [cur_block_dy]
cmp [not_interleaved], 0
jz @f
mov eax, dword [cur_components+28]
movzx ecx, byte [cur_components+3]
cmp cl, [cur_components+32]
sbb eax, -7-1
shr eax, 3
mov [max_x], eax
mov eax, dword [cur_components+40]
movzx edx, byte [cur_components+2]
cmp dl, [cur_components+44]
sbb eax, -7-1
shr eax, 3
mov [max_y], eax
imul ecx, [delta_x]
mov [cur_block_dx], ecx
imul edx, [delta_y]
mov [cur_block_dy], edx
@@:
push [max_x]
pop [cur_x]
push [max_y]
pop [cur_y]
ret
;macro get_bit
;{
;local .l1,.l2,.marker
; add cl, cl
; jnz .l1
; sub ebp, 1
; js decode_data_unit.eof
; mov cl, [esi]
; cmp cl, 0xFF
; jnz .l2
;.marker:
; add esi, 1
; sub ebp, 1
; js decode_data_unit.eof
; cmp byte [esi], 0xFF
; jz .marker
; cmp byte [esi], 0
; jnz decode_data_unit.eof
;.l2:
; sub esi, -1
; adc cl, cl
;.l1:
;}
macro get_bit
{
local .l1,.l2,.marker
sub cl, 1
jns .l1
sub ebp, 1
js .eof
mov ch, [esi]
cmp ch, 0xFF
jnz .l2
.marker:
add esi, 1
sub ebp, 1
js .eof
cmp byte [esi], 0xFF
jz .marker
cmp byte [esi], 0
jnz .eof
.l2:
add esi, 1
mov cl, 7
.l1:
add ch, ch
}
macro get_bits restore_edx
{
local .l1,.l2,.l3,.marker2
movzx eax, ch
mov dl, cl
shl eax, 24
neg cl
push ebx
add cl, 24
.l1:
cmp bl, dl
jbe .l2
sub bl, dl
sub ebp, 1
js .eof
mov ch, [esi]
cmp ch, 0xFF
jnz .l3
.marker2:
add esi, 1
sub ebp, 1
js .eof
cmp byte [esi], 0xFF
jz .marker2
cmp byte [esi], 0
jnz .eof
.l3:
movzx edx, ch
add esi, 1
shl edx, cl
sub cl, 8
or eax, edx
mov dl, 8
jmp .l1
.l2:
mov cl, bl
sub dl, bl
shl ch, cl
pop ebx
cmp eax, 80000000h
rcr eax, 1
mov cl, 31
sub cl, bl
sar eax, cl
mov cl, dl
if restore_edx eq true
pop edx
end if
add eax, 80000000h
adc eax, 80000000h
}
; macro get_huffman_code
; {
; local .l1
; xor ebx, ebx
; .l1:
; get_bit
; adc ebx, ebx
; mov eax, [eax+4*ebx]
; xor ebx, ebx
; cmp eax, -1
; jz .eof
; cmp eax, 0x1000
; jae .l1
; mov ebx, eax
; }
macro get_huffman_code
{
local .l1,.l2,.l3,.l4,.l5,.l6,.nomarker1,.marker1,.nomarker2,.marker2,.nomarker3,.marker3,.done
; 1. (First level in Huffman table) Does the current Huffman code fit in 8 bits
; and have we got enough bits?
movzx ebx, ch
cmp byte [eax+ebx*2], cl
jbe .l1
; 2a. No; load next byte
sub ebp, 1
js .eof
mov ch, [esi]
movzx edx, ch
cmp ch, 0xFF
jnz .nomarker1
.marker1:
add esi, 1
sub ebp, 1
js .eof
cmp byte [esi], 0xFF
jz .marker1
cmp byte [esi], 0
jnz .eof
.nomarker1:
shr edx, cl
add esi, 1
or ebx, edx
; 3a. (First level in Huffman table, >=8 bits known) Does the current Huffman code fit in 8 bits?
cmp byte [eax+ebx*2], 8
jbe .l2
jl .eof
; 4aa. No; go to next level
movzx ebx, byte [eax+ebx*2+1]
mov dl, ch
shl ebx, 5
ror edx, cl
lea ebx, [eax+ebx+0x200]
shr edx, 24
push edx
shr edx, 4
; 5aa. (Second level in Huffman table) Does the current Huffman code fit in 12 bits
; and have we got enough bits?
cmp byte [ebx+edx*2], cl
jbe .l3
; 6aaa. No; have we got 12 bits?
cmp cl, 4
jae .l4
; 7aaaa. No; load next byte
pop edx
sub ebp, 1
js .eof
mov ch, [esi]
cmp ch, 0xFF
jnz .nomarker2
.marker2:
add esi, 1
sub ebp, 1
js .eof
cmp byte [esi], 0xFF
jz .marker2
cmp byte [esi], 0
jnz .eof
.nomarker2:
push ecx
shr ch, cl
add esi, 1
or dl, ch
pop ecx
push edx
shr edx, 4
; 8aaaa. (Second level in Huffman table) Does the current Huffman code fit in 12 bits?
cmp byte [ebx+edx*2], 4
jbe .l5
jl .eof
; 9aaaaa. No; go to next level
movzx ebx, byte [ebx+edx*2+1]
pop edx
shl ebx, 5
and edx, 0xF
lea ebx, [eax+ebx+0x200]
; 10aaaaa. Get current code length and value
sub cl, [ebx+edx*2]
movzx eax, byte [ebx+edx*2+1]
neg cl
shl ch, cl
neg cl
add cl, 8
jmp .done
.l5:
; 9aaaab. Yes; get current code length and value
sub cl, [ebx+edx*2]
movzx eax, byte [ebx+edx*2+1]
neg cl
pop edx
shl ch, cl
neg cl
add cl, 8
jmp .done
.l4:
; 7aaab. Yes; go to next level
movzx ebx, byte [ebx+edx*2+1]
pop edx
shl ebx, 5
and edx, 0xF
lea ebx, [eax+ebx+0x200]
; 8aaab. (Third level in Huffman table) Have we got enough bits?
cmp [ebx+edx*2], cl
jbe .l6
; 9aaaba. No; load next byte
sub ebp, 1
js .eof
mov ch, [esi]
cmp ch, 0xFF
jnz .nomarker3
.marker3:
add esi, 1
sub ebp, 1
js .eof
cmp byte [esi], 0xFF
jz .marker3
cmp byte [esi], 0
jnz .eof
.nomarker3:
push ecx
shr ch, cl
add esi, 1
or dl, ch
pop ecx
; 10aaaba. Get current code length and value
sub cl, [ebx+edx*2]
movzx eax, byte [ebx+edx*2+1]
neg cl
shl ch, cl
neg cl
add cl, 8
jmp .done
.l3:
; 6aab. Yes; get current code length and value
pop eax
.l6:
; 9aaabb. Yes; get current code length and value
sub cl, [ebx+edx*2]
movzx eax, byte [ebx+edx*2+1]
xor cl, 7
shl ch, cl
xor cl, 7
add ch, ch
jmp .done
.l2:
; 3ab. Yes; get current code length and value
sub cl, [eax+ebx*2]
movzx eax, byte [eax+ebx*2+1]
neg cl
shl ch, cl
neg cl
add cl, 8
jmp .done
.l1:
; 3b. Yes; get current code length and value
mov dl, [eax+ebx*2]
movzx eax, byte [eax+ebx*2+1]
xchg cl, dl
sub dl, cl
shl ch, cl
mov cl, dl
.done:
mov ebx, eax
}
; Decode DCT coefficients for one 8*8 block in progressive mode
; from input stream, given by pointer esi and length ebp
; N.B. Speed optimization has sense here.
align 16
decode_progressive_coeff:
mov ecx, [huffman_bits]
cmp [ScanStart], 0
jnz .ac
; DC coefficient
cmp [ApproxPosHigh], 0
jz .dc_first
; DC coefficient, subsequent passes
xor eax, eax
get_bit
adc eax, eax
mov [huffman_bits], ecx
mov cl, [ApproxPosLow]
shl eax, cl
or [edi], ax
ret
.dc_first:
; DC coefficient, first pass
mov eax, [edx+16]
push edx
get_huffman_code
get_bits true
add eax, [edx+48]
mov [edx+48], ax
mov [huffman_bits], ecx
mov cl, [ApproxPosLow]
shl eax, cl
mov [edi], ax
ret
.ac:
; AC coefficients
movzx eax, [ScanStart]
cmp al, [ScanEnd]
ja .ret
cmp dword [edx+52], 0
jnz .was_eob
.acloop:
push edx
push eax
mov eax, [edx+20]
get_huffman_code
pop eax
test ebx, 15
jz .band
push eax ebx
and ebx, 15
get_bits false
pop ebx
xchg eax, [esp]
shr ebx, 4
.zeroloop1:
push eax ebx
movzx eax, byte [zigzag+eax]
xor ebx, ebx
cmp word [edi+eax], bx
jz .zeroloop2
get_bit
jnc @f
push ecx
mov cl, [ApproxPosLow]
xor ebx, ebx
cmp byte [edi+eax+1], 80h
adc ebx, 0
add ebx, ebx
sub ebx, 1
shl ebx, cl
pop ecx
add [edi+eax], bx
@@:
pop ebx eax
@@:
add eax, 1
cmp al, [ScanEnd]
ja decode_data_unit.eof
jmp .zeroloop1
.zeroloop2:
pop ebx eax
sub ebx, 1
jns @b
.nozero1:
pop ebx
test ebx, ebx
jz @f
push eax
movzx eax, byte [zigzag+eax]
push ecx
mov cl, [ApproxPosLow]
shl ebx, cl
pop ecx
mov [edi+eax], bx
pop eax
@@:
add eax, 1
pop edx
cmp al, [ScanEnd]
jbe .acloop
mov [huffman_bits], ecx
.ret:
ret
.eof:
jmp decode_data_unit.eof
.band:
shr ebx, 4
cmp ebx, 15
jnz .eob
push 0
jmp .zeroloop1
.eob:
pop edx
push eax
mov eax, 1
test ebx, ebx
jz .eob0
@@:
get_bit
adc eax, eax
sub ebx, 1
jnz @b
.eob0:
mov [edx+52], eax
pop eax
.was_eob:
sub dword [edx+52], 1
cmp al, [ScanEnd]
ja .ret2
.zeroloop3:
push eax
movzx eax, byte [zigzag+eax]
xor ebx, ebx
cmp word [edi+eax], bx
jz @f
get_bit
jnc @f
push ecx
mov cl, [ApproxPosLow]
xor ebx, ebx
cmp byte [edi+eax+1], 80h
adc ebx, 0
add ebx, ebx
sub ebx, 1
shl ebx, cl
pop ecx
add [edi+eax], bx
@@:
pop eax
add eax, 1
cmp al, [ScanEnd]
jbe .zeroloop3
.ret2:
mov [huffman_bits], ecx
ret
handle_progressive:
cmp [dct_buffer], 0
jnz @f
ret
@@:
; information for all components
mov esi, components
xor ebp, ebp
movzx ecx, byte [ebx+18]
.next_component:
mov edi, cur_components
lodsb ; ComponentID
lodsd
mov ax, 0x0101
stosd ; db V, db H, db VFactor, db HFactor
xor eax, eax
mov al, byte [edi-1] ; get HFactor
mul byte [ebx+18] ; number of components
stosd ; HIncrement_i = HFactor_i * sizeof(pixel)
mov al, byte [edi-4-2] ; get VFactor
mul byte [ebx+18] ; number of components
mul dword [ebx+4] ; image width
stosd ; VIncrement_i = VFactor_i * sizeof(row)
lodsb
and eax, 3
cmp [quant_tables_defined+eax], 0
jz .error
shl eax, 8
add eax, quant_tables
stosd ; dd QuantizationTable
stosd ; dd DCTable - ignored
mov eax, ebp
mul [dct_buffer_size]
add eax, [dct_buffer]
stosd ; instead of dd ACTable - pointer to current DCT coefficients
push ecx
mov eax, [ebx+4]
movzx ecx, byte [edi-21] ; get HFactor
; cdq ; edx = 0 as a result of previous mul
div ecx
stosd ; dd width / HFactor_i
stosd
xchg eax, ecx
inc eax
sub eax, edx
stosd ; dd HFactor_i+1 - (width % HFactor_i)
mov eax, [ebx+8]
movzx ecx, byte [edi-34] ; get VFactor
cdq
div ecx
stosd ; dd height / VFactor_i
stosd
xchg eax, ecx
inc eax
sub eax, edx
stosd ; dd VFactor_i+1 - (height % VFactor_i)
pop ecx
xor eax, eax
test ebp, ebp
setnp al
ror eax, 1
stosd ; dd DCPrediction
mov eax, ebp
stosd ; dd ComponentOffset
inc ebp
push ecx
mov [cur_components_end], edi
lea edx, [edi-56]
; do IDCT and unpack
mov edi, [ebx+28]
add edi, ebx
mov [cur_out_ptr], edi
mov [not_interleaved], 1
call init_limits
.decode_loop:
call decode_MCU
sub [cur_x], 1
jnz .decode_loop
call next_line
sub [cur_y], 1
jnz .decode_loop
pop ecx
dec ecx
jnz .next_component
; image unpacked, return
.error:
push ebx
push 68
pop eax
push 13
pop ebx
mov ecx, [dct_buffer]
int 0x40
pop ebx
ret
; Support for YCbCr -> RGB conversion
; R = Y + 1.402 * (Cr - 128)
; G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
; B = Y + 1.772 * (Cb - 128)
; When converting YCbCr -> RGB, we need to do some multiplications;
; to be faster, we precalculate the table for all 256 possible values
; Also we approximate fractions with N/65536, this gives sufficient precision
initialize_color_table:
; 1.402 = 1 + 26345/65536, -0.71414 = -46802/65536
; -0.34414 = -22554/65536, 1.772 = 1 + 50594/65536
mov edi, color_table_1
mov ecx, 128
; 1. Cb -> 1.772*Cb
xor eax, eax
mov dx, 8000h
.l1:
push ecx
@@:
stosd
add dx, 50594
adc eax, 1
loop @b
neg dx
adc eax, -1
neg eax
pop ecx
jnz .l1
; 2. Cb -> -0.34414*Cb
mov ax, dx
.l2:
push ecx
@@:
stosd
sub eax, 22554
loop @b
neg eax
pop ecx
cmp ax, dx
jnz .l2
xor eax, eax
; 3. Cr -> -0.71414*Cr
.l3:
push ecx
@@:
stosd
sub eax, 46802
loop @b
neg eax
pop ecx
jnz .l3
; 4. Cr -> 1.402*Cr
.l4:
push ecx
@@:
stosd
add dx, 26345
adc eax, 1
loop @b
neg dx
adc eax, -1
neg eax
pop ecx
jnz .l4
ret
; this function is called in the end of image loading
convert_to_rgb:
; some checks
test ebx, ebx ; image exists?
jz .ret
cmp byte [ebx+18], 3 ; full-color image?
jz .ycc2rgb
cmp byte [ebx+18], 4
jz .ycck2rgb
.ret:
ret
.ycc2rgb:
; conversion is needed
mov esi, [ebx+4]
imul esi, [ebx+8]
lea edi, [ebx+44]
push ebx
; N.B. Speed optimization has sense here.
align 16
.loop:
; mov ebx, [edi]
; mov edx, ebx
; mov ecx, ebx
; movzx ebx, bl ; ebx = Y
; shr edx, 16
; mov eax, ebx
; movzx edx, dl ; edx = Cr
; movzx ecx, ch ; ecx = Cb
movzx ebx, byte [edi]
movzx ecx, byte [edi+1]
mov eax, ebx
movzx edx, byte [edi+2]
; B = Y + color_table_1[Cb]
add eax, [color_table_1+ecx*4]
mov ebp, [color_table_2+ecx*4]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
add ebp, [color_table_3+edx*4]
cmp eax, 0x100
sbb ecx, ecx
not ecx
sar ebp, 16
or eax, ecx
mov [edi], al
; G = Y + color_table_2[Cb] + color_table_3[Cr]
lea eax, [ebx+ebp]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
cmp eax, 0x100
sbb ecx, ecx
not ecx
or eax, ecx
mov [edi+1], al
; R = Y + color_table_4[Cr]
mov eax, ebx
add eax, [color_table_4+edx*4]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
cmp eax, 0x100
sbb ecx, ecx
not ecx
or eax, ecx
mov [edi+2], al
add edi, 3
sub esi, 1
jnz .loop
pop ebx
ret
.ycck2rgb:
; conversion is needed
mov esi, [ebx+4]
imul esi, [ebx+8]
push ebx
push esi
lea edi, [ebx+44]
mov esi, edi
; N.B. Speed optimization has sense here.
align 16
.kloop:
; mov ebx, [esi]
; mov edx, ebx
; mov ecx, ebx
; movzx ebx, bl ; ebx = Y
; shr edx, 16
; mov eax, ebx
; movzx edx, dl ; edx = Cr
; movzx ecx, ch ; ecx = Cb
movzx ebx, byte [esi]
movzx ecx, byte [esi+1]
mov eax, ebx
movzx edx, byte [esi+2]
; B = Y + color_table_1[Cb]
add eax, [color_table_1+ecx*4]
mov ebp, [color_table_2+ecx*4]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
add ebp, [color_table_3+edx*4]
cmp eax, 0x100
sbb ecx, ecx
not ecx
sar ebp, 16
or eax, ecx
xor al, 0xFF
mul byte [esi+3]
add al, ah
adc ah, 0
add al, 80h
adc ah, 0
mov byte [edi], ah
; G = Y + color_table_2[Cb] + color_table_3[Cr]
lea eax, [ebx+ebp]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
cmp eax, 0x100
sbb ecx, ecx
not ecx
or eax, ecx
xor al, 0xFF
mul byte [esi+3]
add al, ah
adc ah, 0
add al, 80h
adc ah, 0
mov byte [edi+1], ah
; R = Y + color_table_4[Cr]
mov eax, ebx
add eax, [color_table_4+edx*4]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
cmp eax, 0x100
sbb ecx, ecx
not ecx
or eax, ecx
xor al, 0xFF
mul byte [esi+3]
add al, ah
adc ah, 0
add al, 80h
adc ah, 0
mov byte [edi+2], ah
add esi, 4
add edi, 3
sub dword [esp], 1
jnz .kloop
pop eax
pop ebx
; now correct values in RAW header
mov byte [ebx+12], 24
mov byte [ebx+18], 3
mov ecx, [ebx+32]
shr ecx, 2
lea ecx, [ecx*3]
mov [ebx+22], ecx
; release some memory - must succeed because we decrease size
add ecx, 44+1
mov edx, ebx
push 68
pop eax
push 20
pop ebx
int 0x40
mov ebx, eax
ret
; Decodes one data unit, that is, 8*8 block,
; from input stream, given by pointer esi and length ebp
; N.B. Speed optimization has sense here.
align 16
decode_data_unit:
; edx -> component data
cmp [progressive], 0
jz @f
mov edi, [edx+20]
add dword [edx+20], 64*2
jmp .coeff_decoded
@@:
mov edi, dct_coeff
mov ecx, 64*2/4
xor eax, eax
rep stosd
mov edi, zigzag+1
mov ecx, [huffman_bits]
; read DC coefficient
mov eax, [edx+16]
push edx
get_huffman_code
get_bits true
add eax, [edx+48]
mov [dct_coeff], ax
mov [edx+48], ax
; read AC coefficients
@@:
mov eax, [edx+20]
push edx
get_huffman_code
shr eax, 4
and ebx, 15
jz .band
add edi, eax
cmp edi, zigzag+64
jae .eof
get_bits true
movzx ebx, byte [edi]
mov [dct_coeff+ebx], ax
add edi, 1
cmp edi, zigzag+64
jb @b
jmp .do_idct
.band:
pop edx
cmp al, 15
jnz .do_idct
add edi, 16
cmp edi, zigzag+64
jb @b
; jmp .eof
.do_idct:
mov edi, dct_coeff
mov [huffman_bits], ecx
; coefficients loaded, now IDCT
.coeff_decoded:
mov eax, [edx+12]
mov ebx, idct_tmp_area
.idct_loop1:
mov cx, word [edi+1*16]
repeat 6
or cx, word [edi+(%+1)*16]
end repeat
jnz .real_transform
fild word [edi]
fmul dword [eax]
fstp dword [ebx]
mov ecx, [ebx]
repeat 7
mov [ebx+%*32], ecx
end repeat
jmp .idct_next1
.real_transform:
; S0,...,S7 - transformed values, s0,...,s7 - sought-for values
; S0,...,S7 are dequantized;
; dequantization table elements were multiplied to [idct_pre_table],
; so S0,S1,... later denote S0/2\sqrt{2},S1*\cos{\pi/16}/2,...
; sqrt2 = \sqrt{2}, cos = 2\cos{\pi/8},
; cos_sum = -2(\cos{\pi/8}+\cos{3\pi/8}), cos_diff = 2(\cos{\pi/8}-\cos{3\pi/8})
; Now formulas:
; s0 = ((S0+S4)+(S2+S6)) + ((S1+S7)+(S3+S5))
; s7 = ((S0+S4)+(S2+S6)) - ((S1+S7)+(S3+S5))
; val0 = ((cos-1)S1-(cos+cos_sum+1)S3+(cos+cos_sum-1)S5-(cos+1)S7)
; s1 = ((S0-S4)+((sqrt2-1)S2-(sqrt2+1)S6)) + val0
; s6 = ((S0-S4)+((sqrt2-1)S2-(sqrt2+1)S6)) - val0
; val1 = (S1+S7-S3-S5)sqrt2 - val0
; s2 = ((S0-S4)-((sqrt2-1)S2-(sqrt2+1)S6)) + val1
; s5 = ((S0-S4)-((sqrt2-1)S2-(sqrt2+1)S6)) - val1
; val2 = (S1-S7)cos_diff - (S1-S3+S5-S7)cos + val1
; s3 = ((S0+S4)-(S2+S6)) - val2
; s4 = ((S0+S4)-(S2+S6)) + val2
fild word [edi+3*16]
fmul dword [eax+3*32]
fild word [edi+5*16]
fmul dword [eax+5*32] ; st0=S5,st1=S3
fadd st1,st0
fadd st0,st0
fsub st0,st1 ; st0=S5-S3,st1=S5+S3
fild word [edi+1*16]
fmul dword [eax+1*32]
fild word [edi+7*16]
fmul dword [eax+7*32] ; st0=S7,st1=S1
fsub st1,st0
fadd st0,st0
fadd st0,st1 ; st0=S1+S7,st1=S1-S7,st2=S5-S3,st3=S5+S3
fadd st3,st0
fadd st0,st0
fsub st0,st3 ; st0=S1-S3-S5+S7,st1=S1-S7,st2=S5-S3,st3=S1+S3+S5+S7
fmul [idct_sqrt2]
fld st2
fadd st0,st2
fmul [idct_cos] ; st0=(S1-S3+S5-S7)cos,st1=(S1-S3-S5+S7)sqrt2,
; st2=S1-S7,st3=S5-S3,st4=S1+S3+S5+S7
fxch st2
fmul [idct_cos_diff]
fsub st0,st2 ; st0=(S1-S7)cos_diff - (S1-S3+S5-S7)cos
fxch st3
fmul [idct_cos_sum]
fadd st0,st2 ; st0=(S5-S3)cos_sum+(S1-S3+S5-S7)cos
fsub st0,st4 ; st0=val0
fsub st1,st0 ; st0=val0,st1=val1,st2=(S1-S3+S5-S7)cos,
; st3=(S1-S7)cos_diff-(S1-S3+S5-S7)cos,st4=S1+S3+S5+S7
fxch st2
fstp st0
fadd st2,st0 ; st0=val1,st1=val0,st2=val2,st3=S1+S3+S5+S7
fild word [edi+0*16]
fmul dword [eax+0*32]
fild word [edi+4*16]
fmul dword [eax+4*32] ; st0=S4,st1=S0
fsub st1,st0
fadd st0,st0
fadd st0,st1 ; st0=S0+S4,st1=S0-S4
fild word [edi+6*16]
fmul dword [eax+6*32]
fild word [edi+2*16]
fmul dword [eax+2*32] ; st0=S2,st1=S6
fadd st1,st0
fadd st0,st0
fsub st0,st1 ; st0=S2-S6,st1=S2+S6
fmul [idct_sqrt2]
fsub st0,st1
fsub st3,st0
fadd st0,st0
fadd st0,st3 ; st0=(S0-S4)+((S2-S6)sqrt2-(S2+S6))
; st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
fxch st1
fsub st2,st0
fadd st0,st0
fadd st0,st2 ; st0=(S0+S4)+(S2+S6),st1=(S0-S4)+((S2-S6)sqrt2-(S2+S6)),
; st2=(S0+S4)-(S2+S6),st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
; st4=val1,st5=val0,st6=val2,st7=S1+S3+S5+S7
fsubr st7,st0
fadd st0,st0
fsub st0,st7
fstp dword [ebx+0*32]
fsubr st4,st0
fadd st0,st0
fsub st0,st4
fstp dword [ebx+1*32]
fadd st4,st0
fadd st0,st0
fsub st0,st4
fstp dword [ebx+3*32]
fsubr st1,st0
fadd st0,st0
fsub st0,st1
fstp dword [ebx+2*32]
fstp dword [ebx+5*32]
fstp dword [ebx+6*32]
fstp dword [ebx+4*32]
fstp dword [ebx+7*32]
.idct_next1:
add ebx, 4
add edi, 2
add eax, 4
cmp ebx, idct_tmp_area+8*4
jb .idct_loop1
sub ebx, 8*4
.idct_loop2:
fld dword [ebx+3*4]
fld dword [ebx+5*4]
fadd st1,st0
fadd st0,st0
fsub st0,st1 ; st0=S5-S3,st1=S5+S3
fld dword [ebx+1*4]
fld dword [ebx+7*4]
fsub st1,st0
fadd st0,st0
fadd st0,st1 ; st0=S1+S7,st1=S1-S7,st2=S5-S3,st3=S5+S3
fadd st3,st0
fadd st0,st0
fsub st0,st3 ; st0=S1-S3-S5+S7,st1=S1-S7,st2=S5-S3,st3=S1+S3+S5+S7
fmul [idct_sqrt2]
fld st2
fadd st0,st2
fmul [idct_cos] ; st0=(S1-S3+S5-S7)cos,st1=(S1-S3-S5+S7)sqrt2,
; st2=S1-S7,st3=S5-S3,st4=S1+S3+S5+S7
fxch st2
fmul [idct_cos_diff]
fsub st0,st2 ; st0=(S1-S7)cos_diff - (S1-S3+S5-S7)cos
fxch st3
fmul [idct_cos_sum]
fadd st0,st2 ; st0=(S5-S3)cos_sum+(S1-S3+S5-S7)cos
fsub st0,st4 ; st0=val0
fsub st1,st0 ; st0=val0,st1=val1,st2=(S1-S3+S5-S7)cos,
; st3=(S1-S7)cos_diff-(S1-S3+S5-S7)cos,st4=S1+S3+S5+S7
fxch st2
fstp st0
fadd st2,st0 ; st0=val1,st1=val0,st2=val2,st3=S1+S3+S5+S7
fld dword [ebx+0*4]
fld dword [ebx+4*4]
fsub st1,st0
fadd st0,st0
fadd st0,st1 ; st0=S0+S4,st1=S0-S4
fld dword [ebx+6*4]
fld dword [ebx+2*4]
fadd st1,st0
fadd st0,st0
fsub st0,st1 ; st0=S2-S6,st1=S2+S6
fmul [idct_sqrt2]
fsub st0,st1
fsub st3,st0
fadd st0,st0
fadd st0,st3 ; st0=(S0-S4)+((S2-S6)sqrt2-(S2+S6))
; st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
fxch st1
fsub st2,st0
fadd st0,st0
fadd st0,st2 ; st0=(S0+S4)+(S2+S6),st1=(S0-S4)+((S2-S6)sqrt2-(S2+S6)),
; st2=(S0+S4)-(S2+S6),st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
; st4=val1,st5=val0,st6=val2,st7=S1+S3+S5+S7
fsubr st7,st0
fadd st0,st0
fsub st0,st7
fistp dword [ebx+0*4]
fsubr st4,st0
fadd st0,st0
fsub st0,st4
fistp dword [ebx+1*4]
fadd st4,st0
fadd st0,st0
fsub st0,st4
fistp dword [ebx+3*4]
fsubr st1,st0
fadd st0,st0
fsub st0,st1
fistp dword [ebx+2*4]
fistp dword [ebx+5*4]
fistp dword [ebx+6*4]
fistp dword [ebx+4*4]
fistp dword [ebx+7*4]
add ebx, 32
cmp ebx, idct_tmp_area+32*8
jb .idct_loop2
mov ecx, idct_tmp_area
mov edi, decoded_data - 1
.idct_loop3:
mov eax, [ecx]
add ecx, 4
add eax, 80h
cmp eax, 80000000h
sbb ebx, ebx
add edi, 1
and eax, ebx
cmp eax, 100h
sbb ebx, ebx
not ebx
or eax, ebx
sub al, [edx+51]
cmp ecx, idct_tmp_area+64*4
mov [edi], al
jb .idct_loop3
; done
mov ebx, [_ebx]
ret
.eof:
; EOF or incorrect data during scanning
mov esp, [_esp]
mov ebx, [_ebx]
jmp START.end
;---------------------------------------------------------------------
check_header:
pushad
mov eax,dword [esp+36]
push eax
and dword [eax+8], 0
mov esi, [eax]
mov ebp, [eax+12]
call get_marker
jc .err
cmp al, 0xD8
jnz .err
pop eax
popad
ret 4
.err:
pop eax
inc dword [eax+8]
popad
ret 4
;---------------------------------------------------------------------
zigzag:
; (x,y) -> 2*(x+y*8)
repeat 8
.cur = %
if .cur and 1
repeat %
db 2*((%-1) + (.cur-%)*8)
end repeat
else
repeat %
db 2*((.cur-%) + (%-1)*8)
end repeat
end if
end repeat
repeat 7
.cur = %
if .cur and 1
repeat 8-%
db 2*((%+.cur-1) + (8-%)*8)
end repeat
else
repeat 8-%
db 2*((8-%) + (%+.cur-1)*8)
end repeat
end if
end repeat
align 4
idct_pre_table:
; c_0 = 1/(2\sqrt{2}), c_i = cos(i*\pi/16)/2
dd 0.35355339, 0.49039264, 0.461939766, 0.41573481
dd 0.35355339, 0.27778512, 0.19134172, 0.09754516
idct_sqrt2 dd 1.41421356 ; \sqrt{2}
idct_cos dd 1.847759065 ; 2\cos{\pi/8}
idct_cos_sum dd -2.61312593 ; -2(\cos{\pi/8} + \cos{3\pi/8})
idct_cos_diff dd 1.08239220 ; 2(\cos{\pi/8} - \cos{3\pi/8})
;---------------------------------------------------------------------
Associations:
dd Associations.end - Associations
db 'JPEG',0
db 'JPG',0
db 'JPE',0
.end:
db 0
;---------------------------------------------------------------------
align 4
EXPORTS:
dd szStart, START
dd szVersion, 0x00010002
dd szCheck, check_header
dd szAssoc, Associations
dd 0
szStart db 'START',0
szVersion db 'version',0
szCheck db 'Check_Header',0
szAssoc db 'Associations',0
section '.data' data readable writable align 16
; up to 4 quantization tables
quant_tables rd 4*64
quant_tables_defined rb 4
; Huffman tables
dc_huffman_defined rb 4
ac_huffman_defined rb 4
; up to 4 DC Huffman tables
;dc_huffman rd 4*256*2
; up to 4 AC Huffman tables
;ac_huffman rd 4*256*2
max_hufftable_size = (256 + (9+128)*16)*2
dc_huffman rb 4*max_hufftable_size
ac_huffman rb 4*max_hufftable_size
; restart interval
restart_interval dd ?
decoded_MCUs dd ?
; base esp,ebx values
_esp dd ?
_ebx dd ?
; components information, up to 4 components
; db ComponentIdentifier, db V, db H, db VFactor, db HFactor, db QuantizationTable
components rb 4*6
max_v db ?
max_h db ?
cur_rst_marker db ?
db ?
huffman_bits dd ?
block_width dd ?
block_height dd ?
block_delta_x dd ?
block_delta_y dd ?
cur_block_dx dd ?
cur_block_dy dd ?
x_num_blocks dd ?
y_num_blocks dd ?
delta_x dd ?
delta_y dd ?
pixel_size dd ?
line_size dd ?
cur_x dd ?
cur_y dd ?
max_x dd ?
max_y dd ?
cur_out_ptr dd ?
dct_buffer dd ?
dct_buffer_size dd ?
;ns dd ?
; +0: db V, db H, db VFactor, db HFactor, dd HIncrement, dd VIncrement,
; +12: dd QuantizationTable, dd DCTable, dd ACTable,
; +24: dd width/HFactor, dd width/HFactor-8k, dd HFactor+1-(width%HFactor),
; +36: dd height/VFactor, dd height/VFactor-8m, dd VFactor+1-(height%VFactor),
; +48: dw DCPrediction, db ?, db (0 for Y, 80h for Cb,Cr), dd ComponentOffset
cur_components rb 4*56
cur_components_end dd ?
; progressive JPEG?
progressive db ?
; one component in the scan?
not_interleaved db ?
; Adobe YCCK file?
adobe_ycck db ?
rb 1
; parameters for progressive scan
ScanStart db ?
ScanEnd db ?
ApproxPosLow db ?
ApproxPosHigh db ?
; Fourier coefficients
dct_coeff rw 64
; Temporary space for IDCT
idct_tmp_area rd 64
; decoded block 8*8
decoded_data rb 8*8
; data for YCbCr -> RGB translation
color_table_1 rd 256
color_table_2 rd 256
color_table_3 rd 256
color_table_4 rd 256