kolibrios/programs/develop/libraries/libs-dev/libimg/jpeg/jpeg.asm

2232 lines
46 KiB
NASM
Raw Normal View History

;;================================================================================================;;
;;//// jpeg.asm //// (c) diamond, 2008-2009 //////////////////////////////////////////////////////;;
;;================================================================================================;;
;; ;;
;; This file is part of Common development libraries (Libs-Dev). ;;
;; ;;
;; Libs-Dev is free software: you can redistribute it and/or modify it under the terms of the GNU ;;
;; Lesser General Public License as published by the Free Software Foundation, either version 2.1 ;;
;; of the License, or (at your option) any later version. ;;
;; ;;
;; Libs-Dev is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without ;;
;; even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;
;; Lesser General Public License for more details. ;;
;; ;;
;; You should have received a copy of the GNU Lesser General Public License along with Libs-Dev. ;;
;; If not, see <http://www.gnu.org/licenses/>. ;;
;; ;;
;;================================================================================================;;
include 'jpeg.inc'
img.is.jpg:
push esi ebp
mov esi, [esp+12] ; esi -> JPEG data
mov ebp, [esp+16] ; ebp = data size
call get_marker
jc .no
cmp al, 0xD8 ; SOI marker?
push 1
pop eax
jz .ok
.no:
xor eax, eax
.ok:
pop ebp esi
ret 8
img.decode.jpg:
finit
pushad
mov esi, [esp+20h+4] ; esi -> JPEG data
mov ebp, [esp+20h+8] ; ebp = data size
@@:
; allocate area for JPEG processing
push sizeof.jpeg.work
call [mem.alloc]
test eax, eax
jz .ret
mov ebx, eax
xor ecx, ecx
mov [ebx + jpeg.work.image], ecx
mov [ebx + jpeg.work.dct_buffer], ecx
mov [ebx + jpeg.work._esp], esp
; check for SOI [Start-Of-Image] marker
call get_marker
jc .end
cmp al, 0xD8 ; SOI?
jz .soi_ok
.end:
; general exit from the function
; for progressive mode: convert loaded DCT coefficients to image
call handle_progressive
; convert full-color images to RGB
call convert_to_rgb
push [ebx + jpeg.work.image]
push ebx
call [mem.free]
pop eax
.ret:
mov [esp+28], eax
popad
ret 12
.soi_ok:
mov [ebx + jpeg.work.restart_interval], ecx
mov [ebx + jpeg.work.adobe_ycck], cl
; loop until start of frame (real data), parse markers
.markers_loop:
call get_marker
jc .end
; markers RSTn do not have parameters
; N.B. They can not exist in this part of JPEG, but let's be liberal :)
cmp al, 0xD0
jb @f
cmp al, 0xD8
jb .markers_loop
@@:
cmp al, 0xD9 ; EOI? [invalid here]
jz .end
; ok, this is marker segment
; first word is length of the segment
cmp ebp, 2
jb .end
xor edx, edx
mov dl, [esi+1]
mov dh, [esi] ; edx = marker length, al = marker value
sub ebp, edx
jb .end
cmp al, 0xDB ; DQT?
jz .dqt
cmp al, 0xC4 ; DHT?
jz .dht
cmp al, 0xCC ; DAC? [ignored - no arithmetic coding]
jz .next_marker
cmp al, 0xDD ; DRI?
jz .dri
cmp al, 0xDA ; SOS?
jz .sos
cmp al, 0xC0
jb @f
cmp al, 0xD0
jb .sofn
@@:
cmp al, 0xEE ; APP14?
jz .app14
; unrecognized marker; let's skip it and hope for the best
.next_marker:
add esi, edx
jmp .markers_loop
.app14:
; check for special Adobe marker
cmp dx, 14
jb .next_marker
cmp byte [esi+2], 'A'
jnz .next_marker
cmp dword [esi+3], 'dobe'
jnz .next_marker
cmp byte [esi+13], 2
setz [ebx + jpeg.work.adobe_ycck]
jmp .next_marker
.dqt:
; DQT marker found
; length: 2 bytes for length field + 65 bytes per table
sub edx, 2
jc .end
lodsw
.dqt_loop:
test edx, edx
jz .markers_loop
sub edx, 1+64
jc .end
lodsb
; 8-bit DCT-based process shall not use a 16-bit precision quantization table.
test al, 0xF0
jnz .end
and eax, 3
mov [ebx+jpeg.work.quant_tables_defined+eax], 1
shl eax, 8
lea edi, [ebx+eax+jpeg.work.quant_tables]
xor ecx, ecx
@@:
xor eax, eax
lodsb
push eax
fild dword [esp]
pop eax
movzx eax, byte [zigzag+ecx]
add eax, eax
push eax
and eax, 7*4
fmul dword [idct_pre_table+eax]
pop eax
push eax
shr eax, 3
and eax, 7*4
fmul dword [idct_pre_table+eax]
pop eax
fstp dword [edi+eax]
inc ecx
cmp ecx, 64
jb @b
jmp .dqt_loop
.dri:
; DRI marker found
cmp edx, 4 ; length must be 4
jnz .end2
movzx eax, word [esi+2]
xchg al, ah
mov [ebx+jpeg.work.restart_interval], eax
jmp .next_marker
.dht:
; DHT marker found
sub edx, 2
jc .end2
lodsw
.dht_loop:
test edx, edx
jz .markers_loop
sub edx, 17
jc .end2
; next Huffman table; find place for it
lodsb
mov edi, eax
and eax, 0x10
and edi, 3
shr eax, 2
or edi, eax
mov [ebx+jpeg.work.dc_huffman_defined+edi], 1
; shl edi, 11
imul edi, max_hufftable_size
lea edi, [ebx+edi+jpeg.work.dc_huffman] ; edi -> destination table
; get table size
xor eax, eax
push 16
pop ecx
@@:
add al, [esi]
adc ah, 0
inc esi
loop @b
cmp ax, 0x100
ja .end2
sub edx, eax
jc .end2
; construct Huffman tree
push ebx edx
; lea eax, [edi+256*8]
; push eax
; push 16
; mov edx, esi
; @@:
; cmp byte [edx-1], 0
; jnz @f
; dec edx
; dec dword [esp]
; jmp @b
; @@:
; sub edx, [esp]
; lea eax, [edi+8]
; push 2
; pop ecx
; .lenloop:
; mov bl, byte [edx]
; test bl, bl
; jz .len1done
; push eax
; xor eax, eax
; .len1loop:
; dec ecx
; js .dhterr
; cmp edi, [esp+8]
; jae .dhterr
; lodsb
; stosd
; dec bl
; jnz .len1loop
; pop eax
; .len1done:
; jecxz .len2done
; push ecx
; .len2loop:
; cmp eax, [esp+8]
; jb @f
; or eax, -1
; @@:
; cmp edi, [esp+8]
; jae .dhterr
; stosd
; add eax, 8
; jnb @f
; or eax, -1
; @@:
; loop .len2loop
; pop ecx
; .len2done:
; add ecx, ecx
; inc edx
; dec dword [esp]
; jnz .lenloop
; pop eax
; pop eax
; sub eax, edi
; shr eax, 2
; cmp eax, ecx
; ja @f
; mov ecx, eax
; @@:
; or eax, -1
; rep stosd
; pop edx ebx
; jmp .dht_loop
; .dhterr:
; ;pop eax eax eax edx ebx
; add esp, 5*4
lea eax, [edi+256*2]
push eax
lea edx, [esi-16]
mov ah, 1
mov ecx, 128
.dht_l1:
movzx ebx, byte [edx]
inc edx
test ebx, ebx
jz .dht_l3
.dht_l2:
cmp edi, [esp]
jae .dhterr1
lodsb
xchg al, ah
push ecx
rep stosw
pop ecx
xchg al, ah
dec ebx
jnz .dht_l2
.dht_l3:
inc ah
shr ecx, 1
jnz .dht_l1
push edi
mov edi, [esp+4]
push edi
mov eax, 0x00090100
mov cl, 8
.dht_l4:
movzx ebx, byte [edx]
inc edx
test ebx, ebx
jz .dht_l6
.dht_l5:
cmp edi, [esp]
jb @f
mov edi, [esp+4]
rol eax, 16
cmp edi, [esp+8]
jae .dhterr2
stosw
inc ah
mov [esp+4], edi
pop edi
push edi
rol eax, 16
add dword [esp], 16*2
@@:
lodsb
xchg al, ah
push ecx
rep stosw
pop ecx
xchg al, ah
dec ebx
jnz .dht_l5
.dht_l6:
inc ah
shr ecx, 1
jnz .dht_l4
push edi
movzx ebx, byte [edx]
add ebx, ebx
add bl, [edx+1]
adc bh, 0
add ebx, ebx
add bl, [edx+2]
adc bh, 0
add ebx, ebx
add bl, [edx+3]
adc bh, 0
add ebx, 15
shr ebx, 4
mov cl, 8
lea ebx, [edi+ebx*2]
sub ebx, [esp+12]
add ebx, 31
shr ebx, 5
mov edi, ebx
shl edi, 5
add edi, [esp+12]
xor ebx, 9
shl ebx, 16
xor eax, ebx
push edi
.dht_l7:
movzx ebx, byte [edx]
inc edx
test ebx, ebx
jz .dht_l10
.dht_l8:
cmp edi, [esp]
jb .dht_l9
mov edi, [esp+4]
cmp edi, [esp+8]
jb @f
mov edi, [esp+12]
cmp edi, [esp+16]
jae .dhterr3
mov al, 9
stosb
rol eax, 8
stosb
inc eax
ror eax, 8
mov [esp+12], edi
mov edi, [esp+8]
add dword [esp+8], 16*2
@@:
mov al, 9
stosb
rol eax, 16
stosb
inc eax
ror eax, 16
mov [esp+4], edi
pop edi
push edi
add dword [esp], 16*2
.dht_l9:
lodsb
xchg al, ah
push ecx
rep stosw
pop ecx
xchg al, ah
dec ebx
jnz .dht_l8
.dht_l10:
inc ah
shr ecx, 1
jnz .dht_l7
push -1
pop eax
pop ecx
sub ecx, edi
rep stosb
pop edi
pop ecx
sub ecx, edi
rep stosb
pop edi
pop ecx
sub ecx, edi
rep stosb
pop edx ebx
jmp .dht_loop
.dhterr3:
pop eax eax
.dhterr2:
pop eax eax
.dhterr1:
pop eax
pop edx ebx
.end2:
jmp .end
.sofn:
; SOFn marker found
cmp [ebx+jpeg.work.image], 0
jnz .end2 ; only one frame is allowed
; only SOF0 [baseline sequential], SOF1 [extended sequential], SOF2 [progressive]
; nobody supports other compression methods
cmp al, 0xC2
ja .end2
setz [ebx+jpeg.work.progressive]
; Length must be at least 8
sub edx, 8
jb .end2
; Sample precision in JFIF must be 8 bits
cmp byte [esi+2], 8
jnz .end2
; Color space in JFIF is either YCbCr (color images, 3 components)
; or Y (grey images, 1 component)
movzx eax, byte [esi+7]
cmp al, 1
jz @f
cmp al, 3
jz @f
; Adobe products sometimes use YCCK color space with 4 components
cmp al, 4
jnz .end2
cmp [ebx+jpeg.work.adobe_ycck], 0
jz .end2
@@:
mov edi, eax ; edi = number of components
lea eax, [eax*3]
sub edx, eax
jnz .end2
; image type: 8 bpp for grayscale JPEGs, 24 bpp for normal,
; 32 bpp for Adobe YCCK
push Image.bpp8i
pop eax ; Image.bpp8i = 1
cmp edi, eax
jz @f
inc eax ; Image.bpp24 = 2
cmp edi, 3
jz @f
inc eax ; Image.bpp32 = 3
@@:
push eax
; get width and height
; width must be nonzero
; height must be nonzero - nobody supports DNL markers
mov ah, [esi+3]
mov al, [esi+4] ; eax = height
xor ecx, ecx
mov ch, [esi+5]
mov cl, [esi+6] ; ecx = width
; allocate memory for image
stdcall img.create, ecx, eax
test eax, eax
jz .end2
mov [ebx + jpeg.work.image], eax
; create grayscale palette if needed
cmp edi, 1
jnz .no_create_palette
push ecx edi
mov edi, [eax + Image.Palette]
xor eax, eax
mov ecx, 256
@@:
stosd
add eax, 0x010101
loop @b
pop edi ecx
.no_create_palette:
; other image characteristics
mov eax, edi
shl eax, 3
mov [ebx + jpeg.work.delta_x], eax
mov [ebx + jpeg.work.pixel_size], edi
;mov eax, edi
imul eax, ecx
mov [ebx + jpeg.work.delta_y], eax
shr eax, 3
mov [ebx + jpeg.work.line_size], eax
add esi, 8
mov ecx, edi
lea edi, [ebx + jpeg.work.components]
xor eax, eax
xor edx, edx
.sof_parse_comp:
movsb ; db ComponentIdentifier
lodsb
mov ah, al
and al, 0xF
jz .end3
shr ah, 4
jz .end3
stosd ; db V, db H, db ?, db ? (will be filled later)
cmp dl, al
ja @f
mov dl, al
@@:
cmp dh, ah
ja @f
mov dh, ah
@@:
movsb ; db QuantizationTableID
loop .sof_parse_comp
mov word [ebx + jpeg.work.max_v], dx
movzx eax, dh
movzx edx, dl
push eax edx
shl eax, 3
shl edx, 3
mov [ebx + jpeg.work.block_width], eax
mov [ebx + jpeg.work.block_height], edx
pop edx eax
push eax edx
imul eax, [ebx + jpeg.work.delta_x]
mov [ebx + jpeg.work.block_delta_x], eax
imul edx, [ebx + jpeg.work.delta_y]
mov [ebx + jpeg.work.block_delta_y], edx
mov ecx, [ebx + jpeg.work.image]
mov eax, [ecx + Image.Width]
add eax, [ebx + jpeg.work.block_width]
dec eax
xor edx, edx
div [ebx + jpeg.work.block_width]
mov [ebx + jpeg.work.x_num_blocks], eax
mov eax, [ecx + Image.Height]
add eax, [ebx + jpeg.work.block_height]
dec eax
xor edx, edx
div [ebx + jpeg.work.block_height]
mov [ebx + jpeg.work.y_num_blocks], eax
mov ecx, [ebx + jpeg.work.pixel_size]
pop edx
lea edi, [ebx + jpeg.work.components]
@@:
mov eax, edx
div byte [edi+1] ; VMax / V_i = VFactor_i
mov byte [edi+3], al ; db VFactor
pop eax
push eax
div byte [edi+2] ; HMax / H_i = HFactor_i
mov byte [edi+4], al ; db HFactor
add edi, 6
loop @b
pop eax
cmp [ebx + jpeg.work.progressive], 0
jz .sof_noprogressive
mov eax, [ebx + jpeg.work.x_num_blocks]
mul [ebx + jpeg.work.block_width]
mul [ebx + jpeg.work.y_num_blocks]
mul [ebx + jpeg.work.block_height]
add eax, eax
mov [ebx + jpeg.work.dct_buffer_size], eax
mul [ebx + jpeg.work.pixel_size]
push eax
call [mem.alloc]
test eax, eax
jnz @f
xchg eax, [ebx + jpeg.work.image]
push eax
call img.destroy
jmp .end
@@:
mov [ebx + jpeg.work.dct_buffer], eax
.sof_noprogressive:
jmp .markers_loop
.end3:
jmp .end
.sos:
; SOS marker found
; frame must be already opened
cmp [ebx + jpeg.work.image], 0
jz .end3
cmp edx, 6
jb .end3
; parse marker
movzx eax, byte [esi+2] ; number of components in this scan
test eax, eax
jz .end3 ; must be nonzero
cmp al, byte [ebx + jpeg.work.pixel_size]
ja .end3 ; must be <= total number of components
; mov [ns], eax
cmp al, 1
setz [ebx + jpeg.work.not_interleaved]
lea ecx, [6+eax+eax]
cmp edx, ecx
jnz .end3
mov ecx, eax
lea edi, [ebx + jpeg.work.cur_components]
add esi, 3
.sos_find_comp:
lodsb ; got ComponentID, look for component info
push ecx esi
mov ecx, [ebx + jpeg.work.pixel_size]
lea esi, [ebx + jpeg.work.components]
and dword [edi+48], 0
and dword [edi+52], 0
@@:
cmp [esi], al
jz @f
inc dword [edi+52]
add esi, 6
loop @b
@@:
mov eax, [esi+1]
mov dl, [esi+5]
pop esi ecx
jnz .end3 ; bad ComponentID
cmp [ebx + jpeg.work.not_interleaved], 0
jz @f
mov ax, 0x0101
@@:
stosd ; db V, db H, db VFactor, db HFactor
push ecx
xor eax, eax
mov al, byte [edi-1] ; get HFactor
mul byte [ebx+jpeg.work.pixel_size] ; number of components
stosd ; HIncrement_i = HFactor_i * sizeof(pixel)
mov al, byte [edi-4-2] ; get VFactor
mul byte [ebx+jpeg.work.pixel_size] ; number of components
mov ecx, [ebx+jpeg.work.image]
imul eax, [ecx+Image.Width] ; image width
stosd ; VIncrement_i = VFactor_i * sizeof(row)
xchg eax, edx
and eax, 3
cmp [ebx+jpeg.work.quant_tables_defined+eax], 0
jz .end3
shl eax, 8
lea eax, [ebx+eax+jpeg.work.quant_tables]
stosd ; dd QuantizationTable
lodsb
movzx eax, al
mov edx, eax
shr eax, 4
and edx, 3
and eax, 3
cmp [ebx+jpeg.work.dc_huffman_defined+eax], 0
jnz .dc_table_ok
cmp [ebx+jpeg.work.progressive], 0
jz .end3
xor eax, eax
jmp .dc_table_done
.dc_table_ok:
; shl eax, 11
imul eax, max_hufftable_size
lea eax, [ebx+jpeg.work.dc_huffman+eax]
.dc_table_done:
cmp [ebx+jpeg.work.ac_huffman_defined+edx], 0
jnz .ac_table_ok
cmp [ebx+jpeg.work.progressive], 0
jz .end3
xor edx, edx
jmp .ac_table_done
.ac_table_ok:
; shl edx, 11
imul edx, max_hufftable_size
lea edx, [ebx+jpeg.work.ac_huffman+edx]
.ac_table_done:
stosd ; dd DCTable
xchg eax, edx
stosd ; dd ACTable
mov eax, [ecx+Image.Width]
movzx ecx, byte [edi-21] ; get HFactor
cdq ; edx:eax = width (width<0x10000, so as dword it is unsigned)
div ecx
stosd ; dd width / HFactor_i
stosd
xchg eax, ecx
inc eax
sub eax, edx
stosd ; dd HFactor_i+1 - (width % HFactor_i)
mov ecx, [ebx+jpeg.work.image]
mov eax, [ecx+Image.Height]
movzx ecx, byte [edi-34] ; get VFactor
cdq
div ecx
stosd ; dd height / VFactor_i
stosd
xchg eax, ecx
inc eax
sub eax, edx
stosd ; dd VFactor_i+1 - (height % VFactor_i)
pop ecx
scasd ; dd DCPrediction
cmp dword [edi], 0
setnp al
ror al, 1
mov byte [edi-1], al
scasd ; dd ComponentOffset
dec ecx
jnz .sos_find_comp
mov [ebx+jpeg.work.cur_components_end], edi
lea edi, [ebx+jpeg.work.ScanStart]
movsb
cmp byte [esi], 63
ja .end3
movsb
lodsb
push eax
and al, 0xF
stosb
pop eax
shr al, 4
stosb
; now unpack data
call init_limits
and [ebx+jpeg.work.decoded_MCUs], 0
mov [ebx+jpeg.work.cur_rst_marker], 7
and [ebx+jpeg.work.huffman_bits], 0
cmp [ebx+jpeg.work.progressive], 0
jz .sos_noprogressive
; progressive mode - only decode DCT coefficients
; initialize pointers to coefficients data
; zero number of EOBs for AC coefficients
; redefine HIncrement and VIncrement
lea edi, [ebx+jpeg.work.cur_components]
.coeff_init:
mov eax, [ebx+jpeg.work.dct_buffer_size]
mul dword [edi+52]
add eax, [ebx+jpeg.work.dct_buffer]
mov [edi+12], eax
and dword [edi+52], 0
cmp [ebx+jpeg.work.ScanStart], 0
jz .scan_dc
cmp dword [edi+20], 0
jz .end3
jmp @f
.scan_dc:
cmp dword [edi+16], 0
jz .end3
@@:
movzx eax, byte [edi+1]
shl eax, 7
mov [edi+4], eax
mov eax, [edi+28]
mov cl, [edi+3]
cmp cl, [edi+32]
sbb eax, -7-1
shr eax, 3
shl eax, 7
mov [edi+8], eax
add edi, 56
cmp edi, [ebx+jpeg.work.cur_components_end]
jb .coeff_init
; unpack coefficients
; N.B. Speed optimization has sense here.
.coeff_decode_loop:
lea edx, [ebx+jpeg.work.cur_components]
.coeff_components_loop:
mov edi, [edx+12]
movzx ecx, byte [edx]
push dword [edx+40]
push edi
.coeff_y_loop:
push ecx
movzx eax, byte [edx+1]
push dword [edx+28]
push edi
.coeff_x_loop:
cmp dword [edx+40], 0
jl @f
cmp dword [edx+28], 0
jge .realdata
@@:
cmp [ebx+jpeg.work.not_interleaved], 0
jnz .norealdata
push eax edi
lea edi, [ebx+jpeg.work.dct_coeff]
call decode_progressive_coeff
pop edi eax
jmp .norealdata
.realdata:
push eax
call decode_progressive_coeff
add edi, 64*2
pop eax
.norealdata:
sub dword [edx+28], 8
sub eax, 1
jnz .coeff_x_loop
pop edi
pop dword [edx+28]
add edi, [edx+8]
pop ecx
sub dword [edx+40], 8
sub ecx, 1
jnz .coeff_y_loop
movzx eax, byte [edx+1]
shl eax, 3
pop edi
add edi, [edx+4]
pop dword [edx+40]
sub [edx+28], eax
mov [edx+12], edi
add edx, 56
cmp edx, [ebx+jpeg.work.cur_components_end]
jnz .coeff_components_loop
call next_MCU
jc .norst
sub [ebx+jpeg.work.cur_x], 1
jnz .coeff_decode_loop
call next_line
lea edx, [ebx+jpeg.work.cur_components]
@@:
mov eax, [ebx+jpeg.work.max_x]
imul eax, [edx+4]
sub [edx+12], eax
movzx eax, byte [edx]
imul eax, [edx+8]
add [edx+12], eax
add edx, 56
cmp edx, [ebx+jpeg.work.cur_components_end]
jnz @b
sub [ebx+jpeg.work.cur_y], 1
jnz .coeff_decode_loop
jmp .markers_loop
.norst:
.end4:
jmp .end3
.sos_noprogressive:
; normal mode - unpack JPEG image
mov edi, [ebx+jpeg.work.image]
mov edi, [edi+Image.Data]
mov [ebx+jpeg.work.cur_out_ptr], edi
; N.B. Speed optimization has sense here.
.decode_loop:
call decode_MCU
call next_MCU
jc .end4
sub [ebx+jpeg.work.cur_x], 1
jnz .decode_loop
call next_line
sub [ebx+jpeg.work.cur_y], 1
jnz .decode_loop
jmp .markers_loop
get_marker:
; in: esi -> data
; out: CF=0, al=marker value - ok
; CF=1 - no marker
sub ebp, 1
jc .ret
lodsb
if 1
cmp al, 0xFF
jae @f
; Some stupid men, which do not read specifications and manuals,
; sometimes create markers with length field two less than true
; value (in JPEG length of marker = length of data INCLUDING
; length field itself). To open such files, allow 2 bytes
; before next marker.
cmp ebp, 2
jb .ret
lodsb
lodsb
end if
cmp al, 0xFF
jb .ret
@@:
sub ebp, 1
jc .ret
lodsb
cmp al, 0xFF
jz @b
clc
.ret:
ret
align 16
decode_MCU:
lea edx, [ebx+jpeg.work.cur_components]
.components_loop:
; decode each component
push [ebx+jpeg.work.cur_out_ptr]
movzx ecx, byte [edx]
push dword [edx+40]
; we have H_i * V_i blocks of packed data, decode them
.y_loop_1:
push [ebx+jpeg.work.cur_out_ptr]
push ecx
movzx eax, byte [edx+1]
push dword [edx+28]
.x_loop_1:
push eax
call decode_data_unit
cmp dword [edx+40], 0
jl .nocopyloop
cmp dword [edx+28], 0
jl .nocopyloop
; now we have decoded block 8*8 in decoded_data
; H_i * V_i packed blocks 8*8 make up one block (8*HMax) * (8*VMax)
; so each pixel in packed block corresponds to HFact * VFact pixels
movzx ecx, byte [edx+2]
push esi ebp
mov edi, [ebx+jpeg.work.cur_out_ptr]
add edi, [edx+52]
.y_loop_2:
push ecx edi
cmp ecx, [edx+44]
mov ecx, [edx+40]
sbb ecx, 8-1
sbb eax, eax
and ecx, eax
add ecx, 8
jz .skip_x_loop_2
movzx eax, byte [edx+3]
.x_loop_2:
push eax ecx edi
cmp eax, [edx+32]
mov eax, [edx+28]
sbb eax, 8-1
sbb ebp, ebp
and eax, ebp
mov ebp, .copyiter_all
lea esi, [ebx+jpeg.work.decoded_data]
sub ebp, eax
sub ebp, eax
sub ebp, eax
mov eax, [edx+4]
sub eax, 1
.copyloop:
push esi edi
jmp ebp
.copyiter_all:
movsb
repeat 7
add edi, eax
movsb
end repeat
nop
nop
pop edi esi
add edi, [edx+8]
add esi, 8
sub ecx, 1
jnz .copyloop
pop edi ecx eax
add edi, [ebx+jpeg.work.pixel_size]
sub eax, 1
jnz .x_loop_2
.skip_x_loop_2:
pop edi ecx
add edi, [ebx+jpeg.work.line_size]
sub ecx, 1
jnz .y_loop_2
pop ebp esi
.nocopyloop:
mov eax, [ebx+jpeg.work.delta_x]
add [ebx+jpeg.work.cur_out_ptr], eax
pop eax
sub dword [edx+28], 8
sub eax, 1
jnz .x_loop_1
pop dword [edx+28]
pop ecx
pop eax
sub dword [edx+40], 8
add eax, [ebx+jpeg.work.delta_y]
mov [ebx+jpeg.work.cur_out_ptr], eax
sub ecx, 1
jnz .y_loop_1
movzx eax, byte [edx+1]
pop dword [edx+40]
shl eax, 3
pop [ebx+jpeg.work.cur_out_ptr]
sub dword [edx+28], eax
add edx, 56
cmp edx, [ebx+jpeg.work.cur_components_end]
jb .components_loop
mov eax, [ebx+jpeg.work.cur_block_dx]
add [ebx+jpeg.work.cur_out_ptr], eax
ret
align 16
next_MCU:
add [ebx+jpeg.work.decoded_MCUs], 1
mov eax, [ebx+jpeg.work.restart_interval]
test eax, eax
jz .no_restart
cmp [ebx+jpeg.work.decoded_MCUs], eax
jb .no_restart
and [ebx+jpeg.work.decoded_MCUs], 0
and [ebx+jpeg.work.huffman_bits], 0
cmp [ebx+jpeg.work.cur_x], 1
jnz @f
cmp [ebx+jpeg.work.cur_y], 1
jz .no_restart
@@:
; restart marker must be present
sub ebp, 2
js .error
cmp byte [esi], 0xFF
jnz .error
mov al, [ebx+jpeg.work.cur_rst_marker]
inc eax
and al, 7
mov [ebx+jpeg.work.cur_rst_marker], al
add al, 0xD0
cmp [esi+1], al
jnz .error
add esi, 2
; handle restart marker - zero all DC predictions
lea edx, [ebx+jpeg.work.cur_components]
@@:
and word [edx+48], 0
add edx, 56
cmp edx, [ebx+jpeg.work.cur_components_end]
jb @b
.no_restart:
clc
ret
.error:
stc
ret
next_line:
mov eax, [ebx+jpeg.work.max_x]
mov [ebx+jpeg.work.cur_x], eax
mul [ebx+jpeg.work.cur_block_dx]
sub eax, [ebx+jpeg.work.cur_block_dy]
sub [ebx+jpeg.work.cur_out_ptr], eax
lea edx, [ebx+jpeg.work.cur_components]
@@:
mov eax, [edx+24]
mov [edx+28], eax
movzx eax, byte [edx]
shl eax, 3
sub [edx+40], eax
add edx, 56
cmp edx, [ebx+jpeg.work.cur_components_end]
jb @b
ret
init_limits:
push [ebx+jpeg.work.x_num_blocks]
pop [ebx+jpeg.work.max_x]
push [ebx+jpeg.work.y_num_blocks]
pop [ebx+jpeg.work.max_y]
push [ebx+jpeg.work.block_delta_x]
pop [ebx+jpeg.work.cur_block_dx]
push [ebx+jpeg.work.block_delta_y]
pop [ebx+jpeg.work.cur_block_dy]
cmp [ebx+jpeg.work.not_interleaved], 0
jz @f
mov eax, dword [ebx+jpeg.work.cur_components+28]
movzx ecx, byte [ebx+jpeg.work.cur_components+3]
cmp cl, [ebx+jpeg.work.cur_components+32]
sbb eax, -7-1
shr eax, 3
mov [ebx+jpeg.work.max_x], eax
mov eax, dword [ebx+jpeg.work.cur_components+40]
movzx edx, byte [ebx+jpeg.work.cur_components+2]
cmp dl, [ebx+jpeg.work.cur_components+44]
sbb eax, -7-1
shr eax, 3
mov [ebx+jpeg.work.max_y], eax
imul ecx, [ebx+jpeg.work.delta_x]
mov [ebx+jpeg.work.cur_block_dx], ecx
imul edx, [ebx+jpeg.work.delta_y]
mov [ebx+jpeg.work.cur_block_dy], edx
@@:
push [ebx+jpeg.work.max_x]
pop [ebx+jpeg.work.cur_x]
push [ebx+jpeg.work.max_y]
pop [ebx+jpeg.work.cur_y]
ret
;macro get_bit
;{
;local .l1,.l2,.marker
; add cl, cl
; jnz .l1
; sub ebp, 1
; js decode_data_unit.eof
; mov cl, [esi]
; cmp cl, 0xFF
; jnz .l2
;.marker:
; add esi, 1
; sub ebp, 1
; js decode_data_unit.eof
; cmp byte [esi], 0xFF
; jz .marker
; cmp byte [esi], 0
; jnz decode_data_unit.eof
;.l2:
; sub esi, -1
; adc cl, cl
;.l1:
;}
macro get_bit stack_depth
{
local .l1,.l2,.marker
sub cl, 1
jns .l1
sub ebp, 1
js .eof_pop#stack_depth
mov ch, [esi]
cmp ch, 0xFF
jnz .l2
.marker:
add esi, 1
sub ebp, 1
js .eof_pop#stack_depth
cmp byte [esi], 0xFF
jz .marker
cmp byte [esi], 0
jnz .eof_pop#stack_depth
.l2:
add esi, 1
mov cl, 7
.l1:
add ch, ch
}
macro get_bits stack_depth,stack_depth_p1,restore_edx
{
local .l1,.l2,.l3,.marker2
movzx eax, ch
mov dl, cl
shl eax, 24
neg cl
push ebx
add cl, 24
.l1:
cmp bl, dl
jbe .l2
sub bl, dl
sub ebp, 1
js .eof_pop#stack_depth_p1
mov ch, [esi]
cmp ch, 0xFF
jnz .l3
.marker2:
add esi, 1
sub ebp, 1
js .eof_pop#stack_depth_p1
cmp byte [esi], 0xFF
jz .marker2
cmp byte [esi], 0
jnz .eof_pop#stack_depth_p1
.l3:
movzx edx, ch
add esi, 1
shl edx, cl
sub cl, 8
or eax, edx
mov dl, 8
jmp .l1
.l2:
mov cl, bl
sub dl, bl
shl ch, cl
pop ebx
cmp eax, 80000000h
rcr eax, 1
mov cl, 31
sub cl, bl
sar eax, cl
mov cl, dl
if restore_edx eq true
pop edx
end if
add eax, 80000000h
adc eax, 80000000h
}
; macro get_huffman_code
; {
; local .l1
; xor ebx, ebx
; .l1:
; get_bit
; adc ebx, ebx
; mov eax, [eax+4*ebx]
; xor ebx, ebx
; cmp eax, -1
; jz .eof_pop
; cmp eax, 0x1000
; jae .l1
; mov ebx, eax
; }
macro get_huffman_code stack_depth,stack_depth_p1
{
local .l1,.l2,.l3,.l4,.l5,.l6,.nomarker1,.marker1,.nomarker2,.marker2,.nomarker3,.marker3,.done
; 1. (First level in Huffman table) Does the current Huffman code fit in 8 bits
; and have we got enough bits?
movzx ebx, ch
cmp byte [eax+ebx*2], cl
jbe .l1
; 2a. No; load next byte
sub ebp, 1
js .eof_pop#stack_depth
mov ch, [esi]
movzx edx, ch
cmp ch, 0xFF
jnz .nomarker1
.marker1:
add esi, 1
sub ebp, 1
js .eof_pop#stack_depth
cmp byte [esi], 0xFF
jz .marker1
cmp byte [esi], 0
jnz .eof_pop#stack_depth
.nomarker1:
shr edx, cl
add esi, 1
or ebx, edx
; 3a. (First level in Huffman table, >=8 bits known) Does the current Huffman code fit in 8 bits?
cmp byte [eax+ebx*2], 8
jbe .l2
jl .eof_pop#stack_depth
; 4aa. No; go to next level
movzx ebx, byte [eax+ebx*2+1]
mov dl, ch
shl ebx, 5
ror edx, cl
lea ebx, [eax+ebx+0x200]
shr edx, 24
push edx
shr edx, 4
; 5aa. (Second level in Huffman table) Does the current Huffman code fit in 12 bits
; and have we got enough bits?
cmp byte [ebx+edx*2], cl
jbe .l3
; 6aaa. No; have we got 12 bits?
cmp cl, 4
jae .l4
; 7aaaa. No; load next byte
pop edx
sub ebp, 1
js .eof_pop#stack_depth
mov ch, [esi]
cmp ch, 0xFF
jnz .nomarker2
.marker2:
add esi, 1
sub ebp, 1
js .eof_pop#stack_depth
cmp byte [esi], 0xFF
jz .marker2
cmp byte [esi], 0
jnz .eof_pop#stack_depth
.nomarker2:
push ecx
shr ch, cl
add esi, 1
or dl, ch
pop ecx
push edx
shr edx, 4
; 8aaaa. (Second level in Huffman table) Does the current Huffman code fit in 12 bits?
cmp byte [ebx+edx*2], 4
jbe .l5
jl .eof_pop#stack_depth_p1
; 9aaaaa. No; go to next level
movzx ebx, byte [ebx+edx*2+1]
pop edx
shl ebx, 5
and edx, 0xF
lea ebx, [eax+ebx+0x200]
; 10aaaaa. Get current code length and value
sub cl, [ebx+edx*2]
movzx eax, byte [ebx+edx*2+1]
neg cl
shl ch, cl
neg cl
add cl, 8
jmp .done
.l5:
; 9aaaab. Yes; get current code length and value
sub cl, [ebx+edx*2]
movzx eax, byte [ebx+edx*2+1]
neg cl
pop edx
shl ch, cl
neg cl
add cl, 8
jmp .done
.l4:
; 7aaab. Yes; go to next level
movzx ebx, byte [ebx+edx*2+1]
pop edx
shl ebx, 5
and edx, 0xF
lea ebx, [eax+ebx+0x200]
; 8aaab. (Third level in Huffman table) Have we got enough bits?
cmp [ebx+edx*2], cl
jbe .l6
; 9aaaba. No; load next byte
sub ebp, 1
js .eof_pop#stack_depth
mov ch, [esi]
cmp ch, 0xFF
jnz .nomarker3
.marker3:
add esi, 1
sub ebp, 1
js .eof_pop#stack_depth
cmp byte [esi], 0xFF
jz .marker3
cmp byte [esi], 0
jnz .eof_pop#stack_depth
.nomarker3:
push ecx
shr ch, cl
add esi, 1
or dl, ch
pop ecx
; 10aaaba. Get current code length and value
sub cl, [ebx+edx*2]
movzx eax, byte [ebx+edx*2+1]
neg cl
shl ch, cl
neg cl
add cl, 8
jmp .done
.l3:
; 6aab. Yes; get current code length and value
pop eax
.l6:
; 9aaabb. Yes; get current code length and value
sub cl, [ebx+edx*2]
movzx eax, byte [ebx+edx*2+1]
xor cl, 7
shl ch, cl
xor cl, 7
add ch, ch
jmp .done
.l2:
; 3ab. Yes; get current code length and value
sub cl, [eax+ebx*2]
movzx eax, byte [eax+ebx*2+1]
neg cl
shl ch, cl
neg cl
add cl, 8
jmp .done
.l1:
; 3b. Yes; get current code length and value
mov dl, [eax+ebx*2]
movzx eax, byte [eax+ebx*2+1]
xchg cl, dl
sub dl, cl
shl ch, cl
mov cl, dl
.done:
mov ebx, eax
}
; Decode DCT coefficients for one 8*8 block in progressive mode
; from input stream, given by pointer esi and length ebp
; N.B. Speed optimization has sense here.
align 16
decode_progressive_coeff:
mov ecx, [ebx+jpeg.work.huffman_bits]
cmp [ebx+jpeg.work.ScanStart], 0
jnz .ac
; DC coefficient
cmp [ebx+jpeg.work.ApproxPosHigh], 0
jz .dc_first
; DC coefficient, subsequent passes
xor eax, eax
get_bit 0
adc eax, eax
mov [ebx+jpeg.work.huffman_bits], ecx
mov cl, [ebx+jpeg.work.ApproxPosLow]
shl eax, cl
or [edi], ax
ret
.dc_first:
; DC coefficient, first pass
mov eax, [edx+16]
push ebx
push edx
get_huffman_code 2,3
get_bits 2,3,true
pop ebx
add eax, [edx+48]
mov [edx+48], ax
mov [ebx+jpeg.work.huffman_bits], ecx
mov cl, [ebx+jpeg.work.ApproxPosLow]
shl eax, cl
mov [edi], ax
ret
.ac:
; AC coefficients
movzx eax, [ebx+jpeg.work.ScanStart]
cmp al, [ebx+jpeg.work.ScanEnd]
ja .ret
cmp dword [edx+52], 0
jnz .was_eob
push ebx
.acloop:
push edx
push eax
mov eax, [edx+20]
get_huffman_code 3,4
pop eax
test ebx, 15
jz .band
push eax ebx
and ebx, 15
get_bits 4,5,false
pop ebx
xchg eax, [esp]
shr ebx, 4
mov edx, [esp+8]
.zeroloop1:
push eax ebx
movzx eax, byte [zigzag+eax]
xor ebx, ebx
cmp word [edi+eax], bx
jz .zeroloop2
get_bit 5
jnc @f
push ecx
mov cl, [edx+jpeg.work.ApproxPosLow]
xor ebx, ebx
cmp byte [edi+eax+1], 80h
adc ebx, 0
add ebx, ebx
sub ebx, 1
shl ebx, cl
pop ecx
add [edi+eax], bx
@@:
pop ebx eax
@@:
add eax, 1
cmp al, [edx+jpeg.work.ScanEnd]
ja decode_data_unit.eof_pop3
jmp .zeroloop1
.zeroloop2:
pop ebx eax
sub ebx, 1
jns @b
.nozero1:
pop ebx
test ebx, ebx
jz @f
push eax
movzx eax, byte [zigzag+eax]
push ecx
mov cl, [edx+jpeg.work.ApproxPosLow]
shl ebx, cl
pop ecx
mov [edi+eax], bx
pop eax
@@:
add eax, 1
cmp al, [edx+jpeg.work.ScanEnd]
pop edx
jbe .acloop
pop ebx
mov [ebx+jpeg.work.huffman_bits], ecx
.ret:
ret
.eof_pop5:
pop ebx
.eof_pop4:
pop ebx
.eof_pop3:
pop ebx
.eof_pop2:
pop ebx
.eof_pop1:
pop ebx
.eof_pop0:
jmp decode_data_unit.eof_pop0
.band:
shr ebx, 4
cmp ebx, 15
jnz .eob
mov edx, [esp+4]
push 0
jmp .zeroloop1
.eob:
pop edx
push eax
mov eax, 1
test ebx, ebx
jz .eob0
@@:
get_bit 2
adc eax, eax
sub ebx, 1
jnz @b
.eob0:
mov [edx+52], eax
pop eax
pop ebx
.was_eob:
sub dword [edx+52], 1
cmp al, [ebx+jpeg.work.ScanEnd]
ja .ret2
push edx
.zeroloop3:
push eax
movzx eax, byte [zigzag+eax]
xor edx, edx
cmp word [edi+eax], dx
jz @f
get_bit 2
jnc @f
push ecx
mov cl, [ebx+jpeg.work.ApproxPosLow]
xor edx, edx
cmp byte [edi+eax+1], 80h
adc edx, 0
add edx, edx
sub edx, 1
shl edx, cl
pop ecx
add [edi+eax], dx
@@:
pop eax
add eax, 1
cmp al, [ebx+jpeg.work.ScanEnd]
jbe .zeroloop3
pop edx
.ret2:
mov [ebx+jpeg.work.huffman_bits], ecx
ret
handle_progressive:
cmp [ebx+jpeg.work.dct_buffer], 0
jnz @f
ret
@@:
; information for all components
lea esi, [ebx+jpeg.work.components]
xor ebp, ebp
mov ecx, [ebx+jpeg.work.pixel_size]
.next_component:
lea edi, [ebx+jpeg.work.cur_components]
lodsb ; ComponentID
lodsd
mov ax, 0x0101
stosd ; db V, db H, db VFactor, db HFactor
xor eax, eax
mov al, byte [edi-1] ; get HFactor
mul byte [ebx+jpeg.work.pixel_size] ; number of components
stosd ; HIncrement_i = HFactor_i * sizeof(pixel)
movzx eax, byte [edi-4-2] ; get VFactor
mul [ebx+jpeg.work.line_size] ; number of components * image width
stosd ; VIncrement_i = VFactor_i * sizeof(row)
lodsb
and eax, 3
cmp [ebx+jpeg.work.quant_tables_defined+eax], 0
jz .error
shl eax, 8
lea eax, [ebx+jpeg.work.quant_tables+eax]
stosd ; dd QuantizationTable
stosd ; dd DCTable - ignored
mov eax, ebp
mul [ebx+jpeg.work.dct_buffer_size]
add eax, [ebx+jpeg.work.dct_buffer]
stosd ; instead of dd ACTable - pointer to current DCT coefficients
push ecx
mov eax, [ebx+jpeg.work.image]
mov eax, [eax+Image.Width]
movzx ecx, byte [edi-21] ; get HFactor
; cdq ; edx = 0 as a result of previous mul
div ecx
stosd ; dd width / HFactor_i
stosd
xchg eax, ecx
inc eax
sub eax, edx
stosd ; dd HFactor_i+1 - (width % HFactor_i)
mov eax, [ebx+jpeg.work.image]
mov eax, [eax+Image.Height]
movzx ecx, byte [edi-34] ; get VFactor
cdq
div ecx
stosd ; dd height / VFactor_i
stosd
xchg eax, ecx
inc eax
sub eax, edx
stosd ; dd VFactor_i+1 - (height % VFactor_i)
pop ecx
xor eax, eax
test ebp, ebp
setnp al
ror eax, 1
stosd ; dd DCPrediction
mov eax, ebp
stosd ; dd ComponentOffset
inc ebp
push ecx
mov [ebx+jpeg.work.cur_components_end], edi
lea edx, [edi-56]
; do IDCT and unpack
mov edi, [ebx+jpeg.work.image]
mov edi, [edi+Image.Data]
mov [ebx+jpeg.work.cur_out_ptr], edi
mov [ebx+jpeg.work.not_interleaved], 1
call init_limits
.decode_loop:
call decode_MCU
sub [ebx+jpeg.work.cur_x], 1
jnz .decode_loop
call next_line
sub [ebx+jpeg.work.cur_y], 1
jnz .decode_loop
pop ecx
dec ecx
jnz .next_component
; image unpacked, return
.error:
push [ebx+jpeg.work.dct_buffer]
call [mem.free]
ret
; Support for YCbCr -> RGB conversion
; R = Y + 1.402 * (Cr - 128)
; G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
; B = Y + 1.772 * (Cb - 128)
; When converting YCbCr -> RGB, we need to do some multiplications;
; to be faster, we precalculate the table for all 256 possible values
; Also we approximate fractions with N/65536, this gives sufficient precision
img.initialize.jpeg:
;initialize_color_table:
; 1.402 = 1 + 26345/65536, -0.71414 = -46802/65536
; -0.34414 = -22554/65536, 1.772 = 1 + 50594/65536
pushad
mov edi, color_table_1
mov ecx, 128
; 1. Cb -> 1.772*Cb
xor eax, eax
mov dx, 8000h
.l1:
push ecx
@@:
stosd
add dx, 50594
adc eax, 1
loop @b
neg dx
adc eax, -1
neg eax
pop ecx
jnz .l1
; 2. Cb -> -0.34414*Cb
mov ax, dx
.l2:
push ecx
@@:
stosd
sub eax, 22554
loop @b
neg eax
pop ecx
cmp ax, dx
jnz .l2
xor eax, eax
; 3. Cr -> -0.71414*Cr
.l3:
push ecx
@@:
stosd
sub eax, 46802
loop @b
neg eax
pop ecx
jnz .l3
; 4. Cr -> 1.402*Cr
.l4:
push ecx
@@:
stosd
add dx, 26345
adc eax, 1
loop @b
neg dx
adc eax, -1
neg eax
pop ecx
jnz .l4
popad
ret
; this function is called in the end of image loading
convert_to_rgb:
; some checks
mov eax, [ebx+jpeg.work.image]
test eax, eax ; image exists?
jz .ret
cmp byte [ebx+jpeg.work.pixel_size], 3 ; full-color image?
jz .ycc2rgb
cmp byte [ebx+jpeg.work.pixel_size], 4
jz .ycck2rgb
.ret:
ret
.ycc2rgb:
; conversion is needed
mov esi, [eax+Image.Width]
imul esi, [eax+Image.Height]
mov edi, [eax+Image.Data]
push ebx
; N.B. Speed optimization has sense here.
align 16
.loop:
; mov ebx, [edi]
; mov edx, ebx
; mov ecx, ebx
; movzx ebx, bl ; ebx = Y
; shr edx, 16
; mov eax, ebx
; movzx edx, dl ; edx = Cr
; movzx ecx, ch ; ecx = Cb
movzx ebx, byte [edi]
movzx ecx, byte [edi+1]
mov eax, ebx
movzx edx, byte [edi+2]
; B = Y + color_table_1[Cb]
add eax, [color_table_1+ecx*4]
mov ebp, [color_table_2+ecx*4]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
add ebp, [color_table_3+edx*4]
cmp eax, 0x100
sbb ecx, ecx
not ecx
sar ebp, 16
or eax, ecx
mov [edi], al
; G = Y + color_table_2[Cb] + color_table_3[Cr]
lea eax, [ebx+ebp]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
cmp eax, 0x100
sbb ecx, ecx
not ecx
or eax, ecx
mov [edi+1], al
; R = Y + color_table_4[Cr]
mov eax, ebx
add eax, [color_table_4+edx*4]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
cmp eax, 0x100
sbb ecx, ecx
not ecx
or eax, ecx
mov [edi+2], al
add edi, 3
sub esi, 1
jnz .loop
pop ebx
ret
.ycck2rgb:
; conversion is needed
mov esi, [eax+Image.Width]
imul esi, [eax+Image.Height]
push ebx
push esi
mov edi, [eax+Image.Data]
mov esi, edi
; N.B. Speed optimization has sense here.
align 16
.kloop:
; mov ebx, [esi]
; mov edx, ebx
; mov ecx, ebx
; movzx ebx, bl ; ebx = Y
; shr edx, 16
; mov eax, ebx
; movzx edx, dl ; edx = Cr
; movzx ecx, ch ; ecx = Cb
movzx ebx, byte [esi]
movzx ecx, byte [esi+1]
mov eax, ebx
movzx edx, byte [esi+2]
; B = Y + color_table_1[Cb]
add eax, [color_table_1+ecx*4]
mov ebp, [color_table_2+ecx*4]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
add ebp, [color_table_3+edx*4]
cmp eax, 0x100
sbb ecx, ecx
not ecx
sar ebp, 16
or eax, ecx
xor al, 0xFF
mul byte [esi+3]
add al, ah
adc ah, 0
add al, 80h
adc ah, 0
mov byte [edi], ah
; G = Y + color_table_2[Cb] + color_table_3[Cr]
lea eax, [ebx+ebp]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
cmp eax, 0x100
sbb ecx, ecx
not ecx
or eax, ecx
xor al, 0xFF
mul byte [esi+3]
add al, ah
adc ah, 0
add al, 80h
adc ah, 0
mov byte [edi+1], ah
; R = Y + color_table_4[Cr]
mov eax, ebx
add eax, [color_table_4+edx*4]
cmp eax, 80000000h
sbb ecx, ecx
and eax, ecx
cmp eax, 0x100
sbb ecx, ecx
not ecx
or eax, ecx
xor al, 0xFF
mul byte [esi+3]
add al, ah
adc ah, 0
add al, 80h
adc ah, 0
mov byte [edi+2], ah
add esi, 4
add edi, 4 ;3
sub dword [esp], 1
jnz .kloop
pop eax
pop ebx
; release some memory - must succeed because we decrease size
; add ecx, 44+1
; mov edx, ebx
; push 68
; pop eax
; push 20
; pop ebx
; int 0x40
; mov ebx, eax
ret
; Decodes one data unit, that is, 8*8 block,
; from input stream, given by pointer esi and length ebp
; N.B. Speed optimization has sense here.
align 16
decode_data_unit:
; edx -> component data
cmp [ebx+jpeg.work.progressive], 0
jz @f
mov edi, [edx+20]
add dword [edx+20], 64*2
jmp .coeff_decoded
@@:
lea edi, [ebx+jpeg.work.dct_coeff]
mov ecx, 64*2/4
xor eax, eax
rep stosd
mov edi, zigzag+1
mov ecx, [ebx+jpeg.work.huffman_bits]
; read DC coefficient
push ebx
mov eax, [edx+16]
push edx
get_huffman_code 2,3
get_bits 2,3,true
pop ebx
add eax, [edx+48]
mov [ebx+jpeg.work.dct_coeff], ax
mov [edx+48], ax
; read AC coefficients
push ebx
@@:
mov eax, [edx+20]
push edx
get_huffman_code 2,3
shr eax, 4
and ebx, 15
jz .band
add edi, eax
cmp edi, zigzag+64
jae .eof_pop2
get_bits 2,3,true
movzx ebx, byte [edi]
add ebx, [esp]
mov [jpeg.work.dct_coeff+ebx], ax
add edi, 1
cmp edi, zigzag+64
jb @b
jmp .do_idct
.band:
pop edx
cmp al, 15
jnz .do_idct
add edi, 16
cmp edi, zigzag+64
jb @b
; jmp .eof_pop1
.do_idct:
pop ebx
lea edi, [ebx+jpeg.work.dct_coeff]
mov [ebx+jpeg.work.huffman_bits], ecx
; coefficients loaded, now IDCT
.coeff_decoded:
mov eax, [edx+12]
add ebx, jpeg.work.idct_tmp_area
push 8
.idct_loop1:
mov cx, word [edi+1*16]
repeat 6
or cx, word [edi+(%+1)*16]
end repeat
jnz .real_transform
fild word [edi]
fmul dword [eax]
fstp dword [ebx]
mov ecx, [ebx]
repeat 7
mov [ebx+%*32], ecx
end repeat
jmp .idct_next1
.real_transform:
; S0,...,S7 - transformed values, s0,...,s7 - sought-for values
; S0,...,S7 are dequantized;
; dequantization table elements were multiplied to [idct_pre_table],
; so S0,S1,... later denote S0/2\sqrt{2},S1*\cos{\pi/16}/2,...
; sqrt2 = \sqrt{2}, cos = 2\cos{\pi/8},
; cos_sum = -2(\cos{\pi/8}+\cos{3\pi/8}), cos_diff = 2(\cos{\pi/8}-\cos{3\pi/8})
; Now formulas:
; s0 = ((S0+S4)+(S2+S6)) + ((S1+S7)+(S3+S5))
; s7 = ((S0+S4)+(S2+S6)) - ((S1+S7)+(S3+S5))
; val0 = ((cos-1)S1-(cos+cos_sum+1)S3+(cos+cos_sum-1)S5-(cos+1)S7)
; s1 = ((S0-S4)+((sqrt2-1)S2-(sqrt2+1)S6)) + val0
; s6 = ((S0-S4)+((sqrt2-1)S2-(sqrt2+1)S6)) - val0
; val1 = (S1+S7-S3-S5)sqrt2 - val0
; s2 = ((S0-S4)-((sqrt2-1)S2-(sqrt2+1)S6)) + val1
; s5 = ((S0-S4)-((sqrt2-1)S2-(sqrt2+1)S6)) - val1
; val2 = (S1-S7)cos_diff - (S1-S3+S5-S7)cos + val1
; s3 = ((S0+S4)-(S2+S6)) - val2
; s4 = ((S0+S4)-(S2+S6)) + val2
fild word [edi+3*16]
fmul dword [eax+3*32]
fild word [edi+5*16]
fmul dword [eax+5*32] ; st0=S5,st1=S3
fadd st1,st0
fadd st0,st0
fsub st0,st1 ; st0=S5-S3,st1=S5+S3
fild word [edi+1*16]
fmul dword [eax+1*32]
fild word [edi+7*16]
fmul dword [eax+7*32] ; st0=S7,st1=S1
fsub st1,st0
fadd st0,st0
fadd st0,st1 ; st0=S1+S7,st1=S1-S7,st2=S5-S3,st3=S5+S3
fadd st3,st0
fadd st0,st0
fsub st0,st3 ; st0=S1-S3-S5+S7,st1=S1-S7,st2=S5-S3,st3=S1+S3+S5+S7
fmul [idct_sqrt2]
fld st2
fadd st0,st2
fmul [idct_cos] ; st0=(S1-S3+S5-S7)cos,st1=(S1-S3-S5+S7)sqrt2,
; st2=S1-S7,st3=S5-S3,st4=S1+S3+S5+S7
fxch st2
fmul [idct_cos_diff]
fsub st0,st2 ; st0=(S1-S7)cos_diff - (S1-S3+S5-S7)cos
fxch st3
fmul [idct_cos_sum]
fadd st0,st2 ; st0=(S5-S3)cos_sum+(S1-S3+S5-S7)cos
fsub st0,st4 ; st0=val0
fsub st1,st0 ; st0=val0,st1=val1,st2=(S1-S3+S5-S7)cos,
; st3=(S1-S7)cos_diff-(S1-S3+S5-S7)cos,st4=S1+S3+S5+S7
fxch st2
fstp st0
fadd st2,st0 ; st0=val1,st1=val0,st2=val2,st3=S1+S3+S5+S7
fild word [edi+0*16]
fmul dword [eax+0*32]
fild word [edi+4*16]
fmul dword [eax+4*32] ; st0=S4,st1=S0
fsub st1,st0
fadd st0,st0
fadd st0,st1 ; st0=S0+S4,st1=S0-S4
fild word [edi+6*16]
fmul dword [eax+6*32]
fild word [edi+2*16]
fmul dword [eax+2*32] ; st0=S2,st1=S6
fadd st1,st0
fadd st0,st0
fsub st0,st1 ; st0=S2-S6,st1=S2+S6
fmul [idct_sqrt2]
fsub st0,st1
fsub st3,st0
fadd st0,st0
fadd st0,st3 ; st0=(S0-S4)+((S2-S6)sqrt2-(S2+S6))
; st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
fxch st1
fsub st2,st0
fadd st0,st0
fadd st0,st2 ; st0=(S0+S4)+(S2+S6),st1=(S0-S4)+((S2-S6)sqrt2-(S2+S6)),
; st2=(S0+S4)-(S2+S6),st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
; st4=val1,st5=val0,st6=val2,st7=S1+S3+S5+S7
fsubr st7,st0
fadd st0,st0
fsub st0,st7
fstp dword [ebx+0*32]
fsubr st4,st0
fadd st0,st0
fsub st0,st4
fstp dword [ebx+1*32]
fadd st4,st0
fadd st0,st0
fsub st0,st4
fstp dword [ebx+3*32]
fsubr st1,st0
fadd st0,st0
fsub st0,st1
fstp dword [ebx+2*32]
fstp dword [ebx+5*32]
fstp dword [ebx+6*32]
fstp dword [ebx+4*32]
fstp dword [ebx+7*32]
.idct_next1:
add ebx, 4
add edi, 2
add eax, 4
sub dword [esp], 1
jnz .idct_loop1
pop ecx
sub ebx, 8*4
mov ecx, 8
.idct_loop2:
fld dword [ebx+3*4]
fld dword [ebx+5*4]
fadd st1,st0
fadd st0,st0
fsub st0,st1 ; st0=S5-S3,st1=S5+S3
fld dword [ebx+1*4]
fld dword [ebx+7*4]
fsub st1,st0
fadd st0,st0
fadd st0,st1 ; st0=S1+S7,st1=S1-S7,st2=S5-S3,st3=S5+S3
fadd st3,st0
fadd st0,st0
fsub st0,st3 ; st0=S1-S3-S5+S7,st1=S1-S7,st2=S5-S3,st3=S1+S3+S5+S7
fmul [idct_sqrt2]
fld st2
fadd st0,st2
fmul [idct_cos] ; st0=(S1-S3+S5-S7)cos,st1=(S1-S3-S5+S7)sqrt2,
; st2=S1-S7,st3=S5-S3,st4=S1+S3+S5+S7
fxch st2
fmul [idct_cos_diff]
fsub st0,st2 ; st0=(S1-S7)cos_diff - (S1-S3+S5-S7)cos
fxch st3
fmul [idct_cos_sum]
fadd st0,st2 ; st0=(S5-S3)cos_sum+(S1-S3+S5-S7)cos
fsub st0,st4 ; st0=val0
fsub st1,st0 ; st0=val0,st1=val1,st2=(S1-S3+S5-S7)cos,
; st3=(S1-S7)cos_diff-(S1-S3+S5-S7)cos,st4=S1+S3+S5+S7
fxch st2
fstp st0
fadd st2,st0 ; st0=val1,st1=val0,st2=val2,st3=S1+S3+S5+S7
fld dword [ebx+0*4]
fld dword [ebx+4*4]
fsub st1,st0
fadd st0,st0
fadd st0,st1 ; st0=S0+S4,st1=S0-S4
fld dword [ebx+6*4]
fld dword [ebx+2*4]
fadd st1,st0
fadd st0,st0
fsub st0,st1 ; st0=S2-S6,st1=S2+S6
fmul [idct_sqrt2]
fsub st0,st1
fsub st3,st0
fadd st0,st0
fadd st0,st3 ; st0=(S0-S4)+((S2-S6)sqrt2-(S2+S6))
; st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
fxch st1
fsub st2,st0
fadd st0,st0
fadd st0,st2 ; st0=(S0+S4)+(S2+S6),st1=(S0-S4)+((S2-S6)sqrt2-(S2+S6)),
; st2=(S0+S4)-(S2+S6),st3=(S0-S4)-((S2-S6)sqrt2-(S2+S6))
; st4=val1,st5=val0,st6=val2,st7=S1+S3+S5+S7
fsubr st7,st0
fadd st0,st0
fsub st0,st7
fistp dword [ebx+0*4]
fsubr st4,st0
fadd st0,st0
fsub st0,st4
fistp dword [ebx+1*4]
fadd st4,st0
fadd st0,st0
fsub st0,st4
fistp dword [ebx+3*4]
fsubr st1,st0
fadd st0,st0
fsub st0,st1
fistp dword [ebx+2*4]
fistp dword [ebx+5*4]
fistp dword [ebx+6*4]
fistp dword [ebx+4*4]
fistp dword [ebx+7*4]
add ebx, 32
sub ecx, 1
jnz .idct_loop2
sub ebx, 32*8
mov ecx, 64
lea edi, [ebx - jpeg.work.idct_tmp_area + jpeg.work.decoded_data - 1]
push esi
.idct_loop3:
mov eax, [ebx]
add ebx, 4
add eax, 80h
cmp eax, 80000000h
sbb esi, esi
add edi, 1
and eax, esi
cmp eax, 100h
sbb esi, esi
not esi
or eax, esi
sub al, [edx+51]
sub ecx, 1
mov [edi], al
jnz .idct_loop3
pop esi
sub ebx, 64*4 + jpeg.work.idct_tmp_area
; done
ret
.eof_pop3:
pop ebx
.eof_pop2:
pop ebx
.eof_pop1:
pop ebx
.eof_pop0:
; EOF or incorrect data during scanning
mov esp, [ebx + jpeg.work._esp]
jmp img.decode.jpg.end
img.encode.jpg:
xor eax, eax
ret 8
zigzag:
; (x,y) -> 2*(x+y*8)
repeat 8
.cur = %
if .cur and 1
repeat %
db 2*((%-1) + (.cur-%)*8)
end repeat
else
repeat %
db 2*((.cur-%) + (%-1)*8)
end repeat
end if
end repeat
repeat 7
.cur = %
if .cur and 1
repeat 8-%
db 2*((%+.cur-1) + (8-%)*8)
end repeat
else
repeat 8-%
db 2*((8-%) + (%+.cur-1)*8)
end repeat
end if
end repeat
align 4
idct_pre_table:
; c_0 = 1/(2\sqrt{2}), c_i = cos(i*\pi/16)/2
dd 0.35355339, 0.49039264, 0.461939766, 0.41573481
dd 0.35355339, 0.27778512, 0.19134172, 0.09754516
idct_sqrt2 dd 1.41421356 ; \sqrt{2}
idct_cos dd 1.847759065 ; 2\cos{\pi/8}
idct_cos_sum dd -2.61312593 ; -2(\cos{\pi/8} + \cos{3\pi/8})
idct_cos_diff dd 1.08239220 ; 2(\cos{\pi/8} - \cos{3\pi/8})
;---------------------------------------------------------------------