293 lines
9.7 KiB
PHP
293 lines
9.7 KiB
PHP
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
;; ;;
|
||
|
;; Copyright (C) KolibriOS team 2004-2013. All rights reserved. ;;
|
||
|
;; Distributed under terms of the GNU General Public License ;;
|
||
|
;; ;;
|
||
|
;; Written by CleverMouse ;;
|
||
|
;; ;;
|
||
|
;; GNU GENERAL PUBLIC LICENSE ;;
|
||
|
;; Version 2, June 1991 ;;
|
||
|
;; ;;
|
||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||
|
|
||
|
uglobal
|
||
|
|
||
|
utf8_bytes_rest dd ? ; bytes rest in current UTF8 sequence
|
||
|
utf8_char dd ? ; first bits of current UTF8 character
|
||
|
|
||
|
endg
|
||
|
|
||
|
|
||
|
;get_next_byte:
|
||
|
;; Load next byte from the packet, translating to cp866 if necessary
|
||
|
;; At input esi = pointer to data, edx = limit of data
|
||
|
;; Output is either (translated) byte in al with CF set or CF cleared.
|
||
|
; mov eax, [encoding]
|
||
|
; jmp [get_byte_table+eax*4]
|
||
|
;
|
||
|
;get_byte_cp866:
|
||
|
; cmp esi, edx
|
||
|
; jae .nothing
|
||
|
; lodsb
|
||
|
;.nothing:
|
||
|
; ret
|
||
|
;
|
||
|
;get_byte_cp1251:
|
||
|
; cmp esi, edx
|
||
|
; jae .nothing
|
||
|
; lodsb
|
||
|
; cmp al, 0x80
|
||
|
; jb @f
|
||
|
; and eax, 0x7F
|
||
|
; mov al, [cp1251_table+eax]
|
||
|
;@@:
|
||
|
; stc
|
||
|
;.nothing:
|
||
|
; ret
|
||
|
|
||
|
get_byte_utf8:
|
||
|
; UTF8 decoding is slightly complicated.
|
||
|
; One character can occupy one or more bytes.
|
||
|
; The boundary in packets theoretically can be anywhere in data,
|
||
|
; so this procedure keeps internal state between calls and handles
|
||
|
; one byte at a time, looping until character is read or packet is over.
|
||
|
; Globally, there are two distinct tasks: decode byte sequence to unicode char
|
||
|
; and convert this unicode char to our base encoding (that is cp866).
|
||
|
; 1. Check that there are data.
|
||
|
cmp esi, edx
|
||
|
jae .nothing
|
||
|
; 2. Load byte.
|
||
|
lodsb
|
||
|
movzx ecx, al
|
||
|
; 3. Bytes in an UTF8 sequence can be of any of three types.
|
||
|
; If most significant bit is cleared, sequence is one byte and usual ASCII char.
|
||
|
; First byte of a sequence must be 11xxxxxx, other bytes are 10yyyyyy.
|
||
|
and al, 0xC0
|
||
|
jns .single_byte
|
||
|
jp .first_byte
|
||
|
; 4. This byte is not first in UTF8 sequence.
|
||
|
; 4a. Check that the sequence was started. If no, it is invalid byte
|
||
|
; and we simply ignore it.
|
||
|
cmp [utf8_bytes_rest], 0
|
||
|
jz get_byte_utf8
|
||
|
; 4b. Otherwise, it is really next byte and it gives some more bits of char.
|
||
|
mov eax, [utf8_char]
|
||
|
shl eax, 6
|
||
|
lea eax, [eax+ecx-0x80]
|
||
|
; 4c. Decrement number of bytes rest in the sequence.
|
||
|
; If it goes to zero, character is read, so return it.
|
||
|
dec [utf8_bytes_rest]
|
||
|
jz .got_char
|
||
|
mov [utf8_char], eax
|
||
|
jmp get_byte_utf8
|
||
|
; 5. If the byte is first in UTF8 sequence, calculate the number of leading 1s
|
||
|
; - it equals total number of bytes in the sequence; some other bits rest for
|
||
|
; leading bits in the character.
|
||
|
.first_byte:
|
||
|
mov eax, -1
|
||
|
@@:
|
||
|
inc eax
|
||
|
add cl, cl
|
||
|
js @b
|
||
|
mov [utf8_bytes_rest], eax
|
||
|
xchg eax, ecx
|
||
|
inc ecx
|
||
|
shr al, cl
|
||
|
mov [utf8_char], eax
|
||
|
jmp get_byte_utf8
|
||
|
; 6. If the byte is ASCII char, it is the character.
|
||
|
.single_byte:
|
||
|
xchg eax, ecx
|
||
|
.got_char:
|
||
|
; We got the character, now abandon a possible sequence in progress.
|
||
|
and [utf8_bytes_rest], 0
|
||
|
; Now second task. The unicode character is in eax, and now we shall convert it
|
||
|
; to cp866.
|
||
|
cmp eax, 0x80
|
||
|
jb .done
|
||
|
; 0x410-0x43F -> 0x80-0xAF, 0x440-0x44F -> 0xE0-0xEF, 0x401 -> 0xF0, 0x451 -> 0xF1
|
||
|
cmp eax, 0x401
|
||
|
jz .YO
|
||
|
cmp eax, 0x451
|
||
|
jz .yo
|
||
|
cmp eax, 0x410
|
||
|
jb .unrecognized
|
||
|
cmp eax, 0x440
|
||
|
jb .part1
|
||
|
cmp eax, 0x450
|
||
|
jb .part2
|
||
|
cmp eax, 0x25a0
|
||
|
jae .unrecognized
|
||
|
sub eax, 0x2500
|
||
|
jb .unrecognized
|
||
|
mov al, [cp866_boxes+eax]
|
||
|
ret
|
||
|
.part1:
|
||
|
sub al, 0x10-0x80
|
||
|
.nothing:
|
||
|
.done:
|
||
|
ret
|
||
|
.part2:
|
||
|
sub al, (0x40-0xE0) and 0xFF
|
||
|
ret
|
||
|
.unrecognized:
|
||
|
mov al, '?'
|
||
|
stc
|
||
|
ret
|
||
|
.YO:
|
||
|
mov al, 0xF0
|
||
|
stc
|
||
|
ret
|
||
|
.yo:
|
||
|
mov al, 0xF1
|
||
|
stc
|
||
|
ret
|
||
|
|
||
|
|
||
|
|
||
|
;recode_to_cp866:
|
||
|
; rep movsb
|
||
|
; ret
|
||
|
;
|
||
|
;recode_to_cp1251:
|
||
|
; xor eax, eax
|
||
|
; jecxz .nothing
|
||
|
; .loop:
|
||
|
; lodsb
|
||
|
; cmp al,0x80
|
||
|
; jb @f
|
||
|
; mov al, [cp866_table-0x80+eax]
|
||
|
; @@: stosb
|
||
|
; loop .loop
|
||
|
; .nothing:
|
||
|
; ret
|
||
|
|
||
|
recode_to_utf8:
|
||
|
jecxz .nothing
|
||
|
.loop:
|
||
|
lodsb
|
||
|
cmp al, 0x80
|
||
|
jb .single_byte
|
||
|
and eax, 0x7F
|
||
|
mov ax, [utf8_table+eax*2]
|
||
|
stosw
|
||
|
loop .loop
|
||
|
ret
|
||
|
.single_byte:
|
||
|
stosb
|
||
|
loop .loop
|
||
|
.nothing:
|
||
|
ret
|
||
|
|
||
|
;recode:
|
||
|
; mov eax, [encoding]
|
||
|
; jmp [recode_proc+eax*4]
|
||
|
|
||
|
|
||
|
|
||
|
;encoding dd UTF8
|
||
|
;recode_proc dd recode_to_cp866, recode_to_cp1251, recode_to_utf8
|
||
|
;get_byte_table dd get_byte_cp866, get_byte_cp1251, get_byte_utf8
|
||
|
|
||
|
|
||
|
;cp1251_table:
|
||
|
; db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; 8
|
||
|
; db '?','?','?','?','?',$F9,'?','?' , '?','?','?','?','?','?','?','?' ; 9
|
||
|
; db '?',$F6,$F7,'?',$FD,'?','?','?' , $F0,'?',$F2,'?','?','?','?',$F4 ; A
|
||
|
; db $F8,'?','?','?','?','?','?',$FA , $F1,$FC,$F3,'?','?','?','?',$F5 ; B
|
||
|
; db $80,$81,$82,$83,$84,$85,$86,$87 , $88,$89,$8A,$8B,$8C,$8D,$8E,$8F ; C
|
||
|
; db $90,$91,$92,$93,$94,$95,$96,$97 , $98,$99,$9A,$9B,$9C,$9D,$9E,$9F ; D
|
||
|
; db $A0,$A1,$A2,$A3,$A4,$A5,$A6,$A7 , $A8,$A9,$AA,$AB,$AC,$AD,$AE,$AF ; E
|
||
|
; db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; F
|
||
|
|
||
|
; 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||
|
|
||
|
utf8_table:
|
||
|
times 80h dw 0x98C3 ; default placeholder
|
||
|
|
||
|
; 0x80-0xAF -> 0x90D0-0xBFD0
|
||
|
repeat 0x30
|
||
|
store byte 0xD0 at utf8_table+2*(%-1)
|
||
|
store byte 0x90+%-1 at utf8_table+2*%-1
|
||
|
end repeat
|
||
|
|
||
|
; 0xE0-0xEF -> 0x80D1-0x8FD1
|
||
|
repeat 0x10
|
||
|
store byte 0xD1 at utf8_table+2*(0xE0-0x80+%-1)
|
||
|
store byte 0x80+%-1 at utf8_table+2*(0xE0-0x80+%)-1
|
||
|
end repeat
|
||
|
|
||
|
; 0xF0 -> 0x81D0, 0xF1 -> 0x91D1
|
||
|
store dword 0x91D181D0 at utf8_table+2*(0xF0-0x80)
|
||
|
|
||
|
;cp866_table:
|
||
|
; db $C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7 , $C8,$C9,$CA,$CB,$CC,$CD,$CE,$CF ; 8
|
||
|
; db $D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7 , $D8,$D9,$DA,$DB,$DC,$DD,$DE,$DF ; 9
|
||
|
; db $E0,$E1,$E2,$E3,$E4,$E5,$E6,$E7 , $E8,$E9,$EA,$EB,$EC,$ED,$EE,$EF ; A
|
||
|
; db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; B
|
||
|
; db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; C
|
||
|
; db '?','?','?','?','?','?','?','?' , '?','?','?','?','?','?','?','?' ; D
|
||
|
; db $F0,$F1,$F2,$F3,$F4,$F5,$F6,$F7 , $F8,$F9,$FA,$FB,$FC,$FD,$FE,$FF ; E
|
||
|
; db $A8,$B8,$AA,$BA,$AF,$BF,$A1,$A2 , $B0,$95,$B7,'?',$B9,$A4,'?','?' ; F
|
||
|
|
||
|
; 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||
|
|
||
|
|
||
|
; Codepoints for 0xB0-0xDF, unicode offset 0x2500
|
||
|
cp866_boxes:
|
||
|
times 0xA0 db '?'
|
||
|
|
||
|
store byte 0xB0 at cp866_boxes+0x91
|
||
|
store byte 0xB1 at cp866_boxes+0x92
|
||
|
store byte 0xB2 at cp866_boxes+0x93
|
||
|
store byte 0xB3 at cp866_boxes+0x02
|
||
|
store byte 0xB4 at cp866_boxes+0x24
|
||
|
store byte 0xB5 at cp866_boxes+0x61
|
||
|
store byte 0xB6 at cp866_boxes+0x62
|
||
|
store byte 0xB7 at cp866_boxes+0x56
|
||
|
|
||
|
store byte 0xB8 at cp866_boxes+0x55
|
||
|
store byte 0xB9 at cp866_boxes+0x63
|
||
|
store byte 0xBA at cp866_boxes+0x51
|
||
|
store byte 0xBB at cp866_boxes+0x57
|
||
|
store byte 0xBC at cp866_boxes+0x5D
|
||
|
store byte 0xBD at cp866_boxes+0x5C
|
||
|
store byte 0xBE at cp866_boxes+0x5B
|
||
|
store byte 0xBF at cp866_boxes+0x10
|
||
|
|
||
|
store byte 0xC0 at cp866_boxes+0x14
|
||
|
store byte 0xC1 at cp866_boxes+0x34
|
||
|
store byte 0xC2 at cp866_boxes+0x2C
|
||
|
store byte 0xC3 at cp866_boxes+0x1C
|
||
|
store byte 0xC4 at cp866_boxes+0x00
|
||
|
store byte 0xC5 at cp866_boxes+0x3C
|
||
|
store byte 0xC6 at cp866_boxes+0x5E
|
||
|
store byte 0xC7 at cp866_boxes+0x5F
|
||
|
|
||
|
store byte 0xC8 at cp866_boxes+0x5A
|
||
|
store byte 0xC9 at cp866_boxes+0x54
|
||
|
store byte 0xCA at cp866_boxes+0x69
|
||
|
store byte 0xCB at cp866_boxes+0x66
|
||
|
store byte 0xCC at cp866_boxes+0x60
|
||
|
store byte 0xCD at cp866_boxes+0x50
|
||
|
store byte 0xCE at cp866_boxes+0x6C
|
||
|
store byte 0xCF at cp866_boxes+0x67
|
||
|
|
||
|
store byte 0xD0 at cp866_boxes+0x68
|
||
|
store byte 0xD1 at cp866_boxes+0x64
|
||
|
store byte 0xD2 at cp866_boxes+0x65
|
||
|
store byte 0xD3 at cp866_boxes+0x59
|
||
|
store byte 0xD4 at cp866_boxes+0x58
|
||
|
store byte 0xD5 at cp866_boxes+0x52
|
||
|
store byte 0xD6 at cp866_boxes+0x53
|
||
|
store byte 0xD7 at cp866_boxes+0x6B
|
||
|
|
||
|
store byte 0xD8 at cp866_boxes+0x6A
|
||
|
store byte 0xD9 at cp866_boxes+0x18
|
||
|
store byte 0xDA at cp866_boxes+0x0C
|
||
|
store byte 0xDB at cp866_boxes+0x88
|
||
|
store byte 0xDC at cp866_boxes+0x84
|
||
|
store byte 0xDD at cp866_boxes+0x8C
|
||
|
store byte 0xDE at cp866_boxes+0x90
|
||
|
store byte 0xDF at cp866_boxes+0x80
|