Add initial libunicode for UTF-8 parsing and example
- Added libunicode.asm to parse UTF-8 strings. - Implemented count_utf8_codepoints to skip continuation bytes. - Implemented count_utf8_graphemes to handle ZWJ (E2 80 8D) and combining marks (CC/CD). - Added console.asm to the examples folder to test and print the results. - Submitted for GSoC qualification task.
This commit is contained in:
81
programs/develop/libraries/libunicode/libunicode.asm
Normal file
81
programs/develop/libraries/libunicode/libunicode.asm
Normal file
@@ -0,0 +1,81 @@
|
||||
;=============================================================
|
||||
; eax <- pointer to the memory address of the string
|
||||
; * Do not use other register because it may be overwritten.
|
||||
; -------PRIVATE--------
|
||||
; ebx <- counter of codepoints
|
||||
; ecx <- each byte
|
||||
;
|
||||
;=============================================================
|
||||
count_utf8_codepoints:
|
||||
mov ebx, 0
|
||||
|
||||
read_loop:
|
||||
mov cl, byte [eax]
|
||||
|
||||
test cl, cl ; if it is an ending byte (0)
|
||||
je done
|
||||
|
||||
and cl, 0xC0
|
||||
cmp cl, 0x80
|
||||
je skip_count
|
||||
inc ebx
|
||||
|
||||
skip_count:
|
||||
inc eax
|
||||
jmp read_loop
|
||||
done:
|
||||
mov eax, ebx
|
||||
ret
|
||||
|
||||
;=============================================================
|
||||
; eax <- pointer to the memory address of the string
|
||||
; * Do not use other register because it may be overwritten.
|
||||
; -------PRIVATE--------
|
||||
; ebx <- counter of grapheme
|
||||
; ecx <- each byte
|
||||
;
|
||||
;=============================================================
|
||||
count_utf8_gramphene:
|
||||
mov ebx, 0
|
||||
|
||||
read_loop_graph:
|
||||
mov cl, byte [eax]
|
||||
|
||||
test cl, cl ; if it is an ending byte (0)
|
||||
je done_graph
|
||||
; Is this accent
|
||||
|
||||
cmp cl, 0xCC
|
||||
je skip_count_graph
|
||||
|
||||
cmp cl, 0xCD
|
||||
je skip_count_graph
|
||||
|
||||
; Check for not a zero width joint
|
||||
cmp cl, 0xE2
|
||||
jne not_any_special
|
||||
|
||||
cmp byte [eax+1], 0x80
|
||||
jne not_any_special
|
||||
|
||||
cmp byte [eax+2], 0x8D
|
||||
jne not_any_special
|
||||
|
||||
dec ebx
|
||||
add eax, 3
|
||||
jmp read_loop_graph
|
||||
|
||||
not_any_special:
|
||||
and cl, 0xC0 ; Is this a continution byte
|
||||
cmp cl, 0x80
|
||||
je skip_count_graph
|
||||
inc ebx
|
||||
|
||||
skip_count_graph:
|
||||
inc eax
|
||||
jmp read_loop_graph
|
||||
|
||||
done_graph:
|
||||
mov eax, ebx
|
||||
ret
|
||||
|
||||
Reference in New Issue
Block a user