Add initial libunicode for UTF-8 parsing and example
- Added libunicode.asm to parse UTF-8 strings. - Implemented count_utf8_codepoints to skip continuation bytes. - Implemented count_utf8_graphemes to handle ZWJ (E2 80 8D) and combining marks (CC/CD). - Added console.asm to the examples folder to test and print the results. - Submitted for GSoC qualification task.
This commit is contained in:
70
programs/develop/libraries/libunicode/examples/console.asm
Normal file
70
programs/develop/libraries/libunicode/examples/console.asm
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
format binary as ""
|
||||||
|
use32
|
||||||
|
org 0x0
|
||||||
|
|
||||||
|
db 'MENUET01'
|
||||||
|
dd 0x01, START, I_END
|
||||||
|
dd 0x100000 ; 1MB Memory
|
||||||
|
dd 0x100000 ; Stack pointer
|
||||||
|
dd 0x0
|
||||||
|
dd 0x0
|
||||||
|
|
||||||
|
include '../../../../proc32.inc'
|
||||||
|
include '../../../../macros.inc'
|
||||||
|
include '../../../../dll.inc'
|
||||||
|
include '../libunicode.asm'
|
||||||
|
|
||||||
|
START:
|
||||||
|
stdcall dll.Load, import_table
|
||||||
|
test eax, eax
|
||||||
|
jnz EXIT ; If 0, jump to exit
|
||||||
|
|
||||||
|
push window_title
|
||||||
|
push -1
|
||||||
|
push -1
|
||||||
|
push -1
|
||||||
|
push -1
|
||||||
|
call [con_init]
|
||||||
|
|
||||||
|
push my_text
|
||||||
|
call [con_write_asciiz]
|
||||||
|
|
||||||
|
mov eax, test_combo
|
||||||
|
call count_utf8_codepoints
|
||||||
|
|
||||||
|
push eax
|
||||||
|
push fmt_codepoints
|
||||||
|
call [con_printf]
|
||||||
|
add esp, 8
|
||||||
|
|
||||||
|
mov eax, test_combo
|
||||||
|
call count_utf8_gramphene
|
||||||
|
|
||||||
|
push eax
|
||||||
|
push fmt_graphemes
|
||||||
|
call [con_printf]
|
||||||
|
add esp, 8
|
||||||
|
|
||||||
|
push 0
|
||||||
|
call [con_exit]
|
||||||
|
|
||||||
|
EXIT:
|
||||||
|
mcall -1 ; Exit cleanly
|
||||||
|
|
||||||
|
; DATA SECTION
|
||||||
|
|
||||||
|
window_title db 'Debug Console', 0
|
||||||
|
my_text db 'Console loaded successfully!', 10, 0
|
||||||
|
fmt_codepoints db "Total Codepoints: %d", 10, 0
|
||||||
|
fmt_graphemes db "Total Graphemes: %d", 10, 0
|
||||||
|
unitxt db 'AП👨👩👦qwerty', 0
|
||||||
|
test_tech db 'c', 'a', 'f', 'e', 0xCC, 0x81, 0
|
||||||
|
test_combo db 'A', 0xD0, 0x9F, 0xF0, 0x9F, 0x91, 0xA9, 0xE2, 0x80, 0x8D, 0xF0, 0x9F, 0x92, 0xBB, 'e', 0xCC, 0x81, 0
|
||||||
|
|
||||||
|
align 4
|
||||||
|
import_table:
|
||||||
|
library console, '/sys/lib/console.obj'
|
||||||
|
import console, con_init, 'con_init', con_write_asciiz, 'con_write_asciiz', con_exit, 'con_exit', \
|
||||||
|
con_printf, 'con_printf'
|
||||||
|
|
||||||
|
I_END:
|
||||||
81
programs/develop/libraries/libunicode/libunicode.asm
Normal file
81
programs/develop/libraries/libunicode/libunicode.asm
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
;=============================================================
|
||||||
|
; eax <- pointer to the memory address of the string
|
||||||
|
; * Do not use other register because it may be overwritten.
|
||||||
|
; -------PRIVATE--------
|
||||||
|
; ebx <- counter of codepoints
|
||||||
|
; ecx <- each byte
|
||||||
|
;
|
||||||
|
;=============================================================
|
||||||
|
count_utf8_codepoints:
|
||||||
|
mov ebx, 0
|
||||||
|
|
||||||
|
read_loop:
|
||||||
|
mov cl, byte [eax]
|
||||||
|
|
||||||
|
test cl, cl ; if it is an ending byte (0)
|
||||||
|
je done
|
||||||
|
|
||||||
|
and cl, 0xC0
|
||||||
|
cmp cl, 0x80
|
||||||
|
je skip_count
|
||||||
|
inc ebx
|
||||||
|
|
||||||
|
skip_count:
|
||||||
|
inc eax
|
||||||
|
jmp read_loop
|
||||||
|
done:
|
||||||
|
mov eax, ebx
|
||||||
|
ret
|
||||||
|
|
||||||
|
;=============================================================
|
||||||
|
; eax <- pointer to the memory address of the string
|
||||||
|
; * Do not use other register because it may be overwritten.
|
||||||
|
; -------PRIVATE--------
|
||||||
|
; ebx <- counter of grapheme
|
||||||
|
; ecx <- each byte
|
||||||
|
;
|
||||||
|
;=============================================================
|
||||||
|
count_utf8_gramphene:
|
||||||
|
mov ebx, 0
|
||||||
|
|
||||||
|
read_loop_graph:
|
||||||
|
mov cl, byte [eax]
|
||||||
|
|
||||||
|
test cl, cl ; if it is an ending byte (0)
|
||||||
|
je done_graph
|
||||||
|
; Is this accent
|
||||||
|
|
||||||
|
cmp cl, 0xCC
|
||||||
|
je skip_count_graph
|
||||||
|
|
||||||
|
cmp cl, 0xCD
|
||||||
|
je skip_count_graph
|
||||||
|
|
||||||
|
; Check for not a zero width joint
|
||||||
|
cmp cl, 0xE2
|
||||||
|
jne not_any_special
|
||||||
|
|
||||||
|
cmp byte [eax+1], 0x80
|
||||||
|
jne not_any_special
|
||||||
|
|
||||||
|
cmp byte [eax+2], 0x8D
|
||||||
|
jne not_any_special
|
||||||
|
|
||||||
|
dec ebx
|
||||||
|
add eax, 3
|
||||||
|
jmp read_loop_graph
|
||||||
|
|
||||||
|
not_any_special:
|
||||||
|
and cl, 0xC0 ; Is this a continution byte
|
||||||
|
cmp cl, 0x80
|
||||||
|
je skip_count_graph
|
||||||
|
inc ebx
|
||||||
|
|
||||||
|
skip_count_graph:
|
||||||
|
inc eax
|
||||||
|
jmp read_loop_graph
|
||||||
|
|
||||||
|
done_graph:
|
||||||
|
mov eax, ebx
|
||||||
|
ret
|
||||||
|
|
||||||
Reference in New Issue
Block a user