Add initial libunicode for UTF-8 parsing and example

- Added libunicode.asm to parse UTF-8 strings.
- Implemented count_utf8_codepoints to skip continuation bytes.
- Implemented count_utf8_graphemes to handle ZWJ (E2 80 8D) and combining marks (CC/CD).
- Added console.asm to the examples folder to test and print the results.
- Submitted for GSoC qualification task.
This commit is contained in:
2026-03-09 13:08:25 +05:30
parent 91dc4d8cad
commit 6de88d5fd0
2 changed files with 151 additions and 0 deletions

View File

@@ -0,0 +1,70 @@
format binary as ""
use32
org 0x0
db 'MENUET01'
dd 0x01, START, I_END
dd 0x100000 ; 1MB Memory
dd 0x100000 ; Stack pointer
dd 0x0
dd 0x0
include '../../../../proc32.inc'
include '../../../../macros.inc'
include '../../../../dll.inc'
include '../libunicode.asm'
START:
stdcall dll.Load, import_table
test eax, eax
jnz EXIT ; If 0, jump to exit
push window_title
push -1
push -1
push -1
push -1
call [con_init]
push my_text
call [con_write_asciiz]
mov eax, test_combo
call count_utf8_codepoints
push eax
push fmt_codepoints
call [con_printf]
add esp, 8
mov eax, test_combo
call count_utf8_gramphene
push eax
push fmt_graphemes
call [con_printf]
add esp, 8
push 0
call [con_exit]
EXIT:
mcall -1 ; Exit cleanly
; DATA SECTION
window_title db 'Debug Console', 0
my_text db 'Console loaded successfully!', 10, 0
fmt_codepoints db "Total Codepoints: %d", 10, 0
fmt_graphemes db "Total Graphemes: %d", 10, 0
unitxt db 'AП👨👩👦qwerty', 0
test_tech db 'c', 'a', 'f', 'e', 0xCC, 0x81, 0
test_combo db 'A', 0xD0, 0x9F, 0xF0, 0x9F, 0x91, 0xA9, 0xE2, 0x80, 0x8D, 0xF0, 0x9F, 0x92, 0xBB, 'e', 0xCC, 0x81, 0
align 4
import_table:
library console, '/sys/lib/console.obj'
import console, con_init, 'con_init', con_write_asciiz, 'con_write_asciiz', con_exit, 'con_exit', \
con_printf, 'con_printf'
I_END:

View File

@@ -0,0 +1,81 @@
;=============================================================
; eax <- pointer to the memory address of the string
; * Do not use other register because it may be overwritten.
; -------PRIVATE--------
; ebx <- counter of codepoints
; ecx <- each byte
;
;=============================================================
count_utf8_codepoints:
mov ebx, 0
read_loop:
mov cl, byte [eax]
test cl, cl ; if it is an ending byte (0)
je done
and cl, 0xC0
cmp cl, 0x80
je skip_count
inc ebx
skip_count:
inc eax
jmp read_loop
done:
mov eax, ebx
ret
;=============================================================
; eax <- pointer to the memory address of the string
; * Do not use other register because it may be overwritten.
; -------PRIVATE--------
; ebx <- counter of grapheme
; ecx <- each byte
;
;=============================================================
count_utf8_gramphene:
mov ebx, 0
read_loop_graph:
mov cl, byte [eax]
test cl, cl ; if it is an ending byte (0)
je done_graph
; Is this accent
cmp cl, 0xCC
je skip_count_graph
cmp cl, 0xCD
je skip_count_graph
; Check for not a zero width joint
cmp cl, 0xE2
jne not_any_special
cmp byte [eax+1], 0x80
jne not_any_special
cmp byte [eax+2], 0x8D
jne not_any_special
dec ebx
add eax, 3
jmp read_loop_graph
not_any_special:
and cl, 0xC0 ; Is this a continution byte
cmp cl, 0x80
je skip_count_graph
inc ebx
skip_count_graph:
inc eax
jmp read_loop_graph
done_graph:
mov eax, ebx
ret