Add initial libunicode for UTF-8 parsing and example

- Added libunicode.asm to parse UTF-8 strings. - Implemented count_utf8_codepoints to skip continuation bytes. - Implemented count_utf8_graphemes to handle ZWJ (E2 80 8D) and combining marks (CC/CD). - Added console.asm to the examples folder to test and print the results. - Submitted for GSoC qualification task.
2026-03-09 13:08:25 +05:30
parent 91dc4d8cad
commit 6de88d5fd0
2 changed files with 151 additions and 0 deletions
--- a/programs/develop/libraries/libunicode/examples/console.asm
+++ b/programs/develop/libraries/libunicode/examples/console.asm
@@ -0,0 +1,70 @@
+format binary as ""
+use32
+org 0x0
+
+db 'MENUET01'
+dd 0x01, START, I_END
+dd 0x100000 ; 1MB Memory
+dd 0x100000 ; Stack pointer
+dd 0x0
+dd 0x0
+
+include '../../../../proc32.inc'
+include '../../../../macros.inc'
+include '../../../../dll.inc'
+include '../libunicode.asm'
+
+START:
+    stdcall dll.Load, import_table
+    test eax, eax
+    jnz EXIT         ; If 0, jump to exit
+
+    push window_title
+    push -1
+    push -1
+    push -1
+    push -1
+    call [con_init]
+
+    push my_text
+    call [con_write_asciiz]
+
+    mov  eax, test_combo
+    call count_utf8_codepoints
+
+    push eax
+    push fmt_codepoints
+    call [con_printf]
+    add esp, 8
+
+    mov  eax, test_combo
+    call count_utf8_gramphene
+
+    push eax
+    push fmt_graphemes
+    call [con_printf]
+    add esp, 8
+
+    push 0
+    call [con_exit]
+
+EXIT:
+    mcall -1              ; Exit cleanly
+
+; DATA SECTION
+
+window_title    db 'Debug Console', 0
+my_text         db 'Console loaded successfully!', 10, 0
+fmt_codepoints  db "Total Codepoints: %d", 10, 0
+fmt_graphemes   db "Total  Graphemes: %d", 10, 0
+unitxt          db 'AП👨‍👩‍👦qwerty', 0
+test_tech       db 'c', 'a', 'f', 'e', 0xCC, 0x81, 0
+test_combo db 'A', 0xD0, 0x9F, 0xF0, 0x9F, 0x91, 0xA9, 0xE2, 0x80, 0x8D, 0xF0, 0x9F, 0x92, 0xBB, 'e', 0xCC, 0x81, 0
+
+align 4
+import_table:
+library console, '/sys/lib/console.obj'
+import console, con_init, 'con_init', con_write_asciiz, 'con_write_asciiz', con_exit, 'con_exit', \
+                con_printf, 'con_printf'
+
+I_END:
--- a/programs/develop/libraries/libunicode/libunicode.asm
+++ b/programs/develop/libraries/libunicode/libunicode.asm
@@ -0,0 +1,81 @@
+;=============================================================
+;   eax <- pointer to the memory address of the string
+;   * Do not use other register because it may be overwritten.
+;   -------PRIVATE--------
+;   ebx <- counter of codepoints
+;   ecx <- each byte
+;   
+;=============================================================
+count_utf8_codepoints:
+        mov     ebx, 0
+
+read_loop:
+        mov     cl, byte [eax]
+        
+        test    cl, cl        ; if it is an ending byte (0)
+        je      done
+
+        and     cl, 0xC0
+        cmp     cl, 0x80
+        je      skip_count
+        inc     ebx
+
+skip_count:
+        inc     eax
+        jmp     read_loop
+done:
+        mov eax, ebx
+        ret
+
+;=============================================================
+;   eax <- pointer to the memory address of the string
+;   * Do not use other register because it may be overwritten.
+;   -------PRIVATE--------
+;   ebx <- counter of grapheme
+;   ecx <- each byte
+;   
+;=============================================================
+count_utf8_gramphene:
+        mov     ebx, 0
+
+read_loop_graph:
+        mov     cl, byte [eax]
+        
+        test    cl, cl        ; if it is an ending byte (0)
+        je      done_graph
+; Is this accent
+
+        cmp     cl, 0xCC
+        je      skip_count_graph
+
+        cmp     cl, 0xCD
+        je      skip_count_graph
+
+; Check for not a zero width joint
+        cmp     cl, 0xE2
+        jne     not_any_special
+
+        cmp byte [eax+1], 0x80
+        jne     not_any_special
+
+        cmp byte [eax+2], 0x8D
+        jne     not_any_special
+
+        dec     ebx
+        add     eax, 3
+        jmp     read_loop_graph
+
+not_any_special:
+        and     cl, 0xC0        ; Is this a continution byte
+        cmp     cl, 0x80
+        je      skip_count_graph
+        inc     ebx
+
+skip_count_graph:
+        inc     eax
+        jmp     read_loop_graph
+
+done_graph:
+        mov eax, ebx
+        ret
+