diff --git a/programs/develop/libraries/libunicode/examples/console.asm b/programs/develop/libraries/libunicode/examples/console.asm new file mode 100644 index 000000000..122db49e3 --- /dev/null +++ b/programs/develop/libraries/libunicode/examples/console.asm @@ -0,0 +1,70 @@ +format binary as "" +use32 +org 0x0 + +db 'MENUET01' +dd 0x01, START, I_END +dd 0x100000 ; 1MB Memory +dd 0x100000 ; Stack pointer +dd 0x0 +dd 0x0 + +include '../../../../proc32.inc' +include '../../../../macros.inc' +include '../../../../dll.inc' +include '../libunicode.asm' + +START: + stdcall dll.Load, import_table + test eax, eax + jnz EXIT ; If 0, jump to exit + + push window_title + push -1 + push -1 + push -1 + push -1 + call [con_init] + + push my_text + call [con_write_asciiz] + + mov eax, test_combo + call count_utf8_codepoints + + push eax + push fmt_codepoints + call [con_printf] + add esp, 8 + + mov eax, test_combo + call count_utf8_gramphene + + push eax + push fmt_graphemes + call [con_printf] + add esp, 8 + + push 0 + call [con_exit] + +EXIT: + mcall -1 ; Exit cleanly + +; DATA SECTION + +window_title db 'Debug Console', 0 +my_text db 'Console loaded successfully!', 10, 0 +fmt_codepoints db "Total Codepoints: %d", 10, 0 +fmt_graphemes db "Total Graphemes: %d", 10, 0 +unitxt db 'AП👨‍👩‍👦qwerty', 0 +test_tech db 'c', 'a', 'f', 'e', 0xCC, 0x81, 0 +test_combo db 'A', 0xD0, 0x9F, 0xF0, 0x9F, 0x91, 0xA9, 0xE2, 0x80, 0x8D, 0xF0, 0x9F, 0x92, 0xBB, 'e', 0xCC, 0x81, 0 + +align 4 +import_table: +library console, '/sys/lib/console.obj' +import console, con_init, 'con_init', con_write_asciiz, 'con_write_asciiz', con_exit, 'con_exit', \ + con_printf, 'con_printf' + +I_END: diff --git a/programs/develop/libraries/libunicode/libunicode.asm b/programs/develop/libraries/libunicode/libunicode.asm new file mode 100644 index 000000000..94ef07b76 --- /dev/null +++ b/programs/develop/libraries/libunicode/libunicode.asm @@ -0,0 +1,81 @@ +;============================================================= +; eax <- pointer to the memory address of the string +; * Do not use other register because it may be overwritten. +; -------PRIVATE-------- +; ebx <- counter of codepoints +; ecx <- each byte +; +;============================================================= +count_utf8_codepoints: + mov ebx, 0 + +read_loop: + mov cl, byte [eax] + + test cl, cl ; if it is an ending byte (0) + je done + + and cl, 0xC0 + cmp cl, 0x80 + je skip_count + inc ebx + +skip_count: + inc eax + jmp read_loop +done: + mov eax, ebx + ret + +;============================================================= +; eax <- pointer to the memory address of the string +; * Do not use other register because it may be overwritten. +; -------PRIVATE-------- +; ebx <- counter of grapheme +; ecx <- each byte +; +;============================================================= +count_utf8_gramphene: + mov ebx, 0 + +read_loop_graph: + mov cl, byte [eax] + + test cl, cl ; if it is an ending byte (0) + je done_graph +; Is this accent + + cmp cl, 0xCC + je skip_count_graph + + cmp cl, 0xCD + je skip_count_graph + +; Check for not a zero width joint + cmp cl, 0xE2 + jne not_any_special + + cmp byte [eax+1], 0x80 + jne not_any_special + + cmp byte [eax+2], 0x8D + jne not_any_special + + dec ebx + add eax, 3 + jmp read_loop_graph + +not_any_special: + and cl, 0xC0 ; Is this a continution byte + cmp cl, 0x80 + je skip_count_graph + inc ebx + +skip_count_graph: + inc eax + jmp read_loop_graph + +done_graph: + mov eax, ebx + ret +