Files
kolibrios/programs/develop/libraries/libunicode/libunicode.asm
Swarnadeep Paul 6de88d5fd0
Some checks are pending
Build system / Check kernel codestyle (pull_request) Blocked by required conditions
Build system / Build (pull_request) Blocked by required conditions
Add initial libunicode for UTF-8 parsing and example
- Added libunicode.asm to parse UTF-8 strings.
- Implemented count_utf8_codepoints to skip continuation bytes.
- Implemented count_utf8_graphemes to handle ZWJ (E2 80 8D) and combining marks (CC/CD).
- Added console.asm to the examples folder to test and print the results.
- Submitted for GSoC qualification task.
2026-03-09 13:08:25 +05:30

82 lines
1.9 KiB
NASM

;=============================================================
; eax <- pointer to the memory address of the string
; * Do not use other register because it may be overwritten.
; -------PRIVATE--------
; ebx <- counter of codepoints
; ecx <- each byte
;
;=============================================================
count_utf8_codepoints:
mov ebx, 0
read_loop:
mov cl, byte [eax]
test cl, cl ; if it is an ending byte (0)
je done
and cl, 0xC0
cmp cl, 0x80
je skip_count
inc ebx
skip_count:
inc eax
jmp read_loop
done:
mov eax, ebx
ret
;=============================================================
; eax <- pointer to the memory address of the string
; * Do not use other register because it may be overwritten.
; -------PRIVATE--------
; ebx <- counter of grapheme
; ecx <- each byte
;
;=============================================================
count_utf8_gramphene:
mov ebx, 0
read_loop_graph:
mov cl, byte [eax]
test cl, cl ; if it is an ending byte (0)
je done_graph
; Is this accent
cmp cl, 0xCC
je skip_count_graph
cmp cl, 0xCD
je skip_count_graph
; Check for not a zero width joint
cmp cl, 0xE2
jne not_any_special
cmp byte [eax+1], 0x80
jne not_any_special
cmp byte [eax+2], 0x8D
jne not_any_special
dec ebx
add eax, 3
jmp read_loop_graph
not_any_special:
and cl, 0xC0 ; Is this a continution byte
cmp cl, 0x80
je skip_count_graph
inc ebx
skip_count_graph:
inc eax
jmp read_loop_graph
done_graph:
mov eax, ebx
ret