; libunicode -- KolibriOS Unicode Library ; ; Copyright (C) <2026> KolibriOS.org Team ; Author: ; 1. Swarnadeep Paul ; ; This program is free software: you can redistribute it and/or modify it under ; the terms of the GNU General Public License as published by the Free Software ; Foundation, either version 2 of the License, or (at your option) any later ; version. ; ; This program is distributed in the hope that it will be useful, but WITHOUT ; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ; FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License along with ; this program. If not, see . format MS COFF public @EXPORT as 'EXPORTS' include "macros.inc" include "proc32.inc" section '.flat' code readable align 16 ;============================================================= ; Valid UTF-8 character checking routine ; Parameters: ; _str_ptr = pointer to the start of the character ; should be part of null terminated string ; Return values: ; eax = 1,2,3,4 <- if Valid ; 0 <- if not Valid ; Examples: ; invoke is_valid_utf8_char, my_string ; test eax, eax ; Check if valid ; jz .handle_error ; error handling by user ; add esi, eax ; Advance the pointer by the length! ;============================================================= ; PRIVATE: ; eax = pointer to str ; ebx = internal variable ; ecx = load the bytes for testing ;============================================================= proc is_valid_utf8_char uses ebx, _str_ptr mov eax, [_str_ptr] movzx ecx, byte [eax] test ecx, ecx ; If it is an 0 byte jz .is_invalid cmp cl, 0x80 ; If less than 0x80 than valid ascii jb .is_valid_byte cmp cl, 0xC2 jb .is_invalid ; rejects 0x80–0xC1 (continution and overlong) ; 0xE0 is start of 3 byte so check if smaller cmp cl, 0xE0 jb .check_valid_2_bytes ; 0xF0 is start of 4 byte so check if smaller cmp cl, 0xF0 jb .check_valid_3_bytes ; 0xF4 is upper limit of unicode first byte as of 20-March-2026 ; The limit is U+10FFFF cmp cl, 0xF4 ja .is_invalid ; since chacking of 1 byte, 22 byte, 3 byte is finished ; we will start to check for 4 byte sequence .check_valid_4_bytes: ; cl contains the first byte movzx ebx, byte [eax+1] ; If it is 0x00 then invalid test ebx, ebx jz .is_invalid ; For cl == 0xF0; second byte must be 0x90–0xBF cmp cl, 0xF0 jnz .check_f4 cmp ebx, 0x90 jb .is_invalid cmp ebx, 0xBF ja .is_invalid jmp .check_bytes_3_4 ; For cl == 0xF4; second byte must be 0x80–0x8F .check_f4: cmp cl, 0xF4 jnz .check_general_F1_F3 cmp ebx, 0x80 jb .is_invalid cmp ebx, 0x8F ja .is_invalid jmp .check_bytes_3_4 .check_general_F1_F3: and ebx, 0xC0 cmp ebx, 0x80 jne .is_invalid .check_bytes_3_4: movzx ebx, byte [eax+2] ; If it is 0x00 then invalid test ebx, ebx jz .is_invalid and ebx, 0xC0 cmp ebx, 0x80 jne .is_invalid movzx ebx, byte [eax+3] ; If it is 0x00 then invalid test ebx, ebx jz .is_invalid and ebx, 0xC0 cmp ebx, 0x80 jne .is_invalid jmp .is_valid_4_byte .check_valid_2_bytes: ; cl contains the first byte movzx ebx, byte [eax+1] ; If it is 0x00 then invalid test ebx, ebx jz .is_invalid and ebx, 0xC0 cmp ebx, 0x80 jne .is_invalid jmp .is_valid_2_byte .check_valid_3_bytes: ; cl contains the first byte movzx ebx, byte [eax+1] ; If it is 0x00 then invalid test ebx, ebx jz .is_invalid ; for cl == 0xE0; second byte must be 0xA0 - 0xBF cmp cl, 0xE0 jnz .check_ED cmp ebx, 0xA0 jb .is_invalid cmp ebx, 0xBF ja .is_invalid jmp .check_byte_3 ; for cl == 0xED; second byte must be 0x80 - 0x9F .check_ED: cmp cl, 0xED jnz .check_general_Exx cmp ebx, 0x80 jb .is_invalid cmp ebx, 0x9F ja .is_invalid jmp .check_byte_3 .check_general_Exx: and ebx, 0xC0 cmp ebx, 0x80 jne .is_invalid .check_byte_3: movzx ebx, byte [eax+2] ; If it is 0x00 then invalid test ebx, ebx jz .is_invalid and ebx, 0xC0 cmp ebx, 0x80 jne .is_invalid jmp .is_valid_3_byte .is_valid_byte: mov eax, 1 ; Return 1 (Valid) in eax ret .is_valid_2_byte: mov eax, 2 ; Return 2 (Valid) in eax ret .is_valid_3_byte: mov eax, 3 ; Return 3 (Bytes) in eax ret .is_valid_4_byte: mov eax, 4 ; Return 4 (Bytes) in eax ret .is_invalid: mov eax, 0 ; Return 0 (Invalid) in eax ret endp ;============================================================= ; CodePoint Counting function ; Parameters: ; _str_ptr = pointer to the memory address of the null ; terminated string ; Return values: ; eax = total codepoints ;============================================================= proc count_utf8_codepoints uses ebx, _str_ptr mov eax, [_str_ptr] mov ebx, 0 .read_loop: mov cl, byte [eax] test cl, cl ; if it is an ending byte (0) je .done and cl, 0xC0 cmp cl, 0x80 je .skip_count inc ebx .skip_count: inc eax jmp .read_loop .done: mov eax, ebx ret endp ;============================================================= ; Grapheme Counting function ; Parameters: ; _str_ptr = pointer to the memory address of the null ; terminated string ; Return values: ; eax = total grapheme count ;============================================================= proc count_utf8_graphemes uses ebx, _str_ptr mov eax, [_str_ptr] mov ebx, 0 .read_loop: mov cl, byte [eax] test cl, cl ; if it is an ending byte (0) je .done ; Is this accent cmp cl, 0xCC je .skip_count cmp cl, 0xCD je .skip_count ; Check for not a zero width joint cmp cl, 0xE2 jne .not_any_special cmp byte [eax+1], 0x80 jne .not_any_special cmp byte [eax+2], 0x8D jne .not_any_special dec ebx add eax, 3 jmp .read_loop .not_any_special: and cl, 0xC0 ; Is this a continution byte cmp cl, 0x80 je .skip_count inc ebx .skip_count: inc eax jmp .read_loop .done: mov eax, ebx ret endp align 4 @EXPORT: export \ count_utf8_codepoints, "utf8.count_codepoints", \ count_utf8_graphemes, "utf8.count_graphemes", \ is_valid_utf8_char, "utf8.is_valid_char"