Files
kolibrios/programs/develop/libraries/libunicode/libunicode.asm
Swarnadeep Paul 05ecfe005c Add robust UTF-8 character validation and length check
Implemented the `is_valid_utf8_char` procedure to safely validate UTF-8
sequences and return their byte length (1-4, or 0 if invalid).

This routine implements strict Unicode compliance checks, including:
- Rejection of overlong encodings (e.g., checking 0xC0/0xC1, and strict
bounds for 0xE0/0xF0).
- Prevention of surrogate half decoding (restricting 0xED bounds).
- Enforcement of the maximum Unicode scalar value limit (U+10FFFF).
- Safe handling of null-terminators and truncated sequences.

This provides a secure foundation for upgrading the codepoint and
grapheme counting functions in upcoming commits.
2026-03-21 12:37:43 +05:30

311 lines
7.6 KiB
NASM
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
; libunicode -- KolibriOS Unicode Library
;
; Copyright (C) <2026> KolibriOS.org Team
; Author:
; 1. Swarnadeep Paul <swarnadeep@mail.com>
;
; This program is free software: you can redistribute it and/or modify it under
; the terms of the GNU General Public License as published by the Free Software
; Foundation, either version 2 of the License, or (at your option) any later
; version.
;
; This program is distributed in the hope that it will be useful, but WITHOUT
; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
; FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License along with
; this program. If not, see <http://www.gnu.org/licenses/>.
format MS COFF
public @EXPORT as 'EXPORTS'
include "macros.inc"
include "proc32.inc"
section '.flat' code readable align 16
;=============================================================
; Valid UTF-8 character checking routine
; Parameters:
; _str_ptr = pointer to the start of the character
; should be part of null terminated string
; Return values:
; eax = 1,2,3,4 <- if Valid
; 0 <- if not Valid
; Examples:
; invoke is_valid_utf8_char, my_string
; test eax, eax ; Check if valid
; jz .handle_error ; error handling by user
; add esi, eax ; Advance the pointer by the length!
;=============================================================
; PRIVATE:
; eax = pointer to str
; ebx = internal variable
; ecx = load the bytes for testing
;=============================================================
proc is_valid_utf8_char uses ebx, _str_ptr
mov eax, [_str_ptr]
movzx ecx, byte [eax]
test ecx, ecx ; If it is an 0 byte
jz .is_invalid
cmp cl, 0x80 ; If less than 0x80 than valid ascii
jb .is_valid_byte
cmp cl, 0xC2
jb .is_invalid ; rejects 0x800xC1 (continution and overlong)
; 0xE0 is start of 3 byte so check if smaller
cmp cl, 0xE0
jb .check_valid_2_bytes
; 0xF0 is start of 4 byte so check if smaller
cmp cl, 0xF0
jb .check_valid_3_bytes
; 0xF4 is upper limit of unicode first byte as of 20-March-2026
; The limit is U+10FFFF
cmp cl, 0xF4
ja .is_invalid
; since chacking of 1 byte, 22 byte, 3 byte is finished
; we will start to check for 4 byte sequence
.check_valid_4_bytes:
; cl contains the first byte
movzx ebx, byte [eax+1]
; If it is 0x00 then invalid
test ebx, ebx
jz .is_invalid
; For cl == 0xF0; second byte must be 0x900xBF
cmp cl, 0xF0
jnz .check_f4
cmp ebx, 0x90
jb .is_invalid
cmp ebx, 0xBF
ja .is_invalid
jmp .check_bytes_3_4
; For cl == 0xF4; second byte must be 0x800x8F
.check_f4:
cmp cl, 0xF4
jnz .check_general_F1_F3
cmp ebx, 0x80
jb .is_invalid
cmp ebx, 0x8F
ja .is_invalid
jmp .check_bytes_3_4
.check_general_F1_F3:
and ebx, 0xC0
cmp ebx, 0x80
jne .is_invalid
.check_bytes_3_4:
movzx ebx, byte [eax+2]
; If it is 0x00 then invalid
test ebx, ebx
jz .is_invalid
and ebx, 0xC0
cmp ebx, 0x80
jne .is_invalid
movzx ebx, byte [eax+3]
; If it is 0x00 then invalid
test ebx, ebx
jz .is_invalid
and ebx, 0xC0
cmp ebx, 0x80
jne .is_invalid
jmp .is_valid_4_byte
.check_valid_2_bytes:
; cl contains the first byte
movzx ebx, byte [eax+1]
; If it is 0x00 then invalid
test ebx, ebx
jz .is_invalid
and ebx, 0xC0
cmp ebx, 0x80
jne .is_invalid
jmp .is_valid_2_byte
.check_valid_3_bytes:
; cl contains the first byte
movzx ebx, byte [eax+1]
; If it is 0x00 then invalid
test ebx, ebx
jz .is_invalid
; for cl == 0xE0; second byte must be 0xA0 - 0xBF
cmp cl, 0xE0
jnz .check_ED
cmp ebx, 0xA0
jb .is_invalid
cmp ebx, 0xBF
ja .is_invalid
jmp .check_byte_3
; for cl == 0xED; second byte must be 0x80 - 0x9F
.check_ED:
cmp cl, 0xED
jnz .check_general_Exx
cmp ebx, 0x80
jb .is_invalid
cmp ebx, 0x9F
ja .is_invalid
jmp .check_byte_3
.check_general_Exx:
and ebx, 0xC0
cmp ebx, 0x80
jne .is_invalid
.check_byte_3:
movzx ebx, byte [eax+2]
; If it is 0x00 then invalid
test ebx, ebx
jz .is_invalid
and ebx, 0xC0
cmp ebx, 0x80
jne .is_invalid
jmp .is_valid_3_byte
.is_valid_byte:
mov eax, 1 ; Return 1 (Valid) in eax
ret
.is_valid_2_byte:
mov eax, 2 ; Return 2 (Valid) in eax
ret
.is_valid_3_byte:
mov eax, 3 ; Return 3 (Bytes) in eax
ret
.is_valid_4_byte:
mov eax, 4 ; Return 4 (Bytes) in eax
ret
.is_invalid:
mov eax, 0 ; Return 0 (Invalid) in eax
ret
endp
;=============================================================
; CodePoint Counting function
; Parameters:
; _str_ptr = pointer to the memory address of the null
; terminated string
; Return values:
; eax = total codepoints
;=============================================================
proc count_utf8_codepoints uses ebx, _str_ptr
mov eax, [_str_ptr]
mov ebx, 0
.read_loop:
mov cl, byte [eax]
test cl, cl ; if it is an ending byte (0)
je .done
and cl, 0xC0
cmp cl, 0x80
je .skip_count
inc ebx
.skip_count:
inc eax
jmp .read_loop
.done:
mov eax, ebx
ret
endp
;=============================================================
; Grapheme Counting function
; Parameters:
; _str_ptr = pointer to the memory address of the null
; terminated string
; Return values:
; eax = total grapheme count
;=============================================================
proc count_utf8_graphemes uses ebx, _str_ptr
mov eax, [_str_ptr]
mov ebx, 0
.read_loop:
mov cl, byte [eax]
test cl, cl ; if it is an ending byte (0)
je .done
; Is this accent
cmp cl, 0xCC
je .skip_count
cmp cl, 0xCD
je .skip_count
; Check for not a zero width joint
cmp cl, 0xE2
jne .not_any_special
cmp byte [eax+1], 0x80
jne .not_any_special
cmp byte [eax+2], 0x8D
jne .not_any_special
dec ebx
add eax, 3
jmp .read_loop
.not_any_special:
and cl, 0xC0 ; Is this a continution byte
cmp cl, 0x80
je .skip_count
inc ebx
.skip_count:
inc eax
jmp .read_loop
.done:
mov eax, ebx
ret
endp
align 4
@EXPORT:
export \
count_utf8_codepoints, "utf8.count_codepoints", \
count_utf8_graphemes, "utf8.count_graphemes", \
is_valid_utf8_char, "utf8.is_valid_char"