forked from KolibriOS/kolibrios
Implemented the `is_valid_utf8_char` procedure to safely validate UTF-8 sequences and return their byte length (1-4, or 0 if invalid). This routine implements strict Unicode compliance checks, including: - Rejection of overlong encodings (e.g., checking 0xC0/0xC1, and strict bounds for 0xE0/0xF0). - Prevention of surrogate half decoding (restricting 0xED bounds). - Enforcement of the maximum Unicode scalar value limit (U+10FFFF). - Safe handling of null-terminators and truncated sequences. This provides a secure foundation for upgrading the codepoint and grapheme counting functions in upcoming commits.
311 lines
7.6 KiB
NASM
311 lines
7.6 KiB
NASM
; libunicode -- KolibriOS Unicode Library
|
||
;
|
||
; Copyright (C) <2026> KolibriOS.org Team
|
||
; Author:
|
||
; 1. Swarnadeep Paul <swarnadeep@mail.com>
|
||
;
|
||
; This program is free software: you can redistribute it and/or modify it under
|
||
; the terms of the GNU General Public License as published by the Free Software
|
||
; Foundation, either version 2 of the License, or (at your option) any later
|
||
; version.
|
||
;
|
||
; This program is distributed in the hope that it will be useful, but WITHOUT
|
||
; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||
; FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
||
;
|
||
; You should have received a copy of the GNU General Public License along with
|
||
; this program. If not, see <http://www.gnu.org/licenses/>.
|
||
|
||
format MS COFF
|
||
|
||
public @EXPORT as 'EXPORTS'
|
||
|
||
include "macros.inc"
|
||
include "proc32.inc"
|
||
|
||
section '.flat' code readable align 16
|
||
|
||
;=============================================================
|
||
; Valid UTF-8 character checking routine
|
||
; Parameters:
|
||
; _str_ptr = pointer to the start of the character
|
||
; should be part of null terminated string
|
||
; Return values:
|
||
; eax = 1,2,3,4 <- if Valid
|
||
; 0 <- if not Valid
|
||
; Examples:
|
||
; invoke is_valid_utf8_char, my_string
|
||
; test eax, eax ; Check if valid
|
||
; jz .handle_error ; error handling by user
|
||
; add esi, eax ; Advance the pointer by the length!
|
||
;=============================================================
|
||
; PRIVATE:
|
||
; eax = pointer to str
|
||
; ebx = internal variable
|
||
; ecx = load the bytes for testing
|
||
;=============================================================
|
||
|
||
proc is_valid_utf8_char uses ebx, _str_ptr
|
||
mov eax, [_str_ptr]
|
||
|
||
movzx ecx, byte [eax]
|
||
|
||
test ecx, ecx ; If it is an 0 byte
|
||
jz .is_invalid
|
||
|
||
cmp cl, 0x80 ; If less than 0x80 than valid ascii
|
||
jb .is_valid_byte
|
||
|
||
cmp cl, 0xC2
|
||
jb .is_invalid ; rejects 0x80–0xC1 (continution and overlong)
|
||
|
||
; 0xE0 is start of 3 byte so check if smaller
|
||
cmp cl, 0xE0
|
||
jb .check_valid_2_bytes
|
||
|
||
; 0xF0 is start of 4 byte so check if smaller
|
||
cmp cl, 0xF0
|
||
jb .check_valid_3_bytes
|
||
|
||
; 0xF4 is upper limit of unicode first byte as of 20-March-2026
|
||
; The limit is U+10FFFF
|
||
cmp cl, 0xF4
|
||
ja .is_invalid
|
||
|
||
; since chacking of 1 byte, 22 byte, 3 byte is finished
|
||
; we will start to check for 4 byte sequence
|
||
.check_valid_4_bytes:
|
||
; cl contains the first byte
|
||
movzx ebx, byte [eax+1]
|
||
|
||
; If it is 0x00 then invalid
|
||
test ebx, ebx
|
||
jz .is_invalid
|
||
|
||
; For cl == 0xF0; second byte must be 0x90–0xBF
|
||
cmp cl, 0xF0
|
||
jnz .check_f4
|
||
|
||
cmp ebx, 0x90
|
||
jb .is_invalid
|
||
cmp ebx, 0xBF
|
||
ja .is_invalid
|
||
|
||
jmp .check_bytes_3_4
|
||
|
||
; For cl == 0xF4; second byte must be 0x80–0x8F
|
||
.check_f4:
|
||
cmp cl, 0xF4
|
||
jnz .check_general_F1_F3
|
||
|
||
cmp ebx, 0x80
|
||
jb .is_invalid
|
||
cmp ebx, 0x8F
|
||
ja .is_invalid
|
||
|
||
jmp .check_bytes_3_4
|
||
|
||
.check_general_F1_F3:
|
||
and ebx, 0xC0
|
||
cmp ebx, 0x80
|
||
jne .is_invalid
|
||
|
||
.check_bytes_3_4:
|
||
movzx ebx, byte [eax+2]
|
||
; If it is 0x00 then invalid
|
||
test ebx, ebx
|
||
jz .is_invalid
|
||
|
||
and ebx, 0xC0
|
||
cmp ebx, 0x80
|
||
jne .is_invalid
|
||
|
||
movzx ebx, byte [eax+3]
|
||
; If it is 0x00 then invalid
|
||
test ebx, ebx
|
||
jz .is_invalid
|
||
|
||
and ebx, 0xC0
|
||
cmp ebx, 0x80
|
||
jne .is_invalid
|
||
|
||
jmp .is_valid_4_byte
|
||
|
||
.check_valid_2_bytes:
|
||
; cl contains the first byte
|
||
movzx ebx, byte [eax+1]
|
||
; If it is 0x00 then invalid
|
||
test ebx, ebx
|
||
jz .is_invalid
|
||
|
||
and ebx, 0xC0
|
||
cmp ebx, 0x80
|
||
jne .is_invalid
|
||
|
||
jmp .is_valid_2_byte
|
||
|
||
.check_valid_3_bytes:
|
||
; cl contains the first byte
|
||
movzx ebx, byte [eax+1]
|
||
|
||
; If it is 0x00 then invalid
|
||
test ebx, ebx
|
||
jz .is_invalid
|
||
|
||
; for cl == 0xE0; second byte must be 0xA0 - 0xBF
|
||
cmp cl, 0xE0
|
||
jnz .check_ED
|
||
|
||
cmp ebx, 0xA0
|
||
jb .is_invalid
|
||
cmp ebx, 0xBF
|
||
ja .is_invalid
|
||
|
||
jmp .check_byte_3
|
||
|
||
; for cl == 0xED; second byte must be 0x80 - 0x9F
|
||
.check_ED:
|
||
cmp cl, 0xED
|
||
jnz .check_general_Exx
|
||
|
||
cmp ebx, 0x80
|
||
jb .is_invalid
|
||
cmp ebx, 0x9F
|
||
ja .is_invalid
|
||
|
||
jmp .check_byte_3
|
||
|
||
.check_general_Exx:
|
||
and ebx, 0xC0
|
||
cmp ebx, 0x80
|
||
jne .is_invalid
|
||
|
||
.check_byte_3:
|
||
movzx ebx, byte [eax+2]
|
||
; If it is 0x00 then invalid
|
||
test ebx, ebx
|
||
jz .is_invalid
|
||
|
||
and ebx, 0xC0
|
||
cmp ebx, 0x80
|
||
jne .is_invalid
|
||
|
||
jmp .is_valid_3_byte
|
||
|
||
.is_valid_byte:
|
||
mov eax, 1 ; Return 1 (Valid) in eax
|
||
ret
|
||
|
||
.is_valid_2_byte:
|
||
mov eax, 2 ; Return 2 (Valid) in eax
|
||
ret
|
||
|
||
.is_valid_3_byte:
|
||
mov eax, 3 ; Return 3 (Bytes) in eax
|
||
ret
|
||
|
||
.is_valid_4_byte:
|
||
mov eax, 4 ; Return 4 (Bytes) in eax
|
||
ret
|
||
|
||
.is_invalid:
|
||
mov eax, 0 ; Return 0 (Invalid) in eax
|
||
ret
|
||
endp
|
||
|
||
;=============================================================
|
||
; CodePoint Counting function
|
||
; Parameters:
|
||
; _str_ptr = pointer to the memory address of the null
|
||
; terminated string
|
||
; Return values:
|
||
; eax = total codepoints
|
||
;=============================================================
|
||
|
||
proc count_utf8_codepoints uses ebx, _str_ptr
|
||
mov eax, [_str_ptr]
|
||
mov ebx, 0
|
||
|
||
.read_loop:
|
||
mov cl, byte [eax]
|
||
|
||
test cl, cl ; if it is an ending byte (0)
|
||
je .done
|
||
|
||
and cl, 0xC0
|
||
cmp cl, 0x80
|
||
je .skip_count
|
||
inc ebx
|
||
|
||
.skip_count:
|
||
inc eax
|
||
jmp .read_loop
|
||
.done:
|
||
mov eax, ebx
|
||
ret
|
||
endp
|
||
|
||
;=============================================================
|
||
; Grapheme Counting function
|
||
; Parameters:
|
||
; _str_ptr = pointer to the memory address of the null
|
||
; terminated string
|
||
; Return values:
|
||
; eax = total grapheme count
|
||
;=============================================================
|
||
|
||
proc count_utf8_graphemes uses ebx, _str_ptr
|
||
mov eax, [_str_ptr]
|
||
mov ebx, 0
|
||
|
||
.read_loop:
|
||
mov cl, byte [eax]
|
||
|
||
test cl, cl ; if it is an ending byte (0)
|
||
je .done
|
||
; Is this accent
|
||
|
||
cmp cl, 0xCC
|
||
je .skip_count
|
||
|
||
cmp cl, 0xCD
|
||
je .skip_count
|
||
|
||
; Check for not a zero width joint
|
||
cmp cl, 0xE2
|
||
jne .not_any_special
|
||
|
||
cmp byte [eax+1], 0x80
|
||
jne .not_any_special
|
||
|
||
cmp byte [eax+2], 0x8D
|
||
jne .not_any_special
|
||
|
||
dec ebx
|
||
add eax, 3
|
||
jmp .read_loop
|
||
|
||
.not_any_special:
|
||
and cl, 0xC0 ; Is this a continution byte
|
||
cmp cl, 0x80
|
||
je .skip_count
|
||
inc ebx
|
||
|
||
.skip_count:
|
||
inc eax
|
||
jmp .read_loop
|
||
|
||
.done:
|
||
mov eax, ebx
|
||
ret
|
||
endp
|
||
|
||
align 4
|
||
@EXPORT:
|
||
|
||
export \
|
||
count_utf8_codepoints, "utf8.count_codepoints", \
|
||
count_utf8_graphemes, "utf8.count_graphemes", \
|
||
is_valid_utf8_char, "utf8.is_valid_char"
|
||
|