forked from KolibriOS/kolibrios
123 lines
3.0 KiB
NASM
123 lines
3.0 KiB
NASM
; libunicode -- KolibriOS Unicode Library
|
|
;
|
|
; Copyright (C) <2026> KolibriOS.org Team
|
|
; Author:
|
|
; 1. Swarnadeep Paul <swarnadeep@mail.com>
|
|
;
|
|
; This program is free software: you can redistribute it and/or modify it under
|
|
; the terms of the GNU General Public License as published by the Free Software
|
|
; Foundation, either version 2 of the License, or (at your option) any later
|
|
; version.
|
|
;
|
|
; This program is distributed in the hope that it will be useful, but WITHOUT
|
|
; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
; FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU General Public License along with
|
|
; this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
format MS COFF
|
|
|
|
public @EXPORT as 'EXPORTS'
|
|
|
|
include "macros.inc"
|
|
include "proc32.inc"
|
|
|
|
section '.flat' code readable align 16
|
|
|
|
|
|
;=============================================================
|
|
; CodePoint Counting function
|
|
; Parameters:
|
|
; eax = pointer to the memory address of the null
|
|
; terminated string
|
|
; Return values:
|
|
; eax = total codepoints
|
|
; -------PRIVATE--------
|
|
; ebx <- counter of codepoints
|
|
; ecx <- each byte
|
|
;
|
|
;=============================================================
|
|
proc count_utf8_codepoints
|
|
mov ebx, 0
|
|
|
|
.read_loop:
|
|
mov cl, byte [eax]
|
|
|
|
test cl, cl ; if it is an ending byte (0)
|
|
je .done
|
|
|
|
and cl, 0xC0
|
|
cmp cl, 0x80
|
|
je .skip_count
|
|
inc ebx
|
|
|
|
.skip_count:
|
|
inc eax
|
|
jmp .read_loop
|
|
.done:
|
|
mov eax, ebx
|
|
ret
|
|
endp
|
|
|
|
;=============================================================
|
|
; eax <- pointer to the memory address of the string
|
|
; * Do not use other register because it may be overwritten.
|
|
; -------PRIVATE--------
|
|
; ebx <- counter of grapheme
|
|
; ecx <- each byte
|
|
;
|
|
;=============================================================
|
|
proc count_utf8_graphemes
|
|
mov ebx, 0
|
|
|
|
.read_loop:
|
|
mov cl, byte [eax]
|
|
|
|
test cl, cl ; if it is an ending byte (0)
|
|
je .done
|
|
; Is this accent
|
|
|
|
cmp cl, 0xCC
|
|
je .skip_count
|
|
|
|
cmp cl, 0xCD
|
|
je .skip_count
|
|
|
|
; Check for not a zero width joint
|
|
cmp cl, 0xE2
|
|
jne .not_any_special
|
|
|
|
cmp byte [eax+1], 0x80
|
|
jne .not_any_special
|
|
|
|
cmp byte [eax+2], 0x8D
|
|
jne .not_any_special
|
|
|
|
dec ebx
|
|
add eax, 3
|
|
jmp .read_loop
|
|
|
|
.not_any_special:
|
|
and cl, 0xC0 ; Is this a continution byte
|
|
cmp cl, 0x80
|
|
je .skip_count
|
|
inc ebx
|
|
|
|
.skip_count:
|
|
inc eax
|
|
jmp .read_loop
|
|
|
|
.done:
|
|
mov eax, ebx
|
|
ret
|
|
endp
|
|
|
|
align 4
|
|
@EXPORT:
|
|
|
|
export \
|
|
count_utf8_codepoints, "utf8.count_codepoints", \
|
|
count_utf8_graphemes, "utf8.count_graphemes"
|
|
|