kolibrios/programs/develop/libraries/libunicode/libunicode.asm

; libunicode -- KolibriOS Unicode Library
;
; Copyright (C) <2026> KolibriOS.org Team
; Author:
;           1. Swarnadeep Paul <swarnadeep@mail.com>
;
; This program is free software: you can redistribute it and/or modify it under
; the terms of the GNU General Public License as published by the Free Software
; Foundation, either version 2 of the License, or (at your option) any later
; version.
;
; This program is distributed in the hope that it will be useful, but WITHOUT
; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
; FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License along with
; this program. If not, see <http://www.gnu.org/licenses/>.

format MS COFF

public @EXPORT as 'EXPORTS'

include "macros.inc"
include "proc32.inc"

section '.flat' code readable align 16


;=============================================================
;   CodePoint Counting function
;   Parameters:
;       eax = pointer to the memory address of the null
;             terminated string
;   Return values:
;       eax = total codepoints
;   -------PRIVATE--------
;   ebx <- counter of codepoints
;   ecx <- each byte
;
;=============================================================
proc count_utf8_codepoints
        mov     ebx, 0

.read_loop:
        mov     cl, byte [eax]

        test    cl, cl        ; if it is an ending byte (0)
        je      .done

        and     cl, 0xC0
        cmp     cl, 0x80
        je      .skip_count
        inc     ebx

.skip_count:
        inc     eax
        jmp     .read_loop
.done:
        mov     eax, ebx
        ret
endp

;=============================================================
;   eax <- pointer to the memory address of the string
;   * Do not use other register because it may be overwritten.
;   -------PRIVATE--------
;   ebx <- counter of grapheme
;   ecx <- each byte
;
;=============================================================
proc count_utf8_graphemes
        mov     ebx, 0

.read_loop:
        mov     cl, byte [eax]

        test    cl, cl        ; if it is an ending byte (0)
        je      .done
; Is this accent

        cmp     cl, 0xCC
        je      .skip_count

        cmp     cl, 0xCD
        je      .skip_count

; Check for not a zero width joint
        cmp     cl, 0xE2
        jne     .not_any_special

        cmp byte [eax+1], 0x80
        jne     .not_any_special

        cmp byte [eax+2], 0x8D
        jne     .not_any_special

        dec     ebx
        add     eax, 3
        jmp     .read_loop

.not_any_special:
        and     cl, 0xC0        ; Is this a continution byte
        cmp     cl, 0x80
        je      .skip_count
        inc     ebx

.skip_count:
        inc     eax
        jmp     .read_loop

.done:
        mov     eax, ebx
        ret
endp

align 4
@EXPORT:

export \
        count_utf8_codepoints, "utf8.count_codepoints", \
        count_utf8_graphemes, "utf8.count_graphemes"