pyTermTk/tests/timeit/05.wcwidth.bisearch.py

#!/usr/bin/env python3

# MIT License
#
# Copyright (c) 2021 Eugenio Parodi <ceccopierangiolieugenio AT googlemail DOT com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import sys, os

import timeit
import random
import unicodedata

import wcwidth
from functools import lru_cache

sys.path.append(os.path.join(sys.path[0],'../..'))
sys.path.append(os.path.join(sys.path[0],'.'))
import TermTk as ttk


_unicode_version = "13.0.0"
zw = wcwidth.ZERO_WIDTH[_unicode_version]
# zwcf = wcwidth.ZERO_WIDTH_CF
we = wcwidth.WIDE_EASTASIAN[_unicode_version]
zwcf = [
    0,       # Null (Cc)
    0x034F,  # Combining grapheme joiner (Mn)
    0x200B,  # Zero width space
    0x200C,  # Zero width non-joiner
    0x200D,  # Zero width joiner
    0x200E,  # Left-to-right mark
    0x200F,  # Right-to-left mark
    0x2028,  # Line separator (Zl)
    0x2029,  # Paragraph separator (Zp)
    0x202A,  # Left-to-right embedding
    0x202B,  # Right-to-left embedding
    0x202C,  # Pop directional formatting
    0x202D,  # Left-to-right override
    0x202E,  # Right-to-left override
    0x2060,  # Word joiner
    0x2061,  # Function application
    0x2062,  # Invisible times
    0x2063,  # Invisible separator
]


def set2binmask(s):
    ret = []
    for v in s:
        id = v >> 5
        mask = v & 0x1F
        bit = 1 << mask
        if id >= len(ret):
            ret += [0]*(id-len(ret)+2)
        ret[id] |= bit
    return ret


print(f"Create Set...")
zset = []
for a,b in zw:
    for v in range(a,b+1):
        zset.append(v)
for v in zwcf:
    zset.append(v)
zset = set(zset)

wset = []
for a,b in we:
    for v in range(a,b+1):
        wset.append(v)
wset = set(wset)

print(f"Create Set DONE!!!")

print(f"Create CharSetStringTest...")
cstr = ""
for _ in range(0x4000):
    cstr += chr(random.randint(0x100,0x20000))
print(f"Create CharSetStringTest DONE!!!")

# print(f"{set2binmask(zset)}")

bzset = set2binmask(zset)
bwset = set2binmask(wset)

print(f"len zset 0x{len(zset):04x}")
print(f"len zset 0x{len(bzset):04x}")
print(f"len wset 0x{len(wset):04x}")
print(f"len wset 0x{len(bwset):04x}")
print(f"len cstr 0x{len(cstr):04x}")

print([f"'{ch}':{unicodedata.east_asian_width(ch)}:{unicodedata.category(ch)}" for ch in cstr])

# @lru_cache(maxsize=3)
# def ttt(val):
#     return random.randint(10,100)
#
# print(f"{ttt(1)=}")
# print(f"{ttt(2)=}")
# print(f"{ttt(3)=}")
# print(f"{ttt(1)=}")unicodedata.category
# print(f"{ttt(1)=}")
# print(f"{ttt(3)=}")
# print(f"{ttt(2)=}")

def _bisearch(ucs, table):
    lbound = 0
    ubound = len(table) - 1

    if ucs < table[0][0] or ucs > table[ubound][1]:
        return 0
    while ubound >= lbound:
        mid = (lbound + ubound) // 2
        if ucs > table[mid][1]:
            lbound = mid + 1
        elif ucs < table[mid][0]:
            ubound = mid - 1
        else:
            return 1

    return 0

@lru_cache(maxsize=1000)
def _bicache(ucs, table):
    lbound = 0
    ubound = len(table) - 1

    if ucs < table[0][0] or ucs > table[ubound][1]:
        return 0
    while ubound >= lbound:
        mid = (lbound + ubound) // 2
        if ucs > table[mid][1]:
            lbound = mid + 1
        elif ucs < table[mid][0]:
            ubound = mid - 1
        else:
            return 1

    return 0

def test1():
    cw = 0
    for ch in cstr:
        cw += _bisearch(ord(ch), zw)
    return cw

def test2():
    cw = 0
    for ch in cstr:
        cw += _bicache(ord(ch), zw)
    return cw

def test3():
    return  wcwidth.wcswidth(cstr)

def test4():
    cw = 0
    for ch in cstr:
        cw += 1 if ord(ch) in wset else 0
    return cw

def test5():
    cw = sum([1 if ord(ch) in wset else 0 for ch in cstr])
    return cw

def test6():
    return len(cstr) + sum([ord(ch) in wset for ch in cstr]) - sum([ord(ch) in zset for ch in cstr])

def test7():
    return len(cstr) + sum([bwset[ord(ch)>>5]>>(ord(ch)&0x1F)&1 for ch in cstr]) - sum([bzset[ord(ch)>>5]>>(ord(ch)&0x1F)&1 for ch in cstr])

def test8():
    return len(cstr) + sum([bwset[ord(ch)>>5]>>(ord(ch)&0x1F)&1 for ch in cstr]) - sum([ord(ch) in zset for ch in cstr])

def test9():
    return len(cstr) + sum([0!=(bwset[ord(ch)>>5]&(1<<(ord(ch)&0x1F))) for ch in cstr]) - sum([ord(ch) in zset for ch in cstr])

def test10():
    return ( len(cstr) +
             sum(['W'==unicodedata.east_asian_width(ch) for ch in cstr]) -
             sum(['Me'==(c:=unicodedata.category(ch)) or 'Mn'==c for ch in cstr]) )
def test11():
    return ( len(cstr) +
             sum([unicodedata.east_asian_width(ch) == 'W' for ch in cstr]) -
             sum([unicodedata.category(ch) in ('Me','Mn') for ch in cstr]) )

def test12():
    retTxt = []
    retCol = []
    for i,ch in enumerate(cstr):
        if unicodedata.east_asian_width(ch) == 'W':
            retTxt += (ch,'')
            retCol += (ch,ch)
        if unicodedata.category(ch) in ('Me','Mn'):
            retTxt[-1]+=ch
        else:
            retTxt.append(ch)
            retCol.append(ch)
    return (len(retTxt), len(retCol))


loop = 100

result = timeit.timeit('test4()', globals=globals(), number=loop)
print(f"4  {result / loop:.10f} - {result / loop} {test4()}")
result = timeit.timeit('test5()', globals=globals(), number=loop)
print(f"5  {result / loop:.10f} - {result / loop} {test5()}")
result = timeit.timeit('test6()', globals=globals(), number=loop)
print(f"6  {result / loop:.10f} - {result / loop} {test6()}")
result = timeit.timeit('test10()', globals=globals(), number=loop)
print(f"10 {result / loop:.10f} - {result / loop} {test10()}")
result = timeit.timeit('test11()', globals=globals(), number=loop)
print(f"11 {result / loop:.10f} - {result / loop} {test11()}")
result = timeit.timeit('test7()', globals=globals(), number=loop)
print(f"7  {result / loop:.10f} - {result / loop} {test7()}")
result = timeit.timeit('test8()', globals=globals(), number=loop)
print(f"8  {result / loop:.10f} - {result / loop} {test8()}")
result = timeit.timeit('test9()', globals=globals(), number=loop)
print(f"9  {result / loop:.10f} - {result / loop} {test9()}")
result = timeit.timeit('test12()', globals=globals(), number=loop)
print(f"12 {result / loop:.10f} - {result / loop} {test12()}")


result = timeit.timeit('test3()', globals=globals(), number=loop)
print(f"3w {result / loop:.10f} - {result / loop} {test3()}")
result = timeit.timeit('test1()', globals=globals(), number=loop)
print(f"1w {result / loop:.10f} - {result / loop} {test1()}")
result = timeit.timeit('test2()', globals=globals(), number=loop)
print(f"2w {result / loop:.10f} - {result / loop} {test2()}")