You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

27 lines
629 B

#!/usr/bin/env python
from typing import List, Tuple
import sudachipy
import segmenter_lib
class JaTokenizer():
_MODE = sudachipy.SplitMode.C
def __init__(self) -> None:
self._tokenizer = sudachipy.Dictionary().create()
def __call__(self, text: bytes) -> Tuple[List[int], List[int]]:
unicode_text = text.decode()
tokens = self._tokenizer.tokenize(unicode_text)
starts = []
ends = []
for token in tokens:
starts.append(len(unicode_text[:token.begin()].encode()))
ends.append(len(unicode_text[:token.end()].encode()))
return starts, ends
if __name__ == "__main__":
segmenter_lib.Main(JaTokenizer())