You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
28 lines
629 B
28 lines
629 B
|
4 years ago
|
#!/usr/bin/env python
|
||
|
|
from typing import List, Tuple
|
||
|
|
|
||
|
|
import sudachipy
|
||
|
|
|
||
|
|
import segmenter_lib
|
||
|
|
|
||
|
|
|
||
|
|
class JaTokenizer():
|
||
|
|
_MODE = sudachipy.SplitMode.C
|
||
|
|
|
||
|
|
def __init__(self) -> None:
|
||
|
|
self._tokenizer = sudachipy.Dictionary().create()
|
||
|
|
|
||
|
|
def __call__(self, text: bytes) -> Tuple[List[int], List[int]]:
|
||
|
|
unicode_text = text.decode()
|
||
|
|
tokens = self._tokenizer.tokenize(unicode_text)
|
||
|
|
starts = []
|
||
|
|
ends = []
|
||
|
|
for token in tokens:
|
||
|
|
starts.append(len(unicode_text[:token.begin()].encode()))
|
||
|
|
ends.append(len(unicode_text[:token.end()].encode()))
|
||
|
|
return starts, ends
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
segmenter_lib.Main(JaTokenizer())
|