You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

21 lines
527 B

#!/usr/bin/env python
from typing import List, Tuple
import tensorflow_text
import segmenter_lib
class ZhTokenizer():
_MODEL_HANDLE = "https://tfhub.dev/google/zh_segmentation/1"
def __init__(self) -> None:
self._tokenizer = tensorflow_text.HubModuleTokenizer(self._MODEL_HANDLE)
def __call__(self, text: bytes) -> Tuple[List[int], List[int]]:
_tokens, starts, ends = self._tokenizer.tokenize_with_offsets(text)
return starts.numpy(), ends.numpy()
if __name__ == "__main__":
segmenter_lib.Main(ZhTokenizer())