diff --git a/tools/segmenter/README.md b/tools/segmenter/README.md index e08ded124..00302c5a2 100644 --- a/tools/segmenter/README.md +++ b/tools/segmenter/README.md @@ -47,7 +47,13 @@ You can also see the segmenter output for a given string like this: tools/segmenter/segment_zh.py --debug '返回到 {:d} 层' ``` ``` -返回|到| |{|:d}| |层 +返回|到 {:d} 层 +``` + +When inspecting the diffs, you can use `sed` to display the segments, e.g.: + +```bash +git diff --color | sed "s/$(echo -ne '\u200B')/|/g" ``` [ZWSP]: https://en.wikipedia.org/wiki/Zero-width_space diff --git a/tools/segmenter/segmenter_lib.py b/tools/segmenter/segmenter_lib.py index ff7747697..fb0250b4d 100644 --- a/tools/segmenter/segmenter_lib.py +++ b/tools/segmenter/segmenter_lib.py @@ -66,7 +66,7 @@ def _RecoverGaps(text: bytes, starts: List[int], ends: List[int]) -> Tuple[List[ _DISALLOW_SEGMENTATION = re.compile( - r'(?:\{.*?\}|\\[a-z]|[\w/-](?::[\w/-]?)?|[.,;?!=+/#]|.?’.?|.:)+'.encode()) + r'(?:\{.*?\}|\\[a-z]|[\w/-](?::[\w/-]?)?|[.,;?!=+/#]|.?’.?|.:|%[0-9a-z.]+)+'.encode()) def _MergeDisallowedPositions(text: bytes, starts: List[int], ends: List[int]) -> Tuple[List[int], List[int]]: