From bc6bd228b817e6a8b6c2f7e67e473b48319c4c72 Mon Sep 17 00:00:00 2001 From: Gleb Mazovetskiy Date: Mon, 8 Nov 2021 00:52:14 +0000 Subject: [PATCH] A tool to segment Chinese .po files with ZWSP This tool can insert ZWSP between words in Chinese .po files using this model: https://tfhub.dev/google/zh_segmentation/1 We will use this tool to be able to word wrap Chinese translations at runtime without bundling a segmenter. Doing this offline also allows us to use a segmenter that would otherwise be too slow for runtime. --- tools/zh_segmenter/README.md | 37 +++++++ tools/zh_segmenter/segment.py | 186 ++++++++++++++++++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100644 tools/zh_segmenter/README.md create mode 100755 tools/zh_segmenter/segment.py diff --git a/tools/zh_segmenter/README.md b/tools/zh_segmenter/README.md new file mode 100644 index 000000000..4c3ddab53 --- /dev/null +++ b/tools/zh_segmenter/README.md @@ -0,0 +1,37 @@ +# Chinese segmenter for gettext (.po) translation files + +Inserts [ZWSP] between the segments of Chinese text. + +Uses a high quality `zh_segmentation` model from Google: . + +## Pre-requisites + +1. Python. The easiest way to install Python on any Linux system is . + +2. ```bash + pip install 'tensorflow_text>=2.4.0b0' + ``` + +## Usage + +To re-segment the current translation files: + +```shell +tools/zh_segmenter/segment.py --input_path Translations/zh_CN.po +tools/zh_segmenter/segment.py --input_path Translations/zh_TW.po +``` + +Additionaly, you can provide a different separator, such as `--separator='|'`, for debugging. + +This tool performs a number of replacements to make sure interpolations are not affected etc. + +You can also see the segmenter output for a given string like this: + +```console +tools/zh_segmenter/segment.py --debug '返回到 {:d} 层' +``` +``` +返回|到| |{|:d}| |层 +``` + +[ZWSP]: https://en.wikipedia.org/wiki/Zero-width_space diff --git a/tools/zh_segmenter/segment.py b/tools/zh_segmenter/segment.py new file mode 100755 index 000000000..f550a6c8a --- /dev/null +++ b/tools/zh_segmenter/segment.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python +import argparse +import pathlib +import os +import re +import sys + +import tensorflow_text + +_DISALLOW_SEGMENTATION = re.compile( + r'(?:\{[^\}]*\}|\\[a-z]|[\w/-](?::[\w/-]?)?|[.,;?!=+/#]|\s)+'.encode()) + +_ZWSP = "\u200B" + +_MODEL_HANDLE = "https://tfhub.dev/google/zh_segmentation/1" + +default_input_path = pathlib.Path(__file__).resolve( +).parent.parent.parent.joinpath("Translations/zh_CN.po") + +parser = argparse.ArgumentParser() +parser.add_argument("--input_path", default=default_input_path, + help="Path to the input .po file") +parser.add_argument("--output_path", help="Output path (default: in-place)") +parser.add_argument("--separator", default=_ZWSP, + help="Separator to use between segments") +parser.add_argument("--debug", + help="If this flag is given, segments the given string, joins with \"|\", and prints it.") +args = parser.parse_args() + +_SEPARATOR = args.separator.encode() +_SEGMENTER = tensorflow_text.HubModuleTokenizer(_MODEL_HANDLE) + + +def _RunSegmenter(text, separator): + """Runs the segmenter and produces a separator-joined string.""" + if not text: + return [] + text = _RemoveAllMarkers(text) + _tokens, starts, ends = _SEGMENTER.tokenize_with_offsets(text) + starts, ends = starts.numpy(), ends.numpy() + starts, ends = _RecoverGaps(text, starts, ends) + starts, ends = _MergeDisallowedPositions(text, starts, ends) + output = separator.join([text[i:j] for i, j in zip(starts, ends)]) + return _RemoveRedundantMarkers(output) + + +def _RecoverGaps(text, starts, ends): + """Recovers gaps from the segmenter-produced start and end indices. + + The segmenter may produce gaps around spaces, e.g. segment("hello world") => "hello|world" + """ + out_starts = [] + out_ends = [] + prev_end = 0 + for start, end in zip(starts, ends): + if start != prev_end: + out_starts.append(prev_end) + out_ends.append(start) + out_starts.append(start) + out_ends.append(end) + prev_end = end + if out_ends[-1] != len(text): + out_ends[-1] = len(text) + return out_starts, out_ends + + +def _MergeDisallowedPositions(text, starts, ends): + """Merges segments disallowed by _DISALLOW_SEGMENTATION.""" + disallowed = set() + for m in re.finditer(_DISALLOW_SEGMENTATION, text): + for i in range(m.start() + 1, m.end()): + disallowed.add(i) + + out_starts = [starts[0]] + out_ends = [ends[0]] + for start, end in zip(starts[1:], ends[1:]): + if start in disallowed: + out_ends[-1] = end + else: + out_starts.append(start) + out_ends.append(end) + return out_starts, out_ends + + +_REMOVE_REDUNDANT_MARKERS = re.compile(b''.join( + [re.escape(_SEPARATOR), '?([  ,、。?!])'.encode(), re.escape(_SEPARATOR), b'?'])) + + +def _RemoveRedundantMarkers(text): + """Removes segmentation markers for cases that are handled at runtime anyway.""" + return re.sub(_REMOVE_REDUNDANT_MARKERS, r'\1', text) + + +_SEGMENTATION_MARKERS = re.compile(b''.join( + [b'(?:', re.escape(_SEPARATOR), b'|', re.escape(_ZWSP.encode()), b')+'])) + + +def _RemoveAllMarkers(text): + """Remove the existing segmentation markers to allow for re-segmenting.""" + return re.sub(_SEGMENTATION_MARKERS, b'', text) + + +def _QuoteLine(line): + return f'"{line}"\n' + + +def _SplitEveryN(input, n): + return [input[i:i + n] for i in range(0, len(input), n)] + + +def _FormatMsgStr(text_bytes): + """A rough approximation of poedit formatting and wrapping.""" + if not text_bytes: + return b'""\n' + + text = text_bytes.decode() + output_lines = [] + lines_with_newline = text.split('\\n') + for line_i, line_with_newline in enumerate(lines_with_newline): + if not line_with_newline and line_i != len(lines_with_newline) - 1: + output_lines.append('"\\n"\n') + continue + lines = _SplitEveryN(line_with_newline, 63) + if line_i == len(lines_with_newline) - 1: + lines = map(_QuoteLine, lines) + else: + lines[0:-1] = map(_QuoteLine, lines[0:-1]) + lines[-1] = f'"{lines[-1]}\\n"\n' + output_lines.extend(lines) + + if len(output_lines) > 1: + output_lines.insert(0, '""\n') + return ''.join(output_lines).encode() + + +if args.debug: + with os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f: + f.write(_RunSegmenter(args.debug.encode(), separator=b'|')) + f.write(b'\n') + exit() + +if not args.output_path: + args.output_path = args.input_path + +with open(args.input_path, 'rb') as f: + input = f.readlines() + +_MSGSTR_PREFIX = b'msgstr ' + +output = [] + +# Skip poedit header +header_end = input.index(b'\n') + 1 +output.extend(input[:header_end]) +input = input[header_end:] + +in_msgstr = False +msgstr_prefix = "" +msgstr = [] + + +def _ProcessMsgStr(): + text = _RunSegmenter(b''.join(msgstr), separator=_SEPARATOR) + output.append(msgstr_prefix + _FormatMsgStr(text)) + + +for line in input: + if line.startswith(_MSGSTR_PREFIX): + msgstr_prefix, line = line.split(b'"', maxsplit=1) + msgstr.append(line[:-2]) + in_msgstr = True + elif in_msgstr and line.startswith(b'"'): + msgstr.append(line[1:-2]) + else: + if msgstr: + _ProcessMsgStr() + msgstr.clear() + msgstr_prefix = "" + output.append(line) + in_msgstr = False + +if msgstr: + _ProcessMsgStr() + +with open(args.output_path, 'wb') if args.output_path != "/dev/stdout" else os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f: + f.write(b''.join(output))