A tool to segment Chinese .po files with ZWSP

This tool can insert ZWSP between words in Chinese .po files using this model: https://tfhub.dev/google/zh_segmentation/1 We will use this tool to be able to word wrap Chinese translations at runtime without bundling a segmenter. Doing this offline also allows us to use a segmenter that would otherwise be too slow for runtime.
4 years ago · bc6bd228b8
2 changed files with 223 additions and 0 deletions
--- a/tools/zh_segmenter/README.md
+++ b/tools/zh_segmenter/README.md
@ -0,0 +1,37 @@
+# Chinese segmenter for gettext (.po) translation files
+
+Inserts [ZWSP] between the segments of Chinese text.
+
+Uses a high quality `zh_segmentation` model from Google: <https://tfhub.dev/google/zh_segmentation/1>.
+
+## Pre-requisites
+
+1. Python. The easiest way to install Python on any Linux system is <https://github.com/asdf-vm/asdf>.
+
+2. ```bash
+   pip install 'tensorflow_text>=2.4.0b0'
+   ```
+
+## Usage
+
+To re-segment the current translation files:
+
+```shell
+tools/zh_segmenter/segment.py --input_path Translations/zh_CN.po
+tools/zh_segmenter/segment.py --input_path Translations/zh_TW.po
+```
+
+Additionaly, you can provide a different separator, such as `--separator='|'`, for debugging.
+
+This tool performs a number of replacements to make sure interpolations are not affected etc.
+
+You can also see the segmenter output for a given string like this:
+
+```console
+tools/zh_segmenter/segment.py --debug '返回到 {:d} 层'
+```
+```
+返回|到| |{|:d}| |层
+```
+
+[ZWSP]: https://en.wikipedia.org/wiki/Zero-width_space
--- a/tools/zh_segmenter/segment.py
+++ b/tools/zh_segmenter/segment.py
@ -0,0 +1,186 @@
+#!/usr/bin/env python
+import argparse
+import pathlib
+import os
+import re
+import sys
+
+import tensorflow_text
+
+_DISALLOW_SEGMENTATION = re.compile(
+	r'(?:\{[^\}]*\}|\\[a-z]|[\w/-](?::[\w/-]?)?|[.,;?!=+/#]|\s)+'.encode())
+
+_ZWSP = "\u200B"
+
+_MODEL_HANDLE = "https://tfhub.dev/google/zh_segmentation/1"
+
+default_input_path = pathlib.Path(__file__).resolve(
+).parent.parent.parent.joinpath("Translations/zh_CN.po")
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input_path", default=default_input_path,
+                    help="Path to the input .po file")
+parser.add_argument("--output_path", help="Output path (default: in-place)")
+parser.add_argument("--separator", default=_ZWSP,
+                    help="Separator to use between segments")
+parser.add_argument("--debug",
+                    help="If this flag is given, segments the given string, joins with \"|\", and prints it.")
+args = parser.parse_args()
+
+_SEPARATOR = args.separator.encode()
+_SEGMENTER = tensorflow_text.HubModuleTokenizer(_MODEL_HANDLE)
+
+
+def _RunSegmenter(text, separator):
+	"""Runs the segmenter and produces a separator-joined string."""
+	if not text:
+		return []
+	text = _RemoveAllMarkers(text)
+	_tokens, starts, ends = _SEGMENTER.tokenize_with_offsets(text)
+	starts, ends = starts.numpy(), ends.numpy()
+	starts, ends = _RecoverGaps(text, starts, ends)
+	starts, ends = _MergeDisallowedPositions(text, starts, ends)
+	output = separator.join([text[i:j] for i, j in zip(starts, ends)])
+	return _RemoveRedundantMarkers(output)
+
+
+def _RecoverGaps(text, starts, ends):
+	"""Recovers gaps from the segmenter-produced start and end indices.
+
+	The segmenter may produce gaps around spaces, e.g. segment("hello world") => "hello|world"
+	"""
+	out_starts = []
+	out_ends = []
+	prev_end = 0
+	for start, end in zip(starts, ends):
+		if start != prev_end:
+			out_starts.append(prev_end)
+			out_ends.append(start)
+		out_starts.append(start)
+		out_ends.append(end)
+		prev_end = end
+	if out_ends[-1] != len(text):
+		out_ends[-1] = len(text)
+	return out_starts, out_ends
+
+
+def _MergeDisallowedPositions(text, starts, ends):
+	"""Merges segments disallowed by _DISALLOW_SEGMENTATION."""
+	disallowed = set()
+	for m in re.finditer(_DISALLOW_SEGMENTATION, text):
+		for i in range(m.start() + 1, m.end()):
+			disallowed.add(i)
+
+	out_starts = [starts[0]]
+	out_ends = [ends[0]]
+	for start, end in zip(starts[1:], ends[1:]):
+		if start in disallowed:
+			out_ends[-1] = end
+		else:
+			out_starts.append(start)
+			out_ends.append(end)
+	return out_starts, out_ends
+
+
+_REMOVE_REDUNDANT_MARKERS = re.compile(b''.join(
+	[re.escape(_SEPARATOR), '?([ 　，、。？！])'.encode(), re.escape(_SEPARATOR), b'?']))
+
+
+def _RemoveRedundantMarkers(text):
+	"""Removes segmentation markers for cases that are handled at runtime anyway."""
+	return re.sub(_REMOVE_REDUNDANT_MARKERS, r'\1', text)
+
+
+_SEGMENTATION_MARKERS = re.compile(b''.join(
+	[b'(?:', re.escape(_SEPARATOR), b'|', re.escape(_ZWSP.encode()), b')+']))
+
+
+def _RemoveAllMarkers(text):
+	"""Remove the existing segmentation markers to allow for re-segmenting."""
+	return re.sub(_SEGMENTATION_MARKERS, b'', text)
+
+
+def _QuoteLine(line):
+	return f'"{line}"\n'
+
+
+def _SplitEveryN(input, n):
+	return [input[i:i + n] for i in range(0, len(input), n)]
+
+
+def _FormatMsgStr(text_bytes):
+	"""A rough approximation of poedit formatting and wrapping."""
+	if not text_bytes:
+		return b'""\n'
+
+	text = text_bytes.decode()
+	output_lines = []
+	lines_with_newline = text.split('\\n')
+	for line_i, line_with_newline in enumerate(lines_with_newline):
+		if not line_with_newline and line_i != len(lines_with_newline) - 1:
+			output_lines.append('"\\n"\n')
+			continue
+		lines = _SplitEveryN(line_with_newline, 63)
+		if line_i == len(lines_with_newline) - 1:
+			lines = map(_QuoteLine, lines)
+		else:
+			lines[0:-1] = map(_QuoteLine, lines[0:-1])
+			lines[-1] = f'"{lines[-1]}\\n"\n'
+		output_lines.extend(lines)
+
+	if len(output_lines) > 1:
+		output_lines.insert(0, '""\n')
+	return ''.join(output_lines).encode()
+
+
+if args.debug:
+	with os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f:
+		f.write(_RunSegmenter(args.debug.encode(), separator=b'|'))
+		f.write(b'\n')
+	exit()
+
+if not args.output_path:
+	args.output_path = args.input_path
+
+with open(args.input_path, 'rb') as f:
+    input = f.readlines()
+
+_MSGSTR_PREFIX = b'msgstr '
+
+output = []
+
+# Skip poedit header
+header_end = input.index(b'\n') + 1
+output.extend(input[:header_end])
+input = input[header_end:]
+
+in_msgstr = False
+msgstr_prefix = ""
+msgstr = []
+
+
+def _ProcessMsgStr():
+	text = _RunSegmenter(b''.join(msgstr), separator=_SEPARATOR)
+	output.append(msgstr_prefix + _FormatMsgStr(text))
+
+
+for line in input:
+	if line.startswith(_MSGSTR_PREFIX):
+		msgstr_prefix, line = line.split(b'"', maxsplit=1)
+		msgstr.append(line[:-2])
+		in_msgstr = True
+	elif in_msgstr and line.startswith(b'"'):
+		msgstr.append(line[1:-2])
+	else:
+		if msgstr:
+			_ProcessMsgStr()
+			msgstr.clear()
+			msgstr_prefix = ""
+		output.append(line)
+		in_msgstr = False
+
+if msgstr:
+	_ProcessMsgStr()
+
+with open(args.output_path, 'wb') if args.output_path != "/dev/stdout" else os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f:
+	f.write(b''.join(output))