Segmenter: Use gettext to line-wrap po files

The line wrapping algorithm of gettext is somewhat complicated. Rather than trying to approximate it, use `msgcat` from gettext directly. This is also what poedit does.
4 years ago · 395dbb18f0
3 changed files with 41 additions and 59 deletions
--- a/tools/segmenter/segment_all.py
+++ b/tools/segmenter/segment_all.py
@ -5,22 +5,14 @@ import segment_zh
 import segment_ja
 import segmenter_lib

-
-def _Run(tokenizer, path):
-	with open(path, 'rb') as f:
-		input = f.readlines()
-
-	output = segmenter_lib.SegmentPo(input, tokenizer)
-
-	with open(path, 'wb') as f:
-		f.write(b''.join(output))
-
-
 root = pathlib.Path(__file__).resolve().parent.parent.parent

 zh_tokenizer = segment_zh.ZhTokenizer()
-_Run(zh_tokenizer, root.joinpath("Translations/zh_CN.po"))
-_Run(zh_tokenizer, root.joinpath("Translations/zh_TW.po"))
+segmenter_lib.ProcessPoFile(tokenizer=zh_tokenizer,
+                            input_path=root.joinpath("Translations/zh_CN.po"))
+segmenter_lib.ProcessPoFile(tokenizer=zh_tokenizer,
+                            input_path=root.joinpath("Translations/zh_TW.po"))

 ja_tokenizer = segment_ja.JaTokenizer()
-_Run(ja_tokenizer, root.joinpath("Translations/ja.po"))
+segmenter_lib.ProcessPoFile(tokenizer=ja_tokenizer,
+                            input_path=root.joinpath("Translations/ja.po"))
--- a/tools/segmenter/segment_ja.py
+++ b/tools/segmenter/segment_ja.py
@ -9,7 +9,6 @@ import segmenter_lib
 class JaTokenizer():
 	_MODE = sudachipy.SplitMode.C

-
 	def __init__(self) -> None:
 		self._tokenizer = sudachipy.Dictionary().create()

--- a/tools/segmenter/segmenter_lib.py
+++ b/tools/segmenter/segmenter_lib.py
@ -1,7 +1,7 @@

 import re

-from typing import List, Callable, Tuple
+from typing import List, Callable, Tuple, Union

 _Tokenizer = Callable[[bytes], Tuple[List[int], List[int]]]

@ -22,7 +22,7 @@ class Segmenter():
 	def __call__(self, text: bytes) -> bytes:
 		"""Runs the segmenter and produces a separator-joined string."""
 		if not text:
-			return []
+			return b''
 		text = self._RemoveAllMarkers(text)
 		starts, ends = self._tokenizer(text)
 		starts, ends = _RecoverGaps(text, starts, ends)
@ -88,36 +88,7 @@ def _MergeDisallowedPositions(text: bytes, starts: List[int], ends: List[int]) -


 def _QuoteLine(line):
-	return f'"{line}"\n'
-
-
-def _SplitEveryN(input, n):
-	return [input[i:i + n] for i in range(0, len(input), n)]
-
-
-def _FormatMsgStr(text_bytes):
-	"""A rough approximation of poedit formatting and wrapping."""
-	if not text_bytes:
-		return b'""\n'
-
-	text = text_bytes.decode()
-	output_lines = []
-	lines_with_newline = text.split('\\n')
-	for line_i, line_with_newline in enumerate(lines_with_newline):
-		if not line_with_newline and line_i != len(lines_with_newline) - 1:
-			output_lines.append('"\\n"\n')
-			continue
-		lines = _SplitEveryN(line_with_newline, 63)
-		if line_i == len(lines_with_newline) - 1:
-			lines = map(_QuoteLine, lines)
-		else:
-			lines[0:-1] = map(_QuoteLine, lines[0:-1])
-			lines[-1] = f'"{lines[-1]}\\n"\n'
-		output_lines.extend(lines)
-
-	if len(output_lines) > 1:
-		output_lines.insert(0, '""\n')
-	return ''.join(output_lines).encode()
+	return b'"%s"\n' % line


 _MSGSTR_PREFIX = b'msgstr '
@ -138,7 +109,7 @@ def SegmentPo(input: List[bytes], tokenizer: _Tokenizer, separator: bytes = ZWSP

 	def _ProcessMsgStr():
 		text = segmenter(b''.join(msgstr))
-		output.append(msgstr_prefix + _FormatMsgStr(text))
+		output.append(msgstr_prefix + _QuoteLine(text))

 	for line in input:
 		if line.startswith(_MSGSTR_PREFIX):
@ -161,6 +132,32 @@ def SegmentPo(input: List[bytes], tokenizer: _Tokenizer, separator: bytes = ZWSP
 	return output


+_DEFAULT_SEPARATOR = ZWSP
+_DEFAULT_PO_WRAP_WIDTH = 79
+
+
+def ProcessPoFile(tokenizer: _Tokenizer, input_path: str, output_path: Union[str, None] = None,
+                  separator: str = _DEFAULT_SEPARATOR, wrap_width: int = _DEFAULT_PO_WRAP_WIDTH):
+	import subprocess
+	import os
+	import sys
+
+	if not output_path:
+		output_path = input_path
+
+	with open(input_path, 'rb') as f:
+		input = f.readlines()
+
+	output = SegmentPo(input, tokenizer, separator=separator.encode())
+
+	with open(output_path, 'wb') if output_path != "/dev/stdout" else os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f:
+		f.write(b''.join(output))
+
+	# Run gettext's msgcat to reformat the file.
+	subprocess.run(['msgcat', '--force-po', f'--width={wrap_width}', '-o', output_path, output_path],
+                stdout=sys.stdout, stderr=sys.stderr)
+
+
 def Main(tokenizer: _Tokenizer):
 	import argparse
 	import os
@ -169,8 +166,10 @@ def Main(tokenizer: _Tokenizer):
 	parser = argparse.ArgumentParser()
 	parser.add_argument("--input_path", help="Path to the input .po file")
 	parser.add_argument("--output_path", help="Output path (default: in-place)")
-	parser.add_argument("--separator", default=ZWSP,
+	parser.add_argument("--separator", default=_DEFAULT_SEPARATOR,
                     help="Separator to use between segments")
+	parser.add_argument("--po_wrap_width", default=_DEFAULT_PO_WRAP_WIDTH,
+                     help="Wrap .po text to this many lines")
 	parser.add_argument("--debug",
                     help="If this flag is given, segments the given string, joins with \"|\", and prints it.")
 	args = parser.parse_args()
@ -182,13 +181,5 @@ def Main(tokenizer: _Tokenizer):
 			f.write(b'\n')
 		exit()

-	if not args.output_path:
-		args.output_path = args.input_path
-
-	with open(args.input_path, 'rb') as f:
-		input = f.readlines()
-
-	output = SegmentPo(input, tokenizer, separator=args.separator.encode())
-
-	with open(args.output_path, 'wb') if args.output_path != "/dev/stdout" else os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f:
-		f.write(b''.join(output))
+	ProcessPoFile(input_path=args.input_path, output_path=args.output_path,
+	              tokenizer=tokenizer, separator=args.separator, wrap_width=args.po_wrap_width)