diff --git a/tools/segmenter/segment_all.py b/tools/segmenter/segment_all.py index 3d111749c..9f8c41d9e 100755 --- a/tools/segmenter/segment_all.py +++ b/tools/segmenter/segment_all.py @@ -5,22 +5,14 @@ import segment_zh import segment_ja import segmenter_lib - -def _Run(tokenizer, path): - with open(path, 'rb') as f: - input = f.readlines() - - output = segmenter_lib.SegmentPo(input, tokenizer) - - with open(path, 'wb') as f: - f.write(b''.join(output)) - - root = pathlib.Path(__file__).resolve().parent.parent.parent zh_tokenizer = segment_zh.ZhTokenizer() -_Run(zh_tokenizer, root.joinpath("Translations/zh_CN.po")) -_Run(zh_tokenizer, root.joinpath("Translations/zh_TW.po")) +segmenter_lib.ProcessPoFile(tokenizer=zh_tokenizer, + input_path=root.joinpath("Translations/zh_CN.po")) +segmenter_lib.ProcessPoFile(tokenizer=zh_tokenizer, + input_path=root.joinpath("Translations/zh_TW.po")) ja_tokenizer = segment_ja.JaTokenizer() -_Run(ja_tokenizer, root.joinpath("Translations/ja.po")) +segmenter_lib.ProcessPoFile(tokenizer=ja_tokenizer, + input_path=root.joinpath("Translations/ja.po")) diff --git a/tools/segmenter/segment_ja.py b/tools/segmenter/segment_ja.py index c6b2869f2..218295687 100755 --- a/tools/segmenter/segment_ja.py +++ b/tools/segmenter/segment_ja.py @@ -9,7 +9,6 @@ import segmenter_lib class JaTokenizer(): _MODE = sudachipy.SplitMode.C - def __init__(self) -> None: self._tokenizer = sudachipy.Dictionary().create() diff --git a/tools/segmenter/segmenter_lib.py b/tools/segmenter/segmenter_lib.py index fb0250b4d..5c86c8a1c 100644 --- a/tools/segmenter/segmenter_lib.py +++ b/tools/segmenter/segmenter_lib.py @@ -1,7 +1,7 @@ import re -from typing import List, Callable, Tuple +from typing import List, Callable, Tuple, Union _Tokenizer = Callable[[bytes], Tuple[List[int], List[int]]] @@ -22,7 +22,7 @@ class Segmenter(): def __call__(self, text: bytes) -> bytes: """Runs the segmenter and produces a separator-joined string.""" if not text: - return [] + return b'' text = self._RemoveAllMarkers(text) starts, ends = self._tokenizer(text) starts, ends = _RecoverGaps(text, starts, ends) @@ -88,36 +88,7 @@ def _MergeDisallowedPositions(text: bytes, starts: List[int], ends: List[int]) - def _QuoteLine(line): - return f'"{line}"\n' - - -def _SplitEveryN(input, n): - return [input[i:i + n] for i in range(0, len(input), n)] - - -def _FormatMsgStr(text_bytes): - """A rough approximation of poedit formatting and wrapping.""" - if not text_bytes: - return b'""\n' - - text = text_bytes.decode() - output_lines = [] - lines_with_newline = text.split('\\n') - for line_i, line_with_newline in enumerate(lines_with_newline): - if not line_with_newline and line_i != len(lines_with_newline) - 1: - output_lines.append('"\\n"\n') - continue - lines = _SplitEveryN(line_with_newline, 63) - if line_i == len(lines_with_newline) - 1: - lines = map(_QuoteLine, lines) - else: - lines[0:-1] = map(_QuoteLine, lines[0:-1]) - lines[-1] = f'"{lines[-1]}\\n"\n' - output_lines.extend(lines) - - if len(output_lines) > 1: - output_lines.insert(0, '""\n') - return ''.join(output_lines).encode() + return b'"%s"\n' % line _MSGSTR_PREFIX = b'msgstr ' @@ -138,7 +109,7 @@ def SegmentPo(input: List[bytes], tokenizer: _Tokenizer, separator: bytes = ZWSP def _ProcessMsgStr(): text = segmenter(b''.join(msgstr)) - output.append(msgstr_prefix + _FormatMsgStr(text)) + output.append(msgstr_prefix + _QuoteLine(text)) for line in input: if line.startswith(_MSGSTR_PREFIX): @@ -161,6 +132,32 @@ def SegmentPo(input: List[bytes], tokenizer: _Tokenizer, separator: bytes = ZWSP return output +_DEFAULT_SEPARATOR = ZWSP +_DEFAULT_PO_WRAP_WIDTH = 79 + + +def ProcessPoFile(tokenizer: _Tokenizer, input_path: str, output_path: Union[str, None] = None, + separator: str = _DEFAULT_SEPARATOR, wrap_width: int = _DEFAULT_PO_WRAP_WIDTH): + import subprocess + import os + import sys + + if not output_path: + output_path = input_path + + with open(input_path, 'rb') as f: + input = f.readlines() + + output = SegmentPo(input, tokenizer, separator=separator.encode()) + + with open(output_path, 'wb') if output_path != "/dev/stdout" else os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f: + f.write(b''.join(output)) + + # Run gettext's msgcat to reformat the file. + subprocess.run(['msgcat', '--force-po', f'--width={wrap_width}', '-o', output_path, output_path], + stdout=sys.stdout, stderr=sys.stderr) + + def Main(tokenizer: _Tokenizer): import argparse import os @@ -169,8 +166,10 @@ def Main(tokenizer: _Tokenizer): parser = argparse.ArgumentParser() parser.add_argument("--input_path", help="Path to the input .po file") parser.add_argument("--output_path", help="Output path (default: in-place)") - parser.add_argument("--separator", default=ZWSP, + parser.add_argument("--separator", default=_DEFAULT_SEPARATOR, help="Separator to use between segments") + parser.add_argument("--po_wrap_width", default=_DEFAULT_PO_WRAP_WIDTH, + help="Wrap .po text to this many lines") parser.add_argument("--debug", help="If this flag is given, segments the given string, joins with \"|\", and prints it.") args = parser.parse_args() @@ -182,13 +181,5 @@ def Main(tokenizer: _Tokenizer): f.write(b'\n') exit() - if not args.output_path: - args.output_path = args.input_path - - with open(args.input_path, 'rb') as f: - input = f.readlines() - - output = SegmentPo(input, tokenizer, separator=args.separator.encode()) - - with open(args.output_path, 'wb') if args.output_path != "/dev/stdout" else os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f: - f.write(b''.join(output)) + ProcessPoFile(input_path=args.input_path, output_path=args.output_path, + tokenizer=tokenizer, separator=args.separator, wrap_width=args.po_wrap_width)