Browse Source

Segmenter: Use gettext to line-wrap po files

The line wrapping algorithm of gettext is somewhat complicated.

Rather than trying to approximate it, use `msgcat` from gettext
directly. This is also what poedit does.
pull/3544/head
Gleb Mazovetskiy 4 years ago
parent
commit
395dbb18f0
  1. 20
      tools/segmenter/segment_all.py
  2. 1
      tools/segmenter/segment_ja.py
  3. 79
      tools/segmenter/segmenter_lib.py

20
tools/segmenter/segment_all.py

@ -5,22 +5,14 @@ import segment_zh
import segment_ja
import segmenter_lib
def _Run(tokenizer, path):
with open(path, 'rb') as f:
input = f.readlines()
output = segmenter_lib.SegmentPo(input, tokenizer)
with open(path, 'wb') as f:
f.write(b''.join(output))
root = pathlib.Path(__file__).resolve().parent.parent.parent
zh_tokenizer = segment_zh.ZhTokenizer()
_Run(zh_tokenizer, root.joinpath("Translations/zh_CN.po"))
_Run(zh_tokenizer, root.joinpath("Translations/zh_TW.po"))
segmenter_lib.ProcessPoFile(tokenizer=zh_tokenizer,
input_path=root.joinpath("Translations/zh_CN.po"))
segmenter_lib.ProcessPoFile(tokenizer=zh_tokenizer,
input_path=root.joinpath("Translations/zh_TW.po"))
ja_tokenizer = segment_ja.JaTokenizer()
_Run(ja_tokenizer, root.joinpath("Translations/ja.po"))
segmenter_lib.ProcessPoFile(tokenizer=ja_tokenizer,
input_path=root.joinpath("Translations/ja.po"))

1
tools/segmenter/segment_ja.py

@ -9,7 +9,6 @@ import segmenter_lib
class JaTokenizer():
_MODE = sudachipy.SplitMode.C
def __init__(self) -> None:
self._tokenizer = sudachipy.Dictionary().create()

79
tools/segmenter/segmenter_lib.py

@ -1,7 +1,7 @@
import re
from typing import List, Callable, Tuple
from typing import List, Callable, Tuple, Union
_Tokenizer = Callable[[bytes], Tuple[List[int], List[int]]]
@ -22,7 +22,7 @@ class Segmenter():
def __call__(self, text: bytes) -> bytes:
"""Runs the segmenter and produces a separator-joined string."""
if not text:
return []
return b''
text = self._RemoveAllMarkers(text)
starts, ends = self._tokenizer(text)
starts, ends = _RecoverGaps(text, starts, ends)
@ -88,36 +88,7 @@ def _MergeDisallowedPositions(text: bytes, starts: List[int], ends: List[int]) -
def _QuoteLine(line):
return f'"{line}"\n'
def _SplitEveryN(input, n):
return [input[i:i + n] for i in range(0, len(input), n)]
def _FormatMsgStr(text_bytes):
"""A rough approximation of poedit formatting and wrapping."""
if not text_bytes:
return b'""\n'
text = text_bytes.decode()
output_lines = []
lines_with_newline = text.split('\\n')
for line_i, line_with_newline in enumerate(lines_with_newline):
if not line_with_newline and line_i != len(lines_with_newline) - 1:
output_lines.append('"\\n"\n')
continue
lines = _SplitEveryN(line_with_newline, 63)
if line_i == len(lines_with_newline) - 1:
lines = map(_QuoteLine, lines)
else:
lines[0:-1] = map(_QuoteLine, lines[0:-1])
lines[-1] = f'"{lines[-1]}\\n"\n'
output_lines.extend(lines)
if len(output_lines) > 1:
output_lines.insert(0, '""\n')
return ''.join(output_lines).encode()
return b'"%s"\n' % line
_MSGSTR_PREFIX = b'msgstr '
@ -138,7 +109,7 @@ def SegmentPo(input: List[bytes], tokenizer: _Tokenizer, separator: bytes = ZWSP
def _ProcessMsgStr():
text = segmenter(b''.join(msgstr))
output.append(msgstr_prefix + _FormatMsgStr(text))
output.append(msgstr_prefix + _QuoteLine(text))
for line in input:
if line.startswith(_MSGSTR_PREFIX):
@ -161,6 +132,32 @@ def SegmentPo(input: List[bytes], tokenizer: _Tokenizer, separator: bytes = ZWSP
return output
_DEFAULT_SEPARATOR = ZWSP
_DEFAULT_PO_WRAP_WIDTH = 79
def ProcessPoFile(tokenizer: _Tokenizer, input_path: str, output_path: Union[str, None] = None,
separator: str = _DEFAULT_SEPARATOR, wrap_width: int = _DEFAULT_PO_WRAP_WIDTH):
import subprocess
import os
import sys
if not output_path:
output_path = input_path
with open(input_path, 'rb') as f:
input = f.readlines()
output = SegmentPo(input, tokenizer, separator=separator.encode())
with open(output_path, 'wb') if output_path != "/dev/stdout" else os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f:
f.write(b''.join(output))
# Run gettext's msgcat to reformat the file.
subprocess.run(['msgcat', '--force-po', f'--width={wrap_width}', '-o', output_path, output_path],
stdout=sys.stdout, stderr=sys.stderr)
def Main(tokenizer: _Tokenizer):
import argparse
import os
@ -169,8 +166,10 @@ def Main(tokenizer: _Tokenizer):
parser = argparse.ArgumentParser()
parser.add_argument("--input_path", help="Path to the input .po file")
parser.add_argument("--output_path", help="Output path (default: in-place)")
parser.add_argument("--separator", default=ZWSP,
parser.add_argument("--separator", default=_DEFAULT_SEPARATOR,
help="Separator to use between segments")
parser.add_argument("--po_wrap_width", default=_DEFAULT_PO_WRAP_WIDTH,
help="Wrap .po text to this many lines")
parser.add_argument("--debug",
help="If this flag is given, segments the given string, joins with \"|\", and prints it.")
args = parser.parse_args()
@ -182,13 +181,5 @@ def Main(tokenizer: _Tokenizer):
f.write(b'\n')
exit()
if not args.output_path:
args.output_path = args.input_path
with open(args.input_path, 'rb') as f:
input = f.readlines()
output = SegmentPo(input, tokenizer, separator=args.separator.encode())
with open(args.output_path, 'wb') if args.output_path != "/dev/stdout" else os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f:
f.write(b''.join(output))
ProcessPoFile(input_path=args.input_path, output_path=args.output_path,
tokenizer=tokenizer, separator=args.separator, wrap_width=args.po_wrap_width)

Loading…
Cancel
Save