You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

186 lines
5.1 KiB

#!/usr/bin/env python
import argparse
import pathlib
import os
import re
import sys
import tensorflow_text
_DISALLOW_SEGMENTATION = re.compile(
r'(?:\{[^\}]*\}|\\[a-z]|[\w/-](?::[\w/-]?)?|[.,;?!=+/#]|\s)+'.encode())
_ZWSP = "\u200B"
_MODEL_HANDLE = "https://tfhub.dev/google/zh_segmentation/1"
default_input_path = pathlib.Path(__file__).resolve(
).parent.parent.parent.joinpath("Translations/zh_CN.po")
parser = argparse.ArgumentParser()
parser.add_argument("--input_path", default=default_input_path,
help="Path to the input .po file")
parser.add_argument("--output_path", help="Output path (default: in-place)")
parser.add_argument("--separator", default=_ZWSP,
help="Separator to use between segments")
parser.add_argument("--debug",
help="If this flag is given, segments the given string, joins with \"|\", and prints it.")
args = parser.parse_args()
_SEPARATOR = args.separator.encode()
_SEGMENTER = tensorflow_text.HubModuleTokenizer(_MODEL_HANDLE)
def _RunSegmenter(text, separator):
"""Runs the segmenter and produces a separator-joined string."""
if not text:
return []
text = _RemoveAllMarkers(text)
_tokens, starts, ends = _SEGMENTER.tokenize_with_offsets(text)
starts, ends = starts.numpy(), ends.numpy()
starts, ends = _RecoverGaps(text, starts, ends)
starts, ends = _MergeDisallowedPositions(text, starts, ends)
output = separator.join([text[i:j] for i, j in zip(starts, ends)])
return _RemoveRedundantMarkers(output)
def _RecoverGaps(text, starts, ends):
"""Recovers gaps from the segmenter-produced start and end indices.
The segmenter may produce gaps around spaces, e.g. segment("hello world") => "hello|world"
"""
out_starts = []
out_ends = []
prev_end = 0
for start, end in zip(starts, ends):
if start != prev_end:
out_starts.append(prev_end)
out_ends.append(start)
out_starts.append(start)
out_ends.append(end)
prev_end = end
if out_ends[-1] != len(text):
out_ends[-1] = len(text)
return out_starts, out_ends
def _MergeDisallowedPositions(text, starts, ends):
"""Merges segments disallowed by _DISALLOW_SEGMENTATION."""
disallowed = set()
for m in re.finditer(_DISALLOW_SEGMENTATION, text):
for i in range(m.start() + 1, m.end()):
disallowed.add(i)
out_starts = [starts[0]]
out_ends = [ends[0]]
for start, end in zip(starts[1:], ends[1:]):
if start in disallowed:
out_ends[-1] = end
else:
out_starts.append(start)
out_ends.append(end)
return out_starts, out_ends
_REMOVE_REDUNDANT_MARKERS = re.compile(b''.join(
[re.escape(_SEPARATOR), '?([  ,、。?!])'.encode(), re.escape(_SEPARATOR), b'?']))
def _RemoveRedundantMarkers(text):
"""Removes segmentation markers for cases that are handled at runtime anyway."""
return re.sub(_REMOVE_REDUNDANT_MARKERS, r'\1', text)
_SEGMENTATION_MARKERS = re.compile(b''.join(
[b'(?:', re.escape(_SEPARATOR), b'|', re.escape(_ZWSP.encode()), b')+']))
def _RemoveAllMarkers(text):
"""Remove the existing segmentation markers to allow for re-segmenting."""
return re.sub(_SEGMENTATION_MARKERS, b'', text)
def _QuoteLine(line):
return f'"{line}"\n'
def _SplitEveryN(input, n):
return [input[i:i + n] for i in range(0, len(input), n)]
def _FormatMsgStr(text_bytes):
"""A rough approximation of poedit formatting and wrapping."""
if not text_bytes:
return b'""\n'
text = text_bytes.decode()
output_lines = []
lines_with_newline = text.split('\\n')
for line_i, line_with_newline in enumerate(lines_with_newline):
if not line_with_newline and line_i != len(lines_with_newline) - 1:
output_lines.append('"\\n"\n')
continue
lines = _SplitEveryN(line_with_newline, 63)
if line_i == len(lines_with_newline) - 1:
lines = map(_QuoteLine, lines)
else:
lines[0:-1] = map(_QuoteLine, lines[0:-1])
lines[-1] = f'"{lines[-1]}\\n"\n'
output_lines.extend(lines)
if len(output_lines) > 1:
output_lines.insert(0, '""\n')
return ''.join(output_lines).encode()
if args.debug:
with os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f:
f.write(_RunSegmenter(args.debug.encode(), separator=b'|'))
f.write(b'\n')
exit()
if not args.output_path:
args.output_path = args.input_path
with open(args.input_path, 'rb') as f:
input = f.readlines()
_MSGSTR_PREFIX = b'msgstr '
output = []
# Skip poedit header
header_end = input.index(b'\n') + 1
output.extend(input[:header_end])
input = input[header_end:]
in_msgstr = False
msgstr_prefix = ""
msgstr = []
def _ProcessMsgStr():
text = _RunSegmenter(b''.join(msgstr), separator=_SEPARATOR)
output.append(msgstr_prefix + _FormatMsgStr(text))
for line in input:
if line.startswith(_MSGSTR_PREFIX):
msgstr_prefix, line = line.split(b'"', maxsplit=1)
msgstr.append(line[:-2])
in_msgstr = True
elif in_msgstr and line.startswith(b'"'):
msgstr.append(line[1:-2])
else:
if msgstr:
_ProcessMsgStr()
msgstr.clear()
msgstr_prefix = ""
output.append(line)
in_msgstr = False
if msgstr:
_ProcessMsgStr()
with open(args.output_path, 'wb') if args.output_path != "/dev/stdout" else os.fdopen(sys.stdout.fileno(), "wb", closefd=False) as f:
f.write(b''.join(output))