@ -1,7 +1,7 @@
import re
from typing import List , Callable , Tuple
from typing import List , Callable , Tuple , Union
_Tokenizer = Callable [ [ bytes ] , Tuple [ List [ int ] , List [ int ] ] ]
@ -22,7 +22,7 @@ class Segmenter():
def __call__ ( self , text : bytes ) - > bytes :
""" Runs the segmenter and produces a separator-joined string. """
if not text :
return [ ]
return b ' '
text = self . _RemoveAllMarkers ( text )
starts , ends = self . _tokenizer ( text )
starts , ends = _RecoverGaps ( text , starts , ends )
@ -88,36 +88,7 @@ def _MergeDisallowedPositions(text: bytes, starts: List[int], ends: List[int]) -
def _QuoteLine ( line ) :
return f ' " { line } " \n '
def _SplitEveryN ( input , n ) :
return [ input [ i : i + n ] for i in range ( 0 , len ( input ) , n ) ]
def _FormatMsgStr ( text_bytes ) :
""" A rough approximation of poedit formatting and wrapping. """
if not text_bytes :
return b ' " " \n '
text = text_bytes . decode ( )
output_lines = [ ]
lines_with_newline = text . split ( ' \\ n ' )
for line_i , line_with_newline in enumerate ( lines_with_newline ) :
if not line_with_newline and line_i != len ( lines_with_newline ) - 1 :
output_lines . append ( ' " \\ n " \n ' )
continue
lines = _SplitEveryN ( line_with_newline , 63 )
if line_i == len ( lines_with_newline ) - 1 :
lines = map ( _QuoteLine , lines )
else :
lines [ 0 : - 1 ] = map ( _QuoteLine , lines [ 0 : - 1 ] )
lines [ - 1 ] = f ' " { lines [ - 1 ] } \\ n " \n '
output_lines . extend ( lines )
if len ( output_lines ) > 1 :
output_lines . insert ( 0 , ' " " \n ' )
return ' ' . join ( output_lines ) . encode ( )
return b ' " %s " \n ' % line
_MSGSTR_PREFIX = b ' msgstr '
@ -138,7 +109,7 @@ def SegmentPo(input: List[bytes], tokenizer: _Tokenizer, separator: bytes = ZWSP
def _ProcessMsgStr ( ) :
text = segmenter ( b ' ' . join ( msgstr ) )
output . append ( msgstr_prefix + _FormatMsgStr ( text ) )
output . append ( msgstr_prefix + _QuoteLine ( text ) )
for line in input :
if line . startswith ( _MSGSTR_PREFIX ) :
@ -161,6 +132,32 @@ def SegmentPo(input: List[bytes], tokenizer: _Tokenizer, separator: bytes = ZWSP
return output
_DEFAULT_SEPARATOR = ZWSP
_DEFAULT_PO_WRAP_WIDTH = 79
def ProcessPoFile ( tokenizer : _Tokenizer , input_path : str , output_path : Union [ str , None ] = None ,
separator : str = _DEFAULT_SEPARATOR , wrap_width : int = _DEFAULT_PO_WRAP_WIDTH ) :
import subprocess
import os
import sys
if not output_path :
output_path = input_path
with open ( input_path , ' rb ' ) as f :
input = f . readlines ( )
output = SegmentPo ( input , tokenizer , separator = separator . encode ( ) )
with open ( output_path , ' wb ' ) if output_path != " /dev/stdout " else os . fdopen ( sys . stdout . fileno ( ) , " wb " , closefd = False ) as f :
f . write ( b ' ' . join ( output ) )
# Run gettext's msgcat to reformat the file.
subprocess . run ( [ ' msgcat ' , ' --force-po ' , f ' --width= { wrap_width } ' , ' -o ' , output_path , output_path ] ,
stdout = sys . stdout , stderr = sys . stderr )
def Main ( tokenizer : _Tokenizer ) :
import argparse
import os
@ -169,8 +166,10 @@ def Main(tokenizer: _Tokenizer):
parser = argparse . ArgumentParser ( )
parser . add_argument ( " --input_path " , help = " Path to the input .po file " )
parser . add_argument ( " --output_path " , help = " Output path (default: in-place) " )
parser . add_argument ( " --separator " , default = ZWSP ,
parser . add_argument ( " --separator " , default = _DEFAULT_SEPARATOR ,
help = " Separator to use between segments " )
parser . add_argument ( " --po_wrap_width " , default = _DEFAULT_PO_WRAP_WIDTH ,
help = " Wrap .po text to this many lines " )
parser . add_argument ( " --debug " ,
help = " If this flag is given, segments the given string, joins with \" | \" , and prints it. " )
args = parser . parse_args ( )
@ -182,13 +181,5 @@ def Main(tokenizer: _Tokenizer):
f . write ( b ' \n ' )
exit ( )
if not args . output_path :
args . output_path = args . input_path
with open ( args . input_path , ' rb ' ) as f :
input = f . readlines ( )
output = SegmentPo ( input , tokenizer , separator = args . separator . encode ( ) )
with open ( args . output_path , ' wb ' ) if args . output_path != " /dev/stdout " else os . fdopen ( sys . stdout . fileno ( ) , " wb " , closefd = False ) as f :
f . write ( b ' ' . join ( output ) )
ProcessPoFile ( input_path = args . input_path , output_path = args . output_path ,
tokenizer = tokenizer , separator = args . separator , wrap_width = args . po_wrap_width )