JustinLin610
update
8437114
raw
history blame
No virus
1.55 kB
import gzip
import argparse
from string import punctuation
def len_no_punc(s, punc):
return len([ch for ch in s if ch in punc])
def filter_overpunc(len_npunc, len_sen):
return len_npunc < 0.5*len_sen
def main(args):
punc = punctuation + "—|–"
print('Processing file {}'.format(args.input))
with gzip.open(args.input, 'rt', encoding=args.encoding) as tsv:
with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc:
with open(args.bitext + '.' + args.tgt_lang, 'wt', encoding=args.encoding) as ftgt:
line = tsv.readline()
fields = line.split('\t')
src, tgt = fields[1], fields[2]
nchar_npunc_src = len_no_punc(src, punc)
nchar_npunc_tgt = len_no_punc(tgt, punc)
if filter_overpunc(nchar_npunc_src, len(src)) and filter_overpunc(nchar_npunc_tgt, len(tgt)):
fsrc.write(src.strip() + '\n')
ftgt.write(tgt.strip() + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--input", required=True, type=str)
parser.add_argument('--encoding', default='utf-8', help='character encoding for input/output')
parser.add_argument('--bitext', type=str, required=True, help='language direction')
parser.add_argument('--src-lang', type=str, required=True, help='Source language')
parser.add_argument('--tgt-lang', type=str, required=True, help='Target language')
main(parser.parse_args())