en_to_indic_translation / scripts /add_joint_tags_translate.py
harveen
Adding code
9bbf386
import sys
from tqdm import tqdm
import os
def add_token(sent, tag_infos):
""" add special tokens specified by tag_infos to each element in list
tag_infos: list of tuples (tag_type,tag)
each tag_info results in a token of the form: __{tag_type}__{tag}__
"""
tokens = []
for tag_type, tag in tag_infos:
token = '__' + tag_type + '__' + tag + '__'
tokens.append(token)
return ' '.join(tokens) + ' ' + sent
def generate_lang_tag_iterator(infname):
with open(infname, 'r', encoding='utf-8') as infile:
for line in infile:
src, tgt, count = line.strip().split('\t')
count = int(count)
for _ in range(count):
yield (src, tgt)
if __name__ == '__main__':
expdir = sys.argv[1]
dset = sys.argv[2]
src_fname = '{expdir}/bpe/{dset}.SRC'.format(
expdir=expdir, dset=dset)
tgt_fname = '{expdir}/bpe/{dset}.TGT'.format(
expdir=expdir, dset=dset)
meta_fname = '{expdir}/data/{dset}_lang_pairs.txt'.format(
expdir=expdir, dset=dset)
out_src_fname = '{expdir}/final/{dset}.SRC'.format(
expdir=expdir, dset=dset)
out_tgt_fname = '{expdir}/final/{dset}.TGT'.format(
expdir=expdir, dset=dset)
lang_tag_iterator = generate_lang_tag_iterator(meta_fname)
os.makedirs('{expdir}/final'.format(expdir=expdir), exist_ok=True)
with open(src_fname, 'r', encoding='utf-8') as srcfile, \
open(tgt_fname, 'r', encoding='utf-8') as tgtfile, \
open(out_src_fname, 'w', encoding='utf-8') as outsrcfile, \
open(out_tgt_fname, 'w', encoding='utf-8') as outtgtfile:
for (l1, l2), src_sent, tgt_sent in tqdm(zip(lang_tag_iterator,
srcfile, tgtfile)):
outsrcfile.write(add_token(src_sent.strip(), [
('src', l1), ('tgt', l2)]) + '\n')
outtgtfile.write(tgt_sent.strip() + '\n')