Spaces:
Runtime error
Runtime error
File size: 3,636 Bytes
e50fe35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
from tqdm import tqdm
import sys
LANGS = [
"as",
"bn",
"gu",
"hi",
"kn",
"ml",
"mr",
"or",
"pa",
"ta",
"te",
#"ur"
]
def add_token(sent, tag_infos):
""" add special tokens specified by tag_infos to each element in list
tag_infos: list of tuples (tag_type,tag)
each tag_info results in a token of the form: __{tag_type}__{tag}__
"""
tokens = []
for tag_type, tag in tag_infos:
token = '__' + tag_type + '__' + tag + '__'
tokens.append(token)
return ' '.join(tokens) + ' ' + sent
def concat_data(data_dir, outdir, lang_pair_list,
out_src_lang='SRC', out_trg_lang='TGT', split='train'):
"""
data_dir: input dir, contains directories for language pairs named l1-l2
"""
os.makedirs(outdir, exist_ok=True)
out_src_fname = '{}/{}.{}'.format(outdir, split, out_src_lang)
out_trg_fname = '{}/{}.{}'.format(outdir, split, out_trg_lang)
# out_meta_fname='{}/metadata.txt'.format(outdir)
print()
print(out_src_fname)
print(out_trg_fname)
# print(out_meta_fname)
# concatenate train data
if os.path.isfile(out_src_fname):
os.unlink(out_src_fname)
if os.path.isfile(out_trg_fname):
os.unlink(out_trg_fname)
# if os.path.isfile(out_meta_fname):
# os.unlink(out_meta_fname)
for src_lang, trg_lang in tqdm(lang_pair_list):
print('src: {}, tgt:{}'.format(src_lang, trg_lang))
in_src_fname = '{}/{}-{}/{}.{}'.format(
data_dir, src_lang, trg_lang, split, src_lang)
in_trg_fname = '{}/{}-{}/{}.{}'.format(
data_dir, src_lang, trg_lang, split, trg_lang)
if not os.path.exists(in_src_fname):
continue
if not os.path.exists(in_trg_fname):
continue
print(in_src_fname)
os.system('cat {} >> {}'.format(in_src_fname, out_src_fname))
print(in_trg_fname)
os.system('cat {} >> {}'.format(in_trg_fname, out_trg_fname))
# with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile:
# lpfile.write('\n'.join( [ '-'.join(x) for x in lang_pair_list ] ))
corpus_stats(data_dir, outdir, lang_pair_list, split)
def corpus_stats(data_dir, outdir, lang_pair_list, split):
"""
data_dir: input dir, contains directories for language pairs named l1-l2
"""
with open('{}/{}_lang_pairs.txt'.format(outdir, split), 'w', encoding='utf-8') as lpfile:
for src_lang, trg_lang in tqdm(lang_pair_list):
print('src: {}, tgt:{}'.format(src_lang, trg_lang))
in_src_fname = '{}/{}-{}/{}.{}'.format(
data_dir, src_lang, trg_lang, split, src_lang)
# in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)
if not os.path.exists(in_src_fname):
continue
print(in_src_fname)
corpus_size = 0
with open(in_src_fname, 'r', encoding='utf-8') as infile:
corpus_size = sum(map(lambda x: 1, infile))
lpfile.write('{}\t{}\t{}\n'.format(
src_lang, trg_lang, corpus_size))
if __name__ == '__main__':
in_dir = sys.argv[1]
out_dir = sys.argv[2]
src_lang = sys.argv[3]
tgt_lang = sys.argv[4]
split = sys.argv[5]
lang_pair_list = []
if src_lang == 'en':
for lang in LANGS:
lang_pair_list.append(['en', lang])
else:
for lang in LANGS:
lang_pair_list.append([lang, 'en'])
concat_data(in_dir, out_dir, lang_pair_list, split=split)
|