en_to_indic_translation / scripts /extract_non_english_pairs.py
harveen
Adding code
9bbf386
from tqdm import tqdm
import os
from collections import defaultdict
def read_file(fname):
with open(fname, "r", encoding="utf-8") as infile:
for line in infile:
yield line.strip()
def extract_non_english_pairs(indir, outdir, LANGS):
"""
Extracts non-english pair parallel corpora
indir: contains english centric data in the following form:
- directory named en-xx for language xx
- each directory contains a train.en and train.xx
outdir: output directory to store mined data for each pair.
One directory is created for each pair.
LANGS: list of languages in the corpus (other than English).
The language codes must correspond to the ones used in the
files and directories in indir. Prefarably, sort the languages
in this list in alphabetic order. outdir will contain data for xx-yy,
but not for yy-xx, so it will be convenient to have this list in sorted order.
"""
for i in tqdm(range(len(LANGS) - 1)):
print()
for j in range(i + 1, len(LANGS)):
lang1 = LANGS[i]
lang2 = LANGS[j]
# print()
print("{} {}".format(lang1, lang2))
fname1 = "{}/en-{}/train.en".format(indir, lang1)
fname2 = "{}/en-{}/train.en".format(indir, lang2)
# print(fname1)
# print(fname2)
enset_l1 = set(read_file(fname1))
common_en_set = enset_l1.intersection(read_file(fname2))
## this block should be used if you want to consider multiple translations.
# il_fname1 = "{}/en-{}/train.{}".format(indir, lang1, lang1)
# en_lang1_dict = defaultdict(list)
# for en_line, il_line in zip(read_file(fname1), read_file(il_fname1)):
# if en_line in common_en_set:
# en_lang1_dict[en_line].append(il_line)
# # this block should be used if you DONT to consider multiple translation.
il_fname1='{}/en-{}/train.{}'.format(indir,lang1,lang1)
en_lang1_dict={}
for en_line,il_line in zip(read_file(fname1),read_file(il_fname1)):
if en_line in common_en_set:
en_lang1_dict[en_line]=il_line
os.makedirs("{}/{}-{}".format(outdir, lang1, lang2), exist_ok=True)
out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format(
o=outdir, l1=lang1, l2=lang2
)
out_l2_fname = "{o}/{l1}-{l2}/train.{l2}".format(
o=outdir, l1=lang1, l2=lang2
)
il_fname2 = "{}/en-{}/train.{}".format(indir, lang2, lang2)
with open(out_l1_fname, "w", encoding="utf-8") as out_l1_file, open(
out_l2_fname, "w", encoding="utf-8"
) as out_l2_file:
for en_line, il_line in zip(read_file(fname2), read_file(il_fname2)):
if en_line in en_lang1_dict:
# this block should be used if you want to consider multiple tranlations.
for il_line_lang1 in en_lang1_dict[en_line]:
# lang1_line, lang2_line = il_line_lang1, il_line
# out_l1_file.write(lang1_line + "\n")
# out_l2_file.write(lang2_line + "\n")
# this block should be used if you DONT to consider multiple translation.
lang1_line, lang2_line = en_lang1_dict[en_line], il_line
out_l1_file.write(lang1_line+'\n')
out_l2_file.write(lang2_line+'\n')
def get_extracted_stats(outdir, LANGS):
"""
gathers stats from the extracted directories
outdir: output directory to store mined data for each pair.
One directory is created for each pair.
LANGS: list of languages in the corpus (other than languages).
The language codes must correspond to the ones used in the
files and directories in indir. Prefarably, sort the languages
in this list in alphabetic order. outdir will contain data for xx-yy,
"""
common_stats = []
for i in tqdm(range(len(LANGS) - 1)):
for j in range(i + 1, len(LANGS)):
lang1 = LANGS[i]
lang2 = LANGS[j]
out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format(
o=outdir, l1=lang1, l2=lang2
)
cnt = sum([1 for _ in read_file(out_l1_fname)])
common_stats.append((lang1, lang2, cnt))
common_stats.append((lang2, lang1, cnt))
return common_stats