File size: 4,694 Bytes
e50fe35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from tqdm import tqdm
import os
from collections import defaultdict


def read_file(fname):
    with open(fname, "r", encoding="utf-8") as infile:
        for line in infile:
            yield line.strip()


def extract_non_english_pairs(indir, outdir, LANGS):
    """
    Extracts non-english pair parallel corpora

    indir: contains english centric data in the following form:
            - directory named en-xx for language xx
            - each directory contains a train.en and train.xx
    outdir: output directory to store mined data for each pair.
            One directory is created for each pair.
    LANGS: list of languages in the corpus (other than English).
            The language codes must correspond to the ones used in the
            files and directories in indir. Prefarably, sort the languages
            in this list in alphabetic order. outdir will contain data for xx-yy,
            but not for yy-xx, so it will be convenient to have this list in sorted order.
    """

    for i in tqdm(range(len(LANGS) - 1)):
        print()
        for j in range(i + 1, len(LANGS)):
            lang1 = LANGS[i]
            lang2 = LANGS[j]
            #         print()
            print("{} {}".format(lang1, lang2))

            fname1 = "{}/en-{}/train.en".format(indir, lang1)
            fname2 = "{}/en-{}/train.en".format(indir, lang2)
            #         print(fname1)
            #         print(fname2)
            enset_l1 = set(read_file(fname1))
            common_en_set = enset_l1.intersection(read_file(fname2))

            ## this block should be used if you want to consider multiple translations.
            # il_fname1 = "{}/en-{}/train.{}".format(indir, lang1, lang1)
            # en_lang1_dict = defaultdict(list)
            # for en_line, il_line in zip(read_file(fname1), read_file(il_fname1)):
            #     if en_line in common_en_set:
            #         en_lang1_dict[en_line].append(il_line)

            #         # this block should be used if you DONT to consider multiple translation.
            il_fname1='{}/en-{}/train.{}'.format(indir,lang1,lang1)
            en_lang1_dict={}
            for en_line,il_line in zip(read_file(fname1),read_file(il_fname1)):
                if en_line in common_en_set:
                    en_lang1_dict[en_line]=il_line

            os.makedirs("{}/{}-{}".format(outdir, lang1, lang2), exist_ok=True)
            out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format(
                o=outdir, l1=lang1, l2=lang2
            )
            out_l2_fname = "{o}/{l1}-{l2}/train.{l2}".format(
                o=outdir, l1=lang1, l2=lang2
            )

            il_fname2 = "{}/en-{}/train.{}".format(indir, lang2, lang2)
            with open(out_l1_fname, "w", encoding="utf-8") as out_l1_file, open(
                out_l2_fname, "w", encoding="utf-8"
            ) as out_l2_file:
                for en_line, il_line in zip(read_file(fname2), read_file(il_fname2)):
                    if en_line in en_lang1_dict:

                        # this block should be used if you want to consider multiple tranlations.
                        for il_line_lang1 in en_lang1_dict[en_line]:
                        #     lang1_line, lang2_line = il_line_lang1, il_line
                        #     out_l1_file.write(lang1_line + "\n")
                        #     out_l2_file.write(lang2_line + "\n")

                    # this block should be used if you DONT to consider multiple translation.
	                        lang1_line, lang2_line = en_lang1_dict[en_line], il_line
	                        out_l1_file.write(lang1_line+'\n')
	                        out_l2_file.write(lang2_line+'\n')


def get_extracted_stats(outdir, LANGS):
    """
    gathers stats from the extracted directories

    outdir: output directory to store mined data for each pair.
            One directory is created for each pair.
    LANGS: list of languages in the corpus (other than languages).
            The language codes must correspond to the ones used in the
            files and directories in indir. Prefarably, sort the languages
            in this list in alphabetic order. outdir will contain data for xx-yy,
    """
    common_stats = []
    for i in tqdm(range(len(LANGS) - 1)):
        for j in range(i + 1, len(LANGS)):
            lang1 = LANGS[i]
            lang2 = LANGS[j]

            out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format(
                o=outdir, l1=lang1, l2=lang2
            )

            cnt = sum([1 for _ in read_file(out_l1_fname)])
            common_stats.append((lang1, lang2, cnt))
            common_stats.append((lang2, lang1, cnt))
    return common_stats