Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- fairseq/examples/mms/lid_rerank/mms/merge_by_lang.py +33 -0
- fairseq/examples/mms/lid_rerank/mms/prep_wav_list.py +23 -0
- fairseq/examples/mms/lid_rerank/mms/split_by_lang.py +90 -0
- fairseq/examples/mms/lid_rerank/nllb/infer.py +46 -0
- fairseq/examples/mms/lid_rerank/rerank/rerank.py +132 -0
- fairseq/examples/mms/lid_rerank/rerank/tune_coefficients.py +138 -0
- fairseq/examples/mms/lid_rerank/whisper/infer_lid.py +65 -0
- fairseq/examples/moe_lm/data_card.md +221 -0
- fairseq/examples/moe_lm/model_card.md +170 -0
- fairseq/examples/mr_hubert/README.md +187 -0
- fairseq/examples/mr_hubert/config/decode/infer.yaml +30 -0
- fairseq/examples/mr_hubert/config/decode/infer_lm.yaml +37 -0
- fairseq/examples/mr_hubert/config/decode/run/submitit_slurm.yaml +17 -0
- fairseq/examples/mr_hubert/config/decode/run/submitit_slurm_8gpu.yaml +17 -0
- fairseq/examples/mr_hubert/config/finetune/base_100h.yaml +97 -0
- fairseq/examples/mr_hubert/config/finetune/base_100h_large.yaml +97 -0
- fairseq/examples/mr_hubert/config/finetune/base_10h.yaml +101 -0
- fairseq/examples/mr_hubert/config/finetune/base_10h_large.yaml +101 -0
- fairseq/examples/mr_hubert/config/finetune/base_1h.yaml +100 -0
- fairseq/examples/mr_hubert/config/finetune/base_1h_large.yaml +99 -0
- fairseq/examples/mr_hubert/config/pretrain/mrhubert_base_librispeech.yaml +103 -0
- fairseq/examples/mr_hubert/config/pretrain/mrhubert_large_librilight.yaml +107 -0
- fairseq/examples/mr_hubert/config/pretrain/run/submitit_reg.yaml +20 -0
- fairseq/examples/mr_hubert/train.sh +45 -0
- fairseq/examples/multilingual/ML50_langs.txt +52 -0
- fairseq/examples/multilingual/README.md +158 -0
- fairseq/examples/multilingual/data_scripts/README.md +24 -0
- fairseq/examples/multilingual/data_scripts/binarize.py +200 -0
- fairseq/examples/multilingual/data_scripts/check_iswlt_test_data.py +67 -0
- fairseq/examples/multilingual/data_scripts/check_self_overlaps.py +103 -0
- fairseq/examples/multilingual/data_scripts/check_valid_test_overlaps.py +124 -0
- fairseq/examples/multilingual/data_scripts/dedup_all.py +52 -0
- fairseq/examples/multilingual/data_scripts/download_ML50_v1.sh +30 -0
- fairseq/examples/multilingual/data_scripts/download_af_xh.sh +164 -0
- fairseq/examples/multilingual/data_scripts/download_flores_data.sh +246 -0
- fairseq/examples/multilingual/data_scripts/download_iitb.sh +35 -0
- fairseq/examples/multilingual/data_scripts/download_iwslt_and_extract.sh +225 -0
- fairseq/examples/multilingual/data_scripts/download_lotus.sh +46 -0
- fairseq/examples/multilingual/data_scripts/download_ted_and_extract.py +338 -0
- fairseq/examples/multilingual/data_scripts/download_wat19_my.sh +36 -0
- fairseq/examples/multilingual/data_scripts/download_wmt19_and_before.py +899 -0
- fairseq/examples/multilingual/data_scripts/download_wmt20.sh +547 -0
- fairseq/examples/multilingual/data_scripts/preprocess_ML50_v1.sh +27 -0
- fairseq/examples/multilingual/data_scripts/remove_valid_test_in_train.py +290 -0
- fairseq/examples/multilingual/data_scripts/requirement.txt +2 -0
- fairseq/examples/multilingual/data_scripts/utils/dedup.py +41 -0
- fairseq/examples/multilingual/data_scripts/utils/fasttext_multi_filter.py +63 -0
- fairseq/examples/multilingual/data_scripts/utils/strip_sgm.sh +1 -0
- fairseq/examples/multilingual/finetune_multilingual_model.sh +32 -0
- fairseq/examples/multilingual/multilingual_fairseq_gen.sh +26 -0
fairseq/examples/mms/lid_rerank/mms/merge_by_lang.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
from collections import defaultdict
|
4 |
+
import os
|
5 |
+
import soundfile as sf
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
if __name__ == "__main__":
|
9 |
+
parser = argparse.ArgumentParser(description='Example argument parser')
|
10 |
+
parser.add_argument('--exp', type=str)
|
11 |
+
parser.add_argument('--dump', type=str)
|
12 |
+
args = parser.parse_args()
|
13 |
+
|
14 |
+
langs = [d for d in os.listdir(args.dump) if os.path.isdir(os.path.join(args.dump, d))]
|
15 |
+
|
16 |
+
data = {}
|
17 |
+
|
18 |
+
for lang in langs:
|
19 |
+
ids = [int(x.strip()) for x in open(args.dump + "/" + lang + "/ids.txt", "r").readlines()]
|
20 |
+
word_hyps = [x.strip() for x in open(args.exp + "/" + lang + "/hypo.word.reord", "r").readlines()]
|
21 |
+
scores = [x.strip() for x in open(args.exp + "/" + lang + "/asr_score.reord", "r").readlines()]
|
22 |
+
assert len(ids) == len(word_hyps)
|
23 |
+
assert len(ids) == len(scores)
|
24 |
+
for id, word_hyp, s in zip(ids, word_hyps, scores):
|
25 |
+
if id in data:
|
26 |
+
print("Duplicate ID found")
|
27 |
+
import pdb;pdb.set_trace()
|
28 |
+
data[id] = (word_hyp, s)
|
29 |
+
|
30 |
+
with open(args.exp + "/nbest_asr_hyp", "w") as f1, open(args.exp + "/asr_score", "w") as f2:
|
31 |
+
for i in range(len(data.keys())):
|
32 |
+
f1.write(data[i][0] + "\n")
|
33 |
+
f2.write(data[i][1] + "\n")
|
fairseq/examples/mms/lid_rerank/mms/prep_wav_list.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import soundfile as sf
|
2 |
+
import argparse
|
3 |
+
|
4 |
+
if __name__ == "__main__":
|
5 |
+
parser = argparse.ArgumentParser(description='Example argument parser')
|
6 |
+
parser.add_argument('--src', type=str)
|
7 |
+
parser.add_argument('--dst', type=str)
|
8 |
+
args = parser.parse_args()
|
9 |
+
|
10 |
+
wavs = [x.strip() for x in open(args.src, "r").readlines()]
|
11 |
+
|
12 |
+
new_lines = ["/"]
|
13 |
+
for wav in wavs:
|
14 |
+
# Read the wav file
|
15 |
+
data, sample_rate = sf.read(wav)
|
16 |
+
|
17 |
+
# Number of samples is the length of the data array
|
18 |
+
num_samples = len(data)
|
19 |
+
|
20 |
+
new_lines.append(wav+"\t"+str(num_samples))
|
21 |
+
|
22 |
+
with open(args.dst, "w") as f:
|
23 |
+
f.writelines([x+"\n" for x in new_lines])
|
fairseq/examples/mms/lid_rerank/mms/split_by_lang.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
from collections import defaultdict
|
4 |
+
import os
|
5 |
+
import soundfile as sf
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
if __name__ == "__main__":
|
9 |
+
parser = argparse.ArgumentParser(description='Example argument parser')
|
10 |
+
parser.add_argument('--wavs_tsv', type=str)
|
11 |
+
parser.add_argument('--lid_preds', type=str)
|
12 |
+
parser.add_argument('--dst', type=str)
|
13 |
+
parser.add_argument('--refs', type=str, default=None)
|
14 |
+
parser.add_argument('--langs', type=str, default=None)
|
15 |
+
parser.add_argument('--confs', type=str, default=None)
|
16 |
+
args = parser.parse_args()
|
17 |
+
|
18 |
+
# split wavs into dst/lang/wav.txt and dst/lang/ids.txt
|
19 |
+
# uses lid_preds to create topk asr; 1 wav has k different lid
|
20 |
+
|
21 |
+
wavs_tsv = [x for x in open(args.wavs_tsv, "r").readlines()]
|
22 |
+
root = wavs_tsv[0]
|
23 |
+
wavs = wavs_tsv[1:]
|
24 |
+
lid_preds = [eval(x) for x in open(args.lid_preds, "r").readlines()]
|
25 |
+
if args.refs is not None:
|
26 |
+
refs = [x.strip() for x in open(args.refs, "r").readlines()]
|
27 |
+
assert len(wavs) == len(refs)
|
28 |
+
refs_filt = []
|
29 |
+
if args.langs is not None:
|
30 |
+
langs = [x.strip() for x in open(args.langs, "r").readlines()]
|
31 |
+
assert len(wavs) == len(langs)
|
32 |
+
langs_filt = []
|
33 |
+
if args.confs is not None:
|
34 |
+
confs = [x.strip() for x in open(args.confs, "r").readlines()]
|
35 |
+
assert len(wavs) == len(confs)
|
36 |
+
confs_filt = []
|
37 |
+
|
38 |
+
assert len(wavs) == len(lid_preds)
|
39 |
+
|
40 |
+
topk_wavs = []
|
41 |
+
topk_langs = []
|
42 |
+
|
43 |
+
for i, (w, p) in enumerate(zip(wavs, lid_preds)):
|
44 |
+
if p == "n/a":
|
45 |
+
continue
|
46 |
+
|
47 |
+
assert len(p) == len(lid_preds[0])
|
48 |
+
|
49 |
+
for l, _ in p:
|
50 |
+
topk_wavs.append(w)
|
51 |
+
topk_langs.append(l)
|
52 |
+
|
53 |
+
if args.refs is not None:
|
54 |
+
refs_filt.append(refs[i])
|
55 |
+
if args.langs is not None:
|
56 |
+
langs_filt.append(langs[i])
|
57 |
+
if args.confs is not None:
|
58 |
+
confs_filt.append(confs[i])
|
59 |
+
|
60 |
+
lang_split = defaultdict(list)
|
61 |
+
for id, (wav,lid) in enumerate(zip(topk_wavs, topk_langs)):
|
62 |
+
lang_split[lid].append((id, wav))
|
63 |
+
|
64 |
+
for lang in tqdm(lang_split.keys()):
|
65 |
+
if not os.path.exists(args.dst + "/" + lang):
|
66 |
+
os.makedirs(args.dst + "/" + lang)
|
67 |
+
|
68 |
+
with open(args.dst + "/" + lang + "/test.tsv", "w") as f1, \
|
69 |
+
open(args.dst + "/" + lang + "/ids.txt", "w") as f2:
|
70 |
+
f1.write(root)
|
71 |
+
f1.writelines([x[1] for x in lang_split[lang]])
|
72 |
+
f2.writelines([str(x[0]) + "\n" for x in lang_split[lang]])
|
73 |
+
|
74 |
+
with open(args.dst + "/" + lang + "/test.ltr", "w") as fw:
|
75 |
+
fw.write("d u m m y | d u m m y |\n"*len(lang_split[lang]))
|
76 |
+
with open(args.dst + "/" + lang + "/test.wrd", "w") as fw:
|
77 |
+
fw.write("dummy dummy\n"*len(lang_split[lang]))
|
78 |
+
|
79 |
+
with open(args.dst + "/lid.txt", "w") as f:
|
80 |
+
f.writelines([x+"\n" for x in topk_langs])
|
81 |
+
|
82 |
+
if args.refs is not None:
|
83 |
+
with open(args.dst + "/refs.txt", "w") as f:
|
84 |
+
f.writelines([x+"\n" for x in refs_filt])
|
85 |
+
if args.langs is not None:
|
86 |
+
with open(args.dst + "/langs.txt", "w") as f:
|
87 |
+
f.writelines([x+"\n" for x in langs_filt])
|
88 |
+
if args.confs is not None:
|
89 |
+
with open(args.dst + "/confs.txt", "w") as f:
|
90 |
+
f.writelines([x+"\n" for x in confs_filt])
|
fairseq/examples/mms/lid_rerank/nllb/infer.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- encoding: utf8 -*-
|
3 |
+
import fasttext
|
4 |
+
from tqdm import tqdm
|
5 |
+
import argparse
|
6 |
+
import os
|
7 |
+
import math
|
8 |
+
|
9 |
+
parser = argparse.ArgumentParser()
|
10 |
+
parser.add_argument("--txt", type=str)
|
11 |
+
parser.add_argument("--dst", type=str)
|
12 |
+
parser.add_argument("--model", type=str)
|
13 |
+
parser.add_argument('--lid', type=str)
|
14 |
+
args = parser.parse_args()
|
15 |
+
|
16 |
+
mapping = {"arb":"ara", "azj":"aze", "pes":"fas", "fuv":"ful", "lvs":"lav", "khk":"mon", "zsm":"zlm", "gaz":"orm", "pbt":"pus", "uzn":"uzb", "zho":"cmn"}
|
17 |
+
|
18 |
+
def fix_code(x):
|
19 |
+
code = x.split("_")[-2]
|
20 |
+
if code in mapping:
|
21 |
+
code = mapping[code]
|
22 |
+
return code
|
23 |
+
|
24 |
+
if __name__ == "__main__":
|
25 |
+
if not os.path.exists(args.dst):
|
26 |
+
os.makedirs(args.dst)
|
27 |
+
|
28 |
+
pretrained_lang_model = args.model
|
29 |
+
model = fasttext.load_model(pretrained_lang_model)
|
30 |
+
|
31 |
+
txts = [x.strip() for x in open(args.txt, "r").readlines()]
|
32 |
+
lids = [x.strip() for x in open(args.lid, "r").readlines()]
|
33 |
+
assert len(txts) == len(lids)
|
34 |
+
|
35 |
+
with open(args.dst + "/wlid_score", "w") as f:
|
36 |
+
for t,l in tqdm(zip(txts, lids)):
|
37 |
+
predictions = model.predict(t, k=218) # max 218
|
38 |
+
predictions = [(fix_code(x), y) for x, y in zip(predictions[0], predictions[1])]
|
39 |
+
|
40 |
+
try:
|
41 |
+
pred_langs = [x[0] for x in predictions]
|
42 |
+
idx = pred_langs.index(l)
|
43 |
+
score = math.log(predictions[idx][-1])
|
44 |
+
except:
|
45 |
+
score = -1000
|
46 |
+
f.write(str(score) + "\n")
|
fairseq/examples/mms/lid_rerank/rerank/rerank.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
from collections import defaultdict
|
4 |
+
import os
|
5 |
+
from tqdm import tqdm
|
6 |
+
import sys
|
7 |
+
import subprocess
|
8 |
+
import re
|
9 |
+
import math
|
10 |
+
import numpy as np
|
11 |
+
import editdistance
|
12 |
+
from sklearn.preprocessing import StandardScaler
|
13 |
+
from multiprocessing import Pool
|
14 |
+
from functools import partial
|
15 |
+
import random
|
16 |
+
|
17 |
+
cer_langs = [x.strip() for x in open("cer_langs.txt", "r").readlines()]
|
18 |
+
|
19 |
+
def select(w, feats, ref_lid, nbest_lid, ref_asr, nbest_asr, n=10, exclude=None):
|
20 |
+
assert len(w) == len(feats[0])
|
21 |
+
scores = []
|
22 |
+
for f in feats:
|
23 |
+
s = 0
|
24 |
+
for i in range(len(w)):
|
25 |
+
s += w[i]*f[i]
|
26 |
+
scores.append(s)
|
27 |
+
|
28 |
+
lid_correct = 0
|
29 |
+
lid_total = 0
|
30 |
+
asr_err = 0
|
31 |
+
asr_total = 0
|
32 |
+
text = []
|
33 |
+
lang = []
|
34 |
+
|
35 |
+
for i in range(len(ref_lid)):
|
36 |
+
if exclude is not None:
|
37 |
+
if ref_lid[i] in exclude:
|
38 |
+
continue
|
39 |
+
|
40 |
+
start_idx = i * n
|
41 |
+
end_idx = start_idx + n
|
42 |
+
cand_scores = scores[start_idx:end_idx]
|
43 |
+
max_idx, max_val = max(enumerate(cand_scores), key=lambda x: x[1])
|
44 |
+
|
45 |
+
cand_feats = feats[start_idx:end_idx]
|
46 |
+
|
47 |
+
lang.append(nbest_lid[start_idx:end_idx][max_idx])
|
48 |
+
if ref_lid[i] == nbest_lid[start_idx:end_idx][max_idx]:
|
49 |
+
lid_correct += 1
|
50 |
+
lid_total += 1
|
51 |
+
|
52 |
+
hyp = nbest_asr[start_idx:end_idx][max_idx]
|
53 |
+
text.append(hyp)
|
54 |
+
ref = ref_asr[i]
|
55 |
+
hyp = hyp.lower()
|
56 |
+
ref = ref.lower()
|
57 |
+
hyp = hyp.replace(".", "").replace(",", "").replace("?", "").replace("!", "").replace(":", "").replace(")", "").replace("(", "").replace("-", "")
|
58 |
+
ref = ref.replace(".", "").replace(",", "").replace("?", "").replace("!", "").replace(":", "").replace(")", "").replace("(", "").replace("-", "")
|
59 |
+
if ref_lid[i] in cer_langs:
|
60 |
+
hyp = " ".join(hyp)
|
61 |
+
ref = " ".join(ref)
|
62 |
+
|
63 |
+
hyp_words = hyp.split()
|
64 |
+
tgt_words = ref.split()
|
65 |
+
errs = editdistance.eval(hyp_words, tgt_words)
|
66 |
+
asr_err += errs
|
67 |
+
asr_total += len(tgt_words)
|
68 |
+
|
69 |
+
results = {"lid_acc": lid_correct / lid_total, "asr_wer": asr_err / asr_total, "weights": w}
|
70 |
+
|
71 |
+
return results, text, lang
|
72 |
+
|
73 |
+
if __name__ == "__main__":
|
74 |
+
parser = argparse.ArgumentParser(description='Example argument parser')
|
75 |
+
parser.add_argument('--slid', type=str)
|
76 |
+
parser.add_argument('--wlid', type=str)
|
77 |
+
parser.add_argument('--asr', type=str)
|
78 |
+
parser.add_argument('--lm', type=str)
|
79 |
+
parser.add_argument('--uasr', type=str)
|
80 |
+
parser.add_argument('--n', type=int, default=10)
|
81 |
+
parser.add_argument('--dst', type=str)
|
82 |
+
parser.add_argument('--ref_lid', type=str)
|
83 |
+
parser.add_argument('--nbest_lid', type=str)
|
84 |
+
parser.add_argument('--ref_asr', type=str)
|
85 |
+
parser.add_argument('--nbest_asr', type=str)
|
86 |
+
parser.add_argument('--w', type=str)
|
87 |
+
parser.add_argument('--tag', type=str, default = None)
|
88 |
+
parser.add_argument('--exclude', nargs="*", default=None) # exclude langs
|
89 |
+
args = parser.parse_args()
|
90 |
+
|
91 |
+
slid = [float(x.strip()) for x in open(args.slid, "r").readlines()]
|
92 |
+
wlid = [float(x.strip()) for x in open(args.wlid, "r").readlines()]
|
93 |
+
asr = [float(x.strip()) for x in open(args.asr, "r").readlines()]
|
94 |
+
lm = [float(x.strip()) for x in open(args.lm, "r").readlines()]
|
95 |
+
uasr = [float(x.strip()) for x in open(args.uasr, "r").readlines()]
|
96 |
+
|
97 |
+
assert len(slid) == len(wlid)
|
98 |
+
assert len(wlid) == len(asr)
|
99 |
+
assert len(asr) == len(lm)
|
100 |
+
assert len(lm) == len(uasr)
|
101 |
+
|
102 |
+
ref_lid = [x.strip() for x in open(args.ref_lid, "r").readlines()]
|
103 |
+
nbest_lid= [x.strip() for x in open(args.nbest_lid, "r").readlines()]
|
104 |
+
ref_asr = [x.strip() for x in open(args.ref_asr, "r").readlines()]
|
105 |
+
nbest_asr = [x.strip() for x in open(args.nbest_asr, "r").readlines()]
|
106 |
+
|
107 |
+
assert len(ref_lid) * args.n == len(nbest_lid)
|
108 |
+
assert len(ref_asr) * args.n == len(nbest_asr)
|
109 |
+
assert len(ref_lid) == len(ref_asr)
|
110 |
+
|
111 |
+
lengths = [len(x) for x in nbest_asr]
|
112 |
+
|
113 |
+
feats = [[s, w, a, l, u, le] for s,w,a,l,u,le in zip(slid, wlid, asr, lm, uasr, lengths)]
|
114 |
+
|
115 |
+
weight = eval(open(args.w, "r").read())['weights']
|
116 |
+
|
117 |
+
results, text, lang = select(weight, feats, ref_lid, nbest_lid, ref_asr, nbest_asr, n=args.n, exclude=args.exclude)
|
118 |
+
|
119 |
+
if args.tag is not None:
|
120 |
+
tag_text = "." + args.tag
|
121 |
+
else:
|
122 |
+
tag_text = ""
|
123 |
+
|
124 |
+
with open(args.dst + "/reranked_1best_asr_hyp" + tag_text, "w") as f_out:
|
125 |
+
f_out.writelines([x+"\n" for x in text])
|
126 |
+
|
127 |
+
with open(args.dst + "/reranked_1best_lid" + tag_text, "w") as f_out:
|
128 |
+
f_out.writelines([x+"\n" for x in lang])
|
129 |
+
|
130 |
+
with open(args.dst + "/text.result" + tag_text, "w") as f_out:
|
131 |
+
for k in results.keys():
|
132 |
+
f_out.write(k + "\t" + str(results[k]) + "\n")
|
fairseq/examples/mms/lid_rerank/rerank/tune_coefficients.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
from tqdm import tqdm
|
4 |
+
import numpy as np
|
5 |
+
import editdistance
|
6 |
+
from multiprocessing import Pool
|
7 |
+
from functools import partial
|
8 |
+
|
9 |
+
cer_langs = [x.strip() for x in open("cer_langs.txt", "r").readlines()]
|
10 |
+
|
11 |
+
def compute(w, feats, ref_lid, nbest_lid, ref_asr, nbest_asr, n=10, exclude=None):
|
12 |
+
assert len(w) == len(feats[0])
|
13 |
+
scores = []
|
14 |
+
for f in feats:
|
15 |
+
s = 0
|
16 |
+
for i in range(len(w)):
|
17 |
+
s += w[i]*f[i]
|
18 |
+
scores.append(s)
|
19 |
+
|
20 |
+
lid_correct = 0
|
21 |
+
lid_total = 0
|
22 |
+
asr_err = 0
|
23 |
+
asr_total = 0
|
24 |
+
|
25 |
+
for i in range(len(ref_lid)):
|
26 |
+
if exclude is not None:
|
27 |
+
if ref_lid[i] in exclude:
|
28 |
+
continue
|
29 |
+
|
30 |
+
start_idx = i * n
|
31 |
+
end_idx = start_idx + n
|
32 |
+
cand_scores = scores[start_idx:end_idx]
|
33 |
+
max_idx, max_val = max(enumerate(cand_scores), key=lambda x: x[1])
|
34 |
+
|
35 |
+
if ref_lid[i] == nbest_lid[start_idx:end_idx][max_idx]:
|
36 |
+
lid_correct += 1
|
37 |
+
lid_total += 1
|
38 |
+
|
39 |
+
hyp = nbest_asr[start_idx:end_idx][max_idx]
|
40 |
+
ref = ref_asr[i]
|
41 |
+
hyp = hyp.lower()
|
42 |
+
ref = ref.lower()
|
43 |
+
hyp = hyp.replace(".", "").replace(",", "").replace("?", "").replace("!", "").replace(":", "").replace(")", "").replace("(", "").replace("-", "")
|
44 |
+
ref = ref.replace(".", "").replace(",", "").replace("?", "").replace("!", "").replace(":", "").replace(")", "").replace("(", "").replace("-", "")
|
45 |
+
if ref_lid[i] in cer_langs:
|
46 |
+
hyp = " ".join(hyp)
|
47 |
+
ref = " ".join(ref)
|
48 |
+
|
49 |
+
hyp_words = hyp.split()
|
50 |
+
tgt_words = ref.split()
|
51 |
+
errs = editdistance.eval(hyp_words, tgt_words)
|
52 |
+
asr_err += errs
|
53 |
+
asr_total += len(tgt_words)
|
54 |
+
|
55 |
+
return {"lid_acc": lid_correct / lid_total, "asr_wer": asr_err / asr_total, "weights": w}
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
parser = argparse.ArgumentParser(description='Example argument parser')
|
59 |
+
parser.add_argument('--slid', type=str)
|
60 |
+
parser.add_argument('--wlid', type=str)
|
61 |
+
parser.add_argument('--asr', type=str)
|
62 |
+
parser.add_argument('--lm', type=str)
|
63 |
+
parser.add_argument('--uasr', type=str)
|
64 |
+
parser.add_argument('--n', type=int, default=10)
|
65 |
+
parser.add_argument('--dst', type=str)
|
66 |
+
parser.add_argument('--ref_lid', type=str)
|
67 |
+
parser.add_argument('--nbest_lid', type=str)
|
68 |
+
parser.add_argument('--ref_asr', type=str)
|
69 |
+
parser.add_argument('--nbest_asr', type=str)
|
70 |
+
parser.add_argument('--iters', type=int, default=10000)
|
71 |
+
parser.add_argument('--slid_scale', type=int, default = 100)
|
72 |
+
parser.add_argument('--wlid_scale', type=int, default = 100)
|
73 |
+
parser.add_argument('--asr_scale', type=int, default = 10)
|
74 |
+
parser.add_argument('--lm_scale', type=int, default = 10)
|
75 |
+
parser.add_argument('--uasr_scale', type=int, default = 10)
|
76 |
+
parser.add_argument('--len_scale', type=int, default = 1)
|
77 |
+
parser.add_argument('--num_jobs', type=int, default = 64)
|
78 |
+
parser.add_argument('--exclude', nargs="*", default=None) # exclude langs
|
79 |
+
args = parser.parse_args()
|
80 |
+
|
81 |
+
slid = [float(x.strip()) for x in open(args.slid, "r").readlines()]
|
82 |
+
wlid = [float(x.strip()) for x in open(args.wlid, "r").readlines()]
|
83 |
+
asr = [float(x.strip()) for x in open(args.asr, "r").readlines()]
|
84 |
+
lm = [float(x.strip()) for x in open(args.lm, "r").readlines()]
|
85 |
+
uasr = [float(x.strip()) for x in open(args.uasr, "r").readlines()]
|
86 |
+
|
87 |
+
assert len(slid) == len(wlid)
|
88 |
+
assert len(wlid) == len(asr)
|
89 |
+
assert len(asr) == len(lm)
|
90 |
+
assert len(lm) == len(uasr)
|
91 |
+
|
92 |
+
ref_lid = [x.strip() for x in open(args.ref_lid, "r").readlines()]
|
93 |
+
nbest_lid= [x.strip() for x in open(args.nbest_lid, "r").readlines()]
|
94 |
+
ref_asr = [x.strip() for x in open(args.ref_asr, "r").readlines()]
|
95 |
+
nbest_asr = [x.strip() for x in open(args.nbest_asr, "r").readlines()]
|
96 |
+
|
97 |
+
assert len(ref_lid) * args.n == len(nbest_lid)
|
98 |
+
assert len(ref_asr) * args.n == len(nbest_asr)
|
99 |
+
assert len(ref_lid) == len(ref_asr)
|
100 |
+
|
101 |
+
lengths = [len(x) for x in nbest_asr]
|
102 |
+
|
103 |
+
feats = [[s, w, a, l, u, le] for s,w,a,l,u,le in zip(slid, wlid, asr, lm, uasr, lengths)]
|
104 |
+
|
105 |
+
weights = []
|
106 |
+
for i in range(args.iters):
|
107 |
+
s_w = np.random.rand() * args.slid_scale
|
108 |
+
w_w = np.random.rand() * args.wlid_scale
|
109 |
+
a_w = np.random.rand() * args.asr_scale
|
110 |
+
l_w = np.random.rand() * args.lm_scale
|
111 |
+
u_w = np.random.rand() * args.uasr_scale
|
112 |
+
le_w = (np.random.rand() -0.5) * args.len_scale
|
113 |
+
weights.append([s_w, w_w, a_w, l_w, u_w, le_w])
|
114 |
+
|
115 |
+
num_tries = len(weights)
|
116 |
+
print("Total number of search points", num_tries)
|
117 |
+
threads = args.num_jobs
|
118 |
+
pool = Pool(threads)
|
119 |
+
compute_fxn = partial(compute, feats=feats, ref_lid=ref_asr, nbest_lid=nbest_lid, ref_asr=ref_asr, nbest_asr=nbest_asr, n=args.n, exclude=args.exclude)
|
120 |
+
results = pool.map(compute_fxn, weights)
|
121 |
+
pool.close()
|
122 |
+
pool.join()
|
123 |
+
|
124 |
+
assert len(results) == len(weights)
|
125 |
+
|
126 |
+
wer_best = 100
|
127 |
+
best = ""
|
128 |
+
if not os.path.exists(args.dst):
|
129 |
+
os.makedirs(args.dst)
|
130 |
+
with open(args.dst + "/results.all", "w") as f_out:
|
131 |
+
for result in results:
|
132 |
+
f_out.write(str(result)+"\n")
|
133 |
+
if result["asr_wer"] < wer_best:
|
134 |
+
wer_best = result["asr_wer"]
|
135 |
+
best = result
|
136 |
+
|
137 |
+
with open(args.dst + "/best_coefficients", "w") as f_out:
|
138 |
+
f_out.write(str(best)+"\n")
|
fairseq/examples/mms/lid_rerank/whisper/infer_lid.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- encoding: utf8 -*-
|
3 |
+
import argparse
|
4 |
+
import itertools
|
5 |
+
import os
|
6 |
+
import re
|
7 |
+
import sys
|
8 |
+
from pathlib import Path
|
9 |
+
import math
|
10 |
+
|
11 |
+
import whisper
|
12 |
+
from tqdm import tqdm
|
13 |
+
|
14 |
+
|
15 |
+
parser = argparse.ArgumentParser()
|
16 |
+
parser.add_argument("--wavs", type=str)
|
17 |
+
parser.add_argument("--dst", type=str)
|
18 |
+
parser.add_argument("--model", type=str)
|
19 |
+
parser.add_argument("--n", type=int, default=10)
|
20 |
+
parser.add_argument("--mapping", type=str, default="whisper/lid_mapping.txt")
|
21 |
+
args = parser.parse_args()
|
22 |
+
|
23 |
+
if __name__ == "__main__":
|
24 |
+
model = whisper.load_model(args.model)
|
25 |
+
|
26 |
+
print(args)
|
27 |
+
|
28 |
+
wavs = [x.strip() for x in open(args.wavs, "r").readlines()]
|
29 |
+
if not os.path.exists(args.dst):
|
30 |
+
os.makedirs(args.dst)
|
31 |
+
|
32 |
+
if args.mapping is not None:
|
33 |
+
#whisper_lid_code:mms_lid_code
|
34 |
+
mapping = {x[0]:x[1] for x in [l.strip().split(";", 1) for l in open(args.mapping, "r").readlines()]}
|
35 |
+
else:
|
36 |
+
mapping = None
|
37 |
+
|
38 |
+
with open(args.dst + "/predictions", "w") as f:
|
39 |
+
for wav in tqdm(wavs):
|
40 |
+
# load audio and pad/trim it to fit 30 seconds
|
41 |
+
audio = whisper.load_audio(wav)
|
42 |
+
audio = whisper.pad_or_trim(audio)
|
43 |
+
|
44 |
+
# make log-Mel spectrogram and move to the same device as the model
|
45 |
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
46 |
+
|
47 |
+
_, probs = model.detect_language(mel)
|
48 |
+
result = sorted(probs.items(), key=lambda x:x[1], reverse=True)[:args.n]
|
49 |
+
f.write(str(result) + "\n")
|
50 |
+
|
51 |
+
lid_preds = [eval(x) for x in open(args.dst + "/predictions", "r").readlines()]
|
52 |
+
lids = []
|
53 |
+
scores = []
|
54 |
+
for p in lid_preds:
|
55 |
+
assert len(p) == len(lid_preds[0])
|
56 |
+
for l, s in p:
|
57 |
+
if args.mapping is not None:
|
58 |
+
lids.append(mapping[l])
|
59 |
+
else:
|
60 |
+
lids.append(l)
|
61 |
+
scores.append(math.log(s))
|
62 |
+
with open(args.dst + "/nbest_lid", "w") as f:
|
63 |
+
f.writelines([x+"\n" for x in lids])
|
64 |
+
with open(args.dst + "/slid_score", "w") as f:
|
65 |
+
f.writelines([str(x)+"\n" for x in scores])
|
fairseq/examples/moe_lm/data_card.md
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Data card for the paper "Efficient Large Scale Language Modeling with Mixtures of Experts"
|
2 |
+
## Version 1.0.0
|
3 |
+
|
4 |
+
We follow the recommendations of Gebru et al. (2018) and provide a datacard for the dataset used to train the 1.1T parameter model.
|
5 |
+
|
6 |
+
## Motivation
|
7 |
+
* **For what purpose was the dataset created? Was there a specific task in mind? Was there a specific gap that needed to be filled? Please provide a description.**
|
8 |
+
The pre-training data for training the 1.1 T model was created by a union of six English language datasets, including five datasets used by RoBERTa (Liu et al 2019) and the English subset of CC 100. These purpose of creating this dataset was to pre-train the language model.
|
9 |
+
|
10 |
+
* **Who created the dataset (e.g., which team, research group) and on behalf of which entity (e.g., company, institution, organization)?**
|
11 |
+
FAIR (Fundamental Artificial Intelligence Research)
|
12 |
+
|
13 |
+
* **Who funded the creation of the dataset? If there is an associated grant, please provide the name of the grantor and the grant name and number.**
|
14 |
+
FAIR (Fundamental Artificial Intelligence Research)
|
15 |
+
|
16 |
+
* **Any other comments?**
|
17 |
+
No.
|
18 |
+
|
19 |
+
## Composition
|
20 |
+
|
21 |
+
* **What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)? Are there multiple types of instances (e.g., movies, users, and ratings; people and interactions between them; nodes and edges)? Please provide a description.**
|
22 |
+
The instances are textual documents. The overall dataset is composed from a union of the following datasets -
|
23 |
+
* BookCorpus (Zhu et al., 2019) consists of more than 10K unpublished books (4GB);
|
24 |
+
* English Wikipedia, excluding lists, tables and headers (12GB);
|
25 |
+
* CC-News (Nagel,2016) contains 63 million English news articles crawled between September 2016 and February 2019 (76GB);
|
26 |
+
* OpenWebText (Gokaslan and Cohen, 2019), an open source recreation of the WebText dataset used to train GPT-2 (38GB);
|
27 |
+
* CC-Stories (Trinh and Le, 2018) contains a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas (31GB);
|
28 |
+
* English CC100 (Wenzek et al., 2020), a dataset extracted from CommonCrawl snapshots between January 2018 and December 2018, filtered to match the style of Wikipedia (292GB).
|
29 |
+
|
30 |
+
* **How many instances are there in total (of each type, if appropriate)?**
|
31 |
+
The training data contains 112B tokens corresponding to 453 GB of data.
|
32 |
+
|
33 |
+
* **Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set? If the dataset is a sample, then what is the larger set? Is the sample representative of the larger set (e.g., geographic coverage)? If so, please describe how this representativeness was validated/verified. If it is not representative of the larger set, please describe why not (e.g., to cover a more diverse range of instances, because instances were withheld or unavailable).**
|
34 |
+
The English CC100 section of the dataset is a subset of CommonCrawl snapshots extracted between January 2018 to December 2018, filtered to match the style of Wikipedia. The CC-stories dataset contains a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas.
|
35 |
+
|
36 |
+
* **What data does each instance consist of? “Raw” data (e.g., unprocessed text or images) or features? In either case, please provide a description.**
|
37 |
+
Each instance consists of raw text data.
|
38 |
+
|
39 |
+
* **Is there a label or target associated with each instance? If so, please provide a description.**
|
40 |
+
No.
|
41 |
+
|
42 |
+
* **Is any information missing from individual instances? If so, please provide a description, explaining why this information is missing (e.g., because it was unavailable). This does not include intentionally removed information, but might include, e.g., redacted text.**
|
43 |
+
No.
|
44 |
+
|
45 |
+
* **Are relationships between individual instances made explicit (e.g., users' movie ratings, social network links)? If so, please describe how these relationships are made explicit.**
|
46 |
+
There are no explicit relationships between individual instances.
|
47 |
+
|
48 |
+
* **Are there recommended data splits (e.g., training, development/validation, testing)? If so, please provide a description of these splits, explaining the rationale behind them.**
|
49 |
+
We hold out a random validation set of approximately 150MB from the pretraining data, sampled proportionally to each dataset's size in the pretraining corpus.
|
50 |
+
|
51 |
+
* **Are there any errors, sources of noise, or redundancies in the dataset? If so, please provide a description.**
|
52 |
+
N/A
|
53 |
+
|
54 |
+
* **Is the dataset self-contained, or does it link to or otherwise rely on external resources (e.g., websites, tweets, other datasets)?**
|
55 |
+
It's self-contained.
|
56 |
+
|
57 |
+
* **Does the dataset contain data that might be considered confidential (e.g., data that is protected by legal privilege or by doctor-patient confidentiality, data that includes the content of individuals' non-public communications)? If so, please provide a description.**
|
58 |
+
The datasets used are publicly available, and the information in them is not considered confidential.
|
59 |
+
|
60 |
+
* **Does the dataset contain data that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety? If so, please describe why.**
|
61 |
+
Parts of the dataset are a subset of public Common Crawl data, which could contain sentences that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety.
|
62 |
+
|
63 |
+
* **Does the dataset relate to people? If not, you may skip the remaining questions in this section.**
|
64 |
+
Some documents of this data relate to people, such as news articles, Wikipedia descriptions, etc.
|
65 |
+
|
66 |
+
* **Does the dataset identify any subpopulations (e.g., by age, gender)? If so, please describe how these subpopulations are identified and provide a description of their respective distributions within the dataset.**
|
67 |
+
No.
|
68 |
+
|
69 |
+
* **Is it possible to identify individuals (i.e., one or more natural persons), either directly or indirectly (i.e., in combination with other data) from the dataset? If so, please describe how**
|
70 |
+
In addition to individuals who have Wikipedia pages (celebrities, politicians, etc.), it may be possible to identify other individuals by their names, Twitter account names, etc. if that information is present in Common Crawl.
|
71 |
+
|
72 |
+
* **Does the dataset contain data that might be considered sensitive in any way (e.g., data that reveals racial or ethnic origins, sexual orientations, religious beliefs, political opinions or union memberships, or locations; financial or health data; biometric or genetic data; forms of government identification, such as social security numbers; criminal history)? If so, please provide a description.**
|
73 |
+
The training dataset is partially derived from Common Crawl, which may contain some sensitive information.
|
74 |
+
|
75 |
+
* **Any other comments?**
|
76 |
+
No
|
77 |
+
|
78 |
+
|
79 |
+
## Collection Process
|
80 |
+
|
81 |
+
* **How was the data associated with each instance acquired? Was the data directly observable (e.g., raw text, movie ratings), reported by subjects (e.g., survey responses), or indirectly inferred/ derived from other data (e.g., part-of-speech tags, model-based guesses for age or language)? If data was reported by subjects or indirectly inferred/derived from other data, was the data validated/verified? If so, please describe how.**
|
82 |
+
N/A. The dataset is a union of six publicly available datasets.
|
83 |
+
|
84 |
+
* **What mechanisms or procedures were used to collect the data (e.g., hardware apparatus or sensor, manual human curation, software program, software API)? How were these mechanisms or procedures validated?**
|
85 |
+
N/A
|
86 |
+
|
87 |
+
* **If the dataset is a sample from a larger set, what was the sampling strategy (e.g., deterministic, probabilistic with specific sampling probabilities)?**
|
88 |
+
Please refer to the main document for details.
|
89 |
+
|
90 |
+
* **Who was involved in the data collection process (e.g., students, crowdworkers, contractors) and how were they compensated (e.g., how much were crowdworkers paid)?**
|
91 |
+
This data is mined, filtered and sampled by machines.
|
92 |
+
|
93 |
+
* **Over what timeframe was the data collected? Does this timeframe match the creation timeframe of the data associated with the instances (e.g., recent crawl of old news articles)? If not, please describe the timeframe in which the data associated with the instances was created.**
|
94 |
+
Different parts of the dataset were mined over different time periods.
|
95 |
+
1. The CC-News dataset contains English news articles crawled between September 2016 and February 2019.
|
96 |
+
2. The English CC-100 dataset was extracted from CommonCrawl snapshots between January 2018 and December 2018.
|
97 |
+
|
98 |
+
* **Were any ethical review processes conducted (e.g., by an institutional review board)? If so, please provide a description of these review processes, including the outcomes, as well as a link or other access point to any supporting documentation.**
|
99 |
+
No.
|
100 |
+
|
101 |
+
* **Does the dataset relate to people? If not, you may skip the remainder of the questions in this section.**
|
102 |
+
No.
|
103 |
+
|
104 |
+
* **Did you collect the data from the individuals in question directly, or obtain it via third parties or other sources (e.g., websites)?**
|
105 |
+
N/A
|
106 |
+
|
107 |
+
* **Were the individuals in question notified about the data collection? If so, please describe (or show with screenshots or other information) how notice was provided, and provide a link or other access point to, or otherwise reproduce, the exact language of the notification itself.**
|
108 |
+
N/A
|
109 |
+
|
110 |
+
* **Did the individuals in question consent to the collection and use of their data? If so, please describe (or show with screenshots or other information) how consent was requested and provided, and provide a link or other access point to, or otherwise reproduce, the exact language to which the individuals consented.**
|
111 |
+
N/A
|
112 |
+
|
113 |
+
* **If consent was obtained, were the consenting individuals provided with a mechanism to revoke their consent in the future or for certain uses? If so, please provide a description, as well as a link or other access point to the mechanism (if appropriate).**
|
114 |
+
N/A
|
115 |
+
|
116 |
+
* **Has an analysis of the potential impact of the dataset and its use on data subjects (e.g., a data protection impact analysis) been conducted? If so, please provide a description of this analysis, including the outcomes, as well as a link or other access point to any supporting documentation.**
|
117 |
+
Some responsible AI related evaluations were performed. Please refer to the main document and the model card for the paper.
|
118 |
+
|
119 |
+
* **Any other comments?**
|
120 |
+
No
|
121 |
+
|
122 |
+
|
123 |
+
## Preprocessing/cleaning/labeling
|
124 |
+
|
125 |
+
|
126 |
+
* **Was any preprocessing/cleaning/labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)? If so, please provide a description. If not, you may skip the remainder of the questions in this section.**
|
127 |
+
The component datasets went through standard cleaning and re-formatting practices, including removing repetitive/non informative text like "Chapter One", or "This ebook by Project Gutenberg".
|
128 |
+
|
129 |
+
* **Was the “raw” data saved in addition to the preprocessed/cleaned/labeled data (e.g., to support unanticipated future uses)? If so, please provide a link or other access point to the “raw” data.**
|
130 |
+
The "raw" component datasets is publicly available in their respective locations (more details can be seen in the respective papers linked in references).
|
131 |
+
|
132 |
+
* **Is the software used to preprocess/clean/label the instances available? If so, please provide a link or other access point.**
|
133 |
+
The software is proprietary to Meta Platforms and currently unavailable publicly.
|
134 |
+
|
135 |
+
* **Any other comments?**
|
136 |
+
No
|
137 |
+
|
138 |
+
|
139 |
+
## Uses
|
140 |
+
|
141 |
+
* **Has the dataset been used for any tasks already? If so, please provide a description.**
|
142 |
+
Yes, this dataset was used to pre-train the models described in the paper.
|
143 |
+
|
144 |
+
* **Is there a repository that links to any or all papers or systems that use the dataset? If so, please provide a link or other access point.**
|
145 |
+
No.
|
146 |
+
|
147 |
+
* **What (other) tasks could the dataset be used for?**
|
148 |
+
This data can be used to pretrain English language models, which are foundation to many current and future language tasks.
|
149 |
+
|
150 |
+
* **Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses? For example, is there anything that a future user might need to know to avoid uses that could result in unfair treatment of individuals or groups (e.g., stereotyping, quality of service issues) or other undesirable harms (e.g., financial harms, legal risks) If so, please provide a description. Is there anything a future user could do to mitigate these undesirable harms?**
|
151 |
+
The pipeline for creating this dataset paves a way for building a scalable infrastructure for mining datasets to be be used for training large-scale models.
|
152 |
+
|
153 |
+
* **Are there tasks for which the dataset should not be used? If so, please provide a description.**
|
154 |
+
No.
|
155 |
+
|
156 |
+
* **Any other comments?**
|
157 |
+
No.
|
158 |
+
|
159 |
+
## Distribution
|
160 |
+
|
161 |
+
|
162 |
+
* **Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created? If so, please provide a description.**
|
163 |
+
No.
|
164 |
+
|
165 |
+
* **How will the dataset will be distributed (e.g., tarball on website, API, GitHub)? Does the dataset have a digital object identifier (DOI)?**
|
166 |
+
N/A
|
167 |
+
|
168 |
+
* **When will the dataset be distributed?**
|
169 |
+
No.
|
170 |
+
|
171 |
+
* **Will the dataset be distributed under a copyright or other intellectual property (IP) license, and/or under applicable terms of use (ToU)? If so, please describe this license and/or ToU, and provide a link or other access point to, or otherwise reproduce, any relevant licensing terms or ToU, as well as any fees associated with these restrictions.**
|
172 |
+
No.
|
173 |
+
|
174 |
+
* **Have any third parties imposed IP-based or other restrictions on the data associated with the instances? If so, please describe these restrictions, and provide a link or other access point to, or otherwise reproduce, any relevant licensing terms, as well as any fees associated with these restrictions.**
|
175 |
+
No.
|
176 |
+
|
177 |
+
* **Do any export controls or other regulatory restrictions apply to the dataset or to individual instances? If so, please describe these restrictions, and provide a link or other access point to, or otherwise reproduce, any supporting documentation.**
|
178 |
+
N/A
|
179 |
+
|
180 |
+
* **Any other comments?**
|
181 |
+
No.
|
182 |
+
|
183 |
+
## Maintenance
|
184 |
+
|
185 |
+
* **Who is supporting/hosting/maintaining the dataset?**
|
186 |
+
FAIR (Fundamental Artificial Intelligence Research)
|
187 |
+
|
188 |
+
* **How can the owner/curator/manager of the dataset be contacted (e.g., email address)?**
|
189 |
+
Refer to the main document.
|
190 |
+
|
191 |
+
* **Is there an erratum? If so, please provide a link or other access point.**
|
192 |
+
N/A
|
193 |
+
|
194 |
+
* **Will the dataset be updated (e.g., to correct labeling errors, add new instances, delete instances)? If so, please describe how often, by whom, and how updates will be communicated to users (e.g., mailing list, GitHub)?**
|
195 |
+
No plan for updating.
|
196 |
+
|
197 |
+
* **If the dataset relates to people, are there applicable limits on the retention of the data associated with the instances (e.g., were individuals in question told that their data would be retained for a fixed period of time and then deleted)? If so, please describe these limits and explain how they will be enforced.**
|
198 |
+
N/A
|
199 |
+
|
200 |
+
* **Will older versions of the dataset continue to be supported/hosted/maintained? If so, please describe how. If not, please describe how its obsolescence will be communicated to users.**
|
201 |
+
N/A
|
202 |
+
|
203 |
+
* **If others want to extend/augment/build on/contribute to the dataset, is there a mechanism for them to do so? If so, please provide a description. Will these contributions be validated/ verified? If so, please describe how. If not, why not? Is there a process for communicating/ distributing these contributions to other users? If so, please provide a description.**
|
204 |
+
No.
|
205 |
+
|
206 |
+
* **Any other comments?**
|
207 |
+
No.
|
208 |
+
|
209 |
+
## References
|
210 |
+
Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692.
|
211 |
+
|
212 |
+
Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2019. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. arXiv:1506.06724.
|
213 |
+
|
214 |
+
Sebastian Nagel. 2016. Cc-news. http: //web.archive.org/save/http: //commoncrawl.org/2016/10/news-dataset-available.
|
215 |
+
|
216 |
+
Aaron Gokaslan and Vanya Cohen. 2019. Openwebtext corpus. http://web.archive.org/save/http://Skylion007.github.io/OpenWebTextCorpus
|
217 |
+
|
218 |
+
Trieu H Trinh and Quoc V Le. 2018. A simple method for commonsense reasoning. arXiv preprint arXiv:1806.02847.
|
219 |
+
|
220 |
+
Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, and Edouard Grave. 2020. CCNet: Extracting high quality monolingual datasets from web crawl data. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 4003–4012, Marseille, France. European Language Resources Association.
|
221 |
+
|
fairseq/examples/moe_lm/model_card.md
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Model card for the paper ``Efficient Large Scale Language Modeling with Mixtures of Experts"
|
2 |
+
## Version 1.0.0
|
3 |
+
|
4 |
+
### Model developer
|
5 |
+
FAIR (Fundamental Artificial Intelligence Research)
|
6 |
+
|
7 |
+
### Model type
|
8 |
+
An autoregressive English language model trained on a union of six English language models. We explore dense and sparse (MoE based) architectures in the paper.
|
9 |
+
* Dense models - Our dense models range from 125M parameters to 13B parameters.
|
10 |
+
* Sparse (MoE) models - Our MoE based models range from 15B parameters to 1.1 Trillion parameters.
|
11 |
+
This model card focuses on the 1.1 Trillion parameter model, but the discussion
|
12 |
+
applies to all of the models explored in this work.
|
13 |
+
|
14 |
+
### Citation details
|
15 |
+
Artetxe et al. (2021): Efficient Large Scale Language Modeling with Mixtures of Experts
|
16 |
+
|
17 |
+
### Model Feedback Channel
|
18 |
+
fairseq
|
19 |
+
|
20 |
+
## Intended use
|
21 |
+
### Primary intended use
|
22 |
+
For research purposes only, e.g. reproducing model evaluation results. Generation is only used in a limited capacity for explanation/justification or for prompting/probing/priming for class labels.
|
23 |
+
|
24 |
+
### Out of scope uses
|
25 |
+
The primary purpose of the model is not to generate language, although the model is capable of doing that.
|
26 |
+
|
27 |
+
## Factors influencing model performance
|
28 |
+
This section discusses potential risks associated with using the model.
|
29 |
+
|
30 |
+
### Relevant factors
|
31 |
+
Based on known problems with NLP technology, potential relevant factors include bias (gender, profession, race and religion).
|
32 |
+
|
33 |
+
### Evaluation factors
|
34 |
+
The 1.1T model was evaluated on StereoSet and CrowS-Pairs datasets to quantify encoded bias in the model.
|
35 |
+
|
36 |
+
## Metrics
|
37 |
+
### Model performance measures
|
38 |
+
The 1.1T parameter model was primarily evaluated on
|
39 |
+
1. In-domain and out-of-domain language modeling perplexity.
|
40 |
+
2. Zero-shot and few-shot priming.
|
41 |
+
3. Fully supervised finetuning.
|
42 |
+
|
43 |
+
### Approaches to handle uncertainty
|
44 |
+
For few-shot learning, we report the average results across 25 runs, randomly sampling a different set of few-shot examples from the training set each time.
|
45 |
+
|
46 |
+
## Evaluation data
|
47 |
+
## Zero Shot evaluation
|
48 |
+
|
49 |
+
### HellaSwag
|
50 |
+
#### Description
|
51 |
+
HellaSwag is a dataset for evaluating commonsense reasoning.
|
52 |
+
|
53 |
+
### PIQA
|
54 |
+
#### Description
|
55 |
+
PIQA is a dataset designed to evaluate reasoning about Physical Commonsense in Natural Language
|
56 |
+
|
57 |
+
### ReCoRd
|
58 |
+
#### Description
|
59 |
+
Reading Comprehension with Commonsense Reasoning Dataset (ReCoRD) is a large-scale reading comprehension dataset which requires commonsense reasoning. ReCoRD consists of queries automatically generated from CNN/Daily Mail news articles; the answer to each query is a text span from a summarizing passage of the corresponding news. The goal of ReCoRD is to evaluate a machine's ability of commonsense reasoning in reading comprehension.
|
60 |
+
|
61 |
+
## Few Shot evaluation
|
62 |
+
### Winogrande
|
63 |
+
#### Description
|
64 |
+
Winogrande is a benchmark for commonsense reasoning. The dataset contains pronoun resolution problems originally designed to be unsolvable for statistical models that rely on selectional preferences or word associations.
|
65 |
+
|
66 |
+
### StoryCloze
|
67 |
+
#### Description
|
68 |
+
StoryCloze is a new commonsense reasoning framework for evaluating story understanding, story generation, and script learning. This test requires a system to choose the correct ending to a four-sentence story.
|
69 |
+
|
70 |
+
### OpenBookQA
|
71 |
+
#### Description
|
72 |
+
OpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding of a subject. It consists of 5,957 multiple-choice elementary-level science questions (4,957 train, 500 dev, 500 test), which probe the understanding of a small “book” of 1,326 core science facts and the application of these facts to novel situations.
|
73 |
+
|
74 |
+
## Fully supervised evaluation
|
75 |
+
|
76 |
+
### BoolQ
|
77 |
+
#### Description
|
78 |
+
BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally occurring – they are generated in unprompted and unconstrained settings. Each example is a triplet of (question, passage, answer), with the title of the page as optional additional context.
|
79 |
+
|
80 |
+
### SST-2
|
81 |
+
#### Description
|
82 |
+
SST-2 (or SST-binary) is a binary classification dataset where the goal is to differentiate between negative or somewhat negative vs somewhat positive or positive.
|
83 |
+
|
84 |
+
### MNLI
|
85 |
+
#### Description
|
86 |
+
The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information. The corpus is modeled on the SNLI corpus, but differs in that covers a range of genres of spoken and written text, and supports a distinctive cross-genre generalization evaluation.
|
87 |
+
|
88 |
+
## Responsible AI (RAI) evaluation
|
89 |
+
### StereoSet
|
90 |
+
#### Description
|
91 |
+
A large-scale natural dataset in English to measure stereotypical biases in four domains: gender, profession, race, and religion
|
92 |
+
|
93 |
+
#### Motivation for dataset use
|
94 |
+
The motivation for evaluating the 1.1T parameter model on this dataset is to evaluate the model's stereotype bias in gender, profession, race, and religion
|
95 |
+
|
96 |
+
### CrowS
|
97 |
+
#### Description
|
98 |
+
Challenge Dataset for Measuring Social Biases in Masked Language Models
|
99 |
+
|
100 |
+
#### Motivation for dataset use
|
101 |
+
The motivation for evaluating the 1.1T parameter model on this dataset is to evaluate the model’s bias in the domains of race, religion and age
|
102 |
+
|
103 |
+
----
|
104 |
+
|
105 |
+
## Training data
|
106 |
+
### BookCorpus
|
107 |
+
#### Description
|
108 |
+
A dataset consisting of more than 10K unpublished books. 4GB in size. (Zhu et al., 2019)
|
109 |
+
|
110 |
+
### English Wikipedia
|
111 |
+
#### Description
|
112 |
+
Data from English wikipedia, excluding lists, tables and headers. 12GB in size.
|
113 |
+
|
114 |
+
### CC-News
|
115 |
+
#### Description
|
116 |
+
A dataset containing 63 millions English news articles crawled between September 2016 and February 2019. 76GB in size. (Nagel,2016)
|
117 |
+
|
118 |
+
### OpenWebText
|
119 |
+
#### Description
|
120 |
+
An open source recreation of the WebText dataset used to train GPT-2. 38GB in size. (Gokaslan and Cohen, 2019)
|
121 |
+
|
122 |
+
### CC-Stories
|
123 |
+
#### Description
|
124 |
+
A dataset containing a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas. 31GB in size. (Trinh and Le, 2018)
|
125 |
+
|
126 |
+
### English CC100
|
127 |
+
#### Description
|
128 |
+
A dataset extracted from CommonCrawl snapshots between January 2018 and December 2018, filtered to match the style of Wikipedia following the methodology introduced in CCNet (https://arxiv.org/abs/1911.00359). 292GB in size. (Wenzek et al., 2020)
|
129 |
+
|
130 |
+
## Responsible AI (RAI) Dimensions
|
131 |
+
### Fairness (Bias and inclusion)
|
132 |
+
The 1.1T parameter model was evaluated on the StereoSet and CrowS pairs dataset for inherent bias in the model, and bias as a result of the data. Similar to StereoSet, we observe that both the dense and MoE models get worse in terms of the Stereotype Score (SS) with scale.
|
133 |
+
|
134 |
+
### Privacy and security
|
135 |
+
The 1.1T model did not have any special Privacy and Security considerations. The training data and evaluation data were both public and went through standard Meta privacy and licensing procedures.
|
136 |
+
|
137 |
+
### Transparency and control
|
138 |
+
In the spirit of transparency and accountability we have created this model card for the 1.1T parameter model and a data card for the training data (referenced in Artetxe et al. (2021)).
|
139 |
+
|
140 |
+
### Efficiency (Green AI)
|
141 |
+
The 1.1T parameter model is trained as a Mixture of Experts (MoE) model. Mixture of expert (MoE) models are efficient because they leverage sparse computation, i.e., only a small fraction of parameters are active for any given input. For instance, our 1.1T parameter MoE model requires only 30% more FLOPS compared to a 6.7B parameter dense model, i.e., a 160x increase in parameters with only a 30% increase in FLOPS. Notably, MoE models achieve much better validation perplexity for a given compute budget compared to dense models.
|
142 |
+
|
143 |
+
## References
|
144 |
+
Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. 2019. HellaSwag: Can a machine really finish your sentence? In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 4791– 4800, Florence, Italy. Association for Computational Linguistics.
|
145 |
+
|
146 |
+
Yonatan Bisk, Rowan Zellers, Ronan Le bras, Jianfeng Gao, and Yejin Choi. 2020. Piqa: Reasoning about physical commonsense in natural language. Proceedings of the AAAI Conference on Artificial Intelligence, 34(05):7432–7439.
|
147 |
+
|
148 |
+
Sheng Zhang, Xiaodong Liu, Jingjing Liu, Jianfeng Gao, Kevin Duh, and Benjamin Van Durme. 2018. ReCoRD: Bridging the gap between human and machine commonsense reading comprehension. arXiv preprint 1810.12885.
|
149 |
+
|
150 |
+
Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. 2020. Winogrande: An adversarial winograd schema challenge at scale. Proceedings of the AAAI Conference on Artificial Intelligence, 34(05):8732–8740.
|
151 |
+
|
152 |
+
Nasrin Mostafazadeh, Nathanael Chambers, Xiaodong He, Devi Parikh, Dhruv Batra, Lucy Vanderwende, Pushmeet Kohli, and James Allen. 2016. A corpus and cloze evaluation for deeper understanding of commonsense stories. In Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 839–849, San Diego, California. Association for Computational Linguistics.
|
153 |
+
|
154 |
+
Todor Mihaylov, Peter Clark, Tushar Khot, and Ashish Sabharwal. 2018. Can a suit of armor conduct electricity? a new dataset for open book question answering. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 2381–2391, Brussels, Belgium. Association for Computational Linguistics.
|
155 |
+
|
156 |
+
Christopher Clark and Kenton Lee and Ming-Wei Chang and Tom Kwiatkowski and Michael Collins and Kristina Toutanova. 2019. BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions
|
157 |
+
|
158 |
+
Moin Nadeem, Anna Bethke, and Siva Reddy. 2021. StereoSet: Measuring stereotypical bias in pretrained language models. In Association for Computational Linguistics (ACL).
|
159 |
+
|
160 |
+
Nikita Nangia, Clara Vania, Rasika Bhalerao, and Samuel R. Bowman. 2020. CrowS-pairs: A challenge dataset for measuring social biases in masked language models. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1953–1967, Online. Association for Computational Linguistics.
|
161 |
+
|
162 |
+
Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2019. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. arXiv:1506.06724.
|
163 |
+
|
164 |
+
Sebastian Nagel. 2016. Cc-news. http: //web.archive.org/save/http: //commoncrawl.org/2016/10/news-dataset-available.
|
165 |
+
|
166 |
+
Aaron Gokaslan and Vanya Cohen. 2019. Openwebtext corpus. http://web.archive.org/save/http://Skylion007.github.io/OpenWebTextCorpus
|
167 |
+
|
168 |
+
Trieu H Trinh and Quoc V Le. 2018. A simple method for commonsense reasoning. arXiv preprint arXiv:1806.02847.
|
169 |
+
|
170 |
+
Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, and Edouard Grave. 2020. CCNet: Extracting high quality monolingual datasets from web crawl data. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 4003–4012, Marseille, France. European Language Resources Association.
|
fairseq/examples/mr_hubert/README.md
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MR-HuBERT
|
2 |
+
|
3 |
+
## Pre-trained models
|
4 |
+
|
5 |
+
### Main models
|
6 |
+
Model | Pretraining Data | Model | Paper Reference
|
7 |
+
|---|---|---|---
|
8 |
+
MR-HuBERT Base (~97M) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_base/mrhubert_mono_base.pt) | mono\_base
|
9 |
+
MR-HuBERT Base (~321M) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_large/mrhubert_mono_large.pt) | mono\_large
|
10 |
+
Multilingual MR-HuBERT Base (~97M) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/multi_base/multi_base.pt) | multi\_base
|
11 |
+
Multilingual MR-HuBERT Large (~321M) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download 400k steps](https://dl.fbaipublicfiles.com/mrhubert/multi_large/multi_large_400k.pt) or [download 600k steps](https://dl.fbaipublicfiles.com/mrhubert/multi_large/multi_large_600k.pt) | Not in the paper
|
12 |
+
|
13 |
+
|
14 |
+
### Abalation models
|
15 |
+
Model | Pretraining Data | Model | Paper Reference
|
16 |
+
|---|---|---|---
|
17 |
+
MR-HuBERT Base (2-4-6 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-a/b1-a.pt) | (B.1)-a
|
18 |
+
MR-HuBERT Base (5-2-5 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-b/b1-b.pt) | (B.1)-b
|
19 |
+
MR-HuBERT Base (6-4-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-c/b1-c.pt) | (B.1)-c
|
20 |
+
MR-HuBERT Base (3res 3-2-2-2-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-a/b2-a.pt) | (B.2)-a
|
21 |
+
MR-HuBERT Base (3res 2-2-4-2-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-b/b2-b.pt) | (B.2)-b
|
22 |
+
MR-HuBERT Base (3res 2-2-2-2-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-c/b2-c.pt) | (B.2)-c
|
23 |
+
MR-HuBERT Base (Simple sampling) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b3-a/b3-a.pt) | (B.3)-a
|
24 |
+
MR-HuBERT Base (Single target) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b4-a/b4-a.pt) | (B.4)-a
|
25 |
+
MR-HuBERT Base (Simple Sampling + single target) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b4-b/b4-b.pt) | (B.4)-b
|
26 |
+
MR-HuBERT Base (Mono-resolution 20ms) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b5-a/b5-a.pt) | (B.5)-a
|
27 |
+
MR-HuBERT Base (3-3-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b6-a/b6-a.pt) | (B.6)-a
|
28 |
+
MR-HuBERT Base (Mono-resolution 20ms, 3-3-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b6-b/b6-b.pt) | (B.6)-b
|
29 |
+
MR-HuBERT Base (HuBERT 20ms&40ms units) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-a/b7-a.pt) | (B.7)-a
|
30 |
+
MR-HuBERT Base (Encodec 50Hz unit) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-b/b7-b.pt) | (B.7)-b
|
31 |
+
MR-HuBERT Base (Encodec 50Hz units and 25Hz units) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-c/b7-c.pt) | (B.7)-c
|
32 |
+
MR-HuBERT Base (Encodec 50Hz units stream 0&1 ) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-d/b7-d.pt) | (B.7)-d
|
33 |
+
MR-HuBERT Large (no audio norm) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-a/b8-a.pt) | (B.8)-a
|
34 |
+
MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-b/b8-b.pt) | (B.8)-b
|
35 |
+
MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-c/b8-c.pt) | (B.8)-c
|
36 |
+
MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-d/b8-d.pt) | (B.8)-d
|
37 |
+
MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-e/b8-e.pt) | (B.8)-e
|
38 |
+
MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-f/b8-f.pt) | (B.8)-f
|
39 |
+
MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-g/b8-g.pt) | (B.8)-g
|
40 |
+
MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-h/b8-h.pt) | (B.8)-h
|
41 |
+
MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-i/b8-i.pt) | (B.8)-i
|
42 |
+
MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-j/b8-j.pt) | (B.8)-j
|
43 |
+
Multilingual MR-HuBERT Large (Simple sampling) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/multi_large_simple/multi_large_simple.pt) | Not in paper
|
44 |
+
MR-HuBERT xLarge (from HuBERT-base label) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_xlarge/v1.pt) | Not in paper
|
45 |
+
MR-HuBERT xLarge (from HuBERT-large label) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_xlarge/v2.pt) | Not in paper
|
46 |
+
|
47 |
+
## Load a model
|
48 |
+
```
|
49 |
+
ckpt_path = "/path/to/the/checkpoint.pt"
|
50 |
+
models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
|
51 |
+
model = models[0]
|
52 |
+
```
|
53 |
+
|
54 |
+
## Train a new model
|
55 |
+
|
56 |
+
### Data preparation
|
57 |
+
|
58 |
+
Follow the steps in `./simple_kmeans` to create:
|
59 |
+
- `{train,valid}.tsv` waveform list files with length information
|
60 |
+
```
|
61 |
+
/path/to/your/audio/files
|
62 |
+
file1.wav\t160000
|
63 |
+
file2.wav\t154600
|
64 |
+
...
|
65 |
+
filen.wav\t54362
|
66 |
+
```
|
67 |
+
- `{train,valid}.km` frame-aligned pseudo label files (the order is the same as wavefiles in the tsv file).
|
68 |
+
```
|
69 |
+
44 44 44 48 48 962 962 962 962 962 962 962 962 967 967 967 967 967 967 967 967 370 852 370 ... 18 18 745 745
|
70 |
+
44 44 44 48 48 962 962 962 147 147 147 147 147 147 147 147 147 147 147 147 176 176 271 271 ... 27 27 745 745
|
71 |
+
...
|
72 |
+
44 44 44 48 962 962 962 962 962 962 377 377 377 77 77 852 696 694 433 578 578 82 740 622 ... 27 27 745 745
|
73 |
+
```
|
74 |
+
- `dict.km.txt` a dummy dictionary (first column is id, the second is dummy one)
|
75 |
+
```
|
76 |
+
0 1
|
77 |
+
1 1
|
78 |
+
2 1
|
79 |
+
...
|
80 |
+
999 1
|
81 |
+
```
|
82 |
+
|
83 |
+
The `label_rate` is the same as the feature frame rate used for clustering,
|
84 |
+
which is 100Hz for MFCC features and 50Hz for HuBERT features by default.
|
85 |
+
|
86 |
+
### Pre-train a MR-HuBERT model
|
87 |
+
|
88 |
+
Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.km`
|
89 |
+
are saved at `/path/to/labels`, and the label rate is 100Hz.
|
90 |
+
|
91 |
+
To train a base model (12 layer transformer), run:
|
92 |
+
```sh
|
93 |
+
$ python fairseq_cli/hydra_train.py \
|
94 |
+
--config-dir /path/to/fairseq-py/examples/mr_hubert/config/pretrain \
|
95 |
+
--config-name mrhubert_base_librispeech \
|
96 |
+
task.data=/path/to/data task.label_dir=/path/to/labels \
|
97 |
+
task.labels='["km"]' model.label_rate=100 \
|
98 |
+
task.label_rate_ratios='[1, 2]' \
|
99 |
+
```
|
100 |
+
|
101 |
+
Please see sample pre-training scripts `train.sh` for an example script.
|
102 |
+
|
103 |
+
### Fine-tune a MR-HuBERT model with a CTC loss
|
104 |
+
|
105 |
+
Suppose `{train,valid}.tsv` are saved at `/path/to/data`, and their
|
106 |
+
corresponding character transcripts `{train,valid}.ltr` are saved at
|
107 |
+
`/path/to/trans`. A typical ltr file is with the same order of tsv waveform files as
|
108 |
+
```
|
109 |
+
HOW | ARE | YOU
|
110 |
+
...
|
111 |
+
THANK | YOU
|
112 |
+
```
|
113 |
+
|
114 |
+
To fine-tune a pre-trained MR-HuBERT model at `/path/to/checkpoint`, run
|
115 |
+
```sh
|
116 |
+
$ python fairseq_cli/hydra_train.py \
|
117 |
+
--config-dir /path/to/fairseq-py/examples/mr_hubert/config/finetune \
|
118 |
+
--config-name base_10h \
|
119 |
+
task.data=/path/to/data task.label_dir=/path/to/trans \
|
120 |
+
model.w2v_path=/path/to/checkpoint
|
121 |
+
```
|
122 |
+
|
123 |
+
Please see sample fine-tuning scripts `finetune.sh` for an example script.
|
124 |
+
|
125 |
+
### Decode a MR-HuBERT model
|
126 |
+
|
127 |
+
Suppose the `test.tsv` and `test.ltr` are the waveform list and transcripts of
|
128 |
+
the split to be decoded, saved at `/path/to/data`, and the fine-tuned model is
|
129 |
+
saved at `/path/to/checkpoint`.
|
130 |
+
|
131 |
+
|
132 |
+
We support three decoding modes:
|
133 |
+
- Viterbi decoding: greedy decoding without a language model
|
134 |
+
- KenLM decoding: decoding with an arpa-format KenLM n-gram language model
|
135 |
+
- Fairseq-LM deocding: decoding with a Fairseq neural language model (not fully tested)
|
136 |
+
|
137 |
+
|
138 |
+
#### Viterbi decoding
|
139 |
+
|
140 |
+
`task.normalize` needs to be consistent with the value used during fine-tuning.
|
141 |
+
Decoding results will be saved at
|
142 |
+
`/path/to/experiment/directory/decode/viterbi/test`.
|
143 |
+
|
144 |
+
```sh
|
145 |
+
$ python examples/speech_recognition/new/infer.py \
|
146 |
+
--config-dir /path/to/fairseq-py/examples/mr_hubert/config/decode \
|
147 |
+
--config-name infer \
|
148 |
+
task.data=/path/to/data \
|
149 |
+
task.normalize=[true|false] \
|
150 |
+
decoding.exp_dir=/path/to/experiment/directory \
|
151 |
+
common_eval.path=/path/to/checkpoint
|
152 |
+
dataset.gen_subset=test \
|
153 |
+
```
|
154 |
+
|
155 |
+
#### KenLM / Fairseq-LM decoding
|
156 |
+
|
157 |
+
Suppose the pronunciation lexicon and the n-gram LM are saved at
|
158 |
+
`/path/to/lexicon` and `/path/to/arpa`, respectively. Decoding results will be
|
159 |
+
saved at `/path/to/experiment/directory/decode/kenlm/test`.
|
160 |
+
|
161 |
+
```sh
|
162 |
+
$ python examples/speech_recognition/new/infer.py \
|
163 |
+
--config-dir /path/to/fairseq-py/examples/mr_hubert/config/decode \
|
164 |
+
--config-name infer_lm \
|
165 |
+
task.data=/path/to/data \
|
166 |
+
task.normalize=[true|false] \
|
167 |
+
decoding.exp_dir=/path/to/experiment/directory \
|
168 |
+
common_eval.path=/path/to/checkpoint
|
169 |
+
dataset.gen_subset=test \
|
170 |
+
decoding.decoder.lexicon=/path/to/lexicon \
|
171 |
+
decoding.decoder.lmpath=/path/to/arpa
|
172 |
+
```
|
173 |
+
|
174 |
+
The command above uses the default decoding hyperparameter, which can be found
|
175 |
+
in `examples/speech_recognition/hydra/decoder.py`. These parameters can be
|
176 |
+
configured from the command line. For example, to search with a beam size of
|
177 |
+
500, we can append the command above with `decoding.decoder.beam=500`.
|
178 |
+
Important parameters include:
|
179 |
+
- decoding.decoder.beam
|
180 |
+
- decoding.decoder.beamthreshold
|
181 |
+
- decoding.decoder.lmweight
|
182 |
+
- decoding.decoder.wordscore
|
183 |
+
- decoding.decoder.silweight
|
184 |
+
|
185 |
+
To decode with a Fairseq LM, you may check the usage examples in wav2vec2 or hubert examples.
|
186 |
+
|
187 |
+
Please see sample decoding scripts `decode.sh` for an example script.
|
fairseq/examples/mr_hubert/config/decode/infer.yaml
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
defaults:
|
4 |
+
- model: null
|
5 |
+
|
6 |
+
hydra:
|
7 |
+
run:
|
8 |
+
dir: ${common_eval.results_path}/viterbi
|
9 |
+
sweep:
|
10 |
+
dir: ${common_eval.results_path}
|
11 |
+
subdir: viterbi
|
12 |
+
|
13 |
+
task:
|
14 |
+
_name: multires_hubert_pretraining
|
15 |
+
single_target: true
|
16 |
+
fine_tuning: true
|
17 |
+
label_rate_ratios: ???
|
18 |
+
data: ???
|
19 |
+
normalize: false
|
20 |
+
|
21 |
+
decoding:
|
22 |
+
type: viterbi
|
23 |
+
unique_wer_file: true
|
24 |
+
common_eval:
|
25 |
+
results_path: ???
|
26 |
+
path: ???
|
27 |
+
post_process: letter
|
28 |
+
dataset:
|
29 |
+
max_tokens: 1100000
|
30 |
+
gen_subset: ???
|
fairseq/examples/mr_hubert/config/decode/infer_lm.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
defaults:
|
4 |
+
- model: null
|
5 |
+
|
6 |
+
hydra:
|
7 |
+
run:
|
8 |
+
dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
|
9 |
+
sweep:
|
10 |
+
dir: ${common_eval.results_path}
|
11 |
+
subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
|
12 |
+
|
13 |
+
task:
|
14 |
+
_name: multires_hubert_pretraining
|
15 |
+
single_target: true
|
16 |
+
fine_tuning: true
|
17 |
+
data: ???
|
18 |
+
label_rate_ratios: ???
|
19 |
+
normalize: ???
|
20 |
+
|
21 |
+
decoding:
|
22 |
+
type: kenlm
|
23 |
+
lexicon: ???
|
24 |
+
lmpath: ???
|
25 |
+
beamthreshold: 100
|
26 |
+
beam: 500
|
27 |
+
lmweight: 1.5
|
28 |
+
wordscore: -1
|
29 |
+
silweight: 0
|
30 |
+
unique_wer_file: true
|
31 |
+
common_eval:
|
32 |
+
results_path: ???
|
33 |
+
path: ???
|
34 |
+
post_process: letter
|
35 |
+
dataset:
|
36 |
+
max_tokens: 1100000
|
37 |
+
gen_subset: ???
|
fairseq/examples/mr_hubert/config/decode/run/submitit_slurm.yaml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
hydra:
|
3 |
+
launcher:
|
4 |
+
cpus_per_task: ${distributed_training.distributed_world_size}
|
5 |
+
gpus_per_node: ${distributed_training.distributed_world_size}
|
6 |
+
tasks_per_node: ${hydra.launcher.gpus_per_node}
|
7 |
+
nodes: 1
|
8 |
+
mem_gb: 200
|
9 |
+
timeout_min: 4320
|
10 |
+
max_num_timeout: 50
|
11 |
+
name: ${hydra.job.config_name}
|
12 |
+
submitit_folder: ${hydra.sweep.dir}/submitit
|
13 |
+
|
14 |
+
distributed_training:
|
15 |
+
distributed_world_size: 1
|
16 |
+
distributed_no_spawn: true
|
17 |
+
distributed_port: 29761
|
fairseq/examples/mr_hubert/config/decode/run/submitit_slurm_8gpu.yaml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
hydra:
|
3 |
+
launcher:
|
4 |
+
cpus_per_task: ${distributed_training.distributed_world_size}
|
5 |
+
gpus_per_node: ${distributed_training.distributed_world_size}
|
6 |
+
tasks_per_node: ${hydra.launcher.gpus_per_node}
|
7 |
+
nodes: 1
|
8 |
+
mem_gb: 200
|
9 |
+
timeout_min: 4320
|
10 |
+
max_num_timeout: 50
|
11 |
+
name: ${hydra.job.config_name}
|
12 |
+
submitit_folder: ${hydra.sweep.dir}/submitit
|
13 |
+
|
14 |
+
distributed_training:
|
15 |
+
distributed_world_size: 8
|
16 |
+
distributed_no_spawn: true
|
17 |
+
distributed_port: 29761
|
fairseq/examples/mr_hubert/config/finetune/base_100h.yaml
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tblog
|
8 |
+
seed: 1337
|
9 |
+
|
10 |
+
checkpoint:
|
11 |
+
no_epoch_checkpoints: true
|
12 |
+
best_checkpoint_metric: wer
|
13 |
+
|
14 |
+
distributed_training:
|
15 |
+
ddp_backend: c10d
|
16 |
+
find_unused_parameters: true
|
17 |
+
distributed_world_size: 8
|
18 |
+
distributed_port: 29671
|
19 |
+
nprocs_per_node: 8
|
20 |
+
|
21 |
+
task:
|
22 |
+
_name: multires_hubert_pretraining
|
23 |
+
data: ???
|
24 |
+
fine_tuning: true
|
25 |
+
label_dir: ???
|
26 |
+
label_rate_ratios: ???
|
27 |
+
normalize: false # must be consistent with pre-training
|
28 |
+
labels: ["ltr"]
|
29 |
+
single_target: true
|
30 |
+
|
31 |
+
dataset:
|
32 |
+
num_workers: 0
|
33 |
+
max_tokens: 3200000
|
34 |
+
validate_after_updates: ${model.freeze_finetune_updates}
|
35 |
+
validate_interval: 5
|
36 |
+
train_subset: train_100h
|
37 |
+
valid_subset: dev_other
|
38 |
+
|
39 |
+
criterion:
|
40 |
+
_name: ctc
|
41 |
+
zero_infinity: true
|
42 |
+
|
43 |
+
optimization:
|
44 |
+
max_update: 80000
|
45 |
+
lr: [3e-5]
|
46 |
+
sentence_avg: true
|
47 |
+
update_freq: [1]
|
48 |
+
|
49 |
+
optimizer:
|
50 |
+
_name: adam
|
51 |
+
adam_betas: (0.9,0.98)
|
52 |
+
adam_eps: 1e-08
|
53 |
+
|
54 |
+
lr_scheduler:
|
55 |
+
_name: tri_stage
|
56 |
+
phase_ratio: [0.1, 0.4, 0.5]
|
57 |
+
final_lr_scale: 0.05
|
58 |
+
|
59 |
+
model:
|
60 |
+
_name: multires_hubert_ctc
|
61 |
+
multires_hubert_path: ???
|
62 |
+
apply_mask: true
|
63 |
+
mask_selection: static
|
64 |
+
mask_length: 10
|
65 |
+
mask_other: 0
|
66 |
+
mask_prob: 0.75
|
67 |
+
mask_channel_selection: static
|
68 |
+
mask_channel_length: 64
|
69 |
+
mask_channel_other: 0
|
70 |
+
mask_channel_prob: 0.5
|
71 |
+
layerdrop: 0.1
|
72 |
+
dropout: 0.0
|
73 |
+
activation_dropout: 0.1
|
74 |
+
attention_dropout: 0.0
|
75 |
+
feature_grad_mult: 0.0
|
76 |
+
freeze_finetune_updates: 10000
|
77 |
+
|
78 |
+
hydra:
|
79 |
+
job:
|
80 |
+
config:
|
81 |
+
override_dirname:
|
82 |
+
kv_sep: '-'
|
83 |
+
item_sep: '__'
|
84 |
+
exclude_keys:
|
85 |
+
- run
|
86 |
+
- task.data
|
87 |
+
- task.label_dir
|
88 |
+
- model.multires_hubert_path
|
89 |
+
- dataset.train_subset
|
90 |
+
- dataset.valid_subset
|
91 |
+
- criterion.wer_kenlm_model
|
92 |
+
- criterion.wer_lexicon
|
93 |
+
run:
|
94 |
+
dir: ???
|
95 |
+
sweep:
|
96 |
+
dir: ???
|
97 |
+
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
|
fairseq/examples/mr_hubert/config/finetune/base_100h_large.yaml
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tblog
|
8 |
+
seed: 1337
|
9 |
+
|
10 |
+
checkpoint:
|
11 |
+
no_epoch_checkpoints: true
|
12 |
+
best_checkpoint_metric: wer
|
13 |
+
|
14 |
+
distributed_training:
|
15 |
+
ddp_backend: c10d
|
16 |
+
find_unused_parameters: true
|
17 |
+
distributed_world_size: 8
|
18 |
+
distributed_port: 29671
|
19 |
+
nprocs_per_node: 8
|
20 |
+
|
21 |
+
task:
|
22 |
+
_name: multires_hubert_pretraining
|
23 |
+
data: ???
|
24 |
+
fine_tuning: true
|
25 |
+
label_dir: ???
|
26 |
+
label_rate_ratios: ???
|
27 |
+
normalize: true # must be consistent with pre-training
|
28 |
+
labels: ["ltr"]
|
29 |
+
single_target: true
|
30 |
+
|
31 |
+
dataset:
|
32 |
+
num_workers: 0
|
33 |
+
max_tokens: 1600000
|
34 |
+
validate_after_updates: ${model.freeze_finetune_updates}
|
35 |
+
validate_interval: 5
|
36 |
+
train_subset: train_100h
|
37 |
+
valid_subset: dev_other
|
38 |
+
|
39 |
+
criterion:
|
40 |
+
_name: ctc
|
41 |
+
zero_infinity: true
|
42 |
+
|
43 |
+
optimization:
|
44 |
+
max_update: 80000
|
45 |
+
lr: [3e-5]
|
46 |
+
sentence_avg: true
|
47 |
+
update_freq: [2]
|
48 |
+
|
49 |
+
optimizer:
|
50 |
+
_name: adam
|
51 |
+
adam_betas: (0.9,0.98)
|
52 |
+
adam_eps: 1e-08
|
53 |
+
|
54 |
+
lr_scheduler:
|
55 |
+
_name: tri_stage
|
56 |
+
phase_ratio: [0.1, 0.4, 0.5]
|
57 |
+
final_lr_scale: 0.05
|
58 |
+
|
59 |
+
model:
|
60 |
+
_name: multires_hubert_ctc
|
61 |
+
multires_hubert_path: ???
|
62 |
+
apply_mask: true
|
63 |
+
mask_selection: static
|
64 |
+
mask_length: 10
|
65 |
+
mask_other: 0
|
66 |
+
mask_prob: 0.75
|
67 |
+
mask_channel_selection: static
|
68 |
+
mask_channel_length: 64
|
69 |
+
mask_channel_other: 0
|
70 |
+
mask_channel_prob: 0.5
|
71 |
+
layerdrop: 0.1
|
72 |
+
dropout: 0.0
|
73 |
+
activation_dropout: 0.1
|
74 |
+
attention_dropout: 0.0
|
75 |
+
feature_grad_mult: 0.0
|
76 |
+
freeze_finetune_updates: 10000
|
77 |
+
|
78 |
+
hydra:
|
79 |
+
job:
|
80 |
+
config:
|
81 |
+
override_dirname:
|
82 |
+
kv_sep: '-'
|
83 |
+
item_sep: '__'
|
84 |
+
exclude_keys:
|
85 |
+
- run
|
86 |
+
- task.data
|
87 |
+
- task.label_dir
|
88 |
+
- model.multires_hubert_path
|
89 |
+
- dataset.train_subset
|
90 |
+
- dataset.valid_subset
|
91 |
+
- criterion.wer_kenlm_model
|
92 |
+
- criterion.wer_lexicon
|
93 |
+
run:
|
94 |
+
dir: ???
|
95 |
+
sweep:
|
96 |
+
dir: ???
|
97 |
+
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
|
fairseq/examples/mr_hubert/config/finetune/base_10h.yaml
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tblog
|
8 |
+
seed: 1337
|
9 |
+
|
10 |
+
checkpoint:
|
11 |
+
save_interval: 5
|
12 |
+
keep_interval_updates: 1
|
13 |
+
no_epoch_checkpoints: true
|
14 |
+
best_checkpoint_metric: wer
|
15 |
+
|
16 |
+
distributed_training:
|
17 |
+
ddp_backend: c10d
|
18 |
+
find_unused_parameters: true
|
19 |
+
distributed_world_size: 8
|
20 |
+
distributed_port: 29671
|
21 |
+
nprocs_per_node: 8
|
22 |
+
|
23 |
+
task:
|
24 |
+
_name: multires_hubert_pretraining
|
25 |
+
data: ???
|
26 |
+
fine_tuning: true
|
27 |
+
label_dir: ???
|
28 |
+
label_rate_ratios: ???
|
29 |
+
normalize: false # must be consistent with pre-training
|
30 |
+
labels: ["ltr"]
|
31 |
+
single_target: true
|
32 |
+
|
33 |
+
dataset:
|
34 |
+
num_workers: 0
|
35 |
+
max_tokens: 3200000
|
36 |
+
validate_after_updates: ${model.freeze_finetune_updates}
|
37 |
+
validate_interval: 5
|
38 |
+
train_subset: train_10h
|
39 |
+
valid_subset: dev
|
40 |
+
|
41 |
+
criterion:
|
42 |
+
_name: ctc
|
43 |
+
zero_infinity: true
|
44 |
+
|
45 |
+
optimization:
|
46 |
+
max_update: 25000
|
47 |
+
lr: [2e-5]
|
48 |
+
sentence_avg: true
|
49 |
+
update_freq: [1]
|
50 |
+
|
51 |
+
optimizer:
|
52 |
+
_name: adam
|
53 |
+
adam_betas: (0.9,0.98)
|
54 |
+
adam_eps: 1e-08
|
55 |
+
|
56 |
+
lr_scheduler:
|
57 |
+
_name: tri_stage
|
58 |
+
warmup_steps: 8000
|
59 |
+
hold_steps: 0
|
60 |
+
decay_steps: 72000
|
61 |
+
final_lr_scale: 0.05
|
62 |
+
|
63 |
+
model:
|
64 |
+
_name: multires_hubert_ctc
|
65 |
+
multires_hubert_path: ???
|
66 |
+
apply_mask: true
|
67 |
+
mask_selection: static
|
68 |
+
mask_length: 10
|
69 |
+
mask_other: 0
|
70 |
+
mask_prob: 0.75
|
71 |
+
mask_channel_selection: static
|
72 |
+
mask_channel_length: 64
|
73 |
+
mask_channel_other: 0
|
74 |
+
mask_channel_prob: 0.5
|
75 |
+
layerdrop: 0.1
|
76 |
+
dropout: 0.0
|
77 |
+
activation_dropout: 0.1
|
78 |
+
attention_dropout: 0.0
|
79 |
+
feature_grad_mult: 0.0
|
80 |
+
freeze_finetune_updates: 10000
|
81 |
+
|
82 |
+
hydra:
|
83 |
+
job:
|
84 |
+
config:
|
85 |
+
override_dirname:
|
86 |
+
kv_sep: '-'
|
87 |
+
item_sep: '__'
|
88 |
+
exclude_keys:
|
89 |
+
- run
|
90 |
+
- task.data
|
91 |
+
- task.label_dir
|
92 |
+
- model.multires_hubert_path
|
93 |
+
- dataset.train_subset
|
94 |
+
- dataset.valid_subset
|
95 |
+
- criterion.wer_kenlm_model
|
96 |
+
- criterion.wer_lexicon
|
97 |
+
run:
|
98 |
+
dir: ???
|
99 |
+
sweep:
|
100 |
+
dir: ???
|
101 |
+
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
|
fairseq/examples/mr_hubert/config/finetune/base_10h_large.yaml
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tblog
|
8 |
+
seed: 1337
|
9 |
+
|
10 |
+
checkpoint:
|
11 |
+
save_interval: 5
|
12 |
+
keep_interval_updates: 1
|
13 |
+
no_epoch_checkpoints: true
|
14 |
+
best_checkpoint_metric: wer
|
15 |
+
|
16 |
+
distributed_training:
|
17 |
+
ddp_backend: c10d
|
18 |
+
find_unused_parameters: true
|
19 |
+
distributed_world_size: 8
|
20 |
+
distributed_port: 29671
|
21 |
+
nprocs_per_node: 8
|
22 |
+
|
23 |
+
task:
|
24 |
+
_name: multires_hubert_pretraining
|
25 |
+
data: ???
|
26 |
+
fine_tuning: true
|
27 |
+
label_dir: ???
|
28 |
+
label_rate_ratios: ???
|
29 |
+
normalize: true # must be consistent with pre-training
|
30 |
+
labels: ["ltr"]
|
31 |
+
single_target: true
|
32 |
+
|
33 |
+
dataset:
|
34 |
+
num_workers: 0
|
35 |
+
max_tokens: 3200000
|
36 |
+
validate_after_updates: ${model.freeze_finetune_updates}
|
37 |
+
validate_interval: 5
|
38 |
+
train_subset: train_10h
|
39 |
+
valid_subset: dev
|
40 |
+
|
41 |
+
criterion:
|
42 |
+
_name: ctc
|
43 |
+
zero_infinity: true
|
44 |
+
|
45 |
+
optimization:
|
46 |
+
max_update: 25000
|
47 |
+
lr: [2e-5]
|
48 |
+
sentence_avg: true
|
49 |
+
update_freq: [1]
|
50 |
+
|
51 |
+
optimizer:
|
52 |
+
_name: adam
|
53 |
+
adam_betas: (0.9,0.98)
|
54 |
+
adam_eps: 1e-08
|
55 |
+
|
56 |
+
lr_scheduler:
|
57 |
+
_name: tri_stage
|
58 |
+
warmup_steps: 8000
|
59 |
+
hold_steps: 0
|
60 |
+
decay_steps: 72000
|
61 |
+
final_lr_scale: 0.05
|
62 |
+
|
63 |
+
model:
|
64 |
+
_name: multires_hubert_ctc
|
65 |
+
multires_hubert_path: ???
|
66 |
+
apply_mask: true
|
67 |
+
mask_selection: static
|
68 |
+
mask_length: 10
|
69 |
+
mask_other: 0
|
70 |
+
mask_prob: 0.75
|
71 |
+
mask_channel_selection: static
|
72 |
+
mask_channel_length: 64
|
73 |
+
mask_channel_other: 0
|
74 |
+
mask_channel_prob: 0.5
|
75 |
+
layerdrop: 0.1
|
76 |
+
dropout: 0.0
|
77 |
+
activation_dropout: 0.1
|
78 |
+
attention_dropout: 0.0
|
79 |
+
feature_grad_mult: 0.0
|
80 |
+
freeze_finetune_updates: 10000
|
81 |
+
|
82 |
+
hydra:
|
83 |
+
job:
|
84 |
+
config:
|
85 |
+
override_dirname:
|
86 |
+
kv_sep: '-'
|
87 |
+
item_sep: '__'
|
88 |
+
exclude_keys:
|
89 |
+
- run
|
90 |
+
- task.data
|
91 |
+
- task.label_dir
|
92 |
+
- model.multires_hubert_path
|
93 |
+
- dataset.train_subset
|
94 |
+
- dataset.valid_subset
|
95 |
+
- criterion.wer_kenlm_model
|
96 |
+
- criterion.wer_lexicon
|
97 |
+
run:
|
98 |
+
dir: ???
|
99 |
+
sweep:
|
100 |
+
dir: ???
|
101 |
+
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
|
fairseq/examples/mr_hubert/config/finetune/base_1h.yaml
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tblog
|
8 |
+
seed: 1337
|
9 |
+
|
10 |
+
checkpoint:
|
11 |
+
save_interval: 50
|
12 |
+
keep_interval_updates: 1
|
13 |
+
save_interval_updates: 1000
|
14 |
+
no_epoch_checkpoints: true
|
15 |
+
best_checkpoint_metric: wer
|
16 |
+
|
17 |
+
distributed_training:
|
18 |
+
ddp_backend: c10d
|
19 |
+
find_unused_parameters: true
|
20 |
+
distributed_world_size: 8
|
21 |
+
distributed_port: 29671
|
22 |
+
nprocs_per_node: 8
|
23 |
+
|
24 |
+
task:
|
25 |
+
_name: multires_hubert_pretraining
|
26 |
+
data: ???
|
27 |
+
fine_tuning: true
|
28 |
+
label_dir: ???
|
29 |
+
label_rate_ratios: ???
|
30 |
+
normalize: false # must be consistent with pre-training
|
31 |
+
labels: ["ltr"]
|
32 |
+
single_target: true
|
33 |
+
|
34 |
+
dataset:
|
35 |
+
num_workers: 0
|
36 |
+
max_tokens: 3200000
|
37 |
+
validate_after_updates: ${model.freeze_finetune_updates}
|
38 |
+
validate_interval: 1000
|
39 |
+
train_subset: train_1h
|
40 |
+
valid_subset: dev_other
|
41 |
+
|
42 |
+
criterion:
|
43 |
+
_name: ctc
|
44 |
+
zero_infinity: true
|
45 |
+
|
46 |
+
optimization:
|
47 |
+
max_update: 13000
|
48 |
+
lr: [5e-5]
|
49 |
+
sentence_avg: true
|
50 |
+
update_freq: [4]
|
51 |
+
|
52 |
+
optimizer:
|
53 |
+
_name: adam
|
54 |
+
adam_betas: (0.9,0.98)
|
55 |
+
adam_eps: 1e-08
|
56 |
+
|
57 |
+
lr_scheduler:
|
58 |
+
_name: tri_stage
|
59 |
+
phase_ratio: [0.1, 0.4, 0.5]
|
60 |
+
final_lr_scale: 0.05
|
61 |
+
|
62 |
+
model:
|
63 |
+
_name: multires_hubert_ctc
|
64 |
+
multires_hubert_path: ???
|
65 |
+
apply_mask: true
|
66 |
+
mask_selection: static
|
67 |
+
mask_length: 10
|
68 |
+
mask_other: 0
|
69 |
+
mask_prob: 0.75
|
70 |
+
mask_channel_selection: static
|
71 |
+
mask_channel_length: 64
|
72 |
+
mask_channel_other: 0
|
73 |
+
mask_channel_prob: 0.5
|
74 |
+
layerdrop: 0.1
|
75 |
+
dropout: 0.0
|
76 |
+
activation_dropout: 0.1
|
77 |
+
attention_dropout: 0.0
|
78 |
+
feature_grad_mult: 0.0
|
79 |
+
freeze_finetune_updates: 10000
|
80 |
+
|
81 |
+
hydra:
|
82 |
+
job:
|
83 |
+
config:
|
84 |
+
override_dirname:
|
85 |
+
kv_sep: '-'
|
86 |
+
item_sep: '__'
|
87 |
+
exclude_keys:
|
88 |
+
- run
|
89 |
+
- task.data
|
90 |
+
- task.label_dir
|
91 |
+
- model.multires_hubert_path
|
92 |
+
- dataset.train_subset
|
93 |
+
- dataset.valid_subset
|
94 |
+
- criterion.wer_kenlm_model
|
95 |
+
- criterion.wer_lexicon
|
96 |
+
run:
|
97 |
+
dir: ???
|
98 |
+
sweep:
|
99 |
+
dir: ???
|
100 |
+
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
|
fairseq/examples/mr_hubert/config/finetune/base_1h_large.yaml
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
tensorboard_logdir: tblog
|
8 |
+
seed: 1337
|
9 |
+
|
10 |
+
checkpoint:
|
11 |
+
save_interval: 1000
|
12 |
+
keep_interval_updates: 1
|
13 |
+
no_epoch_checkpoints: true
|
14 |
+
best_checkpoint_metric: wer
|
15 |
+
|
16 |
+
distributed_training:
|
17 |
+
ddp_backend: c10d
|
18 |
+
find_unused_parameters: true
|
19 |
+
distributed_world_size: 8
|
20 |
+
distributed_port: 29671
|
21 |
+
nprocs_per_node: 8
|
22 |
+
|
23 |
+
task:
|
24 |
+
_name: multires_hubert_pretraining
|
25 |
+
data: ???
|
26 |
+
fine_tuning: true
|
27 |
+
label_dir: ???
|
28 |
+
label_rate_ratios: ???
|
29 |
+
normalize: true # must be consistent with pre-training
|
30 |
+
labels: ["ltr"]
|
31 |
+
single_target: true
|
32 |
+
|
33 |
+
dataset:
|
34 |
+
num_workers: 0
|
35 |
+
max_tokens: 1280000
|
36 |
+
validate_after_updates: ${model.freeze_finetune_updates}
|
37 |
+
validate_interval: 5
|
38 |
+
train_subset: train_10h
|
39 |
+
valid_subset: dev
|
40 |
+
|
41 |
+
criterion:
|
42 |
+
_name: ctc
|
43 |
+
zero_infinity: true
|
44 |
+
|
45 |
+
optimization:
|
46 |
+
max_update: 25000
|
47 |
+
lr: [3e-4]
|
48 |
+
sentence_avg: true
|
49 |
+
update_freq: [5]
|
50 |
+
|
51 |
+
optimizer:
|
52 |
+
_name: adam
|
53 |
+
adam_betas: (0.9,0.98)
|
54 |
+
adam_eps: 1e-08
|
55 |
+
|
56 |
+
lr_scheduler:
|
57 |
+
_name: tri_stage
|
58 |
+
phase_ratio: [0.1, 0.4, 0.5]
|
59 |
+
final_lr_scale: 0.05
|
60 |
+
|
61 |
+
model:
|
62 |
+
_name: multires_hubert_ctc
|
63 |
+
multires_hubert_path: ???
|
64 |
+
apply_mask: true
|
65 |
+
mask_selection: static
|
66 |
+
mask_length: 10
|
67 |
+
mask_other: 0
|
68 |
+
mask_prob: 0.75
|
69 |
+
mask_channel_selection: static
|
70 |
+
mask_channel_length: 64
|
71 |
+
mask_channel_other: 0
|
72 |
+
mask_channel_prob: 0.5
|
73 |
+
layerdrop: 0.1
|
74 |
+
dropout: 0.0
|
75 |
+
activation_dropout: 0.1
|
76 |
+
attention_dropout: 0.0
|
77 |
+
feature_grad_mult: 0.0
|
78 |
+
freeze_finetune_updates: 10000
|
79 |
+
|
80 |
+
hydra:
|
81 |
+
job:
|
82 |
+
config:
|
83 |
+
override_dirname:
|
84 |
+
kv_sep: '-'
|
85 |
+
item_sep: '__'
|
86 |
+
exclude_keys:
|
87 |
+
- run
|
88 |
+
- task.data
|
89 |
+
- task.label_dir
|
90 |
+
- model.multires_hubert_path
|
91 |
+
- dataset.train_subset
|
92 |
+
- dataset.valid_subset
|
93 |
+
- criterion.wer_kenlm_model
|
94 |
+
- criterion.wer_lexicon
|
95 |
+
run:
|
96 |
+
dir: ???
|
97 |
+
sweep:
|
98 |
+
dir: ???
|
99 |
+
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
|
fairseq/examples/mr_hubert/config/pretrain/mrhubert_base_librispeech.yaml
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
seed: 1337
|
8 |
+
tensorboard_logdir: tblog
|
9 |
+
min_loss_scale: 1e-8
|
10 |
+
|
11 |
+
checkpoint:
|
12 |
+
save_interval_updates: 25000
|
13 |
+
keep_interval_updates: 1
|
14 |
+
no_epoch_checkpoints: true
|
15 |
+
|
16 |
+
distributed_training:
|
17 |
+
ddp_backend: no_c10d
|
18 |
+
distributed_backend: 'nccl'
|
19 |
+
distributed_world_size: 32
|
20 |
+
distributed_port: 29671
|
21 |
+
nprocs_per_node: 8
|
22 |
+
find_unused_parameters: true
|
23 |
+
|
24 |
+
task:
|
25 |
+
_name: multires_hubert_pretraining
|
26 |
+
data: ???
|
27 |
+
label_dir: ???
|
28 |
+
labels: ???
|
29 |
+
label_rate: ${model.label_rate}
|
30 |
+
label_rate_ratios: ???
|
31 |
+
sample_rate: 16000
|
32 |
+
max_sample_size: 250000
|
33 |
+
min_sample_size: 32000
|
34 |
+
pad_audio: false
|
35 |
+
random_crop: true
|
36 |
+
normalize: false # must be consistent with extractor
|
37 |
+
# max_keep_size: 300000
|
38 |
+
# max_keep_size: 50000
|
39 |
+
|
40 |
+
|
41 |
+
dataset:
|
42 |
+
num_workers: 0
|
43 |
+
max_tokens: 1000000
|
44 |
+
skip_invalid_size_inputs_valid_test: true
|
45 |
+
validate_interval: 5
|
46 |
+
validate_interval_updates: 10000
|
47 |
+
|
48 |
+
criterion:
|
49 |
+
_name: hubert
|
50 |
+
pred_masked_weight: 1.0
|
51 |
+
pred_nomask_weight: 0.0
|
52 |
+
loss_weights: [10,]
|
53 |
+
|
54 |
+
optimization:
|
55 |
+
max_update: 400000
|
56 |
+
lr: [0.0005]
|
57 |
+
clip_norm: 10.0
|
58 |
+
|
59 |
+
optimizer:
|
60 |
+
_name: adam
|
61 |
+
adam_betas: (0.9,0.98)
|
62 |
+
adam_eps: 1e-06
|
63 |
+
weight_decay: 0.01
|
64 |
+
|
65 |
+
lr_scheduler:
|
66 |
+
_name: polynomial_decay
|
67 |
+
warmup_updates: 32000
|
68 |
+
|
69 |
+
model:
|
70 |
+
_name: multires_hubert
|
71 |
+
label_rate: ???
|
72 |
+
label_rate_ratios: ${task.label_rate_ratios}
|
73 |
+
skip_masked: false
|
74 |
+
skip_nomask: false
|
75 |
+
mask_prob: 0.80
|
76 |
+
extractor_mode: default
|
77 |
+
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
|
78 |
+
final_dim: 256
|
79 |
+
encoder_layers: 4
|
80 |
+
encoder_layerdrop: 0.05
|
81 |
+
dropout_input: 0.1
|
82 |
+
dropout_features: 0.1
|
83 |
+
dropout: 0.1
|
84 |
+
attention_dropout: 0.1
|
85 |
+
feature_grad_mult: 0.1
|
86 |
+
untie_final_proj: true
|
87 |
+
activation_dropout: 0.0
|
88 |
+
conv_adapator_kernal: 1
|
89 |
+
use_single_target: true
|
90 |
+
|
91 |
+
hydra:
|
92 |
+
job:
|
93 |
+
config:
|
94 |
+
override_dirname:
|
95 |
+
kv_sep: '-'
|
96 |
+
item_sep: '/'
|
97 |
+
exclude_keys:
|
98 |
+
- run
|
99 |
+
- task.data
|
100 |
+
- task.label_dir
|
101 |
+
- common.min_loss_scale
|
102 |
+
- common.log_interval
|
103 |
+
- optimization.clip_norm
|
fairseq/examples/mr_hubert/config/pretrain/mrhubert_large_librilight.yaml
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _group_
|
2 |
+
|
3 |
+
common:
|
4 |
+
memory_efficient_fp16: true
|
5 |
+
log_format: json
|
6 |
+
log_interval: 200
|
7 |
+
seed: 1337
|
8 |
+
tensorboard_logdir: tblog
|
9 |
+
|
10 |
+
checkpoint:
|
11 |
+
save_interval_updates: 25000
|
12 |
+
keep_interval_updates: 1
|
13 |
+
no_epoch_checkpoints: true
|
14 |
+
|
15 |
+
|
16 |
+
distributed_training:
|
17 |
+
ddp_backend: no_c10d
|
18 |
+
distributed_backend: 'nccl'
|
19 |
+
distributed_world_size: 128
|
20 |
+
distributed_port: 29671
|
21 |
+
nprocs_per_node: 8
|
22 |
+
find_unused_parameters: true
|
23 |
+
|
24 |
+
task:
|
25 |
+
_name: multires_hubert_pretraining
|
26 |
+
data: ???
|
27 |
+
label_dir: ???
|
28 |
+
labels: ???
|
29 |
+
label_rate: ${model.label_rate}
|
30 |
+
label_rate_ratios: ???
|
31 |
+
sample_rate: 16000
|
32 |
+
max_sample_size: 250000
|
33 |
+
min_sample_size: 32000
|
34 |
+
pad_audio: false
|
35 |
+
random_crop: true
|
36 |
+
normalize: true # must be consistent with extractor
|
37 |
+
# max_keep_size: 50000
|
38 |
+
|
39 |
+
dataset:
|
40 |
+
num_workers: 0
|
41 |
+
max_tokens: 300000
|
42 |
+
skip_invalid_size_inputs_valid_test: true
|
43 |
+
validate_interval: 5
|
44 |
+
validate_interval_updates: 10000
|
45 |
+
|
46 |
+
criterion:
|
47 |
+
_name: hubert
|
48 |
+
pred_masked_weight: 1.0
|
49 |
+
pred_nomask_weight: 0.0
|
50 |
+
loss_weights: [10,]
|
51 |
+
|
52 |
+
optimization:
|
53 |
+
max_update: 400000
|
54 |
+
lr: [0.0015]
|
55 |
+
clip_norm: 1.0
|
56 |
+
update_freq: [3]
|
57 |
+
|
58 |
+
optimizer:
|
59 |
+
_name: adam
|
60 |
+
adam_betas: (0.9,0.98)
|
61 |
+
adam_eps: 1e-06
|
62 |
+
weight_decay: 0.01
|
63 |
+
|
64 |
+
lr_scheduler:
|
65 |
+
_name: polynomial_decay
|
66 |
+
warmup_updates: 32000
|
67 |
+
|
68 |
+
model:
|
69 |
+
_name: multires_hubert
|
70 |
+
label_rate: ???
|
71 |
+
label_rate_ratios: ${task.label_rate_ratios}
|
72 |
+
encoder_layers: 8
|
73 |
+
encoder_embed_dim: 1024
|
74 |
+
encoder_ffn_embed_dim: 4096
|
75 |
+
encoder_attention_heads: 16
|
76 |
+
final_dim: 768
|
77 |
+
skip_masked: false
|
78 |
+
skip_nomask: false
|
79 |
+
mask_prob: 0.80
|
80 |
+
extractor_mode: layer_norm
|
81 |
+
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
|
82 |
+
encoder_layerdrop: 0.0
|
83 |
+
dropout_input: 0.0
|
84 |
+
dropout_features: 0.0
|
85 |
+
dropout: 0.0
|
86 |
+
attention_dropout: 0.0
|
87 |
+
layer_norm_first: true
|
88 |
+
feature_grad_mult: 1.0
|
89 |
+
untie_final_proj: true
|
90 |
+
activation_dropout: 0.0
|
91 |
+
conv_adapator_kernal: 1
|
92 |
+
use_single_target: true
|
93 |
+
|
94 |
+
hydra:
|
95 |
+
job:
|
96 |
+
config:
|
97 |
+
override_dirname:
|
98 |
+
kv_sep: '-'
|
99 |
+
item_sep: '__'
|
100 |
+
exclude_keys:
|
101 |
+
- run
|
102 |
+
- task.data
|
103 |
+
run:
|
104 |
+
dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
|
105 |
+
sweep:
|
106 |
+
dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
|
107 |
+
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
|
fairseq/examples/mr_hubert/config/pretrain/run/submitit_reg.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
hydra:
|
4 |
+
launcher:
|
5 |
+
cpus_per_task: 8
|
6 |
+
gpus_per_node: 8
|
7 |
+
tasks_per_node: ${hydra.launcher.gpus_per_node}
|
8 |
+
nodes: 4
|
9 |
+
comment: null
|
10 |
+
mem_gb: 384
|
11 |
+
timeout_min: 4320
|
12 |
+
max_num_timeout: 100
|
13 |
+
constraint: volta32gb
|
14 |
+
name: ${hydra.job.config_name}/${hydra.job.override_dirname}
|
15 |
+
submitit_folder: ${hydra.sweep.dir}/submitit/%j
|
16 |
+
|
17 |
+
distributed_training:
|
18 |
+
distributed_world_size: 32
|
19 |
+
distributed_port: 29671
|
20 |
+
nprocs_per_node: 8
|
fairseq/examples/mr_hubert/train.sh
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
FAIRSEQ= # Setup your fairseq directory
|
4 |
+
|
5 |
+
config_dir=${FAIRSEQ}/examples/mr_hubert/config
|
6 |
+
config_name=mr_hubert_base_librispeech
|
7 |
+
|
8 |
+
# Prepared Data Directory
|
9 |
+
data_dir=librispeech
|
10 |
+
# -- data_dir
|
11 |
+
# -- train.tsv
|
12 |
+
# -- valid.tsv
|
13 |
+
|
14 |
+
label_dir=labels
|
15 |
+
# -- label_dir
|
16 |
+
# -- train.km
|
17 |
+
# -- valid.km
|
18 |
+
# -- dict.km.txt
|
19 |
+
|
20 |
+
|
21 |
+
exp_dir=exp # Target experiments directory
|
22 |
+
ratios="[1, 2]" # Default label rate ratios
|
23 |
+
label_rate=50 # Base label rate
|
24 |
+
|
25 |
+
|
26 |
+
_opts=
|
27 |
+
|
28 |
+
# If use slurm, uncomment this line and modify the job submission at
|
29 |
+
# _opts="${_opts} hydra/launcher=submitit_slurm +hydra.launcher.partition=${your_slurm_partition} +run=submitit_reg"
|
30 |
+
|
31 |
+
# If want to set additional experiment tag, uncomment this line
|
32 |
+
# _opts="${_opts} hydra.sweep.subdir=${your_experiment_tag}"
|
33 |
+
|
34 |
+
|
35 |
+
python ${FAIRSEQ}/fairseq_cli/hydra_train.py \
|
36 |
+
-m --config-dir ${config_dir} --config-name ${config_name} ${_opts} \
|
37 |
+
task.data=${data_dir} \
|
38 |
+
task.label_dir=${label_dir} \
|
39 |
+
task.labels='["km"]' \
|
40 |
+
model.label_rate=${label_rate} \
|
41 |
+
task.label_rate_ratios='${ratios}' \
|
42 |
+
hydra.sweep.dir=${exp_dir} &
|
43 |
+
|
44 |
+
|
45 |
+
|
fairseq/examples/multilingual/ML50_langs.txt
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ar_AR
|
2 |
+
cs_CZ
|
3 |
+
de_DE
|
4 |
+
en_XX
|
5 |
+
es_XX
|
6 |
+
et_EE
|
7 |
+
fi_FI
|
8 |
+
fr_XX
|
9 |
+
gu_IN
|
10 |
+
hi_IN
|
11 |
+
it_IT
|
12 |
+
ja_XX
|
13 |
+
kk_KZ
|
14 |
+
ko_KR
|
15 |
+
lt_LT
|
16 |
+
lv_LV
|
17 |
+
my_MM
|
18 |
+
ne_NP
|
19 |
+
nl_XX
|
20 |
+
ro_RO
|
21 |
+
ru_RU
|
22 |
+
si_LK
|
23 |
+
tr_TR
|
24 |
+
vi_VN
|
25 |
+
zh_CN
|
26 |
+
af_ZA
|
27 |
+
az_AZ
|
28 |
+
bn_IN
|
29 |
+
fa_IR
|
30 |
+
he_IL
|
31 |
+
hr_HR
|
32 |
+
id_ID
|
33 |
+
ka_GE
|
34 |
+
km_KH
|
35 |
+
mk_MK
|
36 |
+
ml_IN
|
37 |
+
mn_MN
|
38 |
+
mr_IN
|
39 |
+
pl_PL
|
40 |
+
ps_AF
|
41 |
+
pt_XX
|
42 |
+
sv_SE
|
43 |
+
sw_KE
|
44 |
+
ta_IN
|
45 |
+
te_IN
|
46 |
+
th_TH
|
47 |
+
tl_XX
|
48 |
+
uk_UA
|
49 |
+
ur_PK
|
50 |
+
xh_ZA
|
51 |
+
gl_ES
|
52 |
+
sl_SI
|
fairseq/examples/multilingual/README.md
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Multilingual Translation
|
2 |
+
|
3 |
+
[[Multilingual Translation with Extensible Multilingual Pretraining and Finetuning, https://arxiv.org/abs/2008.00401]](https://arxiv.org/abs/2008.00401)
|
4 |
+
|
5 |
+
## Introduction
|
6 |
+
|
7 |
+
This work is for training multilingual translation models with multiple bitext datasets. This multilingual translation framework supports (see [[training section]](#Training) and [[finetuning section]](#Finetuning) for examples)
|
8 |
+
|
9 |
+
* temperature based sampling over unbalancing datasets of different translation directions
|
10 |
+
- --sampling-method' with
|
11 |
+
choices=['uniform', 'temperature', 'concat']
|
12 |
+
- --sampling-temperature
|
13 |
+
* configurable to automatically add source and/or target language tokens to source/target sentences using data which are prepared in the same way as bilignual training
|
14 |
+
- --encoder-langtok with choices=['src', 'tgt', None] to specify whether to add source or target language tokens to the source sentences
|
15 |
+
- --decoder-langtok (binary option) to specify whether to add target language tokens to the target sentences or not
|
16 |
+
* finetuning mBART pretrained models for multilingual translation
|
17 |
+
- --finetune-from-model to specify the path from which to load the pretrained model
|
18 |
+
|
19 |
+
## Preprocessing data
|
20 |
+
Multilingual training requires a joint BPE vocab. Please follow [mBART's preprocessing steps](https://github.com/pytorch/fairseq/tree/main/examples/mbart#bpe-data) to reuse our pretrained sentence-piece model.
|
21 |
+
|
22 |
+
You can also train a joint BPE model on your own dataset and then follow the steps in [[link]](https://github.com/pytorch/fairseq/tree/main/examples/translation#multilingual-translation).
|
23 |
+
|
24 |
+
## Training
|
25 |
+
|
26 |
+
|
27 |
+
```bash
|
28 |
+
lang_pairs=<language pairs to be trained, e.g. "en-cs,cs-en">
|
29 |
+
path_2_data=<set to data path>
|
30 |
+
lang_list=<a file which contains a list of languages separated by new lines>
|
31 |
+
|
32 |
+
fairseq-train $path_2_data \
|
33 |
+
--encoder-normalize-before --decoder-normalize-before \
|
34 |
+
--arch transformer --layernorm-embedding \
|
35 |
+
--task translation_multi_simple_epoch \
|
36 |
+
--sampling-method "temperature" \
|
37 |
+
--sampling-temperature 1.5 \
|
38 |
+
--encoder-langtok "src" \
|
39 |
+
--decoder-langtok \
|
40 |
+
--lang-dict "$lang_list" \
|
41 |
+
--lang-pairs "$lang_pairs" \
|
42 |
+
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
|
43 |
+
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
|
44 |
+
--lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
|
45 |
+
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
|
46 |
+
--max-tokens 1024 --update-freq 2 \
|
47 |
+
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
|
48 |
+
--seed 222 --log-format simple --log-interval 2
|
49 |
+
```
|
50 |
+
|
51 |
+
## Finetuning
|
52 |
+
We can also finetune multilingual models from a monolingual pretrained models, e.g. [mMBART](https://github.com/pytorch/fairseq/tree/main/examples/mbart).
|
53 |
+
```bash
|
54 |
+
lang_pairs=<language pairs to be trained, e.g. "en-cs,cs-en">
|
55 |
+
path_2_data=<set to data path>
|
56 |
+
lang_list=<a file which contains a list of languages separated by new lines>
|
57 |
+
pretrained_model=<path to the pretrained model, e.g. mbart or another trained multilingual model>
|
58 |
+
|
59 |
+
fairseq-train $path_2_data \
|
60 |
+
--finetune-from-model $pretrained_model \
|
61 |
+
--encoder-normalize-before --decoder-normalize-before \
|
62 |
+
--arch transformer --layernorm-embedding \
|
63 |
+
--task translation_multi_simple_epoch \
|
64 |
+
--sampling-method "temperature" \
|
65 |
+
--sampling-temperature 1.5 \
|
66 |
+
--encoder-langtok "src" \
|
67 |
+
--decoder-langtok \
|
68 |
+
--lang-dict "$lang_list" \
|
69 |
+
--lang-pairs "$lang_pairs" \
|
70 |
+
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
|
71 |
+
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
|
72 |
+
--lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
|
73 |
+
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
|
74 |
+
--max-tokens 1024 --update-freq 2 \
|
75 |
+
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
|
76 |
+
--seed 222 --log-format simple --log-interval 2
|
77 |
+
```
|
78 |
+
## Generate
|
79 |
+
The following command uses the multilingual task (translation_multi_simple_epoch) to generate translation from $source_lang to $target_lang on the test dataset. During generaton, the source language tokens are added to source sentences and the target language tokens are added as the starting token to decode target sentences. Options --lang-dict and --lang-pairs are needed to tell the generation process the ordered list of languages and translation directions that the trained model are awared of; they will need to be consistent with the training.
|
80 |
+
|
81 |
+
```bash
|
82 |
+
model=<multilingual model>
|
83 |
+
source_lang=<source language>
|
84 |
+
target_lang=<target language>
|
85 |
+
|
86 |
+
fairseq-generate $path_2_data \
|
87 |
+
--path $model \
|
88 |
+
--task translation_multi_simple_epoch \
|
89 |
+
--gen-subset test \
|
90 |
+
--source-lang $source_lang \
|
91 |
+
--target-lang $target_lang
|
92 |
+
--sacrebleu --remove-bpe 'sentencepiece'\
|
93 |
+
--batch-size 32 \
|
94 |
+
--encoder-langtok "src" \
|
95 |
+
--decoder-langtok \
|
96 |
+
--lang-dict "$lang_list" \
|
97 |
+
--lang-pairs "$lang_pairs" > ${source_lang}_${target_lang}.txt
|
98 |
+
```
|
99 |
+
Fairseq will generate translation into a file {source_lang}_${target_lang}.txt with sacreblue at the end.
|
100 |
+
|
101 |
+
You can also use costomized tokenizer to compare the performance with the literature. For example, you get a tokenizer [here](https://github.com/rsennrich/wmt16-scripts) and do the following:
|
102 |
+
```bash
|
103 |
+
TOKENIZER=<path to a customized tokenizer for decoding evaluation>
|
104 |
+
TOK_CMD=<"$TOKENIZER $target_lang" or cat for sacrebleu>
|
105 |
+
|
106 |
+
cat {source_lang}_${target_lang}.txt | grep -P "^H" |sort -V |cut -f 3- |$TOK_CMD > ${source_lang}_${target_lang}.hyp
|
107 |
+
cat {source_lang}_${target_lang}.txt | grep -P "^T" |sort -V |cut -f 2- |$TOK_CMD > ${source_lang}_${target_lang}.ref
|
108 |
+
sacrebleu -tok 'none' -s 'none' ${source_lang}_${target_lang}.ref < ${source_lang}_${target_lang}.hyp
|
109 |
+
```
|
110 |
+
|
111 |
+
# mBART50 models
|
112 |
+
|
113 |
+
* [mMBART 50 pretrained model](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.pretrained.tar.gz).
|
114 |
+
* [mMBART 50 finetuned many-to-one](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.n1.tar.gz).
|
115 |
+
* [mMBART 50 finetuned one-to-many](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.1n.tar.gz).
|
116 |
+
* [mMBART 50 finetuned many-to-many](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.nn.tar.gz).
|
117 |
+
|
118 |
+
Please download and extract from the above tarballs. Each tarball contains
|
119 |
+
* The fairseq model checkpoint: model.pt
|
120 |
+
* The list of supported languages: ML50_langs.txt
|
121 |
+
* Sentence piece model: sentence.bpe.model
|
122 |
+
* Fairseq dictionary of each language: dict.{lang}.txt (please replace lang with a language specified in ML50_langs.txt)
|
123 |
+
|
124 |
+
To use the trained models,
|
125 |
+
* use the tool [binarize.py](./data_scripts/binarize.py) to binarize your data using sentence.bpe.model and dict.{lang}.txt, and copy the dictionaries to your data path
|
126 |
+
* then run the generation command:
|
127 |
+
```bash
|
128 |
+
path_2_data=<path to your binarized data with fairseq dictionaries>
|
129 |
+
model=<path_to_extracted_folder>/model.pt
|
130 |
+
lang_list=<path_to_extracted_folder>/ML50_langs.txt
|
131 |
+
source_lang=<source language>
|
132 |
+
target_lang=<target language>
|
133 |
+
|
134 |
+
fairseq-generate $path_2_data \
|
135 |
+
--path $model \
|
136 |
+
--task translation_multi_simple_epoch \
|
137 |
+
--gen-subset test \
|
138 |
+
--source-lang $source_lang \
|
139 |
+
--target-lang $target_lang
|
140 |
+
--sacrebleu --remove-bpe 'sentencepiece'\
|
141 |
+
--batch-size 32 \
|
142 |
+
--encoder-langtok "src" \
|
143 |
+
--decoder-langtok \
|
144 |
+
--lang-dict "$lang_list"
|
145 |
+
```
|
146 |
+
|
147 |
+
## Citation
|
148 |
+
|
149 |
+
```bibtex
|
150 |
+
@article{tang2020multilingual,
|
151 |
+
title={Multilingual Translation with Extensible Multilingual Pretraining and Finetuning},
|
152 |
+
author={Yuqing Tang and Chau Tran and Xian Li and Peng-Jen Chen and Naman Goyal and Vishrav Chaudhary and Jiatao Gu and Angela Fan},
|
153 |
+
year={2020},
|
154 |
+
eprint={2008.00401},
|
155 |
+
archivePrefix={arXiv},
|
156 |
+
primaryClass={cs.CL}
|
157 |
+
}
|
158 |
+
```
|
fairseq/examples/multilingual/data_scripts/README.md
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Install dependency
|
3 |
+
```bash
|
4 |
+
pip install -r requirement.txt
|
5 |
+
```
|
6 |
+
|
7 |
+
# Download the data set
|
8 |
+
```bash
|
9 |
+
export WORKDIR_ROOT=<a directory which will hold all working files>
|
10 |
+
|
11 |
+
```
|
12 |
+
The downloaded data will be at $WORKDIR_ROOT/ML50
|
13 |
+
|
14 |
+
# preprocess the data
|
15 |
+
Install SPM [here](https://github.com/google/sentencepiece)
|
16 |
+
```bash
|
17 |
+
export WORKDIR_ROOT=<a directory which will hold all working files>
|
18 |
+
export SPM_PATH=<a path pointing to sentencepice spm_encode.py>
|
19 |
+
```
|
20 |
+
* $WORKDIR_ROOT/ML50/raw: extracted raw data
|
21 |
+
* $WORKDIR_ROOT/ML50/dedup: dedup data
|
22 |
+
* $WORKDIR_ROOT/ML50/clean: data with valid and test sentences removed from the dedup data
|
23 |
+
|
24 |
+
|
fairseq/examples/multilingual/data_scripts/binarize.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import shutil
|
2 |
+
import os, sys
|
3 |
+
from subprocess import check_call, check_output
|
4 |
+
import glob
|
5 |
+
import argparse
|
6 |
+
import shutil
|
7 |
+
import pathlib
|
8 |
+
import itertools
|
9 |
+
|
10 |
+
def call_output(cmd):
|
11 |
+
print(f"Executing: {cmd}")
|
12 |
+
ret = check_output(cmd, shell=True)
|
13 |
+
print(ret)
|
14 |
+
return ret
|
15 |
+
|
16 |
+
def call(cmd):
|
17 |
+
print(cmd)
|
18 |
+
check_call(cmd, shell=True)
|
19 |
+
|
20 |
+
|
21 |
+
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
|
22 |
+
|
23 |
+
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
|
24 |
+
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
|
25 |
+
sys.exit(-1)
|
26 |
+
|
27 |
+
SPM_PATH = os.environ.get('SPM_PATH', None)
|
28 |
+
|
29 |
+
if SPM_PATH is None or not SPM_PATH.strip():
|
30 |
+
print("Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting...")
|
31 |
+
sys.exit(-1)
|
32 |
+
|
33 |
+
|
34 |
+
SPM_MODEL = f'{WORKDIR_ROOT}/sentence.bpe.model'
|
35 |
+
SPM_VOCAB = f'{WORKDIR_ROOT}/dict_250k.txt'
|
36 |
+
|
37 |
+
SPM_ENCODE = f'{SPM_PATH}'
|
38 |
+
|
39 |
+
if not os.path.exists(SPM_MODEL):
|
40 |
+
call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/sentence.bpe.model -O {SPM_MODEL}")
|
41 |
+
|
42 |
+
|
43 |
+
if not os.path.exists(SPM_VOCAB):
|
44 |
+
call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/dict_250k.txt -O {SPM_VOCAB}")
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
def get_data_size(raw):
|
49 |
+
cmd = f'wc -l {raw}'
|
50 |
+
ret = call_output(cmd)
|
51 |
+
return int(ret.split()[0])
|
52 |
+
|
53 |
+
def encode_spm(model, direction, prefix='', splits=['train', 'test', 'valid'], pairs_per_shard=None):
|
54 |
+
src, tgt = direction.split('-')
|
55 |
+
|
56 |
+
for split in splits:
|
57 |
+
src_raw, tgt_raw = f'{RAW_DIR}/{split}{prefix}.{direction}.{src}', f'{RAW_DIR}/{split}{prefix}.{direction}.{tgt}'
|
58 |
+
if os.path.exists(src_raw) and os.path.exists(tgt_raw):
|
59 |
+
cmd = f"""python {SPM_ENCODE} \
|
60 |
+
--model {model}\
|
61 |
+
--output_format=piece \
|
62 |
+
--inputs {src_raw} {tgt_raw} \
|
63 |
+
--outputs {BPE_DIR}/{direction}{prefix}/{split}.bpe.{src} {BPE_DIR}/{direction}{prefix}/{split}.bpe.{tgt} """
|
64 |
+
print(cmd)
|
65 |
+
call(cmd)
|
66 |
+
|
67 |
+
|
68 |
+
def binarize_(
|
69 |
+
bpe_dir,
|
70 |
+
databin_dir,
|
71 |
+
direction, spm_vocab=SPM_VOCAB,
|
72 |
+
splits=['train', 'test', 'valid'],
|
73 |
+
):
|
74 |
+
src, tgt = direction.split('-')
|
75 |
+
|
76 |
+
try:
|
77 |
+
shutil.rmtree(f'{databin_dir}', ignore_errors=True)
|
78 |
+
os.mkdir(f'{databin_dir}')
|
79 |
+
except OSError as error:
|
80 |
+
print(error)
|
81 |
+
cmds = [
|
82 |
+
"fairseq-preprocess",
|
83 |
+
f"--source-lang {src} --target-lang {tgt}",
|
84 |
+
f"--destdir {databin_dir}/",
|
85 |
+
f"--workers 8",
|
86 |
+
]
|
87 |
+
if isinstance(spm_vocab, tuple):
|
88 |
+
src_vocab, tgt_vocab = spm_vocab
|
89 |
+
cmds.extend(
|
90 |
+
[
|
91 |
+
f"--srcdict {src_vocab}",
|
92 |
+
f"--tgtdict {tgt_vocab}",
|
93 |
+
]
|
94 |
+
)
|
95 |
+
else:
|
96 |
+
cmds.extend(
|
97 |
+
[
|
98 |
+
f"--joined-dictionary",
|
99 |
+
f"--srcdict {spm_vocab}",
|
100 |
+
]
|
101 |
+
)
|
102 |
+
input_options = []
|
103 |
+
if 'train' in splits and glob.glob(f"{bpe_dir}/train.bpe*"):
|
104 |
+
input_options.append(
|
105 |
+
f"--trainpref {bpe_dir}/train.bpe",
|
106 |
+
)
|
107 |
+
if 'valid' in splits and glob.glob(f"{bpe_dir}/valid.bpe*"):
|
108 |
+
input_options.append(f"--validpref {bpe_dir}/valid.bpe")
|
109 |
+
if 'test' in splits and glob.glob(f"{bpe_dir}/test.bpe*"):
|
110 |
+
input_options.append(f"--testpref {bpe_dir}/test.bpe")
|
111 |
+
if len(input_options) > 0:
|
112 |
+
cmd = " ".join(cmds + input_options)
|
113 |
+
print(cmd)
|
114 |
+
call(cmd)
|
115 |
+
|
116 |
+
|
117 |
+
def binarize(
|
118 |
+
databin_dir,
|
119 |
+
direction, spm_vocab=SPM_VOCAB, prefix='',
|
120 |
+
splits=['train', 'test', 'valid'],
|
121 |
+
pairs_per_shard=None,
|
122 |
+
):
|
123 |
+
def move_databin_files(from_folder, to_folder):
|
124 |
+
for bin_file in glob.glob(f"{from_folder}/*.bin") \
|
125 |
+
+ glob.glob(f"{from_folder}/*.idx") \
|
126 |
+
+ glob.glob(f"{from_folder}/dict*"):
|
127 |
+
try:
|
128 |
+
shutil.move(bin_file, to_folder)
|
129 |
+
except OSError as error:
|
130 |
+
print(error)
|
131 |
+
bpe_databin_dir = f"{BPE_DIR}/{direction}{prefix}_databin"
|
132 |
+
bpe_dir = f"{BPE_DIR}/{direction}{prefix}"
|
133 |
+
if pairs_per_shard is None:
|
134 |
+
binarize_(bpe_dir, bpe_databin_dir, direction, spm_vocab=spm_vocab, splits=splits)
|
135 |
+
move_databin_files(bpe_databin_dir, databin_dir)
|
136 |
+
else:
|
137 |
+
# binarize valid and test which will not be sharded
|
138 |
+
binarize_(
|
139 |
+
bpe_dir, bpe_databin_dir, direction,
|
140 |
+
spm_vocab=spm_vocab, splits=[s for s in splits if s != "train"])
|
141 |
+
for shard_bpe_dir in glob.glob(f"{bpe_dir}/shard*"):
|
142 |
+
path_strs = os.path.split(shard_bpe_dir)
|
143 |
+
shard_str = path_strs[-1]
|
144 |
+
shard_folder = f"{bpe_databin_dir}/{shard_str}"
|
145 |
+
databin_shard_folder = f"{databin_dir}/{shard_str}"
|
146 |
+
print(f'working from {shard_folder} to {databin_shard_folder}')
|
147 |
+
os.makedirs(databin_shard_folder, exist_ok=True)
|
148 |
+
binarize_(
|
149 |
+
shard_bpe_dir, shard_folder, direction,
|
150 |
+
spm_vocab=spm_vocab, splits=["train"])
|
151 |
+
|
152 |
+
for test_data in glob.glob(f"{bpe_databin_dir}/valid.*") + glob.glob(f"{bpe_databin_dir}/test.*"):
|
153 |
+
filename = os.path.split(test_data)[-1]
|
154 |
+
try:
|
155 |
+
os.symlink(test_data, f"{databin_shard_folder}/{filename}")
|
156 |
+
except OSError as error:
|
157 |
+
print(error)
|
158 |
+
move_databin_files(shard_folder, databin_shard_folder)
|
159 |
+
|
160 |
+
|
161 |
+
def load_langs(path):
|
162 |
+
with open(path) as fr:
|
163 |
+
langs = [l.strip() for l in fr]
|
164 |
+
return langs
|
165 |
+
|
166 |
+
if __name__ == '__main__':
|
167 |
+
parser = argparse.ArgumentParser()
|
168 |
+
parser.add_argument("--data_root", default=f"{WORKDIR_ROOT}/ML50")
|
169 |
+
parser.add_argument("--raw-folder", default='raw')
|
170 |
+
parser.add_argument("--bpe-folder", default='bpe')
|
171 |
+
parser.add_argument("--databin-folder", default='databin')
|
172 |
+
|
173 |
+
args = parser.parse_args()
|
174 |
+
|
175 |
+
DATA_PATH = args.data_root #'/private/home/yuqtang/public_data/ML50'
|
176 |
+
RAW_DIR = f'{DATA_PATH}/{args.raw_folder}'
|
177 |
+
BPE_DIR = f'{DATA_PATH}/{args.bpe_folder}'
|
178 |
+
DATABIN_DIR = f'{DATA_PATH}/{args.databin_folder}'
|
179 |
+
os.makedirs(BPE_DIR, exist_ok=True)
|
180 |
+
|
181 |
+
raw_files = itertools.chain(
|
182 |
+
glob.glob(f'{RAW_DIR}/train*'),
|
183 |
+
glob.glob(f'{RAW_DIR}/valid*'),
|
184 |
+
glob.glob(f'{RAW_DIR}/test*'),
|
185 |
+
)
|
186 |
+
|
187 |
+
directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
|
188 |
+
|
189 |
+
for direction in directions:
|
190 |
+
prefix = ""
|
191 |
+
splits = ['train', 'valid', 'test']
|
192 |
+
try:
|
193 |
+
shutil.rmtree(f'{BPE_DIR}/{direction}{prefix}', ignore_errors=True)
|
194 |
+
os.mkdir(f'{BPE_DIR}/{direction}{prefix}')
|
195 |
+
os.makedirs(DATABIN_DIR, exist_ok=True)
|
196 |
+
except OSError as error:
|
197 |
+
print(error)
|
198 |
+
spm_model, spm_vocab = SPM_MODEL, SPM_VOCAB
|
199 |
+
encode_spm(spm_model, direction=direction, splits=splits)
|
200 |
+
binarize(DATABIN_DIR, direction, spm_vocab=spm_vocab, splits=splits)
|
fairseq/examples/multilingual/data_scripts/check_iswlt_test_data.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
|
7 |
+
import os, sys
|
8 |
+
import subprocess
|
9 |
+
import re
|
10 |
+
from subprocess import check_call, check_output
|
11 |
+
|
12 |
+
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
|
13 |
+
|
14 |
+
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
|
15 |
+
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
|
16 |
+
sys.exit(-1)
|
17 |
+
|
18 |
+
|
19 |
+
BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ")
|
20 |
+
def run_eval_bleu(cmd):
|
21 |
+
output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip()
|
22 |
+
print(output)
|
23 |
+
bleu = -1.0
|
24 |
+
for line in output.strip().split('\n'):
|
25 |
+
m = BLEU_REGEX.search(line)
|
26 |
+
if m is not None:
|
27 |
+
bleu = m.groups()[0]
|
28 |
+
bleu = float(bleu)
|
29 |
+
break
|
30 |
+
return bleu
|
31 |
+
|
32 |
+
def check_data_test_bleu(raw_folder, data_lang_pairs):
|
33 |
+
not_matchings = []
|
34 |
+
for sacrebleu_set, src_tgts in data_lang_pairs:
|
35 |
+
for src_tgt in src_tgts:
|
36 |
+
print(f'checking test bleus for: {src_tgt} at {sacrebleu_set}')
|
37 |
+
src, tgt = src_tgt.split('-')
|
38 |
+
ssrc, stgt = src[:2], tgt[:2]
|
39 |
+
if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'):
|
40 |
+
# reversed direction may have different test set
|
41 |
+
test_src = f'{raw_folder}/test.{tgt}-{src}.{src}'
|
42 |
+
else:
|
43 |
+
test_src = f'{raw_folder}/test.{src}-{tgt}.{src}'
|
44 |
+
cmd1 = f'cat {test_src} | sacrebleu -t "{sacrebleu_set}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""'
|
45 |
+
test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}'
|
46 |
+
cmd2 = f'cat {test_tgt} | sacrebleu -t "{sacrebleu_set}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""'
|
47 |
+
bleu1 = run_eval_bleu(cmd1)
|
48 |
+
if bleu1 != 100.0:
|
49 |
+
not_matchings.append(f'{sacrebleu_set}:{src_tgt} source side not matching: {test_src}')
|
50 |
+
bleu2 = run_eval_bleu(cmd2)
|
51 |
+
if bleu2 != 100.0:
|
52 |
+
not_matchings.append(f'{sacrebleu_set}:{src_tgt} target side not matching: {test_tgt}')
|
53 |
+
return not_matchings
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
to_data_path = f'{WORKDIR_ROOT}/iwsltv2'
|
57 |
+
not_matching = check_data_test_bleu(
|
58 |
+
f'{to_data_path}/raw',
|
59 |
+
[
|
60 |
+
('iwslt17', ['en_XX-ar_AR', 'en_XX-ko_KR', 'ar_AR-en_XX', 'ko_KR-en_XX']),
|
61 |
+
('iwslt17', ['en_XX-it_IT', 'en_XX-nl_XX', 'it_IT-en_XX', 'nl_XX-en_XX']),
|
62 |
+
('iwslt17/tst2015', ['en_XX-vi_VN', "vi_VN-en_XX"]),
|
63 |
+
]
|
64 |
+
)
|
65 |
+
if len(not_matching) > 0:
|
66 |
+
print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching))
|
67 |
+
|
fairseq/examples/multilingual/data_scripts/check_self_overlaps.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
|
7 |
+
import os
|
8 |
+
import glob
|
9 |
+
import argparse
|
10 |
+
from utils.dedup import deup
|
11 |
+
import sys
|
12 |
+
|
13 |
+
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
|
14 |
+
|
15 |
+
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
|
16 |
+
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
|
17 |
+
sys.exit(-1)
|
18 |
+
|
19 |
+
def get_directions(folder):
|
20 |
+
raw_files = glob.glob(f'{folder}/train*')
|
21 |
+
directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
|
22 |
+
return directions
|
23 |
+
|
24 |
+
def diff_list(lhs, rhs):
|
25 |
+
return set(lhs).difference(set(rhs))
|
26 |
+
|
27 |
+
def check_diff(
|
28 |
+
from_src_file, from_tgt_file,
|
29 |
+
to_src_file, to_tgt_file,
|
30 |
+
):
|
31 |
+
seen_in_from = set()
|
32 |
+
seen_src_in_from = set()
|
33 |
+
seen_tgt_in_from = set()
|
34 |
+
from_count = 0
|
35 |
+
with open(from_src_file, encoding='utf-8') as fsrc, \
|
36 |
+
open(from_tgt_file, encoding='utf-8') as ftgt:
|
37 |
+
for s, t in zip(fsrc, ftgt):
|
38 |
+
seen_in_from.add((s, t))
|
39 |
+
seen_src_in_from.add(s)
|
40 |
+
seen_tgt_in_from.add(t)
|
41 |
+
from_count += 1
|
42 |
+
common = 0
|
43 |
+
common_src = 0
|
44 |
+
common_tgt = 0
|
45 |
+
to_count = 0
|
46 |
+
seen = set()
|
47 |
+
|
48 |
+
with open(to_src_file, encoding='utf-8') as fsrc, \
|
49 |
+
open(to_tgt_file, encoding='utf-8') as ftgt:
|
50 |
+
for s, t in zip(fsrc, ftgt):
|
51 |
+
to_count += 1
|
52 |
+
if (s, t) not in seen:
|
53 |
+
if (s, t) in seen_in_from:
|
54 |
+
common += 1
|
55 |
+
if s in seen_src_in_from:
|
56 |
+
common_src += 1
|
57 |
+
seen_src_in_from.remove(s)
|
58 |
+
if t in seen_tgt_in_from:
|
59 |
+
common_tgt += 1
|
60 |
+
seen_tgt_in_from.remove(t)
|
61 |
+
seen.add((s, t))
|
62 |
+
return common, common_src, common_tgt, from_count, to_count
|
63 |
+
|
64 |
+
def main():
|
65 |
+
parser = argparse.ArgumentParser()
|
66 |
+
parser.add_argument("--folder", type=str, required=True,
|
67 |
+
help="the data folder ")
|
68 |
+
parser.add_argument("--split", type=str, default='test',
|
69 |
+
help="split (valid, test) to check against training data")
|
70 |
+
parser.add_argument('--directions', type=str, default=None, required=False)
|
71 |
+
|
72 |
+
args = parser.parse_args()
|
73 |
+
|
74 |
+
if args.directions is None:
|
75 |
+
directions = set(get_directions(args.folder))
|
76 |
+
directions = sorted(directions)
|
77 |
+
else:
|
78 |
+
directions = args.directions.split(',')
|
79 |
+
directions = sorted(set(directions))
|
80 |
+
|
81 |
+
results = []
|
82 |
+
print(f'checking where {args.split} split data are in training')
|
83 |
+
print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size')
|
84 |
+
|
85 |
+
for direction in directions:
|
86 |
+
src, tgt = direction.split('-')
|
87 |
+
from_src_file = f'{args.folder}/{args.split}.{src}-{tgt}.{src}'
|
88 |
+
from_tgt_file = f'{args.folder}/{args.split}.{src}-{tgt}.{tgt}'
|
89 |
+
if not os.path.exists(from_src_file):
|
90 |
+
# some test/valid data might in reverse directinos:
|
91 |
+
from_src_file = f'{args.folder}/{args.split}.{tgt}-{src}.{src}'
|
92 |
+
from_tgt_file = f'{args.folder}/{args.split}.{tgt}-{src}.{tgt}'
|
93 |
+
to_src_file = f'{args.folder}/train.{src}-{tgt}.{src}'
|
94 |
+
to_tgt_file = f'{args.folder}/train.{src}-{tgt}.{tgt}'
|
95 |
+
if not os.path.exists(to_src_file) or not os.path.exists(from_src_file):
|
96 |
+
continue
|
97 |
+
r = check_diff(from_src_file, from_tgt_file, to_src_file, to_tgt_file)
|
98 |
+
results.append(r)
|
99 |
+
print(f'{direction}\t', '\t'.join(map(str, r)))
|
100 |
+
|
101 |
+
|
102 |
+
if __name__ == "__main__":
|
103 |
+
main()
|
fairseq/examples/multilingual/data_scripts/check_valid_test_overlaps.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
|
7 |
+
import os
|
8 |
+
import argparse
|
9 |
+
import pandas as pd
|
10 |
+
import sys
|
11 |
+
|
12 |
+
|
13 |
+
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
|
14 |
+
|
15 |
+
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
|
16 |
+
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
|
17 |
+
sys.exit(-1)
|
18 |
+
|
19 |
+
def load_langs(path):
|
20 |
+
with open(path) as fr:
|
21 |
+
langs = [l.strip() for l in fr]
|
22 |
+
return langs
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
def load_sentences(raw_data, split, direction):
|
27 |
+
src, tgt = direction.split('-')
|
28 |
+
src_path = f"{raw_data}/{split}.{direction}.{src}"
|
29 |
+
tgt_path = f"{raw_data}/{split}.{direction}.{tgt}"
|
30 |
+
if os.path.exists(src_path) and os.path.exists(tgt_path):
|
31 |
+
return [(src, open(src_path).read().splitlines()), (tgt, open(tgt_path).read().splitlines())]
|
32 |
+
else:
|
33 |
+
return []
|
34 |
+
|
35 |
+
def swap_direction(d):
|
36 |
+
src, tgt = d.split('-')
|
37 |
+
return f'{tgt}-{src}'
|
38 |
+
|
39 |
+
def get_all_test_data(raw_data, directions, split='test'):
|
40 |
+
test_data = [
|
41 |
+
x
|
42 |
+
for dd in directions
|
43 |
+
for d in [dd, swap_direction(dd)]
|
44 |
+
for x in load_sentences(raw_data, split, d)
|
45 |
+
]
|
46 |
+
# all_test_data = {s for _, d in test_data for s in d}
|
47 |
+
all_test_data = {}
|
48 |
+
for lang, d in test_data:
|
49 |
+
for s in d:
|
50 |
+
s = s.strip()
|
51 |
+
lgs = all_test_data.get(s, set())
|
52 |
+
lgs.add(lang)
|
53 |
+
all_test_data[s] = lgs
|
54 |
+
return all_test_data, test_data
|
55 |
+
|
56 |
+
|
57 |
+
def check_train_sentences(src_path, tgt_path, direction, all_test_data, mess_up_train={}):
|
58 |
+
# src, tgt = direction.split('-')
|
59 |
+
print(f'check training data for {direction} in {src_path} and {tgt_path}')
|
60 |
+
size = 0
|
61 |
+
overlapped_size_counted_dup = 0
|
62 |
+
if not os.path.exists(tgt_path) or not os.path.exists(src_path):
|
63 |
+
return mess_up_train, size, overlapped_size_counted_dup
|
64 |
+
|
65 |
+
with open(src_path) as f, open(tgt_path) as g:
|
66 |
+
for src_line, tgt_line in zip(f, g):
|
67 |
+
s = src_line.strip()
|
68 |
+
t = tgt_line.strip()
|
69 |
+
size += 1
|
70 |
+
if s in all_test_data:
|
71 |
+
langs = mess_up_train.get(s, set())
|
72 |
+
langs.add(direction)
|
73 |
+
mess_up_train[s] = langs
|
74 |
+
overlapped_size_counted_dup += 1
|
75 |
+
if t in all_test_data:
|
76 |
+
langs = mess_up_train.get(t, set())
|
77 |
+
langs.add(direction)
|
78 |
+
mess_up_train[t] = langs
|
79 |
+
overlapped_size_counted_dup += 1
|
80 |
+
print(f'{direction}: size={size}, overlapped={overlapped_size_counted_dup}')
|
81 |
+
return mess_up_train, size, overlapped_size_counted_dup
|
82 |
+
|
83 |
+
def check_train_all(raw_data, directions, all_test_data):
|
84 |
+
mess_up_train = {}
|
85 |
+
data_sizes = {}
|
86 |
+
# raw_data = '~chau/data-bin/MineBART/multilingual_mined_100M/en_XX/et_EE-en_XX/all.{en_XX, et_EE}'
|
87 |
+
print(f'checking training data againsts # {len(all_test_data)} sentences')
|
88 |
+
print(f'example test data: ', [s for i, s in enumerate(all_test_data.keys()) if i < 10])
|
89 |
+
for direction in directions:
|
90 |
+
src, tgt = direction.split('-')
|
91 |
+
path = f'{raw_data}/en_XX/{direction}/all'
|
92 |
+
src_path = f'{path}.{src}'
|
93 |
+
tgt_path = f'{path}.{tgt}'
|
94 |
+
print(f'checking {src_path} {tgt_path}')
|
95 |
+
_, size, overlapped_size_counted_dup = check_train_sentences(src_path, tgt_path, direction, all_test_data, mess_up_train)
|
96 |
+
data_sizes[direction] = (size, overlapped_size_counted_dup)
|
97 |
+
return mess_up_train, data_sizes
|
98 |
+
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
def main():
|
103 |
+
parser = argparse.ArgumentParser()
|
104 |
+
parser.add_argument("--folder", type=str, required=True,
|
105 |
+
help="the data folder ")
|
106 |
+
parser.add_argument("--test-data", type=str, required=True,
|
107 |
+
help="the test data folder ")
|
108 |
+
parser.add_argument('--directions', type=str, default=None, required=False)
|
109 |
+
|
110 |
+
args = parser.parse_args()
|
111 |
+
directions = args.directions.split(',')
|
112 |
+
directions = sorted(set(directions))
|
113 |
+
|
114 |
+
results = []
|
115 |
+
# print(f'checking where {args.split} split data are in training')
|
116 |
+
# print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size')
|
117 |
+
raw_data = args.folder
|
118 |
+
all_test_data, test_data = get_all_test_data(args.test_data, directions, split='test')
|
119 |
+
mess_up_train, data_sizes = check_train_all(raw_data, directions, all_test_data)
|
120 |
+
print(data_sizes)
|
121 |
+
|
122 |
+
|
123 |
+
if __name__ == "__main__":
|
124 |
+
main()
|
fairseq/examples/multilingual/data_scripts/dedup_all.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
import os
|
9 |
+
import glob
|
10 |
+
import argparse
|
11 |
+
from utils.dedup import deup
|
12 |
+
|
13 |
+
import sys
|
14 |
+
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
|
15 |
+
|
16 |
+
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
|
17 |
+
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
|
18 |
+
sys.exit(-1)
|
19 |
+
|
20 |
+
|
21 |
+
def main():
|
22 |
+
parser = argparse.ArgumentParser()
|
23 |
+
parser.add_argument("--from-folder", type=str, required=True,
|
24 |
+
help="the data folder to be dedup")
|
25 |
+
parser.add_argument("--to-folder", type=str, required=True,
|
26 |
+
help="the data folder to save deduped data")
|
27 |
+
parser.add_argument('--directions', type=str, default=None, required=False)
|
28 |
+
|
29 |
+
args = parser.parse_args()
|
30 |
+
|
31 |
+
if args.directions is None:
|
32 |
+
raw_files = glob.glob(f'{args.from_folder}/train*')
|
33 |
+
|
34 |
+
directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
|
35 |
+
else:
|
36 |
+
directions = args.directions.split(',')
|
37 |
+
directions = sorted(set(directions))
|
38 |
+
|
39 |
+
for direction in directions:
|
40 |
+
src, tgt = direction.split('-')
|
41 |
+
src_file = f'{args.from_folder}/train.{src}-{tgt}.{src}'
|
42 |
+
tgt_file = f'{args.from_folder}/train.{src}-{tgt}.{tgt}'
|
43 |
+
src_file_out = f'{args.to_folder}/train.{src}-{tgt}.{src}'
|
44 |
+
tgt_file_out = f'{args.to_folder}/train.{src}-{tgt}.{tgt}'
|
45 |
+
assert src_file != src_file_out
|
46 |
+
assert tgt_file != tgt_file_out
|
47 |
+
print(f'deduping {src_file}, {tgt_file}')
|
48 |
+
deup(src_file, tgt_file, src_file_out, tgt_file_out)
|
49 |
+
|
50 |
+
|
51 |
+
if __name__ == "__main__":
|
52 |
+
main()
|
fairseq/examples/multilingual/data_scripts/download_ML50_v1.sh
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
if [ -z $WORKDIR_ROOT ] ;
|
9 |
+
then
|
10 |
+
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
|
11 |
+
exit
|
12 |
+
fi
|
13 |
+
|
14 |
+
# first run download_wmt20.sh; it will install a few useful tools for other scripts
|
15 |
+
# TODO: need to print out instructions on downloading a few files which requires manually authentication from the websites
|
16 |
+
bash ./download_wmt20.sh
|
17 |
+
|
18 |
+
python ./download_wmt19_and_before.py
|
19 |
+
bash ./download_wat19_my.sh
|
20 |
+
python ./download_ted_and_extract.py
|
21 |
+
bash ./download_lotus.sh
|
22 |
+
bash ./download_iitb.sh
|
23 |
+
bash ./download_af_xh.sh
|
24 |
+
|
25 |
+
|
26 |
+
# IWSLT downloading URLs have changed in between; TODO: fix them:
|
27 |
+
bash ./download_iwslt_and_extract.sh
|
28 |
+
|
29 |
+
# TODO: globalvoices URLs changed; need to be fixed
|
30 |
+
bash ./download_flores_data.sh
|
fairseq/examples/multilingual/data_scripts/download_af_xh.sh
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
# set -x -e
|
9 |
+
|
10 |
+
if [ -z $WORKDIR_ROOT ] ;
|
11 |
+
then
|
12 |
+
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
|
13 |
+
exit
|
14 |
+
fi
|
15 |
+
|
16 |
+
|
17 |
+
# put intermediate files
|
18 |
+
TMP_DIR=$WORKDIR_ROOT/temp/af_xhv2
|
19 |
+
# output {train,valid,test} files to dest
|
20 |
+
DEST=${WORKDIR_ROOT}/ML50/raw
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
ROOT=${WORKDIR_ROOT}
|
25 |
+
UTILS=$PWD/utils
|
26 |
+
TMX2CORPUS="${UTILS}/tmx2corpus"
|
27 |
+
TMX_TOOL="python ${TMX2CORPUS}/tmx2corpus.py"
|
28 |
+
|
29 |
+
mkdir -p $TMP_DIR
|
30 |
+
mkdir -p $DEST
|
31 |
+
mkdir -p $UTILS
|
32 |
+
|
33 |
+
function download_opus(){
|
34 |
+
src=$1
|
35 |
+
tgt=$2
|
36 |
+
subset=$3
|
37 |
+
ulr=$4
|
38 |
+
|
39 |
+
mkdir extract_$subset.$src-$tgt
|
40 |
+
pushd extract_$subset.$src-$tgt
|
41 |
+
if [ ! -f "$subset.$src-$tgt.tmx.gz" ]; then
|
42 |
+
wget $url -O "$subset.$src-$tgt.tmx.gz"
|
43 |
+
gzip -d "$subset.$src-$tgt.tmx.gz"
|
44 |
+
f=$subset.$src-$tgt.tmx
|
45 |
+
$TMX_TOOL $f
|
46 |
+
mv bitext.$src ../$subset.$src-$tgt.$src
|
47 |
+
mv bitext.$tgt ../$subset.$src-$tgt.$tgt
|
48 |
+
fi
|
49 |
+
popd
|
50 |
+
}
|
51 |
+
|
52 |
+
function concat_subsets(){
|
53 |
+
src=$1
|
54 |
+
tgt=$2
|
55 |
+
subsets=$3
|
56 |
+
src_train=raw_train.$src-$tgt.$src
|
57 |
+
tgt_train=raw_train.$src-$tgt.$tgt
|
58 |
+
> $src_train
|
59 |
+
> $tgt_train
|
60 |
+
for subset in $subsets; do
|
61 |
+
cat $subset.$src-$tgt.$src >> $src_train
|
62 |
+
cat $subset.$src-$tgt.$tgt >> $tgt_train
|
63 |
+
done
|
64 |
+
}
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
function get_seeded_random()
|
69 |
+
{
|
70 |
+
seed="$1"
|
71 |
+
openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
|
72 |
+
</dev/zero 2>/dev/null
|
73 |
+
}
|
74 |
+
|
75 |
+
function split_train_valid(){
|
76 |
+
src=$1
|
77 |
+
tgt=$2
|
78 |
+
raw_src_train=raw_train.$src-$tgt.$src
|
79 |
+
raw_tgt_train=raw_train.$src-$tgt.$tgt
|
80 |
+
|
81 |
+
shuf --random-source=<(get_seeded_random 43) $raw_src_train > shuffled.$src-$tgt.$src
|
82 |
+
shuf --random-source=<(get_seeded_random 43) $raw_tgt_train > shuffled.$src-$tgt.$tgt
|
83 |
+
|
84 |
+
head -n 1500 shuffled.$src-$tgt.$src > valid.$src-$tgt.$src
|
85 |
+
head -n 1500 shuffled.$src-$tgt.$tgt > valid.$src-$tgt.$tgt
|
86 |
+
|
87 |
+
tail +1501 shuffled.$src-$tgt.$src > train.$src-$tgt.$src
|
88 |
+
tail +1501 shuffled.$src-$tgt.$tgt > train.$src-$tgt.$tgt
|
89 |
+
}
|
90 |
+
|
91 |
+
function copy2dst(){
|
92 |
+
lsrc=$1
|
93 |
+
ltgt=$2
|
94 |
+
src=${lsrc:0:2}
|
95 |
+
tgt=${ltgt:0:2}
|
96 |
+
|
97 |
+
|
98 |
+
cp valid.$src-$tgt.$src $DEST/valid.$lsrc-$ltgt.$lsrc
|
99 |
+
cp valid.$src-$tgt.$tgt $DEST/valid.$lsrc-$ltgt.$ltgt
|
100 |
+
|
101 |
+
cp train.$src-$tgt.$src $DEST/train.$lsrc-$ltgt.$lsrc
|
102 |
+
cp train.$src-$tgt.$tgt $DEST/train.$lsrc-$ltgt.$ltgt
|
103 |
+
}
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
#for xh-en
|
109 |
+
declare -A xh_en_urls
|
110 |
+
xh_en_urls=(
|
111 |
+
[Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/en-xh.tmx.gz
|
112 |
+
[wikimedia]=https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/tmx/en-xh.tmx.gz
|
113 |
+
[memat]=https://object.pouta.csc.fi/OPUS-memat/v1/tmx/en-xh.tmx.gz
|
114 |
+
[uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/en-xh.tmx.gz
|
115 |
+
[GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/en-xh.tmx.gz
|
116 |
+
[XhosaNavy]=https://object.pouta.csc.fi/OPUS-XhosaNavy/v1/tmx/en-xh.tmx.gz
|
117 |
+
[KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/en-xh.tmx.gz
|
118 |
+
[Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/en-xh.tmx.gz
|
119 |
+
)
|
120 |
+
|
121 |
+
mkdir $TMP_DIR/xh-en
|
122 |
+
pushd $TMP_DIR/xh-en
|
123 |
+
for k in "${!xh_en_urls[@]}"
|
124 |
+
do
|
125 |
+
name=$k
|
126 |
+
url=${xh_en_urls[$k]}
|
127 |
+
echo "$name: $url"
|
128 |
+
download_opus xh en $name $ulr
|
129 |
+
done
|
130 |
+
concat_subsets xh en "${!xh_en_urls[@]}"
|
131 |
+
split_train_valid xh en
|
132 |
+
copy2dst xh_ZA en_XX
|
133 |
+
popd
|
134 |
+
|
135 |
+
|
136 |
+
##
|
137 |
+
#for af-en
|
138 |
+
declare -A af_en_urls
|
139 |
+
af_en_urls=(
|
140 |
+
[Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/af-en.tmx.gz
|
141 |
+
[uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/af-en.tmx.gz
|
142 |
+
[GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/af-en.tmx.gz
|
143 |
+
[QED]=https://object.pouta.csc.fi/OPUS-QED/v2.0a/tmx/af-en.tmx.gz
|
144 |
+
[KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/af-en.tmx.gz
|
145 |
+
[OpenSubtitles]=https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/af-en.tmx.gz
|
146 |
+
[SPC]=https://object.pouta.csc.fi/OPUS-SPC/v1/tmx/af-en.tmx.gz
|
147 |
+
[Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/af-en.tmx.gz
|
148 |
+
)
|
149 |
+
|
150 |
+
mkdir $TMP_DIR/af-en
|
151 |
+
pushd $TMP_DIR/af-en
|
152 |
+
for k in "${!af_en_urls[@]}"
|
153 |
+
do
|
154 |
+
name=$k
|
155 |
+
url=${af_en_urls[$k]}
|
156 |
+
echo "$name: $url"
|
157 |
+
download_opus af en $name $ulr
|
158 |
+
done
|
159 |
+
concat_subsets af en "${!af_en_urls[@]}"
|
160 |
+
split_train_valid af en
|
161 |
+
copy2dst af_ZA en_XX
|
162 |
+
popd
|
163 |
+
|
164 |
+
|
fairseq/examples/multilingual/data_scripts/download_flores_data.sh
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
4 |
+
# All rights reserved.
|
5 |
+
#
|
6 |
+
# This source code is licensed under the license found in the
|
7 |
+
# LICENSE file in the root directory of this source tree.
|
8 |
+
#
|
9 |
+
|
10 |
+
if [ -z $WORKDIR_ROOT ] ;
|
11 |
+
then
|
12 |
+
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
|
13 |
+
exit
|
14 |
+
fi
|
15 |
+
|
16 |
+
|
17 |
+
set -e
|
18 |
+
set -o pipefail
|
19 |
+
|
20 |
+
SRC=en
|
21 |
+
SI_TGT=si
|
22 |
+
NE_TGT=ne
|
23 |
+
|
24 |
+
DESTDIR=${WORKDIR_ROOT}/ML50/raw/
|
25 |
+
|
26 |
+
ROOT=${WORKDIR_ROOT}/tmp
|
27 |
+
mkdir -p $ROOT
|
28 |
+
DATA=$ROOT/data
|
29 |
+
NE_ROOT=$DATA/all-clean-ne
|
30 |
+
SI_ROOT=$DATA/all-clean-si
|
31 |
+
|
32 |
+
mkdir -p $DATA $NE_ROOT $SI_ROOT
|
33 |
+
|
34 |
+
SI_OPUS_DATASETS=(
|
35 |
+
"$SI_ROOT/GNOME.en-si"
|
36 |
+
"$SI_ROOT/Ubuntu.en-si"
|
37 |
+
"$SI_ROOT/KDE4.en-si"
|
38 |
+
"$SI_ROOT/OpenSubtitles.en-si"
|
39 |
+
)
|
40 |
+
|
41 |
+
SI_OPUS_URLS=(
|
42 |
+
"https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-si.txt.zip"
|
43 |
+
"https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-si.txt.zip"
|
44 |
+
"https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-si.txt.zip"
|
45 |
+
"https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/en-si.txt.zip"
|
46 |
+
)
|
47 |
+
|
48 |
+
NE_OPUS_DATASETS=(
|
49 |
+
"$NE_ROOT/GNOME.en-ne"
|
50 |
+
"$NE_ROOT/Ubuntu.en-ne"
|
51 |
+
"$NE_ROOT/KDE4.en-ne"
|
52 |
+
)
|
53 |
+
|
54 |
+
NE_OPUS_URLS=(
|
55 |
+
"https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-ne.txt.zip"
|
56 |
+
"https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-ne.txt.zip"
|
57 |
+
"https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-ne.txt.zip"
|
58 |
+
)
|
59 |
+
|
60 |
+
REMOVE_FILE_PATHS=()
|
61 |
+
|
62 |
+
# Download data
|
63 |
+
download_data() {
|
64 |
+
CORPORA=$1
|
65 |
+
URL=$2
|
66 |
+
|
67 |
+
if [ -f $CORPORA ]; then
|
68 |
+
echo "$CORPORA already exists, skipping download"
|
69 |
+
else
|
70 |
+
echo "Downloading $URL"
|
71 |
+
wget $URL -O $CORPORA --no-check-certificate || rm -f $CORPORA
|
72 |
+
if [ -f $CORPORA ]; then
|
73 |
+
echo "$URL successfully downloaded."
|
74 |
+
else
|
75 |
+
echo "$URL not successfully downloaded."
|
76 |
+
rm -f $CORPORA
|
77 |
+
exit -1
|
78 |
+
fi
|
79 |
+
fi
|
80 |
+
}
|
81 |
+
|
82 |
+
# Example: download_opus_data $LANG_ROOT $TGT
|
83 |
+
download_opus_data() {
|
84 |
+
LANG_ROOT=$1
|
85 |
+
TGT=$2
|
86 |
+
|
87 |
+
if [ "$TGT" = "si" ]; then
|
88 |
+
URLS=("${SI_OPUS_URLS[@]}")
|
89 |
+
DATASETS=("${SI_OPUS_DATASETS[@]}")
|
90 |
+
else
|
91 |
+
URLS=("${NE_OPUS_URLS[@]}")
|
92 |
+
DATASETS=("${NE_OPUS_DATASETS[@]}")
|
93 |
+
fi
|
94 |
+
|
95 |
+
# Download and extract data
|
96 |
+
for ((i=0;i<${#URLS[@]};++i)); do
|
97 |
+
URL=${URLS[i]}
|
98 |
+
CORPORA=${DATASETS[i]}
|
99 |
+
|
100 |
+
download_data $CORPORA $URL
|
101 |
+
unzip -o $CORPORA -d $LANG_ROOT
|
102 |
+
REMOVE_FILE_PATHS+=( $CORPORA $CORPORA.xml $CORPORA.ids $LANG_ROOT/README $LANG_ROOT/LICENSE )
|
103 |
+
done
|
104 |
+
|
105 |
+
cat ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$SRC
|
106 |
+
cat ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$TGT
|
107 |
+
|
108 |
+
REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC )
|
109 |
+
REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT )
|
110 |
+
}
|
111 |
+
|
112 |
+
download_opus_data $SI_ROOT $SI_TGT
|
113 |
+
cp ${SI_OPUS_DATASETS[3]}.$SRC $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SRC
|
114 |
+
cp ${SI_OPUS_DATASETS[3]}.$SI_TGT $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SI_TGT
|
115 |
+
REMOVE_FILE_PATHS+=( ${SI_OPUS_DATASETS[3]}.$SRC ${SI_OPUS_DATASETS[3]}.$SI_TGT )
|
116 |
+
|
117 |
+
download_opus_data $NE_ROOT $NE_TGT
|
118 |
+
|
119 |
+
|
120 |
+
# Download and extract Global Voices data
|
121 |
+
GLOBAL_VOICES="$NE_ROOT/globalvoices.2018q4.ne-en"
|
122 |
+
GLOBAL_VOICES_URL="http://www.casmacat.eu/corpus/global-voices/globalvoices.ne-en.xliff.gz"
|
123 |
+
|
124 |
+
download_data $GLOBAL_VOICES.gz $GLOBAL_VOICES_URL
|
125 |
+
gunzip -Nf $GLOBAL_VOICES.gz
|
126 |
+
|
127 |
+
sed -ne 's?.*<source>\(.*\)</source>.*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$NE_TGT
|
128 |
+
sed -ne 's?.*<target[^>]*>\(.*\)</target>.*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$SRC
|
129 |
+
|
130 |
+
REMOVE_FILE_PATHS+=( $GLOBAL_VOICES )
|
131 |
+
|
132 |
+
# Download and extract the bible dataset
|
133 |
+
BIBLE_TOOLS=bible-corpus-tools
|
134 |
+
XML_BIBLES=XML_Bibles
|
135 |
+
XML_BIBLES_DUP=XML_Bibles_dup
|
136 |
+
|
137 |
+
if [ ! -e $BIBLE_TOOLS ]; then
|
138 |
+
echo "Cloning bible-corpus-tools repository..."
|
139 |
+
git clone https://github.com/christos-c/bible-corpus-tools.git
|
140 |
+
fi
|
141 |
+
|
142 |
+
mkdir -p $BIBLE_TOOLS/bin $XML_BIBLES $XML_BIBLES_DUP
|
143 |
+
javac -cp "$BIBLE_TOOLS/lib/*" -d $BIBLE_TOOLS/bin $BIBLE_TOOLS/src/bible/readers/*.java $BIBLE_TOOLS/src/bible/*.java
|
144 |
+
|
145 |
+
download_data bible.tar.gz "https://github.com/christos-c/bible-corpus/archive/v1.2.1.tar.gz"
|
146 |
+
tar xvzf bible.tar.gz
|
147 |
+
|
148 |
+
cp bible-corpus-1.2.1/bibles/{Greek.xml,English.xml,Nepali.xml} $XML_BIBLES/
|
149 |
+
cp bible-corpus-1.2.1/bibles/{Greek.xml,English-WEB.xml,Nepali.xml} $XML_BIBLES_DUP/
|
150 |
+
|
151 |
+
java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES
|
152 |
+
java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES_DUP
|
153 |
+
java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES
|
154 |
+
java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES_DUP
|
155 |
+
|
156 |
+
cat $XML_BIBLES/aligned/*/English.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$SRC
|
157 |
+
cat $XML_BIBLES/aligned/*/Nepali.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$NE_TGT
|
158 |
+
cat $XML_BIBLES_DUP/aligned/*/English-WEB.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$SRC
|
159 |
+
cat $XML_BIBLES_DUP/aligned/*/Nepali.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$NE_TGT
|
160 |
+
REMOVE_FILE_PATHS+=( bible-corpus-1.2.1 bible.tar.gz $BIBLE_TOOLS $XML_BIBLES $XML_BIBLES_DUP )
|
161 |
+
|
162 |
+
# Download and extract the Penn Treebank dataset
|
163 |
+
NE_TAGGED=$ROOT/new_submissions_parallel_corpus_project_Nepal
|
164 |
+
NE_TAGGED_URL="http://www.cle.org.pk/Downloads/ling_resources/parallelcorpus/NepaliTaggedCorpus.zip"
|
165 |
+
EN_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.en.patch"
|
166 |
+
NE_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.ne.patch"
|
167 |
+
MOSES=mosesdecoder
|
168 |
+
MOSES_TOK=$MOSES/scripts/tokenizer
|
169 |
+
EN_PATCH_REGEX="{s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}"
|
170 |
+
NE_PATCH_REGEX="{s:\p{Cf}::g;s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}"
|
171 |
+
|
172 |
+
download_data $DATA/nepali-penn-treebank.$SRC.patch $EN_TAGGED_PATCH_URL
|
173 |
+
download_data $DATA/nepali-penn-treebank.$NE_TGT.patch $NE_TAGGED_PATCH_URL
|
174 |
+
download_data original.zip $NE_TAGGED_URL
|
175 |
+
unzip -o original.zip -d $ROOT
|
176 |
+
|
177 |
+
cat $NE_TAGGED/00.txt $NE_TAGGED/01.txt $NE_TAGGED/02.txt > $NE_TAGGED/nepali-penn-treebank.$SRC
|
178 |
+
cat $NE_TAGGED/00ne_revised.txt $NE_TAGGED/01ne_revised.txt $NE_TAGGED/02ne_revised.txt > $NE_TAGGED/nepali-penn-treebank.$NE_TGT
|
179 |
+
|
180 |
+
patch $NE_TAGGED/nepali-penn-treebank.$SRC -i $DATA/nepali-penn-treebank.$SRC.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$SRC
|
181 |
+
patch $NE_TAGGED/nepali-penn-treebank.$NE_TGT -i $DATA/nepali-penn-treebank.$NE_TGT.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT
|
182 |
+
|
183 |
+
if [ ! -e $MOSES ]; then
|
184 |
+
echo "Cloning moses repository..."
|
185 |
+
git clone https://github.com/moses-smt/mosesdecoder.git
|
186 |
+
fi
|
187 |
+
|
188 |
+
cat $NE_TAGGED/nepali-penn-treebank-patched.$SRC | \
|
189 |
+
perl -anpe "$EN_PATCH_REGEX" | \
|
190 |
+
$MOSES_TOK/tokenizer.perl -l $SRC | \
|
191 |
+
$MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$SRC
|
192 |
+
|
193 |
+
cat $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT | \
|
194 |
+
perl -CIO -anpe "$NE_PATCH_REGEX" | \
|
195 |
+
$MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$NE_TGT
|
196 |
+
|
197 |
+
|
198 |
+
# Download nepali dictionary data
|
199 |
+
NE_DICT=$NE_ROOT/dictionaries
|
200 |
+
download_data $NE_DICT "http://www.seas.upenn.edu/~nlp/resources/TACL-data-release/dictionaries.tar.gz"
|
201 |
+
tar xvzf $NE_DICT
|
202 |
+
cp dictionaries/dict.ne $NE_ROOT/dictionary.$NE_TGT-$SRC
|
203 |
+
REMOVE_FILE_PATHS+=( $NE_DICT dictionaries )
|
204 |
+
|
205 |
+
REMOVE_FILE_PATHS+=( $MOSES $NE_TAGGED original.zip $DATA/nepali-penn-treebank.$SRC.patch $DATA/nepali-penn-treebank.$NE_TGT.patch )
|
206 |
+
|
207 |
+
|
208 |
+
# Remove the temporary files
|
209 |
+
for ((i=0;i<${#REMOVE_FILE_PATHS[@]};++i)); do
|
210 |
+
rm -rf ${REMOVE_FILE_PATHS[i]}
|
211 |
+
done
|
212 |
+
|
213 |
+
# Copy the training data
|
214 |
+
si=si_LK
|
215 |
+
ne=ne_NP
|
216 |
+
en=en_XX
|
217 |
+
cat $SI_ROOT/GNOMEKDEUbuntu.en-si.si $SI_ROOT/OpenSubtitles2018.en-si.si > $DESTDIR/train.$si-$en.$si
|
218 |
+
cat $SI_ROOT/GNOMEKDEUbuntu.en-si.en $SI_ROOT/OpenSubtitles2018.en-si.en > $DESTDIR/train.$si-$en.$en
|
219 |
+
|
220 |
+
cat $NE_ROOT/bible_dup.en-ne.ne $NE_ROOT/bible.en-ne.ne $NE_ROOT/globalvoices.2018q4.ne-en.ne $NE_ROOT/GNOMEKDEUbuntu.en-ne.ne $NE_ROOT/nepali-penn-treebank.ne > $DESTDIR/train.$ne-$en.$ne
|
221 |
+
cat $NE_ROOT/bible_dup.en-ne.en $NE_ROOT/bible.en-ne.en $NE_ROOT/globalvoices.2018q4.ne-en.en $NE_ROOT/GNOMEKDEUbuntu.en-ne.en $NE_ROOT/nepali-penn-treebank.en > $DESTDIR/train.$ne-$en.$en
|
222 |
+
|
223 |
+
|
224 |
+
#Download the test sets
|
225 |
+
wget https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz
|
226 |
+
tar -xvzf wikipedia_en_ne_si_test_sets.tgz
|
227 |
+
|
228 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en.ne $DESTDIR/valid.$ne-$en.$ne
|
229 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en.en $DESTDIR/valid.$ne-$en.$en
|
230 |
+
|
231 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en.si $DESTDIR/valid.$si-$en.$si
|
232 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en.en $DESTDIR/valid.$si-$en.$en
|
233 |
+
|
234 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en.ne $DESTDIR/devtest.$ne-$en.$ne
|
235 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en.en $DESTDIR/devtest.$ne-$en.$en
|
236 |
+
|
237 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en.si $DESTDIR/devtest.$si-$en.$si
|
238 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en.en $DESTDIR/devtest.$si-$en.$en
|
239 |
+
|
240 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.test.ne-en.ne $DESTDIR/test.$ne-$en.$ne
|
241 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.test.ne-en.en $DESTDIR/test.$ne-$en.$en
|
242 |
+
|
243 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.test.si-en.si $DESTDIR/test.$si-$en.$si
|
244 |
+
cp wikipedia_en_ne_si_test_sets/wikipedia.test.si-en.en $DESTDIR/test.$si-$en.$en
|
245 |
+
|
246 |
+
rm -rf wikipedia_en_ne_si_test_sets.tgz wikipedia_en_ne_si_test_sets
|
fairseq/examples/multilingual/data_scripts/download_iitb.sh
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
|
9 |
+
if [ -z $WORKDIR_ROOT ] ;
|
10 |
+
then
|
11 |
+
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
|
12 |
+
exit
|
13 |
+
fi
|
14 |
+
|
15 |
+
IITB=$WORKDIR_ROOT/IITB
|
16 |
+
mkdir -p $IITB
|
17 |
+
pushd $IITB
|
18 |
+
|
19 |
+
wget http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download/parallel.tgz
|
20 |
+
tar -xvzf parallel.tgz
|
21 |
+
|
22 |
+
wget http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download/dev_test.tgz
|
23 |
+
tar -xvzf dev_test.tgz
|
24 |
+
|
25 |
+
DESTDIR=${WORKDIR_ROOT}/ML50/raw/
|
26 |
+
|
27 |
+
cp parallel/IITB.en-hi.en $DESTDIR/train.hi_IN-en_XX.en_XX
|
28 |
+
cp parallel/IITB.en-hi.hi $DESTDIR/train.hi_IN-en_XX.hi_IN
|
29 |
+
|
30 |
+
cp dev_test/dev.en $DESTDIR/valid.hi_IN-en_XX.en_XX
|
31 |
+
cp dev_test/dev.hi $DESTDIR/valid.hi_IN-en_XX.hi_IN
|
32 |
+
|
33 |
+
cp dev_test/test.en $DESTDIR/test.hi_IN-en_XX.en_XX
|
34 |
+
cp dev_test/test.hi $DESTDIR/test.hi_IN-en_XX.hi_IN
|
35 |
+
popd
|
fairseq/examples/multilingual/data_scripts/download_iwslt_and_extract.sh
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
#echo 'Cloning Moses github repository (for tokenization scripts)...'
|
9 |
+
#git clone https://github.com/moses-smt/mosesdecoder.git
|
10 |
+
|
11 |
+
if [ -z $WORKDIR_ROOT ] ;
|
12 |
+
then
|
13 |
+
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
|
14 |
+
exit
|
15 |
+
fi
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
data_root=${WORKDIR_ROOT}/iwsltv2
|
20 |
+
DESTDIR=${WORKDIR_ROOT}/ML50/raw
|
21 |
+
|
22 |
+
|
23 |
+
langs="ar_AR it_IT nl_XX ko_KR vi_VN"
|
24 |
+
echo "data_root: $data_root"
|
25 |
+
|
26 |
+
download_path=${data_root}/downloads
|
27 |
+
raw=${DESTDIR}
|
28 |
+
tmp=${data_root}/tmp
|
29 |
+
orig=${data_root}/orig
|
30 |
+
|
31 |
+
mkdir -p $download_path $orig $raw $tmp
|
32 |
+
#######################
|
33 |
+
download_iwslt(){
|
34 |
+
iwslt_key=$1
|
35 |
+
src=$2
|
36 |
+
tgt=$3
|
37 |
+
save_prefix=$4
|
38 |
+
pushd ${download_path}
|
39 |
+
if [[ ! -f ${save_prefix}$src-$tgt.tgz ]]; then
|
40 |
+
wget https://wit3.fbk.eu/archive/${iwslt_key}/texts/$src/$tgt/$src-$tgt.tgz -O ${save_prefix}$src-$tgt.tgz
|
41 |
+
[ $? -eq 0 ] && return 0
|
42 |
+
fi
|
43 |
+
popd
|
44 |
+
}
|
45 |
+
|
46 |
+
extract_iwslt(){
|
47 |
+
src=$1
|
48 |
+
tgt=$2
|
49 |
+
prefix=$3
|
50 |
+
pushd $orig
|
51 |
+
tar zxvf ${download_path}/${prefix}$src-${tgt}.tgz
|
52 |
+
popd
|
53 |
+
}
|
54 |
+
|
55 |
+
generate_train(){
|
56 |
+
lsrc=$1
|
57 |
+
ltgt=$2
|
58 |
+
src=${lsrc:0:2}
|
59 |
+
tgt=${ltgt:0:2}
|
60 |
+
for ll in $lsrc $ltgt; do
|
61 |
+
l=${ll:0:2}
|
62 |
+
f="$orig/*/train.tags.$src-$tgt.$l"
|
63 |
+
f_raw=$raw/train.$lsrc-$ltgt.$ll
|
64 |
+
cat $f \
|
65 |
+
| grep -v '<url>' \
|
66 |
+
| grep -v '<talkid>' \
|
67 |
+
| grep -v '<keywords>' \
|
68 |
+
| grep -v '<speaker>' \
|
69 |
+
| grep -v '<reviewer' \
|
70 |
+
| grep -v '<translator' \
|
71 |
+
| grep -v '<doc' \
|
72 |
+
| grep -v '</doc>' \
|
73 |
+
| sed -e 's/<title>//g' \
|
74 |
+
| sed -e 's/<\/title>//g' \
|
75 |
+
| sed -e 's/<description>//g' \
|
76 |
+
| sed -e 's/<\/description>//g' \
|
77 |
+
| sed 's/^\s*//g' \
|
78 |
+
| sed 's/\s*$//g' \
|
79 |
+
> $f_raw
|
80 |
+
[ $? -eq 0 ] && echo "extracted $f to $f_raw"
|
81 |
+
done
|
82 |
+
return 0
|
83 |
+
}
|
84 |
+
|
85 |
+
convert_valid_test(){
|
86 |
+
src=$1
|
87 |
+
tgt=$2
|
88 |
+
for l in $src $tgt; do
|
89 |
+
echo "lang: ${l}"
|
90 |
+
for o in `ls $orig/*/IWSLT*.TED*.$src-$tgt.$l.xml`; do
|
91 |
+
fname=${o##*/}
|
92 |
+
f=$tmp/${fname%.*}
|
93 |
+
echo "$o => $f"
|
94 |
+
grep '<seg id' $o \
|
95 |
+
| sed -e 's/<seg id="[0-9]*">\s*//g' \
|
96 |
+
| sed -e 's/\s*<\/seg>\s*//g' \
|
97 |
+
| sed -e "s/\’/\'/g" \
|
98 |
+
> $f
|
99 |
+
echo ""
|
100 |
+
done
|
101 |
+
done
|
102 |
+
}
|
103 |
+
|
104 |
+
generate_subset(){
|
105 |
+
lsrc=$1
|
106 |
+
ltgt=$2
|
107 |
+
src=${lsrc:0:2}
|
108 |
+
tgt=${ltgt:0:2}
|
109 |
+
subset=$3
|
110 |
+
prefix=$4
|
111 |
+
for ll in $lsrc $ltgt; do
|
112 |
+
l=${ll:0:2}
|
113 |
+
f=$tmp/$prefix.${src}-${tgt}.$l
|
114 |
+
if [[ -f $f ]]; then
|
115 |
+
cp $f $raw/$subset.${lsrc}-$ltgt.${ll}
|
116 |
+
fi
|
117 |
+
done
|
118 |
+
}
|
119 |
+
#################
|
120 |
+
|
121 |
+
echo "downloading iwslt training and dev data"
|
122 |
+
# using multilingual for it, nl
|
123 |
+
download_iwslt "2017-01-trnmted" DeEnItNlRo DeEnItNlRo
|
124 |
+
download_iwslt "2017-01-trnted" ar en
|
125 |
+
download_iwslt "2017-01-trnted" en ar
|
126 |
+
download_iwslt "2017-01-trnted" ko en
|
127 |
+
download_iwslt "2017-01-trnted" en ko
|
128 |
+
download_iwslt "2015-01" vi en
|
129 |
+
download_iwslt "2015-01" en vi
|
130 |
+
|
131 |
+
echo "donwloading iwslt test data"
|
132 |
+
download_iwslt "2017-01-mted-test" it en "test."
|
133 |
+
download_iwslt "2017-01-mted-test" en it "test."
|
134 |
+
download_iwslt "2017-01-mted-test" nl en "test."
|
135 |
+
download_iwslt "2017-01-mted-test" en nl "test."
|
136 |
+
|
137 |
+
download_iwslt "2017-01-ted-test" ar en "test."
|
138 |
+
download_iwslt "2017-01-ted-test" en ar "test."
|
139 |
+
download_iwslt "2017-01-ted-test" ko en "test."
|
140 |
+
download_iwslt "2017-01-ted-test" en ko "test."
|
141 |
+
download_iwslt "2015-01-test" vi en "test."
|
142 |
+
download_iwslt "2015-01-test" en vi "test."
|
143 |
+
|
144 |
+
echo "extract training data tar balls"
|
145 |
+
extract_iwslt DeEnItNlRo DeEnItNlRo
|
146 |
+
extract_iwslt ar en
|
147 |
+
extract_iwslt en ar
|
148 |
+
extract_iwslt ko en
|
149 |
+
extract_iwslt en ko
|
150 |
+
extract_iwslt vi en
|
151 |
+
extract_iwslt en vi
|
152 |
+
|
153 |
+
|
154 |
+
echo "extracting iwslt test data"
|
155 |
+
for lang in $langs; do
|
156 |
+
l=${lang:0:2}
|
157 |
+
extract_iwslt $l en "test."
|
158 |
+
extract_iwslt en $l "test."
|
159 |
+
done
|
160 |
+
|
161 |
+
echo "convert dev and test data"
|
162 |
+
for lang in $langs; do
|
163 |
+
s_lang=${lang:0:2}
|
164 |
+
convert_valid_test $s_lang en
|
165 |
+
convert_valid_test en $s_lang
|
166 |
+
done
|
167 |
+
|
168 |
+
|
169 |
+
|
170 |
+
echo "creating training data into $raw"
|
171 |
+
for lang in $langs; do
|
172 |
+
generate_train $lang en_XX
|
173 |
+
generate_train en_XX $lang
|
174 |
+
done
|
175 |
+
|
176 |
+
echo "creating iwslt dev data into raw"
|
177 |
+
generate_subset en_XX vi_VN valid "IWSLT15.TED.tst2013"
|
178 |
+
generate_subset vi_VN en_XX valid "IWSLT15.TED.tst2013"
|
179 |
+
|
180 |
+
generate_subset en_XX ar_AR valid "IWSLT17.TED.tst2016"
|
181 |
+
generate_subset ar_AR en_XX valid "IWSLT17.TED.tst2016"
|
182 |
+
generate_subset en_XX ko_KR valid "IWSLT17.TED.tst2016"
|
183 |
+
generate_subset ko_KR en_XX valid "IWSLT17.TED.tst2016"
|
184 |
+
|
185 |
+
|
186 |
+
generate_subset en_XX it_IT valid "IWSLT17.TED.tst2010"
|
187 |
+
generate_subset it_IT en_XX valid "IWSLT17.TED.tst2010"
|
188 |
+
generate_subset en_XX nl_XX valid "IWSLT17.TED.tst2010"
|
189 |
+
generate_subset nl_XX en_XX valid "IWSLT17.TED.tst2010"
|
190 |
+
|
191 |
+
echo "creating iswslt test data into raw"
|
192 |
+
generate_subset en_XX vi_VN test "IWSLT15.TED.tst2015"
|
193 |
+
generate_subset vi_VN en_XX test "IWSLT15.TED.tst2015"
|
194 |
+
|
195 |
+
generate_subset en_XX ar_AR test "IWSLT17.TED.tst2017"
|
196 |
+
generate_subset ar_AR en_XX test "IWSLT17.TED.tst2017"
|
197 |
+
generate_subset en_XX ko_KR test "IWSLT17.TED.tst2017"
|
198 |
+
generate_subset ko_KR en_XX test "IWSLT17.TED.tst2017"
|
199 |
+
|
200 |
+
generate_subset en_XX it_IT test "IWSLT17.TED.tst2017.mltlng"
|
201 |
+
generate_subset it_IT en_XX test "IWSLT17.TED.tst2017.mltlng"
|
202 |
+
generate_subset en_XX nl_XX test "IWSLT17.TED.tst2017.mltlng"
|
203 |
+
generate_subset nl_XX en_XX test "IWSLT17.TED.tst2017.mltlng"
|
204 |
+
|
205 |
+
# normalze iwslt directions into x-en
|
206 |
+
pushd $raw
|
207 |
+
for lang in $langs; do
|
208 |
+
for split in test valid; do
|
209 |
+
x_en_f1=$split.$lang-en_XX.en_XX
|
210 |
+
x_en_f2=$split.$lang-en_XX.${lang}
|
211 |
+
|
212 |
+
en_x_f1=$split.en_XX-$lang.en_XX
|
213 |
+
en_x_f2=$split.en_XX-$lang.${lang}
|
214 |
+
|
215 |
+
if [ -f $en_x_f1 ] && [ ! -f $x_en_f1 ]; then
|
216 |
+
echo "cp $en_x_f1 $x_en_f1"
|
217 |
+
cp $en_x_f1 $x_en_f1
|
218 |
+
fi
|
219 |
+
if [ -f $x_en_f2 ] && [ ! -f $x_en_f2 ]; then
|
220 |
+
echo "cp $en_x_f2 $x_en_f2"
|
221 |
+
cp $en_x_f2 $x_en_f2
|
222 |
+
fi
|
223 |
+
done
|
224 |
+
done
|
225 |
+
popd
|
fairseq/examples/multilingual/data_scripts/download_lotus.sh
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
|
9 |
+
if [ -z $WORKDIR_ROOT ] ;
|
10 |
+
then
|
11 |
+
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
|
12 |
+
exit
|
13 |
+
fi
|
14 |
+
|
15 |
+
|
16 |
+
SRCDIR=$WORKDIR_ROOT/indic_languages_corpus
|
17 |
+
DESTDIR=${WORKDIR_ROOT}/ML50/raw/
|
18 |
+
mkdir -p $SRCDIR
|
19 |
+
mkdir -p $DESTDIR
|
20 |
+
|
21 |
+
cd $SRCDIR
|
22 |
+
wget http://lotus.kuee.kyoto-u.ac.jp/WAT/indic-multilingual/indic_languages_corpus.tar.gz
|
23 |
+
tar -xvzf indic_languages_corpus.tar.gz
|
24 |
+
|
25 |
+
SRC_EXTRACT_DIR=$SRCDIR/indic_languages_corpus/bilingual
|
26 |
+
|
27 |
+
cp $SRC_EXTRACT_DIR/ml-en/train.ml $DESTDIR/train.ml_IN-en_XX.ml_IN
|
28 |
+
cp $SRC_EXTRACT_DIR/ml-en/train.en $DESTDIR/train.ml_IN-en_XX.en_XX
|
29 |
+
cp $SRC_EXTRACT_DIR/ml-en/dev.ml $DESTDIR/valid.ml_IN-en_XX.ml_IN
|
30 |
+
cp $SRC_EXTRACT_DIR/ml-en/dev.en $DESTDIR/valid.ml_IN-en_XX.en_XX
|
31 |
+
cp $SRC_EXTRACT_DIR/ml-en/test.ml $DESTDIR/test.ml_IN-en_XX.ml_IN
|
32 |
+
cp $SRC_EXTRACT_DIR/ml-en/test.en $DESTDIR/test.ml_IN-en_XX.en_XX
|
33 |
+
|
34 |
+
cp $SRC_EXTRACT_DIR/ur-en/train.ur $DESTDIR/train.ur_PK-en_XX.ur_PK
|
35 |
+
cp $SRC_EXTRACT_DIR/ur-en/train.en $DESTDIR/train.ur_PK-en_XX.en_XX
|
36 |
+
cp $SRC_EXTRACT_DIR/ur-en/dev.ur $DESTDIR/valid.ur_PK-en_XX.ur_PK
|
37 |
+
cp $SRC_EXTRACT_DIR/ur-en/dev.en $DESTDIR/valid.ur_PK-en_XX.en_XX
|
38 |
+
cp $SRC_EXTRACT_DIR/ur-en/test.ur $DESTDIR/test.ur_PK-en_XX.ur_PK
|
39 |
+
cp $SRC_EXTRACT_DIR/ur-en/test.en $DESTDIR/test.ur_PK-en_XX.en_XX
|
40 |
+
|
41 |
+
cp $SRC_EXTRACT_DIR/te-en/train.te $DESTDIR/train.te_IN-en_XX.te_IN
|
42 |
+
cp $SRC_EXTRACT_DIR/te-en/train.en $DESTDIR/train.te_IN-en_XX.en_XX
|
43 |
+
cp $SRC_EXTRACT_DIR/te-en/dev.te $DESTDIR/valid.te_IN-en_XX.te_IN
|
44 |
+
cp $SRC_EXTRACT_DIR/te-en/dev.en $DESTDIR/valid.te_IN-en_XX.en_XX
|
45 |
+
cp $SRC_EXTRACT_DIR/te-en/test.te $DESTDIR/test.te_IN-en_XX.te_IN
|
46 |
+
cp $SRC_EXTRACT_DIR/te-en/test.en $DESTDIR/test.te_IN-en_XX.en_XX
|
fairseq/examples/multilingual/data_scripts/download_ted_and_extract.py
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
|
7 |
+
import itertools
|
8 |
+
import os
|
9 |
+
import csv
|
10 |
+
from collections import defaultdict
|
11 |
+
from six.moves import zip
|
12 |
+
import io
|
13 |
+
import wget
|
14 |
+
import sys
|
15 |
+
|
16 |
+
from subprocess import check_call, check_output
|
17 |
+
|
18 |
+
# scripts and data locations
|
19 |
+
CWD = os.getcwd()
|
20 |
+
UTILS = f"{CWD}/utils"
|
21 |
+
|
22 |
+
MOSES = f"{UTILS}/mosesdecoder"
|
23 |
+
|
24 |
+
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
|
25 |
+
|
26 |
+
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
|
27 |
+
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
|
28 |
+
sys.exit(-1)
|
29 |
+
|
30 |
+
|
31 |
+
# please donwload mosesdecoder here:
|
32 |
+
detok_cmd = f'{MOSES}/scripts/tokenizer/detokenizer.perl'
|
33 |
+
|
34 |
+
|
35 |
+
def call(cmd):
|
36 |
+
print(f"Executing: {cmd}")
|
37 |
+
check_call(cmd, shell=True)
|
38 |
+
|
39 |
+
class MultiLingualAlignedCorpusReader(object):
|
40 |
+
"""A class to read TED talk dataset
|
41 |
+
"""
|
42 |
+
|
43 |
+
def __init__(self, corpus_path, delimiter='\t',
|
44 |
+
target_token=True, bilingual=True, corpus_type='file',
|
45 |
+
lang_dict={'source': ['fr'], 'target': ['en']},
|
46 |
+
eval_lang_dict=None, zero_shot=False,
|
47 |
+
detok=True,
|
48 |
+
):
|
49 |
+
|
50 |
+
self.empty_line_flag = 'NULL'
|
51 |
+
self.corpus_path = corpus_path
|
52 |
+
self.delimiter = delimiter
|
53 |
+
self.bilingual = bilingual
|
54 |
+
self.lang_dict = lang_dict
|
55 |
+
self.lang_set = set()
|
56 |
+
self.target_token = target_token
|
57 |
+
self.zero_shot = zero_shot
|
58 |
+
self.eval_lang_dict = eval_lang_dict
|
59 |
+
self.corpus_type = corpus_type
|
60 |
+
self.detok = detok
|
61 |
+
|
62 |
+
for list_ in self.lang_dict.values():
|
63 |
+
for lang in list_:
|
64 |
+
self.lang_set.add(lang)
|
65 |
+
|
66 |
+
self.data = dict()
|
67 |
+
self.data['train'] = self.read_aligned_corpus(split_type='train')
|
68 |
+
self.data['test'] = self.read_aligned_corpus(split_type='test')
|
69 |
+
self.data['dev'] = self.read_aligned_corpus(split_type='dev')
|
70 |
+
|
71 |
+
def read_data(self, file_loc_):
|
72 |
+
data_list = list()
|
73 |
+
with io.open(file_loc_, 'r', encoding='utf8') as fp:
|
74 |
+
for line in fp:
|
75 |
+
try:
|
76 |
+
text = line.strip()
|
77 |
+
except IndexError:
|
78 |
+
text = self.empty_line_flag
|
79 |
+
data_list.append(text)
|
80 |
+
return data_list
|
81 |
+
|
82 |
+
def filter_text(self, dict_):
|
83 |
+
if self.target_token:
|
84 |
+
field_index = 1
|
85 |
+
else:
|
86 |
+
field_index = 0
|
87 |
+
data_dict = defaultdict(list)
|
88 |
+
list1 = dict_['source']
|
89 |
+
list2 = dict_['target']
|
90 |
+
for sent1, sent2 in zip(list1, list2):
|
91 |
+
try:
|
92 |
+
src_sent = ' '.join(sent1.split()[field_index: ])
|
93 |
+
except IndexError:
|
94 |
+
src_sent = 'NULL'
|
95 |
+
|
96 |
+
if src_sent.find(self.empty_line_flag) != -1 or len(src_sent) == 0:
|
97 |
+
continue
|
98 |
+
|
99 |
+
elif sent2.find(self.empty_line_flag) != -1 or len(sent2) == 0:
|
100 |
+
continue
|
101 |
+
|
102 |
+
else:
|
103 |
+
data_dict['source'].append(sent1)
|
104 |
+
data_dict['target'].append(sent2)
|
105 |
+
return data_dict
|
106 |
+
|
107 |
+
def read_file(self, split_type, data_type):
|
108 |
+
return self.data[split_type][data_type]
|
109 |
+
|
110 |
+
def save_file(self, path_, split_type, data_type, lang):
|
111 |
+
tok_file = tok_file_name(path_, lang)
|
112 |
+
with io.open(tok_file, 'w', encoding='utf8') as fp:
|
113 |
+
for line in self.data[split_type][data_type]:
|
114 |
+
fp.write(line + '\n')
|
115 |
+
if self.detok:
|
116 |
+
de_tok(tok_file, lang)
|
117 |
+
|
118 |
+
def add_target_token(self, list_, lang_id):
|
119 |
+
new_list = list()
|
120 |
+
token = '__' + lang_id + '__'
|
121 |
+
for sent in list_:
|
122 |
+
new_list.append(token + ' ' + sent)
|
123 |
+
return new_list
|
124 |
+
|
125 |
+
def read_from_single_file(self, path_, s_lang, t_lang):
|
126 |
+
data_dict = defaultdict(list)
|
127 |
+
with io.open(path_, 'r', encoding='utf8') as fp:
|
128 |
+
reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE)
|
129 |
+
for row in reader:
|
130 |
+
data_dict['source'].append(row[s_lang])
|
131 |
+
data_dict['target'].append(row[t_lang])
|
132 |
+
|
133 |
+
if self.target_token:
|
134 |
+
text = self.add_target_token(data_dict['source'], t_lang)
|
135 |
+
data_dict['source'] = text
|
136 |
+
|
137 |
+
return data_dict['source'], data_dict['target']
|
138 |
+
|
139 |
+
def read_aligned_corpus(self, split_type='train'):
|
140 |
+
data_dict = defaultdict(list)
|
141 |
+
iterable = []
|
142 |
+
s_list = []
|
143 |
+
t_list = []
|
144 |
+
|
145 |
+
if self.zero_shot:
|
146 |
+
if split_type == "train":
|
147 |
+
iterable = zip(self.lang_dict['source'], self.lang_dict['target'])
|
148 |
+
else:
|
149 |
+
iterable = zip(self.eval_lang_dict['source'], self.eval_lang_dict['target'])
|
150 |
+
|
151 |
+
elif self.bilingual:
|
152 |
+
iterable = itertools.product(self.lang_dict['source'], self.lang_dict['target'])
|
153 |
+
|
154 |
+
for s_lang, t_lang in iterable:
|
155 |
+
if s_lang == t_lang:
|
156 |
+
continue
|
157 |
+
if self.corpus_type == 'file':
|
158 |
+
split_type_file_path = os.path.join(self.corpus_path,
|
159 |
+
"all_talks_{}.tsv".format(split_type))
|
160 |
+
s_list, t_list = self.read_from_single_file(split_type_file_path,
|
161 |
+
s_lang=s_lang,
|
162 |
+
t_lang=t_lang)
|
163 |
+
data_dict['source'] += s_list
|
164 |
+
data_dict['target'] += t_list
|
165 |
+
new_data_dict = self.filter_text(data_dict)
|
166 |
+
return new_data_dict
|
167 |
+
|
168 |
+
|
169 |
+
def read_langs(corpus_path):
|
170 |
+
split_type_file_path = os.path.join(corpus_path, 'extracted',
|
171 |
+
"all_talks_dev.tsv")
|
172 |
+
with io.open(split_type_file_path, 'r', encoding='utf8') as fp:
|
173 |
+
reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE)
|
174 |
+
header = next(reader)
|
175 |
+
return [k for k in header.keys() if k != 'talk_name']
|
176 |
+
|
177 |
+
def extra_english(corpus_path, split):
|
178 |
+
split_type_file_path = os.path.join(corpus_path,
|
179 |
+
f"all_talks_{split}.tsv")
|
180 |
+
output_split_type_file_path = os.path.join(corpus_path,
|
181 |
+
f"all_talks_{split}.en")
|
182 |
+
with io.open(split_type_file_path, 'r', encoding='utf8') as fp, io.open(output_split_type_file_path, 'w', encoding='utf8') as fw:
|
183 |
+
reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE)
|
184 |
+
for row in reader:
|
185 |
+
line = row['en']
|
186 |
+
fw.write(line + '\n')
|
187 |
+
de_tok(output_split_type_file_path, 'en')
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
def tok_file_name(filename, lang):
|
192 |
+
seps = filename.split('.')
|
193 |
+
seps.insert(-1, 'tok')
|
194 |
+
tok_file = '.'.join(seps)
|
195 |
+
return tok_file
|
196 |
+
|
197 |
+
def de_tok(tok_file, lang):
|
198 |
+
# seps = tok_file.split('.')
|
199 |
+
# seps.insert(-1, 'detok')
|
200 |
+
# de_tok_file = '.'.join(seps)
|
201 |
+
de_tok_file = tok_file.replace('.tok.', '.')
|
202 |
+
cmd = 'perl {detok_cmd} -l {lang} < {tok_file} > {de_tok_file}'.format(
|
203 |
+
detok_cmd=detok_cmd, tok_file=tok_file,
|
204 |
+
de_tok_file=de_tok_file, lang=lang[:2])
|
205 |
+
call(cmd)
|
206 |
+
|
207 |
+
def extra_bitex(
|
208 |
+
ted_data_path,
|
209 |
+
lsrc_lang,
|
210 |
+
ltrg_lang,
|
211 |
+
target_token,
|
212 |
+
output_data_path,
|
213 |
+
):
|
214 |
+
def get_ted_lang(lang):
|
215 |
+
long_langs = ['pt-br', 'zh-cn', 'zh-tw', 'fr-ca']
|
216 |
+
if lang[:5] in long_langs:
|
217 |
+
return lang[:5]
|
218 |
+
elif lang[:4] =='calv':
|
219 |
+
return lang[:5]
|
220 |
+
elif lang in ['pt_BR', 'zh_CN', 'zh_TW', 'fr_CA']:
|
221 |
+
return lang.lower().replace('_', '-')
|
222 |
+
return lang[:2]
|
223 |
+
src_lang = get_ted_lang(lsrc_lang)
|
224 |
+
trg_lang = get_ted_lang(ltrg_lang)
|
225 |
+
train_lang_dict={'source': [src_lang], 'target': [trg_lang]}
|
226 |
+
eval_lang_dict = {'source': [src_lang], 'target': [trg_lang]}
|
227 |
+
|
228 |
+
obj = MultiLingualAlignedCorpusReader(corpus_path=ted_data_path,
|
229 |
+
lang_dict=train_lang_dict,
|
230 |
+
target_token=target_token,
|
231 |
+
corpus_type='file',
|
232 |
+
eval_lang_dict=eval_lang_dict,
|
233 |
+
zero_shot=False,
|
234 |
+
bilingual=True)
|
235 |
+
|
236 |
+
os.makedirs(output_data_path, exist_ok=True)
|
237 |
+
lsrc_lang = lsrc_lang.replace('-', '_')
|
238 |
+
ltrg_lang = ltrg_lang.replace('-', '_')
|
239 |
+
obj.save_file(output_data_path + f"/train.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}",
|
240 |
+
split_type='train', data_type='source', lang=src_lang)
|
241 |
+
obj.save_file(output_data_path + f"/train.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}",
|
242 |
+
split_type='train', data_type='target', lang=trg_lang)
|
243 |
+
|
244 |
+
obj.save_file(output_data_path + f"/test.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}",
|
245 |
+
split_type='test', data_type='source', lang=src_lang)
|
246 |
+
obj.save_file(output_data_path + f"/test.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}",
|
247 |
+
split_type='test', data_type='target', lang=trg_lang)
|
248 |
+
|
249 |
+
obj.save_file(output_data_path + f"/valid.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}",
|
250 |
+
split_type='dev', data_type='source', lang=src_lang)
|
251 |
+
obj.save_file(output_data_path + f"/valid.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}",
|
252 |
+
split_type='dev', data_type='target', lang=trg_lang)
|
253 |
+
|
254 |
+
|
255 |
+
def bar_custom(current, total, width=80):
|
256 |
+
print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r')
|
257 |
+
|
258 |
+
|
259 |
+
def download_and_extract(download_to, extract_to):
|
260 |
+
url = 'http://phontron.com/data/ted_talks.tar.gz'
|
261 |
+
filename = f"{download_to}/ted_talks.tar.gz"
|
262 |
+
if os.path.exists(filename):
|
263 |
+
print(f'{filename} has already been downloaded so skip')
|
264 |
+
else:
|
265 |
+
filename = wget.download(url, filename, bar=bar_custom)
|
266 |
+
if os.path.exists(f'{extract_to}/all_talks_train.tsv'):
|
267 |
+
print(f'Already extracted so skip')
|
268 |
+
else:
|
269 |
+
extract_cmd = f'tar xzfv "{filename}" -C "{extract_to}"'
|
270 |
+
call(extract_cmd)
|
271 |
+
|
272 |
+
|
273 |
+
if __name__ == "__main__":
|
274 |
+
import argparse
|
275 |
+
parser = argparse.ArgumentParser()
|
276 |
+
parser.add_argument('--ted_data_path', type=str, default=WORKDIR_ROOT, required=False)
|
277 |
+
parser.add_argument(
|
278 |
+
'--direction-list',
|
279 |
+
type=str,
|
280 |
+
# default=None,
|
281 |
+
#for ML50
|
282 |
+
default=(
|
283 |
+
"bn_IN-en_XX,he_IL-en_XX,fa_IR-en_XX,id_ID-en_XX,sv_SE-en_XX,pt_XX-en_XX,ka_GE-en_XX,ka_GE-en_XX,th_TH-en_XX,"
|
284 |
+
"mr_IN-en_XX,hr_HR-en_XX,uk_UA-en_XX,az_AZ-en_XX,mk_MK-en_XX,gl_ES-en_XX,sl_SI-en_XX,mn_MN-en_XX,"
|
285 |
+
#non-english directions
|
286 |
+
# "fr_XX-de_DE," # replaced with wmt20
|
287 |
+
# "ja_XX-ko_KR,es_XX-pt_XX,ru_RU-sv_SE,hi_IN-bn_IN,id_ID-ar_AR,cs_CZ-pl_PL,ar_AR-tr_TR"
|
288 |
+
),
|
289 |
+
required=False)
|
290 |
+
parser.add_argument('--target-token', action='store_true', default=False)
|
291 |
+
parser.add_argument('--extract-all-english', action='store_true', default=False)
|
292 |
+
|
293 |
+
args = parser.parse_args()
|
294 |
+
|
295 |
+
import sys
|
296 |
+
import json
|
297 |
+
|
298 |
+
# TED Talks data directory
|
299 |
+
ted_data_path = args.ted_data_path
|
300 |
+
|
301 |
+
download_to = f'{ted_data_path}/downloads'
|
302 |
+
extract_to = f'{ted_data_path}/extracted'
|
303 |
+
|
304 |
+
#DESTDIR=${WORKDIR_ROOT}/ML50/raw/
|
305 |
+
output_path = f'{ted_data_path}/ML50/raw'
|
306 |
+
os.makedirs(download_to, exist_ok=True)
|
307 |
+
os.makedirs(extract_to, exist_ok=True)
|
308 |
+
os.makedirs(output_path, exist_ok=True)
|
309 |
+
download_and_extract(download_to, extract_to)
|
310 |
+
|
311 |
+
|
312 |
+
if args.extract_all_english:
|
313 |
+
for split in ['train', 'dev', 'test']:
|
314 |
+
extra_english(ted_data_path, split)
|
315 |
+
exit(0)
|
316 |
+
if args.direction_list is not None:
|
317 |
+
directions = args.direction_list.strip().split(',')
|
318 |
+
directions = [tuple(d.strip().split('-', 1)) for d in directions if d]
|
319 |
+
else:
|
320 |
+
langs = read_langs(ted_data_path)
|
321 |
+
# directions = [
|
322 |
+
# '{}.{}'.format(src, tgt)
|
323 |
+
# for src in langs
|
324 |
+
# for tgt in langs
|
325 |
+
# if src < tgt
|
326 |
+
# ]
|
327 |
+
directions = [('en', tgt) for tgt in langs if tgt != 'en']
|
328 |
+
print(f'num directions={len(directions)}: {directions}')
|
329 |
+
|
330 |
+
for src_lang, trg_lang in directions:
|
331 |
+
print('--working on {}-{}'.format(src_lang, trg_lang))
|
332 |
+
extra_bitex(
|
333 |
+
extract_to,
|
334 |
+
src_lang,
|
335 |
+
trg_lang,
|
336 |
+
target_token=args.target_token,
|
337 |
+
output_data_path=output_path
|
338 |
+
)
|
fairseq/examples/multilingual/data_scripts/download_wat19_my.sh
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
|
9 |
+
if [ -z $WORKDIR_ROOT ] ;
|
10 |
+
then
|
11 |
+
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
|
12 |
+
exit
|
13 |
+
fi
|
14 |
+
|
15 |
+
|
16 |
+
SRCDIR=$WORKDIR_ROOT/indic_languages_corpus
|
17 |
+
DESTDIR=$WORKDIR_ROOT/ML50/raw
|
18 |
+
mkdir -p $SRCDIR
|
19 |
+
mkdir -p $DESTDIR
|
20 |
+
|
21 |
+
WAT_MY_EN=wat2020.my-en.zip
|
22 |
+
cd $SRCDIR
|
23 |
+
# please refer to http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/ for latest URL if the following url expired
|
24 |
+
#- The data used for WAT2020 are identical to those used in WAT2019.
|
25 |
+
wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/$WAT_MY_EN
|
26 |
+
unzip $WAT_MY_EN
|
27 |
+
|
28 |
+
|
29 |
+
SRC_EXTRACT_DIR=$SRCDIR/wat2020.my-en/alt
|
30 |
+
|
31 |
+
cp $SRC_EXTRACT_DIR/train.alt.en $DESTDIR/train.my_MM-en_XX.en_XX
|
32 |
+
cp $SRC_EXTRACT_DIR/train.alt.my $DESTDIR/train.my_MM-en_XX.my_MM
|
33 |
+
cp $SRC_EXTRACT_DIR/dev.alt.en $DESTDIR/valid.my_MM-en_XX.en_XX
|
34 |
+
cp $SRC_EXTRACT_DIR/dev.alt.my $DESTDIR/valid.my_MM-en_XX.my_MM
|
35 |
+
cp $SRC_EXTRACT_DIR/test.alt.en $DESTDIR/test.my_MM-en_XX.en_XX
|
36 |
+
cp $SRC_EXTRACT_DIR/test.alt.my $DESTDIR/test.my_MM-en_XX.my_MM
|
fairseq/examples/multilingual/data_scripts/download_wmt19_and_before.py
ADDED
@@ -0,0 +1,899 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import NamedTuple, List
|
2 |
+
from urllib.parse import urlparse
|
3 |
+
import os, sys
|
4 |
+
import subprocess
|
5 |
+
from subprocess import check_call, check_output
|
6 |
+
import glob
|
7 |
+
import wget
|
8 |
+
import re
|
9 |
+
import multiprocessing as mp
|
10 |
+
from functools import partial
|
11 |
+
import pathlib
|
12 |
+
from collections import OrderedDict
|
13 |
+
|
14 |
+
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
|
15 |
+
|
16 |
+
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
|
17 |
+
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
|
18 |
+
sys.exit(-1)
|
19 |
+
|
20 |
+
# scripts and data locations
|
21 |
+
CWD = os.getcwd()
|
22 |
+
UTILS = f"{CWD}/utils"
|
23 |
+
|
24 |
+
MOSES = f"{UTILS}/mosesdecoder"
|
25 |
+
SGM_TOOL = f'{MOSES}/scripts/ems/support/input-from-sgm.perl'
|
26 |
+
|
27 |
+
TMX2CORPUS = f"{UTILS}/tmx2corpus"
|
28 |
+
TMX_TOOL = f'python {TMX2CORPUS}/tmx2corpus.py'
|
29 |
+
|
30 |
+
to_data_path = f'{WORKDIR_ROOT}/wmt'
|
31 |
+
download_to = f'{to_data_path}/downloads'
|
32 |
+
manually_downloads = f'{to_data_path}/downloads'
|
33 |
+
extract_to = f'{to_data_path}/extracted'
|
34 |
+
#DESTDIR=${WORKDIR_ROOT}/ML50/raw/
|
35 |
+
raw_data = f'{WORKDIR_ROOT}/ML50/raw'
|
36 |
+
####
|
37 |
+
|
38 |
+
class DLDataset(NamedTuple):
|
39 |
+
name: str
|
40 |
+
train_urls: List[str]
|
41 |
+
valid_urls: List[str]
|
42 |
+
test_urls: List[str]
|
43 |
+
train_files_patterns: List[str] = []
|
44 |
+
valid_files_patterns: List[str] = []
|
45 |
+
test_files_patterns: List[str] = []
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
def bar_custom(current, total, width=80):
|
50 |
+
print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r')
|
51 |
+
|
52 |
+
def get_downloaded_file(dl_folder, url):
|
53 |
+
if isinstance(url, tuple):
|
54 |
+
url, f = url
|
55 |
+
else:
|
56 |
+
url_f = urlparse(url)
|
57 |
+
# f = os.path.split(url_f.path)[-1]
|
58 |
+
f = '_'.join(url_f.path.split('/')[1:])
|
59 |
+
return url, f"{dl_folder}/{f}"
|
60 |
+
|
61 |
+
def download_parts_and_combine(dl_folder, urls, filename):
|
62 |
+
parts = []
|
63 |
+
for url_record in urls:
|
64 |
+
url, part_file = get_downloaded_file(dl_folder, url_record)
|
65 |
+
if os.path.exists(part_file):
|
66 |
+
print(f'{part_file} has already been downloaded so skip')
|
67 |
+
else:
|
68 |
+
part_file = wget.download(url, part_file, bar=bar_custom)
|
69 |
+
parts.append(part_file)
|
70 |
+
|
71 |
+
def get_combine_cmd(parts):
|
72 |
+
#default as tar.gz.??
|
73 |
+
return f'cat {" ".join(parts)} > {filename}'
|
74 |
+
|
75 |
+
combine_cmd = get_combine_cmd(parts)
|
76 |
+
call(combine_cmd, debug=True)
|
77 |
+
return filename
|
78 |
+
|
79 |
+
def download_a_url(dl_folder, url):
|
80 |
+
url, filename = get_downloaded_file(dl_folder, url)
|
81 |
+
if os.path.exists(filename):
|
82 |
+
print(f'{filename} has already been downloaded so skip')
|
83 |
+
return filename
|
84 |
+
|
85 |
+
print(f'downloading {url} to {filename}')
|
86 |
+
if isinstance(url, list) or isinstance(url, tuple):
|
87 |
+
download_parts_and_combine(dl_folder, url, filename)
|
88 |
+
else:
|
89 |
+
wget.download(url, filename, bar=bar_custom)
|
90 |
+
print(f'dowloaded: {filename}')
|
91 |
+
return filename
|
92 |
+
|
93 |
+
def download_files(dl_folder, urls, completed_urls={}):
|
94 |
+
for url_record in urls:
|
95 |
+
url, _ = get_downloaded_file(dl_folder, url_record)
|
96 |
+
filename = download_a_url(dl_folder, url_record)
|
97 |
+
completed_urls[str(url)] = filename
|
98 |
+
return completed_urls
|
99 |
+
|
100 |
+
def check_need_manual_downalod(dl_folder, to_manually_download_urls):
|
101 |
+
to_be_manually_dowloaded = []
|
102 |
+
manually_completed_urls = {}
|
103 |
+
for url_record, instruction in to_manually_download_urls:
|
104 |
+
url, filename = get_downloaded_file(dl_folder, url_record)
|
105 |
+
if not os.path.exists(filename):
|
106 |
+
print(f'{url} need to be download manually, please download it manually following {instruction}; and copy it to {filename}')
|
107 |
+
to_be_manually_dowloaded.append((url, filename))
|
108 |
+
else:
|
109 |
+
manually_completed_urls[url] = filename
|
110 |
+
# if len(to_be_manually_dowloaded) > 0:
|
111 |
+
# raise ValueError('Missing files that need to be downloaded manually; stop the process now.')
|
112 |
+
return to_be_manually_dowloaded
|
113 |
+
|
114 |
+
def download_dataset(to_folder, dl_dataset, completed_urls={}):
|
115 |
+
download_files(to_folder, dl_dataset.train_urls, completed_urls)
|
116 |
+
download_files(to_folder, dl_dataset.valid_urls, completed_urls)
|
117 |
+
download_files(to_folder, dl_dataset.test_urls, completed_urls)
|
118 |
+
print('completed downloading')
|
119 |
+
return completed_urls
|
120 |
+
|
121 |
+
def call(cmd, debug=False):
|
122 |
+
if debug:
|
123 |
+
print(cmd)
|
124 |
+
check_call(cmd, shell=True)
|
125 |
+
|
126 |
+
|
127 |
+
def get_extract_name(file_path):
|
128 |
+
path = os.path.split(file_path)
|
129 |
+
return path[-1] + '_extract' #.split('.')[0]
|
130 |
+
|
131 |
+
def extract_file(downloaded_file, extract_folder, get_extract_name=get_extract_name, debug=False):
|
132 |
+
extract_name = get_extract_name(downloaded_file)
|
133 |
+
extract_to = f'{extract_folder}/{extract_name}'
|
134 |
+
os.makedirs(extract_to, exist_ok=True)
|
135 |
+
if os.path.exists(f'{extract_to}/DONE'):
|
136 |
+
print(f'{downloaded_file} has already been extracted to {extract_to} so skip')
|
137 |
+
return extract_to
|
138 |
+
def get_extract_cmd(filename):
|
139 |
+
if filename.endswith('.tgz') or filename.endswith('tar.gz'):
|
140 |
+
return f'tar xzfv {filename} -C {extract_to}'
|
141 |
+
elif filename.endswith('.gz.tar'):
|
142 |
+
return f'tar xfv {filename} -C {extract_to}; (cd {extract_to}; gzip -d *.gz; [ $? -eq 0 ] || gzip -d */*.gz)'
|
143 |
+
elif filename.endswith('.tar'):
|
144 |
+
return f'tar xfv {filename} -C {extract_to}'
|
145 |
+
elif filename.endswith('.gz'):
|
146 |
+
return f'cp {filename} {extract_to}; (cd {extract_to}; gzip -d *.gz)'
|
147 |
+
elif filename.endswith('.zip'):
|
148 |
+
return f'unzip {filename} -d {extract_to}'
|
149 |
+
extract_cmd = get_extract_cmd(downloaded_file)
|
150 |
+
print(f'extracting {downloaded_file}')
|
151 |
+
if isinstance(extract_cmd, list):
|
152 |
+
for c in extract_cmd:
|
153 |
+
call(c, debug=debug)
|
154 |
+
else:
|
155 |
+
call(extract_cmd, debug=debug)
|
156 |
+
call(f'echo DONE > {extract_to}/DONE')
|
157 |
+
return extract_to
|
158 |
+
|
159 |
+
|
160 |
+
def extract_all_files(
|
161 |
+
completed_urls, extract_folder,
|
162 |
+
get_extract_name=get_extract_name,
|
163 |
+
completed_extraction={},
|
164 |
+
debug=False):
|
165 |
+
extracted_folders = OrderedDict()
|
166 |
+
for url, downloaded_file in set(completed_urls.items()):
|
167 |
+
if downloaded_file in completed_extraction:
|
168 |
+
print(f'{downloaded_file} is already extracted; so skip')
|
169 |
+
continue
|
170 |
+
folder = extract_file(downloaded_file, extract_folder, get_extract_name, debug)
|
171 |
+
extracted_folders[url] = folder
|
172 |
+
return extracted_folders
|
173 |
+
|
174 |
+
|
175 |
+
def my_glob(folder):
|
176 |
+
for p in [f'{folder}/*', f'{folder}/*/*', f'{folder}/*/*/*']:
|
177 |
+
for f in glob.glob(p):
|
178 |
+
yield f
|
179 |
+
|
180 |
+
|
181 |
+
def sgm2raw(sgm, debug):
|
182 |
+
to_file = sgm[0:len(sgm) - len('.sgm')]
|
183 |
+
if os.path.exists(to_file):
|
184 |
+
debug and print(f'{sgm} already converted to {to_file}; so skip')
|
185 |
+
return to_file
|
186 |
+
cmd = f'{SGM_TOOL} < {sgm} > {to_file}'
|
187 |
+
call(cmd, debug)
|
188 |
+
return to_file
|
189 |
+
|
190 |
+
def tmx2raw(tmx, debug):
|
191 |
+
to_file = tmx[0:len(tmx) - len('.tmx')]
|
192 |
+
to_folder = os.path.join(*os.path.split(tmx)[:-1])
|
193 |
+
if os.path.exists(f'{to_folder}/bitext.en'):
|
194 |
+
debug and print(f'{tmx} already extracted to {to_file}; so skip')
|
195 |
+
return to_file
|
196 |
+
cmd = f'(cd {to_folder}; {TMX_TOOL} {tmx})'
|
197 |
+
call(cmd, debug)
|
198 |
+
return to_file
|
199 |
+
|
200 |
+
CZENG16_REGEX = re.compile(r'.*?data.plaintext-format/0[0-9]train$')
|
201 |
+
WMT19_WIKITITLES_REGEX = re.compile(r'.*?wikititles-v1.(\w\w)-en.tsv.gz')
|
202 |
+
TSV_REGEX = re.compile(r'.*?(\w\w)-(\w\w).tsv$')
|
203 |
+
|
204 |
+
|
205 |
+
|
206 |
+
def cut_wikitles(wiki_file, debug):
|
207 |
+
# different languages have different file names:
|
208 |
+
if wiki_file.endswith('wiki/fi-en/titles.fi-en'):
|
209 |
+
to_file1 = f'{wiki_file}.fi'
|
210 |
+
to_file2 = f'{wiki_file}.en'
|
211 |
+
BACKSLASH = '\\'
|
212 |
+
cmd1 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f1 |awk '{{$1=$1}};1' > {to_file1}"
|
213 |
+
cmd2 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f2 |awk '{{$1=$1}};1' > {to_file2}"
|
214 |
+
# elif WMT19_WIKITITLES_REGEX.match(wiki_file):
|
215 |
+
# src = WMT19_WIKITITLES_REGEX.match(wiki_file).groups()[0]
|
216 |
+
# to_file1 = f'{wiki_file}.{src}'
|
217 |
+
# to_file2 = f'{wiki_file}.en'
|
218 |
+
# cmd1 = f"cat {wiki_file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}"
|
219 |
+
# cmd2 = f"cat {wiki_file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}"
|
220 |
+
else:
|
221 |
+
return None
|
222 |
+
if os.path.exists(to_file1) and os.path.exists(to_file2):
|
223 |
+
debug and print(f'{wiki_file} already processed to {to_file1} and {to_file2}; so skip')
|
224 |
+
return wiki_file
|
225 |
+
|
226 |
+
call(cmd1, debug=debug)
|
227 |
+
call(cmd2, debug=debug)
|
228 |
+
return wiki_file
|
229 |
+
|
230 |
+
def cut_tsv(file, debug):
|
231 |
+
m = TSV_REGEX.match(file)
|
232 |
+
if m is None:
|
233 |
+
raise ValueError(f'{file} is not matching tsv pattern')
|
234 |
+
src = m.groups()[0]
|
235 |
+
tgt = m.groups()[1]
|
236 |
+
|
237 |
+
to_file1 = f'{file}.{src}'
|
238 |
+
to_file2 = f'{file}.{tgt}'
|
239 |
+
cmd1 = f"cat {file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}"
|
240 |
+
cmd2 = f"cat {file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}"
|
241 |
+
if os.path.exists(to_file1) and os.path.exists(to_file2):
|
242 |
+
debug and print(f'{file} already processed to {to_file1} and {to_file2}; so skip')
|
243 |
+
return file
|
244 |
+
|
245 |
+
call(cmd1, debug=debug)
|
246 |
+
call(cmd2, debug=debug)
|
247 |
+
return file
|
248 |
+
|
249 |
+
|
250 |
+
def convert_file_if_needed(file, debug):
|
251 |
+
if file.endswith('.sgm'):
|
252 |
+
return sgm2raw(file, debug)
|
253 |
+
elif file.endswith('.tmx'):
|
254 |
+
return tmx2raw(file, debug)
|
255 |
+
elif file.endswith('wiki/fi-en/titles.fi-en'):
|
256 |
+
return cut_wikitles(file, debug)
|
257 |
+
# elif WMT19_WIKITITLES_REGEX.match(file):
|
258 |
+
# return cut_wikitles(file, debug)
|
259 |
+
elif file.endswith('.tsv'):
|
260 |
+
return cut_tsv(file, debug)
|
261 |
+
elif CZENG16_REGEX.match(file):
|
262 |
+
return convert2czeng17(file, debug)
|
263 |
+
else:
|
264 |
+
return file
|
265 |
+
|
266 |
+
|
267 |
+
def convert_files_if_needed(extracted_foldrs, my_glob=my_glob, debug=False):
|
268 |
+
return {
|
269 |
+
url: list(sorted(set(convert_file_if_needed(f, debug)) for f in sorted(set(my_glob(folder)))))
|
270 |
+
for url, folder in extracted_foldrs.items()
|
271 |
+
}
|
272 |
+
|
273 |
+
def match_patt(file_path, file_pattern, src, tgt, lang):
|
274 |
+
return file_pattern.format(src=src, tgt=tgt, lang=lang) in file_path
|
275 |
+
|
276 |
+
def match_patts(file_path, file_patterns, src, tgt, lang):
|
277 |
+
for file_pattern in file_patterns:
|
278 |
+
params = { k: v for k, v in [('src', src), ('tgt', tgt), ('lang', lang)] if k in file_pattern}
|
279 |
+
matching = file_pattern.format(**params)
|
280 |
+
|
281 |
+
if isinstance(file_pattern, tuple):
|
282 |
+
pattern, directions = file_pattern
|
283 |
+
if f'{src}-{tgt}' in directions and matching in file_path:
|
284 |
+
return True
|
285 |
+
else:
|
286 |
+
if matching in file_path:
|
287 |
+
return True
|
288 |
+
return False
|
289 |
+
|
290 |
+
def extracted_glob(extracted_folder, file_patterns, src, tgt, lang):
|
291 |
+
def get_matching_pattern(file_pattern):
|
292 |
+
params = {
|
293 |
+
k: v
|
294 |
+
for k, v in [('src', src), ('tgt', tgt), ('lang', lang)]
|
295 |
+
if '{' + k + '}' in file_pattern
|
296 |
+
}
|
297 |
+
file_pattern = re.sub(r'{src:(.*?)}', r'\1' if lang == src else '', file_pattern)
|
298 |
+
file_pattern = re.sub(r'{tgt:(.*?)}', r'\1' if lang == tgt else '', file_pattern)
|
299 |
+
file_pattern = file_pattern.format(**params)
|
300 |
+
return file_pattern
|
301 |
+
for file_pattern in file_patterns:
|
302 |
+
if isinstance(file_pattern, tuple):
|
303 |
+
file_pattern, lang_pairs = file_pattern
|
304 |
+
if f'{src}-{tgt}' not in lang_pairs:
|
305 |
+
continue
|
306 |
+
# print('working on pattern: ', file_pattern, lang_pairs )
|
307 |
+
matching_pattern = get_matching_pattern(file_pattern)
|
308 |
+
if matching_pattern is None:
|
309 |
+
continue
|
310 |
+
glob_patterns = f'{extracted_folder}/{matching_pattern}'
|
311 |
+
# print('glob_patterns: ', glob_patterns)
|
312 |
+
for f in glob.glob(glob_patterns):
|
313 |
+
yield f
|
314 |
+
|
315 |
+
# for debug usage
|
316 |
+
def all_extracted_files(split, src, tgt, extracted_folders, split_urls):
|
317 |
+
def get_url(url):
|
318 |
+
if isinstance(url, tuple):
|
319 |
+
url, downloaded_file = url
|
320 |
+
return url
|
321 |
+
return [
|
322 |
+
f
|
323 |
+
for url in split_urls
|
324 |
+
for f in my_glob(extracted_folders[str(get_url(url))])
|
325 |
+
]
|
326 |
+
|
327 |
+
def concat_files(split, src, tgt, extracted_folders, split_urls, path_patterns, to_folder, debug=False):
|
328 |
+
# if debug:
|
329 |
+
# print('extracted files to be filtered by patterns: ',
|
330 |
+
# '\n\t'.join(sorted(all_extracted_files(split, src, tgt, extracted_folders, split_urls))))
|
331 |
+
for lang in [src, tgt]:
|
332 |
+
to_file = f'{to_folder}/{split}.{src}-{tgt}.{lang}'
|
333 |
+
s_src, s_tgt, s_lang = src.split('_')[0], tgt.split('_')[0], lang.split('_')[0]
|
334 |
+
files = []
|
335 |
+
for url in split_urls:
|
336 |
+
if isinstance(url, tuple):
|
337 |
+
url, downloaded_file = url
|
338 |
+
if str(url) not in extracted_folders:
|
339 |
+
print(f'warning: {url} not in extracted files')
|
340 |
+
for extracted_file in set(
|
341 |
+
extracted_glob(
|
342 |
+
extracted_folders[str(url)], path_patterns,
|
343 |
+
s_src, s_tgt, s_lang)):
|
344 |
+
files.append(extracted_file)
|
345 |
+
if len(files) == 0:
|
346 |
+
print('warning: ', f'No files found for split {to_file}')
|
347 |
+
continue
|
348 |
+
files = sorted(set(files))
|
349 |
+
print(f'concating {len(files)} files into {to_file}')
|
350 |
+
cmd = ['cat'] + [f'"{f}"' for f in files] + [f'>{to_file}']
|
351 |
+
cmd = " ".join(cmd)
|
352 |
+
call(cmd, debug=debug)
|
353 |
+
|
354 |
+
UTILS = os.path.join(pathlib.Path(__file__).parent, 'utils')
|
355 |
+
LID_MODEL = f'{download_to}/lid.176.bin'
|
356 |
+
LID_MULTI = f'{UTILS}/fasttext_multi_filter.py'
|
357 |
+
|
358 |
+
def lid_filter(split, src, tgt, from_folder, to_folder, debug=False):
|
359 |
+
if not os.path.exists(LID_MODEL):
|
360 |
+
call(f'wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O {LID_MODEL}')
|
361 |
+
from_prefix = f'{from_folder}/{split}.{src}-{tgt}'
|
362 |
+
to_prefix = f'{to_folder}/{split}.{src}-{tgt}'
|
363 |
+
if os.path.exists(f'{from_prefix}.{src}') and os.path.exists(f'{from_prefix}.{tgt}'):
|
364 |
+
s_src, s_tgt = src.split('_')[0], tgt.split('_')[0]
|
365 |
+
cmd = (
|
366 |
+
f'python {LID_MULTI} --model {LID_MODEL} --inputs {from_prefix}.{src} {from_prefix}.{tgt} '
|
367 |
+
f'--langs {s_src} {s_tgt} --outputs {to_prefix}.{src} {to_prefix}.{tgt}'
|
368 |
+
)
|
369 |
+
print(f'filtering {from_prefix}')
|
370 |
+
call(cmd, debug=debug)
|
371 |
+
|
372 |
+
def concat_into_splits(dl_dataset, src, tgt, extracted_folders, to_folder, debug):
|
373 |
+
to_folder_tmp = f"{to_folder}_tmp"
|
374 |
+
os.makedirs(to_folder_tmp, exist_ok=True)
|
375 |
+
concat_files('train', src, tgt,
|
376 |
+
extracted_folders,
|
377 |
+
split_urls=dl_dataset.train_urls,
|
378 |
+
path_patterns=dl_dataset.train_files_patterns,
|
379 |
+
to_folder=to_folder_tmp, debug=debug)
|
380 |
+
lid_filter('train', src, tgt, to_folder_tmp, to_folder, debug)
|
381 |
+
|
382 |
+
concat_files('valid', src, tgt,
|
383 |
+
extracted_folders,
|
384 |
+
split_urls=dl_dataset.valid_urls,
|
385 |
+
path_patterns=dl_dataset.valid_files_patterns,
|
386 |
+
to_folder=to_folder, debug=debug)
|
387 |
+
concat_files('test', src, tgt,
|
388 |
+
extracted_folders,
|
389 |
+
split_urls=dl_dataset.test_urls,
|
390 |
+
path_patterns=dl_dataset.test_files_patterns,
|
391 |
+
to_folder=to_folder, debug=debug)
|
392 |
+
|
393 |
+
|
394 |
+
def download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=False):
|
395 |
+
pool = mp.Pool(processes=num_processes)
|
396 |
+
download_f = partial(download_a_url, dl_folder)
|
397 |
+
downloaded_files = pool.imap_unordered(download_f, urls)
|
398 |
+
pool.close()
|
399 |
+
pool.join()
|
400 |
+
|
401 |
+
BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ")
|
402 |
+
def run_eval_bleu(cmd):
|
403 |
+
output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip()
|
404 |
+
print(output)
|
405 |
+
bleu = -1.0
|
406 |
+
for line in output.strip().split('\n'):
|
407 |
+
m = BLEU_REGEX.search(line)
|
408 |
+
if m is not None:
|
409 |
+
bleu = m.groups()[0]
|
410 |
+
bleu = float(bleu)
|
411 |
+
break
|
412 |
+
return bleu
|
413 |
+
|
414 |
+
def check_wmt_test_bleu(raw_folder, wmt_lang_pairs):
|
415 |
+
not_matchings = []
|
416 |
+
for wmt, src_tgts in wmt_lang_pairs:
|
417 |
+
for src_tgt in src_tgts:
|
418 |
+
print(f'checking test bleus for: {src_tgt} at {wmt}')
|
419 |
+
src, tgt = src_tgt.split('-')
|
420 |
+
ssrc, stgt = src[:2], tgt[:2]
|
421 |
+
if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'):
|
422 |
+
# reversed direction may have different test set
|
423 |
+
test_src = f'{raw_folder}/test.{tgt}-{src}.{src}'
|
424 |
+
else:
|
425 |
+
test_src = f'{raw_folder}/test.{src}-{tgt}.{src}'
|
426 |
+
cmd1 = f'cat {test_src} | sacrebleu -t "{wmt}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""'
|
427 |
+
test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}'
|
428 |
+
cmd2 = f'cat {test_tgt} | sacrebleu -t "{wmt}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""'
|
429 |
+
bleu1 = run_eval_bleu(cmd1)
|
430 |
+
if bleu1 != 100.0:
|
431 |
+
not_matchings.append(f'{wmt}:{src_tgt} source side not matching: {test_src}')
|
432 |
+
bleu2 = run_eval_bleu(cmd2)
|
433 |
+
if bleu2 != 100.0:
|
434 |
+
not_matchings.append(f'{wmt}:{src_tgt} target side not matching: {test_tgt}')
|
435 |
+
return not_matchings
|
436 |
+
|
437 |
+
def download_and_extract(
|
438 |
+
to_folder, lang_pairs, dl_dataset,
|
439 |
+
to_manually_download_urls,
|
440 |
+
completed_urls={}, completed_extraction={},
|
441 |
+
debug=False):
|
442 |
+
|
443 |
+
dl_folder = f'{to_folder}/downloads'
|
444 |
+
extract_folder = f'{to_folder}/extracted'
|
445 |
+
raw_folder = f'{to_folder}/raw'
|
446 |
+
lid_filtered = f'{to_folder}/lid_filtered'
|
447 |
+
|
448 |
+
os.makedirs(extract_folder, exist_ok=True)
|
449 |
+
os.makedirs(raw_folder, exist_ok=True)
|
450 |
+
os.makedirs(lid_filtered, exist_ok=True)
|
451 |
+
|
452 |
+
|
453 |
+
to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls)
|
454 |
+
|
455 |
+
completed_urls = download_dataset(
|
456 |
+
dl_folder, dl_dataset, completed_urls)
|
457 |
+
if debug:
|
458 |
+
print('completed urls: ', completed_urls)
|
459 |
+
|
460 |
+
|
461 |
+
extracted_folders = extract_all_files(
|
462 |
+
completed_urls,
|
463 |
+
extract_folder=extract_folder,
|
464 |
+
completed_extraction=completed_extraction,
|
465 |
+
debug=debug)
|
466 |
+
if debug:
|
467 |
+
print('download files have been extracted to folders: ', extracted_folders)
|
468 |
+
|
469 |
+
converted_files = convert_files_if_needed(extracted_folders, debug=False)
|
470 |
+
for src_tgt in lang_pairs:
|
471 |
+
print(f'working on {dl_dataset.name}: {src_tgt}')
|
472 |
+
src, tgt = src_tgt.split('-')
|
473 |
+
concat_into_splits(dl_dataset,
|
474 |
+
src=src, tgt=tgt,
|
475 |
+
extracted_folders=extracted_folders,
|
476 |
+
to_folder=raw_folder, debug=debug)
|
477 |
+
print('completed data into: ', raw_folder)
|
478 |
+
|
479 |
+
def download_czang16(download_to, username=None):
|
480 |
+
wgets = [
|
481 |
+
f'wget --user={username} --password=czeng -P {download_to} http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar'
|
482 |
+
for i in range(10)]
|
483 |
+
cmds = []
|
484 |
+
for i, cmd in enumerate(wgets):
|
485 |
+
filename = f'{download_to}/data-plaintext-format.{i}.tar'
|
486 |
+
if os.path.exists(filename):
|
487 |
+
print(f'{filename} has already been downloaded; so skip')
|
488 |
+
continue
|
489 |
+
cmds.append(cmd)
|
490 |
+
if cmds and username is None:
|
491 |
+
raise ValueError('No czeng username is given; please register at http://ufal.mff.cuni.cz/czeng/czeng16 to obtain username to download')
|
492 |
+
for cmd in cmds:
|
493 |
+
call(cmd)
|
494 |
+
print('done with downloading czeng1.6')
|
495 |
+
|
496 |
+
def download_czeng17_script(download_to, extract_folder, debug=False):
|
497 |
+
url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip'
|
498 |
+
filename = f'{download_to}/convert_czeng16_to_17.pl.zip'
|
499 |
+
extract_to = f'{extract_folder}/{get_extract_name(filename)}'
|
500 |
+
script_path = f'{extract_to}/convert_czeng16_to_17.pl'
|
501 |
+
|
502 |
+
if not os.path.exists(script_path):
|
503 |
+
wget.download(url, filename, bar=bar_custom)
|
504 |
+
extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug)
|
505 |
+
return script_path
|
506 |
+
|
507 |
+
czeng17_script_path = ""
|
508 |
+
def convert2czeng17(file, debug):
|
509 |
+
en_file = f'{file}.en'
|
510 |
+
cs_file = f'{file}.cs'
|
511 |
+
|
512 |
+
if not os.path.exists(en_file) or not os.path.exists(cs_file):
|
513 |
+
cs_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f3 > {cs_file}'
|
514 |
+
en_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f4 > {en_file}'
|
515 |
+
call(cs_cmd, debug)
|
516 |
+
call(en_cmd, debug)
|
517 |
+
else:
|
518 |
+
print(f'already extracted: {en_file} and {cs_file}')
|
519 |
+
return file
|
520 |
+
|
521 |
+
def extract_czeng17(extract_folder, debug=False):
|
522 |
+
url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip'
|
523 |
+
filename = f'{download_to}/convert_czeng16_to_17.pl.zip'
|
524 |
+
extract_to = f'{extract_folder}/{get_extract_name(filename)}'
|
525 |
+
script_path = f'{extract_to}/convert_czeng16_to_17.pl'
|
526 |
+
|
527 |
+
if not os.path.exists(script_path):
|
528 |
+
wget.download(url, filename, bar=bar_custom)
|
529 |
+
extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug)
|
530 |
+
return script_path
|
531 |
+
|
532 |
+
#########
|
533 |
+
# definitions of wmt data sources
|
534 |
+
# for es-en
|
535 |
+
# Punctuation in the official test sets will be encoded with ASCII characters (not complex Unicode characters) as much as possible. You may want to normalize your system's output before submission. You are able able to use a rawer version of the test sets that does not have this normalization.
|
536 |
+
# script to normalize punctuation: http://www.statmt.org/wmt11/normalize-punctuation.perl
|
537 |
+
wmt13_es_en = DLDataset(
|
538 |
+
name='wmt13_es-en',
|
539 |
+
train_urls=[
|
540 |
+
'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz',
|
541 |
+
'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
|
542 |
+
'http://www.statmt.org/wmt13/training-parallel-un.tgz',
|
543 |
+
'http://www.statmt.org/wmt13/training-parallel-nc-v8.tgz',
|
544 |
+
],
|
545 |
+
valid_urls=[
|
546 |
+
('http://www.statmt.org/wmt13/dev.tgz', 'wmt13_dev.tgz')
|
547 |
+
],
|
548 |
+
test_urls=[
|
549 |
+
('http://www.statmt.org/wmt13/test.tgz', 'wmt13_test.tgz')
|
550 |
+
],
|
551 |
+
train_files_patterns=[
|
552 |
+
('*/europarl-v7.{src}-{tgt}.{lang}', ['es-en']),
|
553 |
+
('*commoncrawl.{src}-{tgt}.{lang}', ['es-en']),
|
554 |
+
('*/news-commentary-v8.{src}-{tgt}.{lang}', ['es-en']),
|
555 |
+
('un/*undoc.2000.{src}-{tgt}.{lang}', ['es-en']),
|
556 |
+
] ,
|
557 |
+
valid_files_patterns=[
|
558 |
+
('dev/newstest2012.{lang}', ['es-en'])
|
559 |
+
],
|
560 |
+
test_files_patterns=[
|
561 |
+
('test/newstest*.{lang}', ['es-en'])
|
562 |
+
],
|
563 |
+
)
|
564 |
+
|
565 |
+
wmt14_de_fr_en = DLDataset(
|
566 |
+
name='wmt14_de_fr_en',
|
567 |
+
train_urls=[
|
568 |
+
'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz',
|
569 |
+
'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
|
570 |
+
'http://www.statmt.org/wmt13/training-parallel-un.tgz',
|
571 |
+
'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz',
|
572 |
+
('http://www.statmt.org/wmt10/training-giga-fren.tar', 'training-giga-fren.gz.tar'), #it is actuall a gz.tar
|
573 |
+
],
|
574 |
+
valid_urls=[
|
575 |
+
('http://www.statmt.org/wmt14/dev.tgz', 'wmt14_dev.tgz'),
|
576 |
+
],
|
577 |
+
test_urls=[
|
578 |
+
('http://www.statmt.org/wmt14/test-full.tgz', 'wmt14_test_full.tgz'), # cleaned test sets
|
579 |
+
],
|
580 |
+
train_files_patterns=[
|
581 |
+
('*/europarl-v7.{src}-{tgt}.{lang}', ['fr-en', 'de-en']),
|
582 |
+
('*commoncrawl.{src}-{tgt}.{lang}', ['fr-en', 'de-en']),
|
583 |
+
('*/*news-commentary-v9.{src}-{tgt}.{lang}', ['fr-en', 'de-en']),
|
584 |
+
('un/undoc.2000.{src}-{tgt}.{lang}', ['fr-en']),
|
585 |
+
('*giga-{src}{tgt}*{lang}', ['fr-en'])
|
586 |
+
],
|
587 |
+
valid_files_patterns=[
|
588 |
+
('dev/newstest2013.{lang}', ['fr-en', 'de-en'])
|
589 |
+
],
|
590 |
+
test_files_patterns=[
|
591 |
+
('test-full/newstest*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['en-de', 'de-en', 'fr-en', 'en-fr']),
|
592 |
+
],
|
593 |
+
)
|
594 |
+
|
595 |
+
# pip install git+https://github.com/amake/tmx2corpus.git
|
596 |
+
wmt16_ro_en = DLDataset(
|
597 |
+
name='wmt16_ro-en',
|
598 |
+
train_urls=[
|
599 |
+
('http://data.statmt.org/wmt16/translation-task/training-parallel-ep-v8.tgz', 'wmt16_training-parallel-ep-v8.tgz'),
|
600 |
+
('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-ro.tmx.gz', 'en-ro.tmx.gz'),
|
601 |
+
],
|
602 |
+
valid_urls=[
|
603 |
+
('http://data.statmt.org/wmt16/translation-task/dev-romanian-updated.tgz', 'wmt16_dev.tgz')
|
604 |
+
],
|
605 |
+
test_urls=[
|
606 |
+
('http://data.statmt.org/wmt16/translation-task/test.tgz', 'wmt16_test.tgz')
|
607 |
+
],
|
608 |
+
train_files_patterns=[
|
609 |
+
('*/*europarl-v8.{src}-{tgt}.{lang}', ['ro-en']),
|
610 |
+
('bitext.{lang}', ['ro-en']) #setimes from tmux
|
611 |
+
] ,
|
612 |
+
valid_files_patterns=[
|
613 |
+
('dev/newsdev2016*{src}{tgt}*.{lang}', ['ro-en', 'ro-en'])
|
614 |
+
],
|
615 |
+
test_files_patterns=[
|
616 |
+
('test/newstest*{src}{tgt}*.{lang}', ['ro-en', 'en-ro'])
|
617 |
+
],
|
618 |
+
)
|
619 |
+
|
620 |
+
cwmt_wmt_instruction = 'cwmt download instruction at: http://nlp.nju.edu.cn/cwmt-wmt'
|
621 |
+
wmt17_fi_lv_tr_zh_en_manual_downloads = [
|
622 |
+
# fake urls to have unique keys for the data
|
623 |
+
( ('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'), cwmt_wmt_instruction),
|
624 |
+
( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'), cwmt_wmt_instruction),
|
625 |
+
( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'), cwmt_wmt_instruction),
|
626 |
+
( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'), cwmt_wmt_instruction),
|
627 |
+
( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'), cwmt_wmt_instruction),
|
628 |
+
( ('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'), cwmt_wmt_instruction),
|
629 |
+
]
|
630 |
+
wmt17_fi_lv_tr_zh_en = DLDataset(
|
631 |
+
name='wmt17_fi_lv_tr_zh_en',
|
632 |
+
train_urls=[
|
633 |
+
('http://data.statmt.org/wmt17/translation-task/training-parallel-ep-v8.tgz', 'wmt17_training-parallel-ep-v8.tgz'),
|
634 |
+
'http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz',
|
635 |
+
'http://www.statmt.org/wmt15/wiki-titles.tgz',
|
636 |
+
('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-tr.tmx.gz', 'en-tr.tmx.gz'),
|
637 |
+
('http://data.statmt.org/wmt17/translation-task/rapid2016.tgz', 'wmt17_rapid2016.tgz'),
|
638 |
+
'http://data.statmt.org/wmt17/translation-task/leta.v1.tgz',
|
639 |
+
'http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz',
|
640 |
+
'http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz',
|
641 |
+
(('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00',
|
642 |
+
'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01',), 'UNv1.0.en-zh.tar.gz'),
|
643 |
+
#manually download files:
|
644 |
+
('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'),
|
645 |
+
('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'),
|
646 |
+
('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'),
|
647 |
+
('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'),
|
648 |
+
('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'),
|
649 |
+
('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'),
|
650 |
+
],
|
651 |
+
valid_urls=[
|
652 |
+
('http://data.statmt.org/wmt17/translation-task/dev.tgz', 'wmt17_dev.tgz'),
|
653 |
+
],
|
654 |
+
test_urls=[
|
655 |
+
#NEW: Improved translations for zh test sets
|
656 |
+
('http://data.statmt.org/wmt17/translation-task/test-update-1.tgz', 'wmt17_test_zh_en.tgz'),
|
657 |
+
('http://data.statmt.org/wmt17/translation-task/test.tgz', 'wmt17_test_others.tgz')
|
658 |
+
],
|
659 |
+
train_files_patterns=[
|
660 |
+
('casict*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ),
|
661 |
+
('casia*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ),
|
662 |
+
('dataum*/Book*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en']),
|
663 |
+
('neu*/NEU*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en'] ),
|
664 |
+
('*/*UNv1.0.en-zh.{src:zh}{tgt:en}', ['zh-en']),
|
665 |
+
('training/*news-commentary-v12.{src}-{tgt}.{lang}', ['zh-en', ]),
|
666 |
+
|
667 |
+
('*/*europarl-v8.{src}-{tgt}.{lang}', ['fi-en', 'lv-en']),
|
668 |
+
('wiki/fi-en/titles.{src}-{tgt}.{lang}', ['fi-en', ]),
|
669 |
+
('rapid2016.{tgt}-{src}.{lang}', ['fi-en', 'lv-en']),
|
670 |
+
('*/leta.{lang}', ['lv-en']),
|
671 |
+
('*/dcep.{lang}', ['lv-en']),
|
672 |
+
('*/farewell.{lang}', ['lv-en']),
|
673 |
+
('bitext.{lang}', ['tr-en']),
|
674 |
+
] ,
|
675 |
+
valid_files_patterns=[
|
676 |
+
('dev/newsdev2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}',
|
677 |
+
[
|
678 |
+
'fi-en', 'lv-en', 'tr-en', 'zh-en',
|
679 |
+
'en-fi', 'en-lv', 'en-tr', 'en-zh'
|
680 |
+
]),
|
681 |
+
('dev/newstest2016*{src}{tgt}-{src:src}{tgt:ref}.{lang}',
|
682 |
+
[
|
683 |
+
'fi-en', 'tr-en',
|
684 |
+
'en-fi', 'en-tr',
|
685 |
+
]),
|
686 |
+
],
|
687 |
+
test_files_patterns=[
|
688 |
+
('test/newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
|
689 |
+
[
|
690 |
+
'fi-en', 'lv-en', 'tr-en',
|
691 |
+
'en-fi', 'en-lv', 'en-tr',
|
692 |
+
]),
|
693 |
+
('newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
|
694 |
+
[
|
695 |
+
'zh-en',
|
696 |
+
'en-zh'
|
697 |
+
]),
|
698 |
+
],
|
699 |
+
)
|
700 |
+
|
701 |
+
czeng_instruction = 'download instruction at: http://ufal.mff.cuni.cz/czeng/czeng16'
|
702 |
+
#alternative: use the prepared data but detokenize it?
|
703 |
+
wmt18_cs_et_en_manual_downloads = [
|
704 |
+
#for cs, need to register and download; Register and download CzEng 1.6.
|
705 |
+
#Better results can be obtained by using a subset of sentences, released under a new version name CzEng 1.7.
|
706 |
+
# ((f'http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar',
|
707 |
+
# f'data-plaintext-format.{i}.tar'), czeng_instruction)
|
708 |
+
# for i in range(10)
|
709 |
+
]
|
710 |
+
|
711 |
+
wmt18_cs_et_en = DLDataset(
|
712 |
+
name='wmt18_cs_et_en',
|
713 |
+
train_urls=[
|
714 |
+
'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz',
|
715 |
+
'http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz',
|
716 |
+
'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-cs.zipporah0-dedup-clean.tgz',
|
717 |
+
'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-et.zipporah0-dedup-clean.tgz',
|
718 |
+
'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
|
719 |
+
'http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz',
|
720 |
+
('http://data.statmt.org/wmt18/translation-task/rapid2016.tgz', 'wmt18_rapid2016.tgz'),
|
721 |
+
# (tuple(
|
722 |
+
# (f'http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar',
|
723 |
+
# f'data-plaintext-format.{i}.tar')
|
724 |
+
# for i in range(10)
|
725 |
+
# ),
|
726 |
+
# 'czeng16_data_plaintext.gz.tar'),
|
727 |
+
],
|
728 |
+
valid_urls=[
|
729 |
+
('http://data.statmt.org/wmt18/translation-task/dev.tgz', 'wmt18_dev.tgz'),
|
730 |
+
],
|
731 |
+
test_urls=[
|
732 |
+
('http://data.statmt.org/wmt18/translation-task/test.tgz', 'wmt18_test.tgz'),
|
733 |
+
],
|
734 |
+
train_files_patterns=[
|
735 |
+
# ('*/*europarl-v7.{src}-{tgt}.{lang}', ['cs-en']),
|
736 |
+
('*/*europarl-v8.{src}-{tgt}.{lang}', ['et-en']),
|
737 |
+
# ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['cs-en', 'et-en']),
|
738 |
+
('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['et-en']),
|
739 |
+
# ('*commoncrawl.{src}-{tgt}.{lang}', ['cs-en']),
|
740 |
+
# ('*/news-commentary-v13.{src}-{tgt}.{lang}', ['cs-en']),
|
741 |
+
# ('data.plaintext-format/*train.{lang}', ['cs-en']),
|
742 |
+
('rapid2016.{tgt}-{src}.{lang}', ['et-en']),
|
743 |
+
] ,
|
744 |
+
valid_files_patterns=[
|
745 |
+
('dev/newsdev2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['et-en']),
|
746 |
+
# ('dev/newstest2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['cs-en'])
|
747 |
+
],
|
748 |
+
test_files_patterns=[
|
749 |
+
('test/newstest2018-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
|
750 |
+
# ['cs-en', 'et-en']),
|
751 |
+
['et-en']),
|
752 |
+
]
|
753 |
+
)
|
754 |
+
|
755 |
+
ru_en_yandex_instruction = 'Yandex Corpus download instruction at: https://translate.yandex.ru/corpus?lang=en'
|
756 |
+
wmt19_ru_gu_kk_lt_manual_downloads = [
|
757 |
+
(('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'), ru_en_yandex_instruction)
|
758 |
+
]
|
759 |
+
wmt19_ru_gu_kk_lt = DLDataset(
|
760 |
+
name='wmt19_ru_gu_kk_lt',
|
761 |
+
train_urls=[
|
762 |
+
'http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz',
|
763 |
+
'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-lt.bicleaner07.tmx.gz',
|
764 |
+
'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz',
|
765 |
+
'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
|
766 |
+
'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14-wmt19.en-kk.tsv.gz',
|
767 |
+
'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz',
|
768 |
+
'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz',
|
769 |
+
'http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz',
|
770 |
+
'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz',
|
771 |
+
'http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz',
|
772 |
+
'http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz',
|
773 |
+
(('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00',
|
774 |
+
'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01',
|
775 |
+
'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02',),
|
776 |
+
'wmt19_UNv1.0.en-ru.tar.gz'),
|
777 |
+
'https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2016.en-lt.tmx.zip',
|
778 |
+
('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'),
|
779 |
+
],
|
780 |
+
valid_urls=[
|
781 |
+
('http://data.statmt.org/wmt19/translation-task/dev.tgz', 'wmt19_dev.tgz'),
|
782 |
+
],
|
783 |
+
test_urls=[
|
784 |
+
('http://data.statmt.org/wmt19/translation-task/test.tgz', 'wmt19_test.tgz'),
|
785 |
+
],
|
786 |
+
train_files_patterns=[
|
787 |
+
('*europarl-v9.{src}-{tgt}.tsv.{lang}', ['lt-en']),
|
788 |
+
#paracrawl
|
789 |
+
('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['ru-en']),
|
790 |
+
('bitext.{lang}', ['lt-en',]),
|
791 |
+
('*commoncrawl.{src}-{tgt}.{lang}', ['ru-en',]),
|
792 |
+
('*news-commentary-v14-wmt19.{tgt}-{src}.tsv.{lang}', ['kk-en', ]),
|
793 |
+
('*news-commentary-v14.{tgt}-{src}.tsv.{lang}', ['ru-en']),
|
794 |
+
#yandex
|
795 |
+
('corpus.{tgt}_{src}.1m.{lang}', ['ru-en']),
|
796 |
+
('wikititles_v1_wikititles-v1.{src}-{tgt}.tsv.{lang}', ['ru-en', 'kk-en', 'lt-en', 'gu-en']),
|
797 |
+
('*/UNv1.0.{tgt}-{src}.{lang}', ['ru-en']),
|
798 |
+
#rapid
|
799 |
+
('bitext.{lang}', ['lt-en'])
|
800 |
+
],
|
801 |
+
valid_files_patterns=[
|
802 |
+
('dev/newsdev2019*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['gu-en', 'kk-en', 'lt-en']),
|
803 |
+
('dev/newstest2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['ru-en']),
|
804 |
+
],
|
805 |
+
test_files_patterns=[
|
806 |
+
('sgm/newstest2019-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
|
807 |
+
['ru-en', 'gu-en', 'kk-en', 'lt-en', 'en-ru', 'en-gu', 'en-kk', 'en-lt']),
|
808 |
+
]
|
809 |
+
)
|
810 |
+
|
811 |
+
|
812 |
+
#########
|
813 |
+
|
814 |
+
if __name__ == "__main__":
|
815 |
+
# speed up the downloads with multiple processing
|
816 |
+
dl_folder = f'{to_data_path}/downloads'
|
817 |
+
extract_folder = f'{to_data_path}/extracted'
|
818 |
+
|
819 |
+
urls = [
|
820 |
+
url
|
821 |
+
for dataset in [wmt13_es_en, wmt14_de_fr_en, wmt16_ro_en, wmt18_cs_et_en, wmt19_ru_gu_kk_lt]
|
822 |
+
for urls in [dataset.train_urls, dataset.valid_urls, dataset.test_urls]
|
823 |
+
for url in urls
|
824 |
+
]
|
825 |
+
urls = set(urls)
|
826 |
+
download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=True)
|
827 |
+
|
828 |
+
# check manually downlaods
|
829 |
+
to_manually_download_urls = (
|
830 |
+
wmt17_fi_lv_tr_zh_en_manual_downloads + wmt18_cs_et_en_manual_downloads + wmt19_ru_gu_kk_lt_manual_downloads
|
831 |
+
)
|
832 |
+
to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls)
|
833 |
+
if len(to_be_manually_dowloaded) > 0:
|
834 |
+
print('Missing files that need to be downloaded manually; stop the process now.')
|
835 |
+
exit(-1)
|
836 |
+
|
837 |
+
completed_urls = {}
|
838 |
+
completed_extraction = {}
|
839 |
+
def work_on_wmt(directions, wmt_data):
|
840 |
+
download_and_extract(
|
841 |
+
to_data_path,
|
842 |
+
directions,
|
843 |
+
wmt_data,
|
844 |
+
to_manually_download_urls=to_manually_download_urls,
|
845 |
+
completed_urls=completed_urls, completed_extraction=completed_extraction, debug=True)
|
846 |
+
|
847 |
+
work_on_wmt(
|
848 |
+
['es_XX-en_XX'],
|
849 |
+
wmt13_es_en,)
|
850 |
+
work_on_wmt(
|
851 |
+
[
|
852 |
+
'fr_XX-en_XX', 'en_XX-fr_XX',
|
853 |
+
# 'en_XX-de_DE', 'de_DE-en_XX',
|
854 |
+
],
|
855 |
+
wmt14_de_fr_en,)
|
856 |
+
work_on_wmt(
|
857 |
+
['ro_RO-en_XX', 'en_XX-ro_XX'],
|
858 |
+
wmt16_ro_en,)
|
859 |
+
work_on_wmt(
|
860 |
+
[
|
861 |
+
# 'zh_CN-en_XX',
|
862 |
+
'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX',
|
863 |
+
#in case the reversed directions have different train/valid/test data
|
864 |
+
# 'en_XX-zh_CN',
|
865 |
+
'en_XX-lv_LV', 'en_XX-fi_FI', 'en_XX-tr_TR',
|
866 |
+
],
|
867 |
+
wmt17_fi_lv_tr_zh_en, )
|
868 |
+
# czeng17_script_path = download_czeng17_script(download_to, extract_to, debug=False)
|
869 |
+
# cz_username = None
|
870 |
+
work_on_wmt(
|
871 |
+
[
|
872 |
+
# 'cs_CZ-en_XX',
|
873 |
+
'et_EE-en_XX'],
|
874 |
+
wmt18_cs_et_en,)
|
875 |
+
work_on_wmt(
|
876 |
+
[
|
877 |
+
# 'ru_RU-en_XX', 'en_XX-ru_RU',
|
878 |
+
'gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX',
|
879 |
+
#in case the reversed directions have different train/valid/test data
|
880 |
+
'en_XX-gu_IN', 'en_XX-kk_KZ', 'en_XX-lt_LT'
|
881 |
+
],
|
882 |
+
wmt19_ru_gu_kk_lt,)
|
883 |
+
|
884 |
+
not_matching = check_wmt_test_bleu(
|
885 |
+
f'{to_data_path}/raw',
|
886 |
+
[
|
887 |
+
('wmt13', ['es_XX-en_XX']),
|
888 |
+
('wmt14/full', ['fr_XX-en_XX',]),
|
889 |
+
('wmt16', ['ro_RO-en_XX',]),
|
890 |
+
# ('wmt17/improved', ['zh_CN-en_XX']),
|
891 |
+
('wmt17', [ 'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX']),
|
892 |
+
('wmt18', ['cs_CZ-en_XX', 'et_EE-en_XX']),
|
893 |
+
('wmt19', ['gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX']),
|
894 |
+
#'ru_RU-en_XX',
|
895 |
+
]
|
896 |
+
)
|
897 |
+
if len(not_matching) > 0:
|
898 |
+
print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching))
|
899 |
+
|
fairseq/examples/multilingual/data_scripts/download_wmt20.sh
ADDED
@@ -0,0 +1,547 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
if [ -z $WORKDIR_ROOT ] ;
|
9 |
+
then
|
10 |
+
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
|
11 |
+
exit
|
12 |
+
fi
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
set -x -e
|
17 |
+
|
18 |
+
# TODO update the workdir and dest dir name
|
19 |
+
# put fasttext model
|
20 |
+
WORKDIR=$WORKDIR_ROOT
|
21 |
+
# put intermediate files
|
22 |
+
TMP_DIR=$WORKDIR_ROOT/tmp/tmp_wmt20_lowres_download
|
23 |
+
# output {train,valid,test} files to dest
|
24 |
+
DEST=$WORKDIR_ROOT/ML50/raw
|
25 |
+
|
26 |
+
UTILS=$PWD/utils
|
27 |
+
|
28 |
+
# per dataset locations
|
29 |
+
COMMONCRAWL_DIR=$TMP_DIR/commoncrawl
|
30 |
+
YANDEX_CORPUS=$WORKDIR_ROOT/wmt20/official/ru/yandex/1mcorpus.zip
|
31 |
+
# unzipped
|
32 |
+
CZENG_CORPUS=$WORKDIR_ROOT/wmt20/official/cs/czeng/czeng20-train
|
33 |
+
CCMT_DIR=$WORKDIR_ROOT/wmt20/official/zh/ccmt/parallel
|
34 |
+
|
35 |
+
download_and_select() {
|
36 |
+
SUBFOLDER=$1
|
37 |
+
URL=$2
|
38 |
+
UNCOMPRESS_CMD=$3
|
39 |
+
LANG=$4
|
40 |
+
INPUT_FILEPATH=$5
|
41 |
+
if [[ $# -gt 5 ]]; then
|
42 |
+
LANG_COL=$6
|
43 |
+
EN_COL=$7
|
44 |
+
fi
|
45 |
+
|
46 |
+
mkdir -p $SUBFOLDER
|
47 |
+
cd $SUBFOLDER
|
48 |
+
wget -nc --content-disposition $URL
|
49 |
+
$UNCOMPRESS_CMD
|
50 |
+
|
51 |
+
if [[ $# -gt 5 ]]; then
|
52 |
+
cut -f$LANG_COL $INPUT_FILEPATH > $INPUT_FILEPATH.$LANG
|
53 |
+
cut -f$EN_COL $INPUT_FILEPATH > $INPUT_FILEPATH.en
|
54 |
+
fi
|
55 |
+
cd ..
|
56 |
+
|
57 |
+
ln -sf $SUBFOLDER/$INPUT_FILEPATH.$LANG $SUBFOLDER.$LANG
|
58 |
+
ln -sf $SUBFOLDER/$INPUT_FILEPATH.en $SUBFOLDER.en
|
59 |
+
}
|
60 |
+
|
61 |
+
prepare_lid() {
|
62 |
+
pip install fasttext
|
63 |
+
|
64 |
+
# TODO specify global workdir
|
65 |
+
MODEL=$WORKDIR/fasttext/lid.176.bin
|
66 |
+
LID_MULTI=$UTILS/fasttext_multi_filter.py
|
67 |
+
|
68 |
+
if [ ! -f "$MODEL" ]; then
|
69 |
+
echo "downloading fasttext lid model..."
|
70 |
+
mkdir -p $WORKDIR/fasttext
|
71 |
+
wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O $MODEL
|
72 |
+
fi
|
73 |
+
}
|
74 |
+
|
75 |
+
prepare_moses() {
|
76 |
+
pushd $UTILS
|
77 |
+
echo 'Cloning Moses github repository (for tokenization scripts)...'
|
78 |
+
git clone https://github.com/moses-smt/mosesdecoder.git
|
79 |
+
popd
|
80 |
+
}
|
81 |
+
|
82 |
+
lid_filter() {
|
83 |
+
# TODO specify global workdir
|
84 |
+
MODEL=$WORKDIR/fasttext/lid.176.bin
|
85 |
+
LID_MULTI=$UTILS/fasttext_multi_filter.py
|
86 |
+
|
87 |
+
prepare_lid
|
88 |
+
|
89 |
+
SRC=$1
|
90 |
+
SRC_FILE=$2
|
91 |
+
SRC_OUTPUT=$3
|
92 |
+
TGT=$4
|
93 |
+
TGT_FILE=$5
|
94 |
+
TGT_OUTPUT=$6
|
95 |
+
python $LID_MULTI --model $MODEL --inputs $SRC_FILE $TGT_FILE --langs $SRC $TGT --outputs $SRC_OUTPUT $TGT_OUTPUT
|
96 |
+
}
|
97 |
+
|
98 |
+
prepare_ja_ted() {
|
99 |
+
mkdir -p ted
|
100 |
+
cd ted
|
101 |
+
|
102 |
+
wget -nc https://wit3.fbk.eu/archive/2017-01-trnted//texts/en/ja/en-ja.tgz
|
103 |
+
tar -zxvf en-ja.tgz
|
104 |
+
cat en-ja/train.tags.en-ja.en | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.en
|
105 |
+
cat en-ja/train.tags.en-ja.ja | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.ja
|
106 |
+
|
107 |
+
cd ..
|
108 |
+
ln -sf ted/en-ja/train.en-ja.ja ted.ja
|
109 |
+
ln -sf ted/en-ja/train.en-ja.en ted.en
|
110 |
+
}
|
111 |
+
|
112 |
+
prepare_ja() {
|
113 |
+
OUTPUT_DIR=$TMP_DIR/ja
|
114 |
+
mkdir -p $OUTPUT_DIR
|
115 |
+
cd $OUTPUT_DIR
|
116 |
+
|
117 |
+
download_and_select paracrawl "http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/2.0/bitext/en-ja.tar.gz" "tar -zxvf en-ja.tar.gz" ja en-ja/en-ja.bicleaner05.txt 4 3 &
|
118 |
+
download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ja.tsv.gz" "gunzip -f news-commentary-v15.en-ja.tsv.gz" ja news-commentary-v15.en-ja.tsv 2 1 &
|
119 |
+
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ja-en.tsv.gz" "gunzip -f wikititles-v2.ja-en.tsv.gz" ja wikititles-v2.ja-en.tsv 1 2 &
|
120 |
+
download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ja.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ja.langid.tsv.gz" ja WikiMatrix.v1.en-ja.langid.tsv 3 2 &
|
121 |
+
download_and_select subtitle "https://nlp.stanford.edu/projects/jesc/data/split.tar.gz" "tar -zxvf split.tar.gz" ja split/train 2 1 &
|
122 |
+
download_and_select kftt "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz" "tar -zxvf kftt-data-1.0.tar.gz" ja kftt-data-1.0/data/orig/kyoto-train &
|
123 |
+
|
124 |
+
prepare_ja_ted &
|
125 |
+
|
126 |
+
# ted data needs to
|
127 |
+
|
128 |
+
wait
|
129 |
+
|
130 |
+
# remove previous results
|
131 |
+
rm -f all.??
|
132 |
+
find ./ -maxdepth 1 -name "*.ja" | sort -V | xargs cat > all.ja
|
133 |
+
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
|
134 |
+
lid_filter ja all.ja $DEST/train.ja_XX-en_XX.ja_XX en all.en $DEST/train.ja_XX-en_XX.en_XX
|
135 |
+
}
|
136 |
+
|
137 |
+
prepare_ta() {
|
138 |
+
OUTPUT_DIR=$TMP_DIR/ta
|
139 |
+
mkdir -p $OUTPUT_DIR
|
140 |
+
cd $OUTPUT_DIR
|
141 |
+
|
142 |
+
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ta-en.tsv.gz" "gunzip -f wikititles-v2.ta-en.tsv.gz" ta wikititles-v2.ta-en.tsv 1 2 &
|
143 |
+
download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ta.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ta.langid.tsv.gz" ta WikiMatrix.v1.en-ta.langid.tsv 3 2 &
|
144 |
+
download_and_select pmindia "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.ta-en.tsv" "" ta pmindia.v1.ta-en.tsv 2 1 &
|
145 |
+
download_and_select tanzil "https://object.pouta.csc.fi/OPUS-Tanzil/v1/moses/en-ta.txt.zip" "unzip en-ta.txt.zip" ta Tanzil.en-ta &
|
146 |
+
download_and_select pib "http://preon.iiit.ac.in/~jerin/resources/datasets/pib-v0.tar" "tar -xvf pib-v0.tar" ta pib/en-ta/train &
|
147 |
+
download_and_select mkb "http://preon.iiit.ac.in/~jerin/resources/datasets/mkb-v0.tar" "tar -xvf mkb-v0.tar" ta mkb/en-ta/mkb &
|
148 |
+
download_and_select ufal "http://ufal.mff.cuni.cz/~ramasamy/parallel/data/v2/en-ta-parallel-v2.tar.gz" "tar -zxvf en-ta-parallel-v2.tar.gz" ta en-ta-parallel-v2/corpus.bcn.train &
|
149 |
+
|
150 |
+
wait
|
151 |
+
|
152 |
+
# need special handling for nlpc
|
153 |
+
mkdir -p nlpc
|
154 |
+
cd nlpc
|
155 |
+
wget -nc https://raw.githubusercontent.com/nlpc-uom/English-Tamil-Parallel-Corpus/master/En-Ta%20Corpus/En-Ta%20English.txt
|
156 |
+
wget -nc https://github.com/nlpc-uom/English-Tamil-Parallel-Corpus/raw/master/En-Ta%20Corpus/En-Ta%20Tamil.txt
|
157 |
+
tail -n +4 "En-Ta English.txt" > en-ta.en
|
158 |
+
tail -n +4 "En-Ta Tamil.txt" > en-ta.ta
|
159 |
+
cd ..
|
160 |
+
ln -sf nlpc/en-ta.en nlpc.en
|
161 |
+
ln -sf nlpc/en-ta.ta nlpc.ta
|
162 |
+
|
163 |
+
# remove previous results
|
164 |
+
rm -f all.??
|
165 |
+
find ./ -maxdepth 1 -name "*.ta" | sort -V | xargs cat > all.ta
|
166 |
+
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
|
167 |
+
lid_filter ta all.ta $DEST/train.ta_IN-en_XX.ta_IN en all.en $DEST/train.ta_IN-en_XX.en_XX
|
168 |
+
}
|
169 |
+
|
170 |
+
prepare_iu() {
|
171 |
+
OUTPUT_DIR=$TMP_DIR/iu
|
172 |
+
mkdir -p $OUTPUT_DIR
|
173 |
+
cd $OUTPUT_DIR
|
174 |
+
|
175 |
+
download_and_select nh "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60" "tar -zxvf Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0.1.tgz" iu Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/NunavutHansard > /dev/null &
|
176 |
+
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.iu-en.tsv.gz" "gunzip -f wikititles-v2.iu-en.tsv.gz" iu wikititles-v2.iu-en.tsv 1 2 &
|
177 |
+
|
178 |
+
wait
|
179 |
+
|
180 |
+
# remove previous results
|
181 |
+
rm -f all.??
|
182 |
+
find ./ -maxdepth 1 -name "*.iu" | sort -V | xargs cat | nh/Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/scripts/normalize-iu-spelling.pl > all.iu
|
183 |
+
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
|
184 |
+
paste all.iu all.en | awk -F $'\t' '$1!=""&&$2!=""' > all.iuen
|
185 |
+
cut -f1 all.iuen > $DEST/train.iu_CA-en_XX.iu_CA
|
186 |
+
cut -f2 all.iuen > $DEST/train.iu_CA-en_XX.en_XX
|
187 |
+
}
|
188 |
+
|
189 |
+
prepare_km() {
|
190 |
+
OUTPUT_DIR=$TMP_DIR/km
|
191 |
+
mkdir -p $OUTPUT_DIR
|
192 |
+
cd $OUTPUT_DIR
|
193 |
+
|
194 |
+
download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-km.xz" "unxz wmt20-sent.en-km.zx" km wmt20-sent.en-km 2 1 &
|
195 |
+
|
196 |
+
# km-parallel has multiple sets, concat all of them together
|
197 |
+
mkdir -p opus
|
198 |
+
cd opus
|
199 |
+
wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/km-parallel.tgz"
|
200 |
+
tar -zxvf km-parallel.tgz
|
201 |
+
find ./km-parallel -maxdepth 1 -name "*.km" | sort -V | xargs cat > opus.km
|
202 |
+
find ./km-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en
|
203 |
+
cd ..
|
204 |
+
ln -sf opus/opus.km .
|
205 |
+
ln -sf opus/opus.en .
|
206 |
+
|
207 |
+
wait
|
208 |
+
|
209 |
+
# remove previous results
|
210 |
+
rm -f all.??
|
211 |
+
find ./ -maxdepth 1 -name "*.km" | sort -V | xargs cat > all.km
|
212 |
+
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
|
213 |
+
lid_filter km all.km $DEST/train.km_KH-en_XX.km_KH en all.en $DEST/train.km_KH-en_XX.en_XX
|
214 |
+
}
|
215 |
+
|
216 |
+
prepare_ps() {
|
217 |
+
OUTPUT_DIR=$TMP_DIR/ps
|
218 |
+
mkdir -p $OUTPUT_DIR
|
219 |
+
cd $OUTPUT_DIR
|
220 |
+
|
221 |
+
download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz" "unxz wmt20-sent.en-ps.xz" ps wmt20-sent.en-ps 2 1 &
|
222 |
+
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ps-en.tsv.gz" "gunzip -f wikititles-v2.ps-en.tsv.gz" ps wikititles-v2.ps-en.tsv 1 2 &
|
223 |
+
# ps-parallel has multiple sets, concat all of them together
|
224 |
+
mkdir -p opus
|
225 |
+
cd opus
|
226 |
+
wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz"
|
227 |
+
tar -zxvf ps-parallel.tgz
|
228 |
+
find ./ps-parallel -maxdepth 1 -name "*.ps" | sort -V | xargs cat > opus.ps
|
229 |
+
find ./ps-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en
|
230 |
+
cd ..
|
231 |
+
ln -sf opus/opus.ps opus.ps
|
232 |
+
ln -sf opus/opus.en opus.en
|
233 |
+
|
234 |
+
wait
|
235 |
+
|
236 |
+
# remove previous results
|
237 |
+
rm -f all.??
|
238 |
+
find ./ -maxdepth 1 -name "*.ps" | sort -V | xargs cat > all.ps
|
239 |
+
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
|
240 |
+
lid_filter ps all.ps $DEST/train.ps_AF-en_XX.ps_AF en all.en $DEST/train.ps_AF-en_XX.en_XX
|
241 |
+
}
|
242 |
+
|
243 |
+
download_commoncrawl() {
|
244 |
+
mkdir -p $COMMONCRAWL_DIR
|
245 |
+
cd $COMMONCRAWL_DIR
|
246 |
+
|
247 |
+
wget -nc "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz"
|
248 |
+
tar -zxvf training-parallel-commoncrawl.tgz
|
249 |
+
}
|
250 |
+
link_commoncrawl() {
|
251 |
+
LANG=$1
|
252 |
+
ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.en commoncrawl.en
|
253 |
+
ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.$LANG commoncrawl.$LANG
|
254 |
+
}
|
255 |
+
|
256 |
+
strip_xlf() {
|
257 |
+
INPUT_FILE=$1
|
258 |
+
SRC=$2
|
259 |
+
TGT=$3
|
260 |
+
grep '<source xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$SRC
|
261 |
+
grep '<target xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$TGT
|
262 |
+
}
|
263 |
+
|
264 |
+
download_and_process_tilde() {
|
265 |
+
URL=$1
|
266 |
+
UNCOMPRESS_CMD=$2
|
267 |
+
FILENAME=$3
|
268 |
+
LANG=$4
|
269 |
+
PROCESS_CMD=$5
|
270 |
+
|
271 |
+
mkdir -p tilde
|
272 |
+
cd tilde
|
273 |
+
wget -nc $URL
|
274 |
+
$UNCOMPRESS_CMD
|
275 |
+
echo "executing cmd"
|
276 |
+
echo $PROCESS_CMD
|
277 |
+
$PROCESS_CMD
|
278 |
+
cd ..
|
279 |
+
ln -sf tilde/$FILENAME.$LANG tilde.$LANG
|
280 |
+
ln -sf tilde/$FILENAME.en tilde.en
|
281 |
+
}
|
282 |
+
|
283 |
+
prepare_cs() {
|
284 |
+
OUTPUT_DIR=$TMP_DIR/cs
|
285 |
+
mkdir -p $OUTPUT_DIR
|
286 |
+
cd $OUTPUT_DIR
|
287 |
+
|
288 |
+
#download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.cs-en.tsv.gz" "gunzip europarl-v10.cs-en.tsv.gz" cs europarl-v10.cs-en.tsv 1 2 &
|
289 |
+
#download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-cs.txt.gz" "gunzip en-cs.txt.gz" cs en-cs.txt 2 1 &
|
290 |
+
#link_commoncrawl cs
|
291 |
+
#download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.cs-en.tsv.gz" "gunzip news-commentary-v15.cs-en.tsv.gz" cs news-commentary-v15.cs-en.tsv 1 2 &
|
292 |
+
#download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.cs-en.tsv.gz" "gunzip wikititles-v2.cs-en.tsv.gz" cs wikititles-v2.cs-en.tsv 1 2 &
|
293 |
+
#download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.cs-en.xlf.gz" "gunzip RAPID_2019.cs-en.xlf.gz" RAPID_2019.cs-en.xlf cs "strip_xlf RAPID_2019.cs-en.xlf cs en" &
|
294 |
+
#download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.cs-en.langid.tsv.gz" "gunzip WikiMatrix.v1.cs-en.langid.tsv.gz" cs WikiMatrix.v1.cs-en.langid.tsv 2 3 &
|
295 |
+
|
296 |
+
#wait
|
297 |
+
|
298 |
+
# remove previous results
|
299 |
+
#rm -f all.??
|
300 |
+
#find ./ -maxdepth 1 -name "*.cs" | sort -V | xargs cat > all.cs
|
301 |
+
#find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
|
302 |
+
if [ -z $CZENG_CORPUS ] ;
|
303 |
+
then
|
304 |
+
echo "Please download CZENG_CORPUS manually and place them at $CZENG_CORPUS. Exitting..."
|
305 |
+
exit
|
306 |
+
fi
|
307 |
+
cat $CZENG_CORPUS | sed '/^$/d' | cut -f5 > all.cs
|
308 |
+
cat $CZENG_CORPUS | sed '/^$/d' | cut -f6 > all.en
|
309 |
+
|
310 |
+
lid_filter cs all.cs $DEST/train.cs_CZ-en_XX.cs_CZ en all.en $DEST/train.cs_CZ-en_XX.en_XX
|
311 |
+
}
|
312 |
+
|
313 |
+
prepare_de() {
|
314 |
+
OUTPUT_DIR=$TMP_DIR/de
|
315 |
+
mkdir -p $OUTPUT_DIR
|
316 |
+
cd $OUTPUT_DIR
|
317 |
+
|
318 |
+
download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.de-en.tsv.gz" "gunzip europarl-v10.de-en.tsv.gz" de europarl-v10.de-en.tsv 1 2 &
|
319 |
+
download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-de.txt.gz" "gunzip en-de.txt.gz" de en-de.txt 2 1 &
|
320 |
+
link_commoncrawl de
|
321 |
+
download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.de-en.tsv.gz" "gunzip news-commentary-v15.de-en.tsv.gz" de news-commentary-v15.de-en.tsv 1 2 &
|
322 |
+
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.de-en.tsv.gz" "gunzip wikititles-v2.de-en.tsv.gz" de wikititles-v2.de-en.tsv 1 2 &
|
323 |
+
download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.de-en.xlf.gz" "gunzip RAPID_2019.de-en.xlf.gz" RAPID_2019.de-en.xlf de "strip_xlf RAPID_2019.de-en.xlf de en" &
|
324 |
+
download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.de-en.langid.tsv.gz" "gunzip WikiMatrix.v1.de-en.langid.tsv.gz" de WikiMatrix.v1.de-en.langid.tsv 2 3 &
|
325 |
+
|
326 |
+
wait
|
327 |
+
|
328 |
+
# remove previous results
|
329 |
+
rm -f all.??
|
330 |
+
find ./ -maxdepth 1 -name "*.de" | sort -V | xargs cat > all.de
|
331 |
+
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
|
332 |
+
lid_filter de all.de $DEST/train.de_DE-en_XX.de_DE en all.en $DEST/train.de_DE-en_XX.en_XX
|
333 |
+
}
|
334 |
+
|
335 |
+
prepare_tmx() {
|
336 |
+
TMX_FILE=$1
|
337 |
+
git clone https://github.com/amake/TMX2Corpus $UTILS/tmx2corpus
|
338 |
+
pip install tinysegmenter
|
339 |
+
|
340 |
+
python $UTILS/tmx2corpus/tmx2corpus.py $TMX_FILE
|
341 |
+
}
|
342 |
+
|
343 |
+
prepare_pl() {
|
344 |
+
OUTPUT_DIR=$TMP_DIR/pl
|
345 |
+
mkdir -p $OUTPUT_DIR
|
346 |
+
cd $OUTPUT_DIR
|
347 |
+
|
348 |
+
# download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.pl-en.tsv.gz" "gunzip europarl-v10.pl-en.tsv.gz" pl europarl-v10.pl-en.tsv 1 2 &
|
349 |
+
# download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-pl.txt.gz" "gunzip en-pl.txt.gz" pl en-pl.txt 2 1 &
|
350 |
+
# download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.pl-en.tsv.gz" "gunzip wikititles-v2.pl-en.tsv.gz" pl wikititles-v2.pl-en.tsv 1 2 &
|
351 |
+
download_and_select tilde "https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2019.en-pl.tmx.zip" "gunzip rapid2019.en-pl.tmx.zip" bitext pl "prepare_tmx RAPID_2019.UNIQUE.en-pl.tmx" &
|
352 |
+
# download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-pl.langid.tsv.gz" "gunzip WikiMatrix.v1.en-pl.langid.tsv.gz" pl WikiMatrix.v1.en-pl.langid.tsv 3 2 &
|
353 |
+
|
354 |
+
wait
|
355 |
+
|
356 |
+
# remove previous results
|
357 |
+
rm -f all.??
|
358 |
+
find ./ -maxdepth 1 -name "*.pl" | sort -V | xargs cat > all.pl
|
359 |
+
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
|
360 |
+
lid_filter pl all.pl $DEST/train.pl_PL-en_XX.pl_PL en all.en $DEST/train.pl_PL-en_XX.en_XX
|
361 |
+
}
|
362 |
+
|
363 |
+
prepare_uncorpus() {
|
364 |
+
$URLS=$1
|
365 |
+
$FILES=$2
|
366 |
+
|
367 |
+
mkdir -p uncorpus
|
368 |
+
cd uncorpus
|
369 |
+
|
370 |
+
for URL in $URLS; do
|
371 |
+
wget -nc $URL
|
372 |
+
done
|
373 |
+
cat $FILES > uncorpus.tar.gz
|
374 |
+
tar -zxvf uncorpus.tar.gz
|
375 |
+
|
376 |
+
cd ..
|
377 |
+
ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.$LANG uncorpus.$LANG
|
378 |
+
ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.en uncorpus.en
|
379 |
+
}
|
380 |
+
|
381 |
+
prepare_yandex() {
|
382 |
+
mkdir -p yandex
|
383 |
+
cd yandex
|
384 |
+
unzip $YANDEX_CORPUS ./
|
385 |
+
cd ..
|
386 |
+
ln -s yandex/corpus.en_ru.1m.en yandex.en
|
387 |
+
ln -s yandex/corpus.en_ru.1m.ru yandex.ru
|
388 |
+
}
|
389 |
+
|
390 |
+
prepare_ru() {
|
391 |
+
OUTPUT_DIR=$TMP_DIR/ru
|
392 |
+
mkdir -p $OUTPUT_DIR
|
393 |
+
cd $OUTPUT_DIR
|
394 |
+
|
395 |
+
download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" "tar -zxvf paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" ru paracrawl-release1.en-ru.zipporah0-dedup-clean &
|
396 |
+
link_commoncrawl ru
|
397 |
+
download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ru.tsv.gz" "gunzip news-commentary-v15.en-ru.tsv.gz" ru news-commentary-v15.en-ru.tsv 2 1 &
|
398 |
+
prepare_yandex &
|
399 |
+
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ru-en.tsv.gz" "gunzip wikititles-v2.ru-en.tsv.gz" ru wikititles-v2.ru-en.tsv 1 2 &
|
400 |
+
prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02" "UNv1.0.en-ru.tar.gz.00 UNv1.0.en-ru.tar.gz.01 UNv1.0.en-ru.tar.gz.02" &
|
401 |
+
download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ru.langid.tsv.gz" "gunzip WikiMatrix.v1.en-ru.langid.tsv.gz" ru WikiMatrix.v1.en-ru.langid.tsv 3 2 &
|
402 |
+
|
403 |
+
wait
|
404 |
+
|
405 |
+
# remove previous results
|
406 |
+
rm -f all.??
|
407 |
+
find ./ -maxdepth 1 -name "*.ru" | sort -V | xargs cat > all.ru
|
408 |
+
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
|
409 |
+
lid_filter ru all.ru $DEST/train.ru_RU-en_XX.ru_RU en all.en $DEST/train.ru_RU-en_XX.en_XX
|
410 |
+
}
|
411 |
+
|
412 |
+
prepare_ccmt() {
|
413 |
+
mkdir -p ccmt
|
414 |
+
cd ccmt
|
415 |
+
# assume ccmt data is already unzipped under CCMT_DIR folder
|
416 |
+
cat $CCMT_DIR/datum2017/Book*_cn.txt | sed 's/ //g' > datum2017.detok.zh
|
417 |
+
cat $CCMT_DIR/datum2017/Book*_en.txt > datum2017.detok.en
|
418 |
+
cat $CCMT_DIR/casict2011/casict-A_ch.txt $CCMT_DIR/casict2011/casict-B_ch.txt $CCMT_DIR/casict2015/casict2015_ch.txt $CCMT_DIR/datum2015/datum_ch.txt $CCMT_DIR/neu2017/NEU_cn.txt datum2017.detok.zh > ccmt.zh
|
419 |
+
cat $CCMT_DIR/casict2011/casict-A_en.txt $CCMT_DIR/casict2011/casict-B_en.txt $CCMT_DIR/casict2015/casict2015_en.txt $CCMT_DIR/datum2015/datum_en.txt $CCMT_DIR/neu2017/NEU_en.txt datum2017.detok.en > ccmt.en
|
420 |
+
cd ..
|
421 |
+
ln -sf ccmt/ccmt.zh ccmt.zh
|
422 |
+
ln -sf ccmt/ccmt.en ccmt.en
|
423 |
+
}
|
424 |
+
|
425 |
+
prepare_zh() {
|
426 |
+
OUTPUT_DIR=$TMP_DIR/zh
|
427 |
+
mkdir -p $OUTPUT_DIR
|
428 |
+
cd $OUTPUT_DIR
|
429 |
+
|
430 |
+
download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-zh.tsv.gz" "gunzip news-commentary-v15.en-zh.tsv.gz" zh news-commentary-v15.en-zh.tsv 2 1 &
|
431 |
+
download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.zh-en.tsv.gz" "gunzip wikititles-v2.zh-en.tsv.gz" zh wikititles-v2.zh-en.tsv 1 2 &
|
432 |
+
prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01" "UNv1.0.en-zh.tar.gz.00 UNv1.0.en-zh.tar.gz.01" &
|
433 |
+
prepare_ccmt &
|
434 |
+
download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-zh.langid.tsv.gz" "gunzip WikiMatrix.v1.en-zh.langid.tsv.gz" zh WikiMatrix.v1.en-zh.langid.tsv 3 2 &
|
435 |
+
|
436 |
+
wait
|
437 |
+
|
438 |
+
# remove previous results
|
439 |
+
rm -f all.??
|
440 |
+
find ./ -maxdepth 1 -name "*.zh" | sort -V | xargs cat > all.zh
|
441 |
+
find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
|
442 |
+
lid_filter zh all.zh $DEST/train.zh_CN-en_XX.zh_CN en all.en $DEST/train.zh_CN-en_XX.en_XX
|
443 |
+
}
|
444 |
+
|
445 |
+
prepare_tests() {
|
446 |
+
OUTPUT_DIR=$TMP_DIR
|
447 |
+
mkdir -p $OUTPUT_DIR
|
448 |
+
cd $OUTPUT_DIR
|
449 |
+
wget -nc http://data.statmt.org/wmt20/translation-task/dev.tgz
|
450 |
+
tar -zxvf dev.tgz
|
451 |
+
cd dev
|
452 |
+
|
453 |
+
cat newsdev2020-jaen-src.ja.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.ja
|
454 |
+
cat newsdev2020-jaen-ref.en.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.en
|
455 |
+
split newsdev2020-jaen.ja -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.ja_XX
|
456 |
+
split newsdev2020-jaen.en -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.en_XX
|
457 |
+
split newsdev2020-jaen.ja -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.ja_XX
|
458 |
+
split newsdev2020-jaen.en -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.en_XX
|
459 |
+
|
460 |
+
cat newsdev2020-iuen-src.iu.sgm | strip_sgm.sh > newsdev2020-iuen.iu
|
461 |
+
cat newsdev2020-iuen-ref.en.sgm | strip_sgm.sh > newsdev2020-iuen.en
|
462 |
+
split newsdev2020-iuen.iu -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.iu_CA
|
463 |
+
split newsdev2020-iuen.en -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.en_XX
|
464 |
+
split newsdev2020-iuen.iu -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.iu_CA
|
465 |
+
split newsdev2020-iuen.en -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.en_XX
|
466 |
+
|
467 |
+
cat newsdev2020-taen-src.ta.sgm | strip_sgm.sh > newsdev2020-taen.ta
|
468 |
+
cat newsdev2020-taen-ref.en.sgm | strip_sgm.sh > newsdev2020-taen.en
|
469 |
+
split newsdev2020-taen.ta -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.ta_IN
|
470 |
+
split newsdev2020-taen.en -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.en_XX
|
471 |
+
split newsdev2020-taen.ta -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.ta_IN
|
472 |
+
split newsdev2020-taen.en -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.en_XX
|
473 |
+
|
474 |
+
cp wikipedia.dev.km-en.km $DEST/valid.km_KH-en_XX.km_KH
|
475 |
+
cp wikipedia.dev.km-en.en $DEST/valid.km_KH-en_XX.en_XX
|
476 |
+
cp wikipedia.devtest.km-en.km $DEST/test.km_KH-en_XX.km_KH
|
477 |
+
cp wikipedia.devtest.km-en.en $DEST/test.km_KH-en_XX.en_XX
|
478 |
+
|
479 |
+
cp wikipedia.dev.ps-en.ps $DEST/valid.ps_AF-en_XX.ps_AF
|
480 |
+
cp wikipedia.dev.ps-en.en $DEST/valid.ps_AF-en_XX.en_XX
|
481 |
+
cp wikipedia.devtest.ps-en.ps $DEST/test.ps_AF-en_XX.ps_AF
|
482 |
+
cp wikipedia.devtest.ps-en.en $DEST/test.ps_AF-en_XX.en_XX
|
483 |
+
|
484 |
+
cat newsdev2020-plen-src.pl.sgm | strip_sgm.sh > newsdev2020-plen.pl
|
485 |
+
cat newsdev2020-plen-ref.en.sgm | strip_sgm.sh > newsdev2020-plen.en
|
486 |
+
split newsdev2020-plen.pl -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.pl_PL
|
487 |
+
split newsdev2020-plen.en -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.en_XX
|
488 |
+
split newsdev2020-plen.pl -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.pl_PL
|
489 |
+
split newsdev2020-plen.en -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.en_XX
|
490 |
+
|
491 |
+
cat newstest2018-encs-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.en_XX
|
492 |
+
cat newstest2018-encs-ref.cs.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.cs_CZ
|
493 |
+
cat newstest2019-encs-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.en_XX
|
494 |
+
cat newstest2019-encs-ref.cs.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.cs_CZ
|
495 |
+
|
496 |
+
cat newstest2018-deen-src.de.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.de_DE
|
497 |
+
cat newstest2018-deen-ref.en.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.en_XX
|
498 |
+
cat newstest2018-ende-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.en_XX
|
499 |
+
cat newstest2018-ende-ref.de.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.de_DE
|
500 |
+
cat newstest2019-deen-src.de.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.de_DE
|
501 |
+
cat newstest2019-deen-ref.en.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.en_XX
|
502 |
+
cat newstest2019-ende-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.en_XX
|
503 |
+
cat newstest2019-ende-ref.de.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.de_DE
|
504 |
+
|
505 |
+
cat newstest2018-ruen-src.ru.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.ru_RU
|
506 |
+
cat newstest2018-ruen-ref.en.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.en_XX
|
507 |
+
cat newstest2018-enru-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.en_XX
|
508 |
+
cat newstest2018-enru-ref.ru.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.ru_RU
|
509 |
+
cat newstest2019-ruen-src.ru.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.ru_RU
|
510 |
+
cat newstest2019-ruen-ref.en.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.en_XX
|
511 |
+
cat newstest2019-enru-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.en_XX
|
512 |
+
cat newstest2019-enru-ref.ru.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.ru_RU
|
513 |
+
|
514 |
+
cat newstest2018-zhen-src.zh.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.zh_CN
|
515 |
+
cat newstest2018-zhen-ref.en.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.en_XX
|
516 |
+
cat newstest2018-enzh-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.en_XX
|
517 |
+
cat newstest2018-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.zh_CN
|
518 |
+
cat newstest2019-zhen-src.zh.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.zh_CN
|
519 |
+
cat newstest2019-zhen-ref.en.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.en_XX
|
520 |
+
cat newstest2019-enzh-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.en_XX
|
521 |
+
cat newstest2019-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.zh_CN
|
522 |
+
}
|
523 |
+
|
524 |
+
mkdir -p $DEST
|
525 |
+
|
526 |
+
prepare_lid
|
527 |
+
prepare_moses
|
528 |
+
download_commoncrawl
|
529 |
+
|
530 |
+
prepare_ja &
|
531 |
+
prepare_ta &
|
532 |
+
prepare_km &
|
533 |
+
prepare_ps &
|
534 |
+
prepare_iu &
|
535 |
+
prepare_cs &
|
536 |
+
prepare_de &
|
537 |
+
prepare_pl &
|
538 |
+
prepare_ru &
|
539 |
+
prepare_zh &
|
540 |
+
|
541 |
+
# prepare valid/test set
|
542 |
+
prepare_tests &
|
543 |
+
|
544 |
+
# wait
|
545 |
+
|
546 |
+
# TODO remove intermediate files
|
547 |
+
# rm -rf $TMP_DIR
|
fairseq/examples/multilingual/data_scripts/preprocess_ML50_v1.sh
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
if [ -z $WORKDIR_ROOT ] ;
|
9 |
+
then
|
10 |
+
echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
|
11 |
+
exit
|
12 |
+
fi
|
13 |
+
|
14 |
+
if [ -z $SPM_PATH ] ;
|
15 |
+
then
|
16 |
+
echo "Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting..."
|
17 |
+
exit
|
18 |
+
fi
|
19 |
+
|
20 |
+
ML50=${WORKDIR_ROOT}/ML50
|
21 |
+
|
22 |
+
mkdir -p $ML50/dedup
|
23 |
+
mkdir -p $ML50/cleaned_dedup
|
24 |
+
|
25 |
+
python ./dedup_all.py --from-folder $ML50/raw --to-folder $ML50/dedup
|
26 |
+
python ./remove_valid_test_in_train.py --from-folder $ML50/dedup --to-folder $ML50/clean
|
27 |
+
python ./binarize.py --raw-folder $ML50/clean
|
fairseq/examples/multilingual/data_scripts/remove_valid_test_in_train.py
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import glob, itertools
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
|
6 |
+
|
7 |
+
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
|
8 |
+
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
|
9 |
+
sys.exit(-1)
|
10 |
+
|
11 |
+
|
12 |
+
def load_langs(path):
|
13 |
+
with open(path) as fr:
|
14 |
+
langs = [l.strip() for l in fr]
|
15 |
+
return langs
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
def load_sentences(raw_data, split, direction):
|
20 |
+
src, tgt = direction.split('-')
|
21 |
+
src_path = f"{raw_data}/{split}.{direction}.{src}"
|
22 |
+
tgt_path = f"{raw_data}/{split}.{direction}.{tgt}"
|
23 |
+
if os.path.exists(src_path) and os.path.exists(tgt_path):
|
24 |
+
return [(src, open(src_path).read().splitlines()), (tgt, open(tgt_path).read().splitlines())]
|
25 |
+
else:
|
26 |
+
return []
|
27 |
+
|
28 |
+
def swap_direction(d):
|
29 |
+
src, tgt = d.split('-')
|
30 |
+
return f'{tgt}-{src}'
|
31 |
+
|
32 |
+
def get_all_test_data(raw_data, directions, split='test'):
|
33 |
+
test_data = [
|
34 |
+
x
|
35 |
+
for dd in directions
|
36 |
+
for d in [dd, swap_direction(dd)]
|
37 |
+
for x in load_sentences(raw_data, split, d)
|
38 |
+
]
|
39 |
+
# all_test_data = {s for _, d in test_data for s in d}
|
40 |
+
all_test_data = {}
|
41 |
+
for lang, d in test_data:
|
42 |
+
for s in d:
|
43 |
+
s = s.strip()
|
44 |
+
lgs = all_test_data.get(s, set())
|
45 |
+
lgs.add(lang)
|
46 |
+
all_test_data[s] = lgs
|
47 |
+
return all_test_data, test_data
|
48 |
+
|
49 |
+
def check_train_sentences(raw_data, direction, all_test_data, mess_up_train={}):
|
50 |
+
src, tgt = direction.split('-')
|
51 |
+
tgt_path = f"{raw_data}/train.{direction}.{tgt}"
|
52 |
+
src_path = f"{raw_data}/train.{direction}.{src}"
|
53 |
+
print(f'check training data in {raw_data}/train.{direction}')
|
54 |
+
size = 0
|
55 |
+
if not os.path.exists(tgt_path) or not os.path.exists(src_path):
|
56 |
+
return mess_up_train, size
|
57 |
+
with open(src_path) as f, open(tgt_path) as g:
|
58 |
+
for src_line, tgt_line in zip(f, g):
|
59 |
+
s = src_line.strip()
|
60 |
+
t = tgt_line.strip()
|
61 |
+
size += 1
|
62 |
+
if s in all_test_data:
|
63 |
+
langs = mess_up_train.get(s, set())
|
64 |
+
langs.add(direction)
|
65 |
+
mess_up_train[s] = langs
|
66 |
+
if t in all_test_data:
|
67 |
+
langs = mess_up_train.get(t, set())
|
68 |
+
langs.add(direction)
|
69 |
+
mess_up_train[t] = langs
|
70 |
+
return mess_up_train, size
|
71 |
+
|
72 |
+
def check_train_all(raw_data, directions, all_test_data):
|
73 |
+
mess_up_train = {}
|
74 |
+
data_sizes = {}
|
75 |
+
for direction in directions:
|
76 |
+
_, size = check_train_sentences(raw_data, direction, all_test_data, mess_up_train)
|
77 |
+
data_sizes[direction] = size
|
78 |
+
return mess_up_train, data_sizes
|
79 |
+
|
80 |
+
def count_train_in_other_set(mess_up_train):
|
81 |
+
train_in_others = [(direction, s) for s, directions in mess_up_train.items() for direction in directions]
|
82 |
+
counts = {}
|
83 |
+
for direction, s in train_in_others:
|
84 |
+
counts[direction] = counts.get(direction, 0) + 1
|
85 |
+
return counts
|
86 |
+
|
87 |
+
def train_size_if_remove_in_otherset(data_sizes, mess_up_train):
|
88 |
+
counts_in_other = count_train_in_other_set(mess_up_train)
|
89 |
+
remain_sizes = []
|
90 |
+
for direction, count in counts_in_other.items():
|
91 |
+
remain_sizes.append((direction, data_sizes[direction] - count, data_sizes[direction], count, 100 * count / data_sizes[direction] ))
|
92 |
+
return remain_sizes
|
93 |
+
|
94 |
+
|
95 |
+
def remove_messed_up_sentences(raw_data, direction, mess_up_train, mess_up_train_pairs, corrected_langs):
|
96 |
+
split = 'train'
|
97 |
+
src_lang, tgt_lang = direction.split('-')
|
98 |
+
|
99 |
+
tgt = f"{raw_data}/{split}.{direction}.{tgt_lang}"
|
100 |
+
src = f"{raw_data}/{split}.{direction}.{src_lang}"
|
101 |
+
print(f'working on {direction}: ', src, tgt)
|
102 |
+
if not os.path.exists(tgt) or not os.path.exists(src) :
|
103 |
+
return
|
104 |
+
|
105 |
+
corrected_tgt = f"{to_folder}/{split}.{direction}.{tgt_lang}"
|
106 |
+
corrected_src = f"{to_folder}/{split}.{direction}.{src_lang}"
|
107 |
+
line_num = 0
|
108 |
+
keep_num = 0
|
109 |
+
with open(src, encoding='utf8',) as fsrc, \
|
110 |
+
open(tgt, encoding='utf8',) as ftgt, \
|
111 |
+
open(corrected_src, 'w', encoding='utf8') as fsrc_corrected, \
|
112 |
+
open(corrected_tgt, 'w', encoding='utf8') as ftgt_corrected:
|
113 |
+
for s, t in zip(fsrc, ftgt):
|
114 |
+
s = s.strip()
|
115 |
+
t = t.strip()
|
116 |
+
if t not in mess_up_train \
|
117 |
+
and s not in mess_up_train \
|
118 |
+
and (s, t) not in mess_up_train_pairs \
|
119 |
+
and (t, s) not in mess_up_train_pairs:
|
120 |
+
corrected_langs.add(direction)
|
121 |
+
print(s, file=fsrc_corrected)
|
122 |
+
print(t, file=ftgt_corrected)
|
123 |
+
keep_num += 1
|
124 |
+
line_num += 1
|
125 |
+
if line_num % 1000 == 0:
|
126 |
+
print(f'completed {line_num} lines', end='\r')
|
127 |
+
return line_num, keep_num
|
128 |
+
|
129 |
+
##########
|
130 |
+
|
131 |
+
|
132 |
+
def merge_valid_test_messup(mess_up_train_valid, mess_up_train_test):
|
133 |
+
merged_mess = []
|
134 |
+
for s in set(list(mess_up_train_valid.keys()) + list(mess_up_train_test.keys())):
|
135 |
+
if not s:
|
136 |
+
continue
|
137 |
+
valid = mess_up_train_valid.get(s, set())
|
138 |
+
test = mess_up_train_test.get(s, set())
|
139 |
+
merged_mess.append((s, valid | test))
|
140 |
+
return dict(merged_mess)
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
#########
|
145 |
+
def check_train_pairs(raw_data, direction, all_test_data, mess_up_train={}):
|
146 |
+
src, tgt = direction.split('-')
|
147 |
+
#a hack; TODO: check the reversed directions
|
148 |
+
path1 = f"{raw_data}/train.{src}-{tgt}.{src}"
|
149 |
+
path2 = f"{raw_data}/train.{src}-{tgt}.{tgt}"
|
150 |
+
if not os.path.exists(path1) or not os.path.exists(path2) :
|
151 |
+
return
|
152 |
+
|
153 |
+
with open(path1) as f1, open(path2) as f2:
|
154 |
+
for src_line, tgt_line in zip(f1, f2):
|
155 |
+
s = src_line.strip()
|
156 |
+
t = tgt_line.strip()
|
157 |
+
if (s, t) in all_test_data or (t, s) in all_test_data:
|
158 |
+
langs = mess_up_train.get( (s, t), set())
|
159 |
+
langs.add(src)
|
160 |
+
langs.add(tgt)
|
161 |
+
mess_up_train[(s, t)] = langs
|
162 |
+
|
163 |
+
|
164 |
+
def load_pairs(raw_data, split, direction):
|
165 |
+
src, tgt = direction.split('-')
|
166 |
+
src_f = f"{raw_data}/{split}.{direction}.{src}"
|
167 |
+
tgt_f = f"{raw_data}/{split}.{direction}.{tgt}"
|
168 |
+
if tgt != 'en_XX':
|
169 |
+
src_f, tgt_f = tgt_f, src_f
|
170 |
+
if os.path.exists(src_f) and os.path.exists(tgt_f):
|
171 |
+
return list(zip(open(src_f).read().splitlines(),
|
172 |
+
open(tgt_f).read().splitlines(),
|
173 |
+
))
|
174 |
+
else:
|
175 |
+
return []
|
176 |
+
|
177 |
+
# skip_langs = ['cs_CZ', 'en_XX', 'tl_XX', 'tr_TR']
|
178 |
+
def get_messed_up_test_pairs(split, directions):
|
179 |
+
test_pairs = [
|
180 |
+
(d, load_pairs(raw_data, split, d))
|
181 |
+
for d in directions
|
182 |
+
]
|
183 |
+
# all_test_data = {s for _, d in test_data for s in d}
|
184 |
+
all_test_pairs = {}
|
185 |
+
for direction, d in test_pairs:
|
186 |
+
src, tgt = direction.split('-')
|
187 |
+
for s in d:
|
188 |
+
langs = all_test_pairs.get(s, set())
|
189 |
+
langs.add(src)
|
190 |
+
langs.add(tgt)
|
191 |
+
all_test_pairs[s] = langs
|
192 |
+
mess_up_train_pairs = {}
|
193 |
+
for direction in directions:
|
194 |
+
check_train_pairs(raw_data, direction, all_test_pairs, mess_up_train_pairs)
|
195 |
+
return all_test_pairs, mess_up_train_pairs
|
196 |
+
|
197 |
+
|
198 |
+
|
199 |
+
if __name__ == "__main__":
|
200 |
+
#######
|
201 |
+
import argparse
|
202 |
+
parser = argparse.ArgumentParser()
|
203 |
+
parser.add_argument(
|
204 |
+
'--from-folder',
|
205 |
+
required=True,
|
206 |
+
type=str)
|
207 |
+
parser.add_argument(
|
208 |
+
'--to-folder',
|
209 |
+
required=True,
|
210 |
+
type=str)
|
211 |
+
parser.add_argument(
|
212 |
+
'--directions',
|
213 |
+
default=None,
|
214 |
+
type=str)
|
215 |
+
|
216 |
+
|
217 |
+
args = parser.parse_args()
|
218 |
+
raw_data = args.from_folder
|
219 |
+
to_folder = args.to_folder
|
220 |
+
os.makedirs(to_folder, exist_ok=True)
|
221 |
+
|
222 |
+
if args.directions:
|
223 |
+
directions = args.directions.split(',')
|
224 |
+
else:
|
225 |
+
raw_files = itertools.chain(
|
226 |
+
glob.glob(f'{raw_data}/train*'),
|
227 |
+
glob.glob(f'{raw_data}/valid*'),
|
228 |
+
glob.glob(f'{raw_data}/test*'),
|
229 |
+
)
|
230 |
+
directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
|
231 |
+
print('working on directions: ', directions)
|
232 |
+
|
233 |
+
##########
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
+
all_test_data, test_data = get_all_test_data(raw_data, directions, 'test')
|
238 |
+
print('==loaded test data==')
|
239 |
+
all_valid_data, valid_data = get_all_test_data(raw_data, directions, 'valid')
|
240 |
+
print('==loaded valid data==')
|
241 |
+
all_valid_test_data = merge_valid_test_messup(all_test_data, all_valid_data)
|
242 |
+
mess_up_train, data_sizes = check_train_all(raw_data, directions, all_valid_test_data)
|
243 |
+
print('training messing up with valid, test data:', len(mess_up_train))
|
244 |
+
data_situation = train_size_if_remove_in_otherset(data_sizes, mess_up_train)
|
245 |
+
df = pd.DataFrame(data_situation, columns=['direction', 'train_size_after_remove', 'orig_size', 'num_to_remove', 'remove_percent'])
|
246 |
+
df.sort_values('remove_percent', ascending=False)
|
247 |
+
df.to_csv(f'{raw_data}/clean_summary.tsv', sep='\t')
|
248 |
+
print(f'projected data clean summary in: {raw_data}/clean_summary.tsv')
|
249 |
+
|
250 |
+
# correct the dataset:
|
251 |
+
all_test_pairs, mess_up_test_train_pairs = get_messed_up_test_pairs('test', directions)
|
252 |
+
all_valid_pairs, mess_up_valid_train_pairs = get_messed_up_test_pairs('valid', directions)
|
253 |
+
|
254 |
+
all_messed_pairs = set(mess_up_test_train_pairs.keys()).union(set(mess_up_valid_train_pairs.keys()))
|
255 |
+
corrected_directions = set()
|
256 |
+
|
257 |
+
real_data_situation = []
|
258 |
+
for direction in directions:
|
259 |
+
org_size, new_size = remove_messed_up_sentences(raw_data, direction, mess_up_train, all_messed_pairs, corrected_directions)
|
260 |
+
if org_size == 0:
|
261 |
+
print(f"{direction} has size 0")
|
262 |
+
continue
|
263 |
+
real_data_situation.append(
|
264 |
+
(direction, new_size, org_size, org_size - new_size, (org_size - new_size) / org_size * 100)
|
265 |
+
)
|
266 |
+
print('corrected directions: ', corrected_directions)
|
267 |
+
df = pd.DataFrame(real_data_situation, columns=['direction', 'train_size_after_remove', 'orig_size', 'num_to_remove', 'remove_percent'])
|
268 |
+
df.sort_values('remove_percent', ascending=False)
|
269 |
+
df.to_csv(f'{raw_data}/actual_clean_summary.tsv', sep='\t')
|
270 |
+
print(f'actual data clean summary (which can be different from the projected one because of duplications) in: {raw_data}/actual_clean_summary.tsv')
|
271 |
+
|
272 |
+
import shutil
|
273 |
+
for direction in directions:
|
274 |
+
src_lang, tgt_lang = direction.split('-')
|
275 |
+
for split in ['train', 'valid', 'test']:
|
276 |
+
# copying valid, test and uncorrected train
|
277 |
+
if direction in corrected_directions and split == 'train':
|
278 |
+
continue
|
279 |
+
tgt = f"{raw_data}/{split}.{direction}.{tgt_lang}"
|
280 |
+
src = f"{raw_data}/{split}.{direction}.{src_lang}"
|
281 |
+
if not (os.path.exists(src) and os.path.exists(tgt)):
|
282 |
+
continue
|
283 |
+
corrected_tgt = f"{to_folder}/{split}.{direction}.{tgt_lang}"
|
284 |
+
corrected_src = f"{to_folder}/{split}.{direction}.{src_lang}"
|
285 |
+
print(f'copying {src} to {corrected_src}')
|
286 |
+
shutil.copyfile(src, corrected_src)
|
287 |
+
print(f'copying {tgt} to {corrected_tgt}')
|
288 |
+
shutil.copyfile(tgt, corrected_tgt)
|
289 |
+
|
290 |
+
print('completed')
|
fairseq/examples/multilingual/data_scripts/requirement.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
wget
|
2 |
+
pandas
|
fairseq/examples/multilingual/data_scripts/utils/dedup.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
|
7 |
+
import argparse
|
8 |
+
|
9 |
+
def deup(src_file, tgt_file, src_file_out, tgt_file_out):
|
10 |
+
seen = set()
|
11 |
+
dup_count = 0
|
12 |
+
with open(src_file, encoding='utf-8') as fsrc, \
|
13 |
+
open(tgt_file, encoding='utf-8') as ftgt, \
|
14 |
+
open(src_file_out, 'w', encoding='utf-8') as fsrc_out, \
|
15 |
+
open(tgt_file_out, 'w', encoding='utf-8') as ftgt_out:
|
16 |
+
for s, t in zip(fsrc, ftgt):
|
17 |
+
if (s, t) not in seen:
|
18 |
+
fsrc_out.write(s)
|
19 |
+
ftgt_out.write(t)
|
20 |
+
seen.add((s, t))
|
21 |
+
else:
|
22 |
+
dup_count += 1
|
23 |
+
print(f'number of duplication: {dup_count}')
|
24 |
+
|
25 |
+
|
26 |
+
def main():
|
27 |
+
parser = argparse.ArgumentParser()
|
28 |
+
parser.add_argument("--src-file", type=str, required=True,
|
29 |
+
help="src file")
|
30 |
+
parser.add_argument("--tgt-file", type=str, required=True,
|
31 |
+
help="tgt file")
|
32 |
+
parser.add_argument("--src-file-out", type=str, required=True,
|
33 |
+
help="src ouptut file")
|
34 |
+
parser.add_argument("--tgt-file-out", type=str, required=True,
|
35 |
+
help="tgt ouput file")
|
36 |
+
args = parser.parse_args()
|
37 |
+
deup(args.src_file, args.tgt_file, args.src_file_out, args.tgt_file_out)
|
38 |
+
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
main()
|
fairseq/examples/multilingual/data_scripts/utils/fasttext_multi_filter.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
#
|
3 |
+
# This source code is licensed under the MIT license found in the
|
4 |
+
# LICENSE file in the root directory of this source tree.
|
5 |
+
|
6 |
+
|
7 |
+
#!/bin/python
|
8 |
+
|
9 |
+
import fasttext
|
10 |
+
from multiprocessing import Pool
|
11 |
+
import contextlib
|
12 |
+
import sys
|
13 |
+
import argparse
|
14 |
+
from functools import partial
|
15 |
+
import io
|
16 |
+
|
17 |
+
model = None
|
18 |
+
def init(model_path):
|
19 |
+
global model
|
20 |
+
model = fasttext.load_model(model_path)
|
21 |
+
|
22 |
+
def pred(lines):
|
23 |
+
return lines, [model.predict(line.strip())[0][0][9:] for line in lines]
|
24 |
+
|
25 |
+
def main():
|
26 |
+
parser = argparse.ArgumentParser()
|
27 |
+
parser.add_argument("--model", type=str, required=True,
|
28 |
+
help="model to load")
|
29 |
+
parser.add_argument("--inputs", nargs="+", default=['-'],
|
30 |
+
help="input files to filter")
|
31 |
+
parser.add_argument("--langs", nargs="+", required=True,
|
32 |
+
help="lang ids of each input file")
|
33 |
+
parser.add_argument("--outputs", nargs="+", default=['-'],
|
34 |
+
help="path to save lid filtered outputs")
|
35 |
+
parser.add_argument("--num-workers", type=int, metavar="N", default=10,
|
36 |
+
help="number of processes in parallel")
|
37 |
+
args = parser.parse_args()
|
38 |
+
|
39 |
+
assert len(args.inputs) == len(args.langs) and len(args.inputs) == len(args.outputs)
|
40 |
+
|
41 |
+
with contextlib.ExitStack() as stack:
|
42 |
+
inputs = [
|
43 |
+
stack.enter_context(open(input, "r", encoding="utf-8", newline="\n", errors="replace"))
|
44 |
+
if input != "-" else io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors="replace")
|
45 |
+
for input in args.inputs
|
46 |
+
]
|
47 |
+
outputs = [
|
48 |
+
stack.enter_context(open(output, "w", encoding="utf-8", newline="\n"))
|
49 |
+
if output != "-" else sys.stdout
|
50 |
+
for output in args.outputs
|
51 |
+
]
|
52 |
+
with Pool(args.num_workers, initializer=partial(init, args.model)) as p:
|
53 |
+
skip_cnt = 0
|
54 |
+
for lines, preds in p.imap(pred, list(zip(*inputs)), chunksize=500):
|
55 |
+
if not all(a == b for a, b in zip(preds, args.langs)):
|
56 |
+
skip_cnt += 1
|
57 |
+
continue
|
58 |
+
for line, output_h in zip(lines, outputs):
|
59 |
+
print(line.strip(), file=output_h)
|
60 |
+
print(f"Skipped {skip_cnt} lines.")
|
61 |
+
|
62 |
+
if __name__ == "__main__":
|
63 |
+
main()
|
fairseq/examples/multilingual/data_scripts/utils/strip_sgm.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
grep "seg id" | sed 's/<seg id="[0-9]\+">//g' | sed 's/<\/seg>//g'
|
fairseq/examples/multilingual/finetune_multilingual_model.sh
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
path_2_data=$1 # <path to data> which contains binarized data for each directions
|
9 |
+
lang_list=$2 # <path to a file which contains a list of languages separted by new lines>
|
10 |
+
lang_pairs=$3 #a list language pairs to train multilingual models, e.g. "en-fr,en-cs,fr-en,cs-en"
|
11 |
+
# pretrained can be an mBART pretrained model as well
|
12 |
+
pretrained_model=$4 #<path to a pretrained model>
|
13 |
+
|
14 |
+
|
15 |
+
fairseq-train "$path_2_data" \
|
16 |
+
--encoder-normalize-before --decoder-normalize-before \
|
17 |
+
--arch transformer --layernorm-embedding \
|
18 |
+
--task translation_multi_simple_epoch \
|
19 |
+
--finetune-from-model "$pretrained_model" \
|
20 |
+
--sampling-method "temperature" \
|
21 |
+
--sampling-temperature "1.5" \
|
22 |
+
--encoder-langtok "src" \
|
23 |
+
--decoder-langtok \
|
24 |
+
--lang-dict "$lang_list" \
|
25 |
+
--lang-pairs "$lang_pairs" \
|
26 |
+
--criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
|
27 |
+
--optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
|
28 |
+
--lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
|
29 |
+
--dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
|
30 |
+
--max-tokens 1024 --update-freq 2 \
|
31 |
+
--save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
|
32 |
+
--seed 222 --log-format simple --log-interval 2
|
fairseq/examples/multilingual/multilingual_fairseq_gen.sh
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
|
8 |
+
lang_pairs="en-fr,en-cs,fr-en,cs-en"
|
9 |
+
path_2_data=$1 # <path to data>
|
10 |
+
lang_list=$2 # <path to a file which contains list of languages separted by new lines>
|
11 |
+
model=$3 # <path to a trained model>
|
12 |
+
source_lang=cs
|
13 |
+
target_lang=en
|
14 |
+
|
15 |
+
fairseq-generate "$path_2_data" \
|
16 |
+
--path "$model" \
|
17 |
+
--task translation_multi_simple_epoch \
|
18 |
+
--gen-subset test \
|
19 |
+
--source-lang "$source_lang" \
|
20 |
+
--target-lang "$target_lang" \
|
21 |
+
--sacrebleu --remove-bpe 'sentencepiece'\
|
22 |
+
--batch-size 32 \
|
23 |
+
--encoder-langtok "src" \
|
24 |
+
--decoder-langtok \
|
25 |
+
--lang-dict "$lang_list" \
|
26 |
+
--lang-pairs "$lang_pairs"
|